remove old unused stuff
This commit is contained in:
@@ -7,12 +7,8 @@ add_pim_library(OMONNXToSpatial
|
|||||||
Patterns/Math/Conv.cpp
|
Patterns/Math/Conv.cpp
|
||||||
Patterns/Math/MatMul.cpp
|
Patterns/Math/MatMul.cpp
|
||||||
Patterns/NN/Pool.cpp
|
Patterns/NN/Pool.cpp
|
||||||
Patterns/NN/ReduceMean.cpp
|
|
||||||
Patterns/Tensor/Concat.cpp
|
Patterns/Tensor/Concat.cpp
|
||||||
Patterns/Tensor/Reshape.cpp
|
Patterns/Tensor/Reshape.cpp
|
||||||
Utils/SpatialReducer.cpp
|
|
||||||
Utils/WeightSubdivider.cpp
|
|
||||||
Utils/AnnotateReplication.cpp
|
|
||||||
ONNXToSpatialPass.cpp
|
ONNXToSpatialPass.cpp
|
||||||
Common.cpp
|
Common.cpp
|
||||||
|
|
||||||
|
|||||||
@@ -57,8 +57,6 @@ inline auto getFilterCount(const ShapedType& shapedType) {
|
|||||||
return shapedType.getDimSize(0);
|
return shapedType.getDimSize(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline constexpr mlir::StringRef REPLICATION_ATTR_NAME = "replication_factor";
|
|
||||||
|
|
||||||
using HSliceId = size_t;
|
using HSliceId = size_t;
|
||||||
using CoreId = size_t;
|
using CoreId = size_t;
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
#include "Common/PimCommon.hpp"
|
#include "Common/PimCommon.hpp"
|
||||||
#include "Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||||
@@ -68,11 +67,6 @@ void ONNXToSpatialPass::runOnOperation() {
|
|||||||
signalPassFailure();
|
signalPassFailure();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (annotateReplication(*entryFunc, rewriter).failed()) {
|
|
||||||
llvm::dbgs() << "Failed during annotation for replication analysis\n";
|
|
||||||
signalPassFailure();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConversionTarget target(*ctx);
|
ConversionTarget target(*ctx);
|
||||||
target.addLegalDialect<spatial::SpatialDialect, ONNXDialect, tensor::TensorDialect, arith::ArithDialect>();
|
target.addLegalDialect<spatial::SpatialDialect, ONNXDialect, tensor::TensorDialect, arith::ArithDialect>();
|
||||||
@@ -98,7 +92,6 @@ void ONNXToSpatialPass::runOnOperation() {
|
|||||||
populateReshapeConversionPattern(patterns, ctx);
|
populateReshapeConversionPattern(patterns, ctx);
|
||||||
|
|
||||||
populateONNXConcatToTensorConcatPattern(patterns, ctx);
|
populateONNXConcatToTensorConcatPattern(patterns, ctx);
|
||||||
populateReduceMeanConversionPattern(patterns, ctx);
|
|
||||||
|
|
||||||
if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) {
|
if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) {
|
||||||
signalPassFailure();
|
signalPassFailure();
|
||||||
|
|||||||
@@ -17,6 +17,4 @@ void populateONNXConcatToTensorConcatPattern(mlir::RewritePatternSet& patterns,
|
|||||||
|
|
||||||
void populateReshapeConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
|
void populateReshapeConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
|
||||||
|
|
||||||
void populateReduceMeanConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
} // namespace onnx_mlir
|
||||||
|
|||||||
@@ -5,14 +5,12 @@
|
|||||||
#include "mlir/Support/LogicalResult.h"
|
#include "mlir/Support/LogicalResult.h"
|
||||||
#include "mlir/Transforms/DialectConversion.h"
|
#include "mlir/Transforms/DialectConversion.h"
|
||||||
|
|
||||||
#include "llvm/ADT/STLExtras.h"
|
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -21,12 +19,8 @@ using namespace mlir;
|
|||||||
namespace onnx_mlir {
|
namespace onnx_mlir {
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
constexpr StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor";
|
static FailureOr<Value>
|
||||||
|
materializeScaledConstantTensor(Value value, float factor, ConversionPatternRewriter& rewriter, Location loc) {
|
||||||
static FailureOr<Value> materializeScaledConstantTensor(Value value,
|
|
||||||
float factor,
|
|
||||||
ConversionPatternRewriter& rewriter,
|
|
||||||
Location loc) {
|
|
||||||
if (factor == 1.0f)
|
if (factor == 1.0f)
|
||||||
return value;
|
return value;
|
||||||
|
|
||||||
@@ -70,16 +64,6 @@ struct GemvToSpatialCompute : OpConversionPattern<ONNXGemmOp> {
|
|||||||
LogicalResult matchAndRewrite(ONNXGemmOp gemmOp,
|
LogicalResult matchAndRewrite(ONNXGemmOp gemmOp,
|
||||||
ONNXGemmOpAdaptor gemmOpAdaptor,
|
ONNXGemmOpAdaptor gemmOpAdaptor,
|
||||||
ConversionPatternRewriter& rewriter) const override;
|
ConversionPatternRewriter& rewriter) const override;
|
||||||
|
|
||||||
private:
|
|
||||||
static Value resolveONNXExpOpFromUseChain(Value startValue);
|
|
||||||
|
|
||||||
static LogicalResult softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
|
|
||||||
Value& softmaxChannel,
|
|
||||||
ConversionPatternRewriter& rewriter,
|
|
||||||
SpatialReducer& reducer,
|
|
||||||
ONNXGemmOp& gemmOp,
|
|
||||||
Location& loc);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@@ -122,7 +106,13 @@ LogicalResult GemmToManyGemv::matchAndRewrite(ONNXGemmOp gemmOp,
|
|||||||
// Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
|
// Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
|
||||||
if (cType.getRank() == 1) {
|
if (cType.getRank() == 1) {
|
||||||
auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
|
auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
|
||||||
c = tensor::ExpandShapeOp::create(rewriter, loc, expandedType, c, SmallVector<ReassociationIndices>{{0, 1}});
|
c = tensor::ExpandShapeOp::create(rewriter,
|
||||||
|
loc,
|
||||||
|
expandedType,
|
||||||
|
c,
|
||||||
|
SmallVector<ReassociationIndices> {
|
||||||
|
{0, 1}
|
||||||
|
});
|
||||||
cType = expandedType;
|
cType = expandedType;
|
||||||
}
|
}
|
||||||
assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
|
assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
|
||||||
@@ -208,7 +198,13 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp,
|
|||||||
// Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
|
// Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
|
||||||
if (cType.getRank() == 1) {
|
if (cType.getRank() == 1) {
|
||||||
auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
|
auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
|
||||||
c = tensor::ExpandShapeOp::create(rewriter, gemmLoc, expandedType, c, SmallVector<ReassociationIndices>{{0, 1}});
|
c = tensor::ExpandShapeOp::create(rewriter,
|
||||||
|
gemmLoc,
|
||||||
|
expandedType,
|
||||||
|
c,
|
||||||
|
SmallVector<ReassociationIndices> {
|
||||||
|
{0, 1}
|
||||||
|
});
|
||||||
cType = expandedType;
|
cType = expandedType;
|
||||||
}
|
}
|
||||||
assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
|
assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
|
||||||
@@ -356,124 +352,6 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp,
|
|||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|
||||||
Value GemvToSpatialCompute::resolveONNXExpOpFromUseChain(Value startValue) {
|
|
||||||
Value walker = startValue;
|
|
||||||
|
|
||||||
while (!llvm::isa<ONNXExpOp>(walker.getDefiningOp())) {
|
|
||||||
walker = walker.getDefiningOp()->getOperand(0);
|
|
||||||
|
|
||||||
assert(walker && walker.getDefiningOp()
|
|
||||||
&& "Unwinded the whole chain of operations while trying to "
|
|
||||||
"find ONNXExpOp, but did not find it");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure the dividend is actually produced by an ONNXExpOp
|
|
||||||
assert(llvm::isa<ONNXExpOp>(walker.getDefiningOp())
|
|
||||||
&& "Old output tile (softmax reducer) is not produced by an "
|
|
||||||
"ONNXExpOp");
|
|
||||||
|
|
||||||
return walker;
|
|
||||||
}
|
|
||||||
|
|
||||||
LogicalResult GemvToSpatialCompute::softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
|
|
||||||
Value& softmaxChannel,
|
|
||||||
ConversionPatternRewriter& rewriter,
|
|
||||||
SpatialReducer& reducer,
|
|
||||||
ONNXGemmOp& gemmOp,
|
|
||||||
Location& loc) {
|
|
||||||
// TODO: Check case with one compute op
|
|
||||||
|
|
||||||
// Cast vector of Value into vector of ComputeOp
|
|
||||||
SmallVector<ComputeAndResNum> softmaxOpsToReduce =
|
|
||||||
llvm::to_vector(llvm::map_range(outputOpsAndResNums, [&](OpAndResNum computeAndResNum) {
|
|
||||||
return std::make_pair(cast<spatial::SpatWeightedCompute>(computeAndResNum.first), computeAndResNum.second);
|
|
||||||
}));
|
|
||||||
|
|
||||||
RankedTensorType::Builder tensorTypeBuilder({1}, Float32Type::get(rewriter.getContext()), nullptr);
|
|
||||||
const TensorType scalarTensorType = tensorTypeBuilder;
|
|
||||||
|
|
||||||
reducer.applyReducePattern(
|
|
||||||
softmaxOpsToReduce,
|
|
||||||
[&](Value a, Value b) { return spatial::SpatVAddOp::create(rewriter, loc, scalarTensorType, a, b); },
|
|
||||||
/* preprocess = */
|
|
||||||
[&](Value a) { return spatial::SpatSumOp::create(rewriter, loc, scalarTensorType, a); },
|
|
||||||
[&](Value softmaxDivisor) {
|
|
||||||
// Signal that this is the compute with the softmax divisor
|
|
||||||
auto computeOp = cast<spatial::SpatWeightedCompute>(softmaxDivisor.getDefiningOp()->getParentOp());
|
|
||||||
computeOp->setAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME, rewriter.getUnitAttr());
|
|
||||||
|
|
||||||
// Broadcast the divisor to all the cores
|
|
||||||
rewriter.setInsertionPointAfterValue(softmaxDivisor);
|
|
||||||
spatial::SpatChannelBroadcastSendOp::create(rewriter, loc, softmaxChannel, softmaxDivisor);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* softmaxDividend = onnx.exp (...)
|
|
||||||
* sum = spat.SumOp(softmaxDividend)
|
|
||||||
* [following can be repeated N times, thus walk the use chain]
|
|
||||||
* softmaxDivisor = spat.sadd(sum, ...)
|
|
||||||
*/
|
|
||||||
Value softmaxDividend = resolveONNXExpOpFromUseChain(softmaxDivisor.getDefiningOp()->getOperand(0));
|
|
||||||
|
|
||||||
// Make sure the dividend is actually produced by an ONNXExpOp
|
|
||||||
assert(llvm::isa<ONNXExpOp>(softmaxDividend.getDefiningOp())
|
|
||||||
&& "Dividend of softmax reduction is not an ONNXExpOp");
|
|
||||||
|
|
||||||
// Do not divide here, divide after this
|
|
||||||
return softmaxDivisor;
|
|
||||||
});
|
|
||||||
|
|
||||||
// In all the cores, insert a ChannelRecvOp and divide the output tile by
|
|
||||||
// the reduced denominator.
|
|
||||||
outputOpsAndResNums.clear();
|
|
||||||
outputOpsAndResNums.reserve(softmaxOpsToReduce.size());
|
|
||||||
for (auto& computeToDivideOpAndResNum : softmaxOpsToReduce) {
|
|
||||||
|
|
||||||
auto yieldOp = cast<spatial::SpatYieldOp>(computeToDivideOpAndResNum.first.getBody().front().getTerminator());
|
|
||||||
|
|
||||||
Value divisor;
|
|
||||||
|
|
||||||
// Check if this compute contains the softmax divisor: if so, find the
|
|
||||||
// ChannelBroadcastSendOp, otherwise receive the value from the channel
|
|
||||||
// using ChannelBroadcastReceiveOp
|
|
||||||
if (computeToDivideOpAndResNum.first->hasAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME)) {
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
for (auto broadcastOp :
|
|
||||||
computeToDivideOpAndResNum.first.getBody().front().getOps<spatial::SpatChannelBroadcastSendOp>()) {
|
|
||||||
assert(found == false
|
|
||||||
&& "More than one ChannelBroadcastSendOp in "
|
|
||||||
"compute? How is this possible?");
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
divisor = broadcastOp.getData();
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(found
|
|
||||||
&& "No ChannelBroadcastSendOp in compute where softmax "
|
|
||||||
"divisor was specified to be?");
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
rewriter.setInsertionPoint(yieldOp);
|
|
||||||
divisor = spatial::SpatChannelBroadcastReceiveOp::create(rewriter, loc, scalarTensorType, softmaxChannel);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Walk the chain of operations until we find the ONNXExpOp: this is
|
|
||||||
// needed because some some may have a different amount of `VAddOp`s due
|
|
||||||
// to the tree reduction (e.g. some may have no VAddOp, some may have
|
|
||||||
// multiples)
|
|
||||||
Value oldOutputTile = resolveONNXExpOpFromUseChain(yieldOp->getOperand(computeToDivideOpAndResNum.second));
|
|
||||||
|
|
||||||
rewriter.setInsertionPoint(yieldOp);
|
|
||||||
Value newOutputTile = spatial::SpatVSDivOp::create(rewriter, loc, oldOutputTile.getType(), oldOutputTile, divisor);
|
|
||||||
auto yieldOperandNum = yieldOp->getNumOperands();
|
|
||||||
yieldOp->insertOperands(yieldOperandNum, newOutputTile);
|
|
||||||
|
|
||||||
outputOpsAndResNums.push_back({computeToDivideOpAndResNum.first, yieldOperandNum});
|
|
||||||
}
|
|
||||||
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
|
|
||||||
void populateOnnxGemmOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
|
void populateOnnxGemmOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
|
||||||
patterns.insert<GemmToManyGemv>(ctx);
|
patterns.insert<GemmToManyGemv>(ctx);
|
||||||
patterns.insert<GemvToSpatialCompute>(ctx);
|
patterns.insert<GemvToSpatialCompute>(ctx);
|
||||||
|
|||||||
@@ -1,89 +0,0 @@
|
|||||||
#include "mlir/Transforms/DialectConversion.h"
|
|
||||||
|
|
||||||
#include "Conversion/ONNXToSpatial/Patterns.hpp"
|
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
struct ReduceMeanConversionPattern : public OpConversionPattern<ONNXReduceMeanV13Op> {
|
|
||||||
|
|
||||||
ReduceMeanConversionPattern(MLIRContext* ctx)
|
|
||||||
: OpConversionPattern(ctx) {}
|
|
||||||
|
|
||||||
LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean,
|
|
||||||
ONNXReduceMeanV13OpAdaptor adaptor,
|
|
||||||
ConversionPatternRewriter& rewriter) const final {
|
|
||||||
|
|
||||||
// Get the input tensor.
|
|
||||||
Value inputTensor = adaptor.getData();
|
|
||||||
auto inputTensorType = cast<RankedTensorType>(inputTensor.getType());
|
|
||||||
|
|
||||||
// This pattern will substitute the ONNXReduceMeanV13Op with a
|
|
||||||
// ONNXAveragePoolOp with the same input tensor and an appropriate kernel
|
|
||||||
// shape and strides.
|
|
||||||
|
|
||||||
// To get the stride and shape of the kernel, we need to read the tensor
|
|
||||||
// shape.
|
|
||||||
int image_height = inputTensorType.getShape()[2];
|
|
||||||
int image_width = inputTensorType.getShape()[3];
|
|
||||||
|
|
||||||
// Define the kernel shape and strides.
|
|
||||||
SmallVector<int64_t> kernelShapeVals = {image_height, image_width};
|
|
||||||
SmallVector<int64_t> stridesVals = {image_height, image_width};
|
|
||||||
SmallVector<int64_t> dilationsVals = {1, 1};
|
|
||||||
|
|
||||||
// Set the pads to 0.
|
|
||||||
SmallVector<int64_t> padsVals = {0, 0, 0, 0};
|
|
||||||
|
|
||||||
// Create the ArrayAttrs
|
|
||||||
auto kernelShape = mlir::ArrayAttr::get(
|
|
||||||
rewriter.getContext(), llvm::to_vector(llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute {
|
|
||||||
return rewriter.getI64IntegerAttr(v);
|
|
||||||
})));
|
|
||||||
|
|
||||||
auto strides = mlir::ArrayAttr::get(rewriter.getContext(),
|
|
||||||
llvm::to_vector(llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute {
|
|
||||||
return rewriter.getI64IntegerAttr(v);
|
|
||||||
})));
|
|
||||||
|
|
||||||
auto dilations = mlir::ArrayAttr::get(
|
|
||||||
rewriter.getContext(), llvm::to_vector(llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute {
|
|
||||||
return rewriter.getI64IntegerAttr(v);
|
|
||||||
})));
|
|
||||||
|
|
||||||
auto pads = mlir::ArrayAttr::get(rewriter.getContext(),
|
|
||||||
llvm::to_vector(llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute {
|
|
||||||
return rewriter.getI64IntegerAttr(v);
|
|
||||||
})));
|
|
||||||
|
|
||||||
// Create the resulting tensor type.
|
|
||||||
auto resultType = RankedTensorType::get(
|
|
||||||
/*shape=*/ {inputTensorType.getShape()[0], inputTensorType.getShape()[1], 1, 1},
|
|
||||||
/*elementType=*/inputTensorType.getElementType());
|
|
||||||
|
|
||||||
// Create the ONNXAveragePoolOp.
|
|
||||||
auto averagePool = ONNXAveragePoolOp::create(rewriter,
|
|
||||||
reduceMean.getLoc(),
|
|
||||||
resultType,
|
|
||||||
inputTensor,
|
|
||||||
/*auto_pad=*/"NOTSET",
|
|
||||||
/*ceil_mode=*/0,
|
|
||||||
/*count_include_pad=*/1,
|
|
||||||
dilations,
|
|
||||||
/*kernel_shape=*/kernelShape,
|
|
||||||
/*pads=*/pads,
|
|
||||||
/*strides=*/strides);
|
|
||||||
|
|
||||||
// Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp.
|
|
||||||
rewriter.replaceOp(reduceMean, averagePool.getResult());
|
|
||||||
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void populateReduceMeanConversionPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
|
|
||||||
patterns.insert<ReduceMeanConversionPattern>(ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,119 +0,0 @@
|
|||||||
#include <queue>
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Structure that describes the replication of a convolution operation,
|
|
||||||
* along the image height axis.
|
|
||||||
*/
|
|
||||||
struct ConvReplication {
|
|
||||||
ONNXConvOp convOp; // Convolution operation
|
|
||||||
size_t input_w; // Width of the input image
|
|
||||||
size_t replicationFactor; // Replication factor on the image height axis
|
|
||||||
size_t coresNeededPerReplica; // Number of cores needed for each replica
|
|
||||||
|
|
||||||
friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
|
|
||||||
return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
|
|
||||||
}
|
|
||||||
|
|
||||||
ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
|
|
||||||
: convOp(convOp),
|
|
||||||
input_w(input_w),
|
|
||||||
replicationFactor(replicationFactor),
|
|
||||||
coresNeededPerReplica(coresNeededPerReplica) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
|
|
||||||
|
|
||||||
if (coresCount == -1) {
|
|
||||||
// No need for annotation, implicitly set replication to 1
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
|
|
||||||
|
|
||||||
size_t minimumCores = 0;
|
|
||||||
|
|
||||||
for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
|
|
||||||
if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
|
|
||||||
// Convolution layer
|
|
||||||
|
|
||||||
Value X = convOp.getX(), W = convOp.getW();
|
|
||||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
|
||||||
ShapedType wShape = mlir::cast<ShapedType>(W.getType());
|
|
||||||
|
|
||||||
size_t input_w = getImageWidth(xShape);
|
|
||||||
size_t krn_h = getKernelHeight(wShape);
|
|
||||||
size_t krn_w = getKernelWidth(wShape);
|
|
||||||
|
|
||||||
size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
|
|
||||||
size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
|
|
||||||
|
|
||||||
auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
|
|
||||||
auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
|
|
||||||
|
|
||||||
minimumCores += neededCores;
|
|
||||||
|
|
||||||
convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
|
|
||||||
}
|
|
||||||
else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
|
|
||||||
// Fully connected layer
|
|
||||||
auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
|
|
||||||
auto inputSize = matrixTensorShape.getDimSize(0);
|
|
||||||
auto outputSize = matrixTensorShape.getDimSize(1);
|
|
||||||
if (gemmOp.getTransB())
|
|
||||||
std::swap(inputSize, outputSize);
|
|
||||||
|
|
||||||
const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
|
|
||||||
const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
|
|
||||||
|
|
||||||
// Each output tile is computed by `coresPerOutputTile` cores. The
|
|
||||||
// entire input is given to each of these cores.
|
|
||||||
const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
|
|
||||||
|
|
||||||
auto neededCores = coresPerOutputTile * outputTilesCount;
|
|
||||||
|
|
||||||
minimumCores += neededCores;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (static_cast<size_t>(coresCount) < minimumCores) {
|
|
||||||
return funcOp->emitError("Not enough cores for this network: ")
|
|
||||||
<< minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
|
|
||||||
|
|
||||||
// Consume all the elements in the queue
|
|
||||||
while (!convOpsReplicationQueue.empty()) {
|
|
||||||
auto convOpReplication = convOpsReplicationQueue.top();
|
|
||||||
convOpsReplicationQueue.pop();
|
|
||||||
|
|
||||||
// Check if we can replicate this convolution (e.g. we have enough cores)
|
|
||||||
if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
|
|
||||||
// We can replicate this convolution: increment replicationFactor and put
|
|
||||||
// back in queue
|
|
||||||
availableCores -= convOpReplication.coresNeededPerReplica;
|
|
||||||
convOpReplication.replicationFactor++;
|
|
||||||
|
|
||||||
convOpsReplicationQueue.push(convOpReplication);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// Cannot replicate this convolution anymore, annotate the operation
|
|
||||||
// with the replication factor
|
|
||||||
convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
|
|
||||||
rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return success();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,10 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
|
||||||
#include "mlir/IR/PatternMatch.h"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
mlir::LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter);
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,348 +0,0 @@
|
|||||||
#include "mlir/IR/BuiltinAttributes.h"
|
|
||||||
#include "mlir/IR/Value.h"
|
|
||||||
|
|
||||||
#include "llvm/Support/raw_ostream.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "SpatialReducer.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
||||||
|
|
||||||
#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
|
|
||||||
#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
llvm::SmallPtrSet<mlir::Operation*, 16> onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
|
|
||||||
|
|
||||||
ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> processFun,
|
|
||||||
mlir::ConversionPatternRewriter& rewriter) {
|
|
||||||
assert(processFun);
|
|
||||||
|
|
||||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
|
||||||
auto resultNum = GET_RES_NUM(computeOpAndResNum);
|
|
||||||
|
|
||||||
spatial::SpatYieldOp yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
||||||
|
|
||||||
mlir::Value result = yieldOp->getOperand(resultNum);
|
|
||||||
rewriter.setInsertionPointAfterValue(result);
|
|
||||||
mlir::Value processedResult = processFun(result);
|
|
||||||
if (processedResult == result) {
|
|
||||||
// Sometimes we want processedResult to return the same value but do
|
|
||||||
// something else with it (e.g. in softmax we want to broadcast the value
|
|
||||||
// using a channel). In this case, we can just return the same value.
|
|
||||||
return resultNum;
|
|
||||||
}
|
|
||||||
|
|
||||||
yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
|
|
||||||
|
|
||||||
return yieldOp.getNumOperands() - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
OpAndResNum
|
|
||||||
SpatialReducer::applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
|
|
||||||
std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> preprocess,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> postprocess) {
|
|
||||||
|
|
||||||
if (preprocess)
|
|
||||||
for (auto& computeOpAndResNum : computeOpsAndResNum)
|
|
||||||
GET_RES_NUM(computeOpAndResNum) = applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
|
|
||||||
|
|
||||||
// It is possible that `computeOpsAndResNum` contains two entries for the same
|
|
||||||
// computeOp. In this case, we need to apply the reduction within-computef
|
|
||||||
|
|
||||||
// Keep a map between a computeOp and the last Value for this reduction
|
|
||||||
std::unordered_map<mlir::Operation*, mlir::Value> lastValueForCompute;
|
|
||||||
for (auto& computeOpAndResNum : computeOpsAndResNum) {
|
|
||||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
|
||||||
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
||||||
mlir::Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
|
|
||||||
|
|
||||||
auto it = lastValueForCompute.find(computeOp.getOperation());
|
|
||||||
|
|
||||||
if (it != lastValueForCompute.end()) {
|
|
||||||
// If we have already seen this computeOp, apply the reduction
|
|
||||||
// within-compute
|
|
||||||
mlir::Value lastWithinComputeValue = it->second;
|
|
||||||
|
|
||||||
assert(valueWithinCompute.getDefiningOp() && lastWithinComputeValue.getDefiningOp());
|
|
||||||
|
|
||||||
if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp()))
|
|
||||||
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
|
||||||
else
|
|
||||||
rewriter.setInsertionPointAfterValue(valueWithinCompute);
|
|
||||||
valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
|
|
||||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
|
||||||
}
|
|
||||||
|
|
||||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now, reconstruct from the map the computeOpsAndResNum list
|
|
||||||
computeOpsAndResNum.clear();
|
|
||||||
computeOpsAndResNum.reserve(lastValueForCompute.size());
|
|
||||||
for (auto& entry : lastValueForCompute) {
|
|
||||||
auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(entry.first);
|
|
||||||
auto valueWithinCompute = entry.second;
|
|
||||||
|
|
||||||
// We check if `valueWithinCompute` is already used by the yieldOp, in that
|
|
||||||
// case no need to add it
|
|
||||||
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
||||||
bool yieldOpUseFound = false;
|
|
||||||
for (auto& use : valueWithinCompute.getUses()) {
|
|
||||||
if (use.getOwner() == yieldOp.getOperation()) {
|
|
||||||
// If the value is already used by the yieldOp, we can just use it
|
|
||||||
computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
|
|
||||||
yieldOpUseFound = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (yieldOpUseFound)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// If this result is not used within a yieldOp, then add it
|
|
||||||
auto resultNum = yieldOp->getNumOperands();
|
|
||||||
yieldOp->insertOperands(resultNum, valueWithinCompute);
|
|
||||||
|
|
||||||
computeOpsAndResNum.push_back({computeOp, resultNum});
|
|
||||||
}
|
|
||||||
|
|
||||||
mlir::Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
|
|
||||||
|
|
||||||
// Recursive algorithm to reduce the inputs to a single one:
|
|
||||||
// - Take two inputs at a time, and reduce them into a single one, updating
|
|
||||||
// the computeOpsAndResNum list which becomes half the size.
|
|
||||||
// - Repeat until there is only one input left.
|
|
||||||
llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
|
|
||||||
while (computeOpsRef.size() > 1) {
|
|
||||||
llvm::SmallVector<ComputeAndResNum> nextComputeOps;
|
|
||||||
nextComputeOps.reserve(computeOpsRef.size() / 2);
|
|
||||||
for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
|
|
||||||
auto [firstCompute, firstResultNum] = computeOpsRef[i];
|
|
||||||
auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
|
|
||||||
|
|
||||||
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
|
||||||
std::swap(firstCompute, secondCompute);
|
|
||||||
std::swap(firstResultNum, secondResultNum);
|
|
||||||
}
|
|
||||||
|
|
||||||
// We do not immediately alter the computeOps results/operands, instead we
|
|
||||||
// do it in a delayed manner, to avoid invalidating the references to the
|
|
||||||
// computeOps (which must be replaced by a cloned ComputeOp when changing
|
|
||||||
// the number of results)
|
|
||||||
// See below `reducerChanges.push_back` and `finalizeReduceUpdates`
|
|
||||||
|
|
||||||
auto yieldOpFirstCompute = mlir::cast<spatial::SpatYieldOp>(firstCompute.getBody().front().getTerminator());
|
|
||||||
|
|
||||||
// Add a new operand to the block of the second computeOp
|
|
||||||
mlir::Block& secondBlock = secondCompute.getBody().front();
|
|
||||||
mlir::Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
|
|
||||||
|
|
||||||
auto secondComputeWeightsNum =
|
|
||||||
secondCompute->getAttrOfType<mlir::DenseI32ArrayAttr>(secondCompute.getOperandSegmentSizesAttrName())[0];
|
|
||||||
auto secondComputeOperandNum = secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
|
|
||||||
|
|
||||||
// Take the "former-result" from the second computeOp
|
|
||||||
spatial::SpatYieldOp secondYield = mlir::cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
|
|
||||||
mlir::Value formerRes2 = secondYield.getOperand(secondResultNum);
|
|
||||||
|
|
||||||
// Apply reduction operation
|
|
||||||
rewriter.setInsertionPoint(secondYield);
|
|
||||||
mlir::Value reduced = reduce(formerRes2, formerRes1);
|
|
||||||
|
|
||||||
// Unfortunately, it is not possible to update the result in place,
|
|
||||||
// because we may have already referenced it by <computeOp, resultNum>
|
|
||||||
// outside of this function, thus replacing it would invalidate the
|
|
||||||
// reference. Therefore, we need to append a new result to the yieldOp,
|
|
||||||
// and then at a later stage update the computeOp accordingly.
|
|
||||||
|
|
||||||
// Add `reduced` to the second yieldOp
|
|
||||||
auto secondYieldOperandNum = secondYield.getNumOperands();
|
|
||||||
secondYield->insertOperands(secondYieldOperandNum, reduced);
|
|
||||||
secondResultNum = secondYieldOperandNum;
|
|
||||||
|
|
||||||
// We should also add an entry for updating the results of the last
|
|
||||||
// operation (the one which never becomes a `firstCompute`): because it is
|
|
||||||
// not tracked by reducerChanges as `fromOp`
|
|
||||||
reducerChanges.push_back(
|
|
||||||
{firstCompute.getOperation(), firstResultNum, secondCompute.getOperation(), secondComputeOperandNum});
|
|
||||||
nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we have an odd number of inputs, we need to add the last one to the
|
|
||||||
// newInputs list.
|
|
||||||
if (computeOpsRef.size() % 2 == 1)
|
|
||||||
nextComputeOps.push_back(computeOpsRef.back());
|
|
||||||
|
|
||||||
// Replace the inputOps list with the new one.
|
|
||||||
computeOpsRef = llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(computeOpsRef.size() == 1 && "Internal error: expected a single input at this point.");
|
|
||||||
|
|
||||||
auto finalComputeAndResNum = computeOpsRef[0];
|
|
||||||
|
|
||||||
// Force the update of the results of this computeOp, when finalizing
|
|
||||||
computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
|
|
||||||
|
|
||||||
if (postprocess)
|
|
||||||
GET_RES_NUM(finalComputeAndResNum) = applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
|
|
||||||
|
|
||||||
return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(), GET_RES_NUM(finalComputeAndResNum));
|
|
||||||
}
|
|
||||||
|
|
||||||
void SpatialReducer::finalizeReduceUpdates() {
|
|
||||||
assert(reducesFinalized == false && "Cannot finalize two times.");
|
|
||||||
|
|
||||||
reducesFinalized = true;
|
|
||||||
|
|
||||||
// First, add the results to the computeOps
|
|
||||||
for (auto& reduceChange : reducerChanges)
|
|
||||||
updateResultsOfCompute(reduceChange.fromOp);
|
|
||||||
|
|
||||||
for (auto& c : computeOpNeedingResUpdate)
|
|
||||||
updateResultsOfCompute(c.getOperation());
|
|
||||||
|
|
||||||
for (auto& reducerChange : this->reducerChanges) {
|
|
||||||
auto fromOp = reducerChange.fromOp;
|
|
||||||
auto toOp = reducerChange.toOp;
|
|
||||||
auto fromOpResNum = reducerChange.fromOpResNum;
|
|
||||||
auto toOpOperandNum = reducerChange.toOpOperandNum;
|
|
||||||
|
|
||||||
auto fromComputeOp = opToReplacedCompute[fromOp];
|
|
||||||
assert(fromComputeOp && "fromOp should have been mapped before!");
|
|
||||||
|
|
||||||
// toComputeOp could be the existing pointer, or we have to remap it with
|
|
||||||
// `opToReplacedCompute`
|
|
||||||
auto toComputeOp = opToReplacedCompute[toOp];
|
|
||||||
if (!toComputeOp)
|
|
||||||
toComputeOp = mlir::cast<spatial::SpatWeightedCompute>(toOp);
|
|
||||||
|
|
||||||
assert(toComputeOp != fromComputeOp && "Oops should have caught this earlier!");
|
|
||||||
|
|
||||||
assert(toComputeOp->getNumOperands() == toOpOperandNum
|
|
||||||
&& "toOpOperandNum should be the last operand of toComputeOp, are the "
|
|
||||||
"operations in the right order?");
|
|
||||||
|
|
||||||
// Add the new operand to `toComputeOp`
|
|
||||||
auto fromResult = fromComputeOp.getResult(fromOpResNum);
|
|
||||||
toComputeOp->insertOperands(toOpOperandNum, fromResult);
|
|
||||||
incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mlir::Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) {
|
|
||||||
assert(reducesFinalized && "Cannot create resolve values before finalizing the reduce updates.");
|
|
||||||
|
|
||||||
mlir::Operation* opToCast;
|
|
||||||
auto it = opToReplacedCompute.find(opAndResNum.first);
|
|
||||||
if (it != opToReplacedCompute.end())
|
|
||||||
opToCast = it->second;
|
|
||||||
else
|
|
||||||
opToCast = opAndResNum.first;
|
|
||||||
|
|
||||||
auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(opToCast);
|
|
||||||
|
|
||||||
return computeOp.getResult(opAndResNum.second);
|
|
||||||
}
|
|
||||||
|
|
||||||
void SpatialReducer::updateResultsOfCompute(mlir::Operation* computeOp) {
|
|
||||||
if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
|
|
||||||
// If we have already replaced the fromOp, we do not need to do it again
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto oldComputeOp = mlir::cast<spatial::SpatWeightedCompute>(computeOp);
|
|
||||||
|
|
||||||
auto oldComputeOpNum = oldComputeOp->getNumOperands();
|
|
||||||
|
|
||||||
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
|
|
||||||
|
|
||||||
if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
|
|
||||||
// No result was added, just add itself to the map
|
|
||||||
opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the results by inspecting its YieldOp
|
|
||||||
auto newResultTypes = yieldOp.getOperandTypes();
|
|
||||||
|
|
||||||
// Create a new ComputeOp with the new result type, but same operands
|
|
||||||
rewriter.setInsertionPoint(oldComputeOp);
|
|
||||||
auto newComputeOp = spatial::SpatWeightedCompute::create(
|
|
||||||
rewriter, oldComputeOp->getLoc(), newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
|
|
||||||
|
|
||||||
newComputeOp.getBody().takeBody(oldComputeOp.getBody());
|
|
||||||
|
|
||||||
auto newComputeOpNum = newComputeOp->getNumOperands();
|
|
||||||
|
|
||||||
assert(oldComputeOpNum == newComputeOpNum);
|
|
||||||
|
|
||||||
// Since we replaced the old ComputeOp with a new one, we need to replace
|
|
||||||
// all its results' uses
|
|
||||||
for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
|
|
||||||
mlir::Value oldResult = oldComputeOp.getResult(i);
|
|
||||||
mlir::Value newResult = newComputeOp.getResult(i);
|
|
||||||
|
|
||||||
// Replace the uses, except the uses of the compute ops which got deleted
|
|
||||||
// previously
|
|
||||||
rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally, erase the old computeOp and update the map
|
|
||||||
opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
|
|
||||||
oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
|
|
||||||
rewriter.setInsertionPoint(oldComputeOp);
|
|
||||||
rewriter.eraseOp(oldComputeOp);
|
|
||||||
}
|
|
||||||
|
|
||||||
mlir::Value
|
|
||||||
SpatialReducer::createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
|
|
||||||
mlir::Location& loc,
|
|
||||||
mlir::Type outputType) {
|
|
||||||
|
|
||||||
assert(reducesFinalized && "Cannot create ImgConcatOp before finalizing the reduce updates.");
|
|
||||||
|
|
||||||
// outputTiles are indexed like this: [channelTile][x][y]
|
|
||||||
auto tilesCount = outputTiles.size();
|
|
||||||
auto width = outputTiles[0].size();
|
|
||||||
auto height = outputTiles[0][0].size();
|
|
||||||
|
|
||||||
llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>> remappedOutputTiles(
|
|
||||||
tilesCount, llvm::SmallVector<llvm::SmallVector<mlir::Value>>(width, llvm::SmallVector<mlir::Value>(height)));
|
|
||||||
|
|
||||||
for (size_t t = 0; t < tilesCount; t++)
|
|
||||||
for (size_t x = 0; x < width; x++)
|
|
||||||
for (size_t y = 0; y < height; y++)
|
|
||||||
remappedOutputTiles[t][x][y] = resolveValueFromOpAndResNum(outputTiles[t][x][y]);
|
|
||||||
|
|
||||||
return ::onnx_mlir::createImgConcatOp(remappedOutputTiles, rewriter, loc, outputType);
|
|
||||||
}
|
|
||||||
|
|
||||||
OpAndResNum SpatialReducer::applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
|
|
||||||
mlir::ConversionPatternRewriter& rewriter,
|
|
||||||
mlir::Value biasTile,
|
|
||||||
MapOperations mapOp) {
|
|
||||||
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> postprocessing = nullptr;
|
|
||||||
|
|
||||||
if (mapOp != MapOperations::None) {
|
|
||||||
postprocessing = [&](const mlir::Value a) {
|
|
||||||
mlir::Value mapOperand = a;
|
|
||||||
if (biasTile)
|
|
||||||
mapOperand = spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, biasTile);
|
|
||||||
return createMapOperation(rewriter, mapOp, mapOperand);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return this->applyReducePattern(
|
|
||||||
computeOps,
|
|
||||||
[&](mlir::Value a, mlir::Value b) { return spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, b); },
|
|
||||||
/* preprocess = */ nullptr,
|
|
||||||
postprocessing);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llvm/ADT/SmallPtrSet.h"
|
|
||||||
#include "llvm/Support/Casting.h"
|
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
using ResNum = unsigned int;
|
|
||||||
|
|
||||||
using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;
|
|
||||||
|
|
||||||
struct SpatialReducerChange {
|
|
||||||
mlir::Operation* fromOp;
|
|
||||||
unsigned int fromOpResNum;
|
|
||||||
mlir::Operation* toOp;
|
|
||||||
unsigned int toOpOperandNum;
|
|
||||||
};
|
|
||||||
|
|
||||||
using OpAndResNum = std::pair<mlir::Operation*, ResNum>;
|
|
||||||
|
|
||||||
class SpatialReducer {
|
|
||||||
|
|
||||||
public:
|
|
||||||
SpatialReducer(mlir::ConversionPatternRewriter& rewriter)
|
|
||||||
: rewriter(rewriter) {}
|
|
||||||
|
|
||||||
OpAndResNum applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
|
|
||||||
std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> preprocess,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> postprocess);
|
|
||||||
|
|
||||||
OpAndResNum applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
|
|
||||||
mlir::ConversionPatternRewriter& rewriter,
|
|
||||||
mlir::Value biasTile,
|
|
||||||
MapOperations mapOp);
|
|
||||||
|
|
||||||
void finalizeReduceUpdates();
|
|
||||||
|
|
||||||
~SpatialReducer() {
|
|
||||||
if (!reducesFinalized)
|
|
||||||
finalizeReduceUpdates();
|
|
||||||
}
|
|
||||||
|
|
||||||
mlir::Value createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
|
|
||||||
mlir::Location& loc,
|
|
||||||
mlir::Type outputType);
|
|
||||||
|
|
||||||
mlir::Value resolveValueFromOpAndResNum(OpAndResNum& opAndResNum);
|
|
||||||
|
|
||||||
private:
|
|
||||||
[[nodiscard("computeOp result number gets updated")]] ResNum
|
|
||||||
applyResultProcessing(ComputeAndResNum computeOpAndResNum,
|
|
||||||
std::function<mlir::Value(const mlir::Value&)> processFun,
|
|
||||||
mlir::ConversionPatternRewriter& rewriter);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Update the results of a ComputeOp.
|
|
||||||
*
|
|
||||||
* This function updates the results of a ComputeOp by taking a look at the
|
|
||||||
operands of its yieldOp.
|
|
||||||
* If the ComputeOp was replaced, it updates `opToReplacedCompute` with the
|
|
||||||
replaced ComputeOp.
|
|
||||||
*
|
|
||||||
* @param computeOp The ComputeOp to update the results of.
|
|
||||||
*/
|
|
||||||
void updateResultsOfCompute(mlir::Operation* computeOp);
|
|
||||||
|
|
||||||
mlir::ConversionPatternRewriter& rewriter;
|
|
||||||
bool reducesFinalized = false;
|
|
||||||
|
|
||||||
// List of changes to be applied after the reduction is finalized
|
|
||||||
llvm::SmallVector<SpatialReducerChange, 4> reducerChanges;
|
|
||||||
// List of computeOps that need to be replaced with new results
|
|
||||||
llvm::SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
|
|
||||||
|
|
||||||
std::unordered_map<mlir::Operation*, spatial::SpatWeightedCompute> opToReplacedCompute;
|
|
||||||
|
|
||||||
static llvm::SmallPtrSet<mlir::Operation*, 16> oldComputeOpsReplaced;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
#include <cassert>
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
WeightSubdivider::WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights)
|
|
||||||
: weights(std::move(weights)) {}
|
|
||||||
|
|
||||||
bool WeightSubdivider::isEmpty() const { return weights.empty(); }
|
|
||||||
|
|
||||||
TaggedWeights WeightSubdivider::popGroup(size_t amount) {
|
|
||||||
assert(!weights.empty() && "No weights to extract.");
|
|
||||||
|
|
||||||
auto it = weights.begin();
|
|
||||||
llvm::SmallVector<mlir::Value>& values = it->second.begin()->second;
|
|
||||||
|
|
||||||
long inputTile = it->first;
|
|
||||||
long outputTile = it->second.begin()->first;
|
|
||||||
|
|
||||||
size_t n = std::min(amount, values.size());
|
|
||||||
crossbarsUsed += n;
|
|
||||||
|
|
||||||
llvm::SmallVector<mlir::Value> result;
|
|
||||||
result.assign(values.begin(), values.begin() + n);
|
|
||||||
|
|
||||||
if (n < values.size()) {
|
|
||||||
values.erase(values.begin(), values.begin() + n);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
it->second.erase(outputTile);
|
|
||||||
if (it->second.empty())
|
|
||||||
weights.erase(inputTile);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {inputTile, outputTile, crossbarsUsed - n, result};
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
|
|
||||||
crossbarsUsed = 0;
|
|
||||||
llvm::SmallVector<TaggedWeights> result;
|
|
||||||
size_t remaining = n;
|
|
||||||
|
|
||||||
while (remaining > 0 && !weights.empty()) {
|
|
||||||
auto group = popGroup(remaining);
|
|
||||||
result.push_back(group);
|
|
||||||
remaining -= group.weights.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,46 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/IR/Value.h"
|
|
||||||
|
|
||||||
#include "llvm/ADT/SmallVector.h"
|
|
||||||
|
|
||||||
#include <cstddef>
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief A helper struct to store a group of weights.
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
struct TaggedWeights {
|
|
||||||
long inputTile;
|
|
||||||
long outputTile;
|
|
||||||
size_t startingCrossbarIndex;
|
|
||||||
llvm::SmallVector<mlir::Value> weights;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief A helper class to subdivide weights into groups.
|
|
||||||
*
|
|
||||||
* Weights are stored as a map of maps of SmallVectors. The outer map is indexed
|
|
||||||
* by input tile, the inner map is indexed by output tile, and the SmallVector
|
|
||||||
* contains the weights for the filter. This class allows us to extract groups
|
|
||||||
* of weights from the map until we've extracted a certain number of elements,
|
|
||||||
* namely as many as we need to fill a compute unit.
|
|
||||||
*/
|
|
||||||
class WeightSubdivider {
|
|
||||||
private:
|
|
||||||
std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights;
|
|
||||||
size_t crossbarsUsed = 0;
|
|
||||||
|
|
||||||
TaggedWeights popGroup(size_t amount);
|
|
||||||
|
|
||||||
public:
|
|
||||||
WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights);
|
|
||||||
|
|
||||||
bool isEmpty() const;
|
|
||||||
llvm::SmallVector<TaggedWeights> popGroups(size_t n);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
Reference in New Issue
Block a user