diff --git a/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt b/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt index 383f7f3..1e383c8 100644 --- a/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt +++ b/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt @@ -7,12 +7,8 @@ add_pim_library(OMONNXToSpatial Patterns/Math/Conv.cpp Patterns/Math/MatMul.cpp Patterns/NN/Pool.cpp - Patterns/NN/ReduceMean.cpp Patterns/Tensor/Concat.cpp Patterns/Tensor/Reshape.cpp - Utils/SpatialReducer.cpp - Utils/WeightSubdivider.cpp - Utils/AnnotateReplication.cpp ONNXToSpatialPass.cpp Common.cpp diff --git a/src/PIM/Conversion/ONNXToSpatial/Common.hpp b/src/PIM/Conversion/ONNXToSpatial/Common.hpp index 2676698..3acddb5 100644 --- a/src/PIM/Conversion/ONNXToSpatial/Common.hpp +++ b/src/PIM/Conversion/ONNXToSpatial/Common.hpp @@ -57,8 +57,6 @@ inline auto getFilterCount(const ShapedType& shapedType) { return shapedType.getDimSize(0); } -inline constexpr mlir::StringRef REPLICATION_ATTR_NAME = "replication_factor"; - using HSliceId = size_t; using CoreId = size_t; diff --git a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp index 2ba333e..fb65649 100644 --- a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp +++ b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp @@ -11,7 +11,6 @@ #include #include "Common/PimCommon.hpp" -#include "Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" @@ -68,11 +67,6 @@ void ONNXToSpatialPass::runOnOperation() { signalPassFailure(); return; } - if (annotateReplication(*entryFunc, rewriter).failed()) { - llvm::dbgs() << "Failed during annotation for replication analysis\n"; - signalPassFailure(); - return; - } ConversionTarget target(*ctx); target.addLegalDialect(); @@ -98,7 +92,6 @@ void ONNXToSpatialPass::runOnOperation() { populateReshapeConversionPattern(patterns, ctx); populateONNXConcatToTensorConcatPattern(patterns, ctx); - populateReduceMeanConversionPattern(patterns, ctx); if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) { signalPassFailure(); diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns.hpp b/src/PIM/Conversion/ONNXToSpatial/Patterns.hpp index 2b29f4a..de4ae31 100644 --- a/src/PIM/Conversion/ONNXToSpatial/Patterns.hpp +++ b/src/PIM/Conversion/ONNXToSpatial/Patterns.hpp @@ -17,6 +17,4 @@ void populateONNXConcatToTensorConcatPattern(mlir::RewritePatternSet& patterns, void populateReshapeConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx); -void populateReduceMeanConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx); - } // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp index ab67abf..d8c7cb0 100644 --- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp +++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp @@ -5,14 +5,12 @@ #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/DialectConversion.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include #include "src/Accelerators/PIM/Common/PimCommon.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp" -#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp" #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" @@ -21,12 +19,8 @@ using namespace mlir; namespace onnx_mlir { namespace { -constexpr StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor"; - -static FailureOr materializeScaledConstantTensor(Value value, - float factor, - ConversionPatternRewriter& rewriter, - Location loc) { +static FailureOr +materializeScaledConstantTensor(Value value, float factor, ConversionPatternRewriter& rewriter, Location loc) { if (factor == 1.0f) return value; @@ -70,16 +64,6 @@ struct GemvToSpatialCompute : OpConversionPattern { LogicalResult matchAndRewrite(ONNXGemmOp gemmOp, ONNXGemmOpAdaptor gemmOpAdaptor, ConversionPatternRewriter& rewriter) const override; - -private: - static Value resolveONNXExpOpFromUseChain(Value startValue); - - static LogicalResult softmaxReductionApplication(SmallVector& outputOpsAndResNums, - Value& softmaxChannel, - ConversionPatternRewriter& rewriter, - SpatialReducer& reducer, - ONNXGemmOp& gemmOp, - Location& loc); }; } // namespace @@ -122,7 +106,13 @@ LogicalResult GemmToManyGemv::matchAndRewrite(ONNXGemmOp gemmOp, // Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling if (cType.getRank() == 1) { auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType()); - c = tensor::ExpandShapeOp::create(rewriter, loc, expandedType, c, SmallVector{{0, 1}}); + c = tensor::ExpandShapeOp::create(rewriter, + loc, + expandedType, + c, + SmallVector { + {0, 1} + }); cType = expandedType; } assert("Only support rank 2 tensor for C" && cType.getRank() == 2); @@ -208,7 +198,13 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp, // Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling if (cType.getRank() == 1) { auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType()); - c = tensor::ExpandShapeOp::create(rewriter, gemmLoc, expandedType, c, SmallVector{{0, 1}}); + c = tensor::ExpandShapeOp::create(rewriter, + gemmLoc, + expandedType, + c, + SmallVector { + {0, 1} + }); cType = expandedType; } assert("Only support rank 2 tensor for C" && cType.getRank() == 2); @@ -356,124 +352,6 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp, return success(); } -Value GemvToSpatialCompute::resolveONNXExpOpFromUseChain(Value startValue) { - Value walker = startValue; - - while (!llvm::isa(walker.getDefiningOp())) { - walker = walker.getDefiningOp()->getOperand(0); - - assert(walker && walker.getDefiningOp() - && "Unwinded the whole chain of operations while trying to " - "find ONNXExpOp, but did not find it"); - } - - // Make sure the dividend is actually produced by an ONNXExpOp - assert(llvm::isa(walker.getDefiningOp()) - && "Old output tile (softmax reducer) is not produced by an " - "ONNXExpOp"); - - return walker; -} - -LogicalResult GemvToSpatialCompute::softmaxReductionApplication(SmallVector& outputOpsAndResNums, - Value& softmaxChannel, - ConversionPatternRewriter& rewriter, - SpatialReducer& reducer, - ONNXGemmOp& gemmOp, - Location& loc) { - // TODO: Check case with one compute op - - // Cast vector of Value into vector of ComputeOp - SmallVector softmaxOpsToReduce = - llvm::to_vector(llvm::map_range(outputOpsAndResNums, [&](OpAndResNum computeAndResNum) { - return std::make_pair(cast(computeAndResNum.first), computeAndResNum.second); - })); - - RankedTensorType::Builder tensorTypeBuilder({1}, Float32Type::get(rewriter.getContext()), nullptr); - const TensorType scalarTensorType = tensorTypeBuilder; - - reducer.applyReducePattern( - softmaxOpsToReduce, - [&](Value a, Value b) { return spatial::SpatVAddOp::create(rewriter, loc, scalarTensorType, a, b); }, - /* preprocess = */ - [&](Value a) { return spatial::SpatSumOp::create(rewriter, loc, scalarTensorType, a); }, - [&](Value softmaxDivisor) { - // Signal that this is the compute with the softmax divisor - auto computeOp = cast(softmaxDivisor.getDefiningOp()->getParentOp()); - computeOp->setAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME, rewriter.getUnitAttr()); - - // Broadcast the divisor to all the cores - rewriter.setInsertionPointAfterValue(softmaxDivisor); - spatial::SpatChannelBroadcastSendOp::create(rewriter, loc, softmaxChannel, softmaxDivisor); - - /* - * softmaxDividend = onnx.exp (...) - * sum = spat.SumOp(softmaxDividend) - * [following can be repeated N times, thus walk the use chain] - * softmaxDivisor = spat.sadd(sum, ...) - */ - Value softmaxDividend = resolveONNXExpOpFromUseChain(softmaxDivisor.getDefiningOp()->getOperand(0)); - - // Make sure the dividend is actually produced by an ONNXExpOp - assert(llvm::isa(softmaxDividend.getDefiningOp()) - && "Dividend of softmax reduction is not an ONNXExpOp"); - - // Do not divide here, divide after this - return softmaxDivisor; - }); - - // In all the cores, insert a ChannelRecvOp and divide the output tile by - // the reduced denominator. - outputOpsAndResNums.clear(); - outputOpsAndResNums.reserve(softmaxOpsToReduce.size()); - for (auto& computeToDivideOpAndResNum : softmaxOpsToReduce) { - - auto yieldOp = cast(computeToDivideOpAndResNum.first.getBody().front().getTerminator()); - - Value divisor; - - // Check if this compute contains the softmax divisor: if so, find the - // ChannelBroadcastSendOp, otherwise receive the value from the channel - // using ChannelBroadcastReceiveOp - if (computeToDivideOpAndResNum.first->hasAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME)) { - - bool found = false; - for (auto broadcastOp : - computeToDivideOpAndResNum.first.getBody().front().getOps()) { - assert(found == false - && "More than one ChannelBroadcastSendOp in " - "compute? How is this possible?"); - found = true; - - divisor = broadcastOp.getData(); - } - - assert(found - && "No ChannelBroadcastSendOp in compute where softmax " - "divisor was specified to be?"); - } - else { - rewriter.setInsertionPoint(yieldOp); - divisor = spatial::SpatChannelBroadcastReceiveOp::create(rewriter, loc, scalarTensorType, softmaxChannel); - } - - // Walk the chain of operations until we find the ONNXExpOp: this is - // needed because some some may have a different amount of `VAddOp`s due - // to the tree reduction (e.g. some may have no VAddOp, some may have - // multiples) - Value oldOutputTile = resolveONNXExpOpFromUseChain(yieldOp->getOperand(computeToDivideOpAndResNum.second)); - - rewriter.setInsertionPoint(yieldOp); - Value newOutputTile = spatial::SpatVSDivOp::create(rewriter, loc, oldOutputTile.getType(), oldOutputTile, divisor); - auto yieldOperandNum = yieldOp->getNumOperands(); - yieldOp->insertOperands(yieldOperandNum, newOutputTile); - - outputOpsAndResNums.push_back({computeToDivideOpAndResNum.first, yieldOperandNum}); - } - - return success(); -} - void populateOnnxGemmOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert(ctx); patterns.insert(ctx); diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/NN/ReduceMean.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/NN/ReduceMean.cpp deleted file mode 100644 index 859fa7b..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Patterns/NN/ReduceMean.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include "mlir/Transforms/DialectConversion.h" - -#include "Conversion/ONNXToSpatial/Patterns.hpp" -#include "src/Dialect/ONNX/ONNXOps.hpp" - -using namespace mlir; -namespace onnx_mlir { - -struct ReduceMeanConversionPattern : public OpConversionPattern { - - ReduceMeanConversionPattern(MLIRContext* ctx) - : OpConversionPattern(ctx) {} - - LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean, - ONNXReduceMeanV13OpAdaptor adaptor, - ConversionPatternRewriter& rewriter) const final { - - // Get the input tensor. - Value inputTensor = adaptor.getData(); - auto inputTensorType = cast(inputTensor.getType()); - - // This pattern will substitute the ONNXReduceMeanV13Op with a - // ONNXAveragePoolOp with the same input tensor and an appropriate kernel - // shape and strides. - - // To get the stride and shape of the kernel, we need to read the tensor - // shape. - int image_height = inputTensorType.getShape()[2]; - int image_width = inputTensorType.getShape()[3]; - - // Define the kernel shape and strides. - SmallVector kernelShapeVals = {image_height, image_width}; - SmallVector stridesVals = {image_height, image_width}; - SmallVector dilationsVals = {1, 1}; - - // Set the pads to 0. - SmallVector padsVals = {0, 0, 0, 0}; - - // Create the ArrayAttrs - auto kernelShape = mlir::ArrayAttr::get( - rewriter.getContext(), llvm::to_vector(llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute { - return rewriter.getI64IntegerAttr(v); - }))); - - auto strides = mlir::ArrayAttr::get(rewriter.getContext(), - llvm::to_vector(llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute { - return rewriter.getI64IntegerAttr(v); - }))); - - auto dilations = mlir::ArrayAttr::get( - rewriter.getContext(), llvm::to_vector(llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute { - return rewriter.getI64IntegerAttr(v); - }))); - - auto pads = mlir::ArrayAttr::get(rewriter.getContext(), - llvm::to_vector(llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute { - return rewriter.getI64IntegerAttr(v); - }))); - - // Create the resulting tensor type. - auto resultType = RankedTensorType::get( - /*shape=*/ {inputTensorType.getShape()[0], inputTensorType.getShape()[1], 1, 1}, - /*elementType=*/inputTensorType.getElementType()); - - // Create the ONNXAveragePoolOp. - auto averagePool = ONNXAveragePoolOp::create(rewriter, - reduceMean.getLoc(), - resultType, - inputTensor, - /*auto_pad=*/"NOTSET", - /*ceil_mode=*/0, - /*count_include_pad=*/1, - dilations, - /*kernel_shape=*/kernelShape, - /*pads=*/pads, - /*strides=*/strides); - - // Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp. - rewriter.replaceOp(reduceMean, averagePool.getResult()); - - return success(); - } -}; - -void populateReduceMeanConversionPattern(RewritePatternSet& patterns, MLIRContext* ctx) { - patterns.insert(ctx); -} - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp b/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp deleted file mode 100644 index 289eda4..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include - -#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" -#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp" -#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp" -#include "src/Dialect/ONNX/ONNXOps.hpp" - -using namespace mlir; - -namespace onnx_mlir { - -/** - * @brief Structure that describes the replication of a convolution operation, - * along the image height axis. - */ -struct ConvReplication { - ONNXConvOp convOp; // Convolution operation - size_t input_w; // Width of the input image - size_t replicationFactor; // Replication factor on the image height axis - size_t coresNeededPerReplica; // Number of cores needed for each replica - - friend bool operator<(const ConvReplication& a, const ConvReplication& b) { - return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor; - } - - ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica) - : convOp(convOp), - input_w(input_w), - replicationFactor(replicationFactor), - coresNeededPerReplica(coresNeededPerReplica) {} -}; - -LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) { - - if (coresCount == -1) { - // No need for annotation, implicitly set replication to 1 - return success(); - } - - std::priority_queue convOpsReplicationQueue; - - size_t minimumCores = 0; - - for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) { - if (auto convOp = dyn_cast(op)) { - // Convolution layer - - Value X = convOp.getX(), W = convOp.getW(); - ShapedType xShape = mlir::cast(X.getType()); - ShapedType wShape = mlir::cast(W.getType()); - - size_t input_w = getImageWidth(xShape); - size_t krn_h = getKernelHeight(wShape); - size_t krn_w = getKernelWidth(wShape); - - size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue()); - size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue()); - - auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount; - auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue()); - - minimumCores += neededCores; - - convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores); - } - else if (auto gemmOp = dyn_cast(op)) { - // Fully connected layer - auto matrixTensorShape = cast(gemmOp.getB().getType()); - auto inputSize = matrixTensorShape.getDimSize(0); - auto outputSize = matrixTensorShape.getDimSize(1); - if (gemmOp.getTransB()) - std::swap(inputSize, outputSize); - - const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue()); - const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue()); - - // Each output tile is computed by `coresPerOutputTile` cores. The - // entire input is given to each of these cores. - const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue()); - - auto neededCores = coresPerOutputTile * outputTilesCount; - - minimumCores += neededCores; - } - } - - if (static_cast(coresCount) < minimumCores) { - return funcOp->emitError("Not enough cores for this network: ") - << minimumCores << " cores needed, but only " << static_cast(coresCount) << " available."; - } - - size_t availableCores = static_cast(coresCount) - minimumCores; - - // Consume all the elements in the queue - while (!convOpsReplicationQueue.empty()) { - auto convOpReplication = convOpsReplicationQueue.top(); - convOpsReplicationQueue.pop(); - - // Check if we can replicate this convolution (e.g. we have enough cores) - if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) { - // We can replicate this convolution: increment replicationFactor and put - // back in queue - availableCores -= convOpReplication.coresNeededPerReplica; - convOpReplication.replicationFactor++; - - convOpsReplicationQueue.push(convOpReplication); - } - else { - // Cannot replicate this convolution anymore, annotate the operation - // with the replication factor - convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME, - rewriter.getI64IntegerAttr(convOpReplication.replicationFactor)); - } - } - - return success(); -} - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp b/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp deleted file mode 100644 index ebd859d..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/IR/PatternMatch.h" - -namespace onnx_mlir { - -mlir::LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter); - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp b/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp deleted file mode 100644 index 36d0573..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp +++ /dev/null @@ -1,348 +0,0 @@ -#include "mlir/IR/BuiltinAttributes.h" -#include "mlir/IR/Value.h" - -#include "llvm/Support/raw_ostream.h" - -#include -#include -#include - -#include "SpatialReducer.hpp" -#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" - -#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum) -#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum) - -namespace onnx_mlir { - -llvm::SmallPtrSet onnx_mlir::SpatialReducer::oldComputeOpsReplaced; - -ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum, - std::function processFun, - mlir::ConversionPatternRewriter& rewriter) { - assert(processFun); - - auto computeOp = GET_COMP(computeOpAndResNum); - auto resultNum = GET_RES_NUM(computeOpAndResNum); - - spatial::SpatYieldOp yieldOp = mlir::cast(computeOp.getBody().front().getTerminator()); - - mlir::Value result = yieldOp->getOperand(resultNum); - rewriter.setInsertionPointAfterValue(result); - mlir::Value processedResult = processFun(result); - if (processedResult == result) { - // Sometimes we want processedResult to return the same value but do - // something else with it (e.g. in softmax we want to broadcast the value - // using a channel). In this case, we can just return the same value. - return resultNum; - } - - yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult); - - return yieldOp.getNumOperands() - 1; -} - -OpAndResNum -SpatialReducer::applyReducePattern(llvm::SmallVector& computeOpsAndResNum, - std::function reduce, - std::function preprocess, - std::function postprocess) { - - if (preprocess) - for (auto& computeOpAndResNum : computeOpsAndResNum) - GET_RES_NUM(computeOpAndResNum) = applyResultProcessing(computeOpAndResNum, preprocess, rewriter); - - // It is possible that `computeOpsAndResNum` contains two entries for the same - // computeOp. In this case, we need to apply the reduction within-computef - - // Keep a map between a computeOp and the last Value for this reduction - std::unordered_map lastValueForCompute; - for (auto& computeOpAndResNum : computeOpsAndResNum) { - auto computeOp = GET_COMP(computeOpAndResNum); - auto yieldOp = mlir::cast(computeOp.getBody().front().getTerminator()); - mlir::Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum)); - - auto it = lastValueForCompute.find(computeOp.getOperation()); - - if (it != lastValueForCompute.end()) { - // If we have already seen this computeOp, apply the reduction - // within-compute - mlir::Value lastWithinComputeValue = it->second; - - assert(valueWithinCompute.getDefiningOp() && lastWithinComputeValue.getDefiningOp()); - - if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp())) - rewriter.setInsertionPointAfterValue(lastWithinComputeValue); - else - rewriter.setInsertionPointAfterValue(valueWithinCompute); - valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute); - lastValueForCompute[computeOp.getOperation()] = valueWithinCompute; - } - - lastValueForCompute[computeOp.getOperation()] = valueWithinCompute; - } - - // Now, reconstruct from the map the computeOpsAndResNum list - computeOpsAndResNum.clear(); - computeOpsAndResNum.reserve(lastValueForCompute.size()); - for (auto& entry : lastValueForCompute) { - auto computeOp = mlir::cast(entry.first); - auto valueWithinCompute = entry.second; - - // We check if `valueWithinCompute` is already used by the yieldOp, in that - // case no need to add it - auto yieldOp = mlir::cast(computeOp.getBody().front().getTerminator()); - bool yieldOpUseFound = false; - for (auto& use : valueWithinCompute.getUses()) { - if (use.getOwner() == yieldOp.getOperation()) { - // If the value is already used by the yieldOp, we can just use it - computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()}); - yieldOpUseFound = true; - break; - } - } - if (yieldOpUseFound) - continue; - - // If this result is not used within a yieldOp, then add it - auto resultNum = yieldOp->getNumOperands(); - yieldOp->insertOperands(resultNum, valueWithinCompute); - - computeOpsAndResNum.push_back({computeOp, resultNum}); - } - - mlir::Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc(); - - // Recursive algorithm to reduce the inputs to a single one: - // - Take two inputs at a time, and reduce them into a single one, updating - // the computeOpsAndResNum list which becomes half the size. - // - Repeat until there is only one input left. - llvm::OwningArrayRef computeOpsRef(computeOpsAndResNum); - while (computeOpsRef.size() > 1) { - llvm::SmallVector nextComputeOps; - nextComputeOps.reserve(computeOpsRef.size() / 2); - for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) { - auto [firstCompute, firstResultNum] = computeOpsRef[i]; - auto [secondCompute, secondResultNum] = computeOpsRef[i + 1]; - - if (secondCompute->isBeforeInBlock(firstCompute)) { - std::swap(firstCompute, secondCompute); - std::swap(firstResultNum, secondResultNum); - } - - // We do not immediately alter the computeOps results/operands, instead we - // do it in a delayed manner, to avoid invalidating the references to the - // computeOps (which must be replaced by a cloned ComputeOp when changing - // the number of results) - // See below `reducerChanges.push_back` and `finalizeReduceUpdates` - - auto yieldOpFirstCompute = mlir::cast(firstCompute.getBody().front().getTerminator()); - - // Add a new operand to the block of the second computeOp - mlir::Block& secondBlock = secondCompute.getBody().front(); - mlir::Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc); - - auto secondComputeWeightsNum = - secondCompute->getAttrOfType(secondCompute.getOperandSegmentSizesAttrName())[0]; - auto secondComputeOperandNum = secondComputeWeightsNum + secondBlock.getNumArguments() - 1; - - // Take the "former-result" from the second computeOp - spatial::SpatYieldOp secondYield = mlir::cast(secondBlock.getTerminator()); - mlir::Value formerRes2 = secondYield.getOperand(secondResultNum); - - // Apply reduction operation - rewriter.setInsertionPoint(secondYield); - mlir::Value reduced = reduce(formerRes2, formerRes1); - - // Unfortunately, it is not possible to update the result in place, - // because we may have already referenced it by - // outside of this function, thus replacing it would invalidate the - // reference. Therefore, we need to append a new result to the yieldOp, - // and then at a later stage update the computeOp accordingly. - - // Add `reduced` to the second yieldOp - auto secondYieldOperandNum = secondYield.getNumOperands(); - secondYield->insertOperands(secondYieldOperandNum, reduced); - secondResultNum = secondYieldOperandNum; - - // We should also add an entry for updating the results of the last - // operation (the one which never becomes a `firstCompute`): because it is - // not tracked by reducerChanges as `fromOp` - reducerChanges.push_back( - {firstCompute.getOperation(), firstResultNum, secondCompute.getOperation(), secondComputeOperandNum}); - nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum)); - } - - // If we have an odd number of inputs, we need to add the last one to the - // newInputs list. - if (computeOpsRef.size() % 2 == 1) - nextComputeOps.push_back(computeOpsRef.back()); - - // Replace the inputOps list with the new one. - computeOpsRef = llvm::OwningArrayRef(std::move(nextComputeOps)); - } - - assert(computeOpsRef.size() == 1 && "Internal error: expected a single input at this point."); - - auto finalComputeAndResNum = computeOpsRef[0]; - - // Force the update of the results of this computeOp, when finalizing - computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum)); - - if (postprocess) - GET_RES_NUM(finalComputeAndResNum) = applyResultProcessing(finalComputeAndResNum, postprocess, rewriter); - - return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(), GET_RES_NUM(finalComputeAndResNum)); -} - -void SpatialReducer::finalizeReduceUpdates() { - assert(reducesFinalized == false && "Cannot finalize two times."); - - reducesFinalized = true; - - // First, add the results to the computeOps - for (auto& reduceChange : reducerChanges) - updateResultsOfCompute(reduceChange.fromOp); - - for (auto& c : computeOpNeedingResUpdate) - updateResultsOfCompute(c.getOperation()); - - for (auto& reducerChange : this->reducerChanges) { - auto fromOp = reducerChange.fromOp; - auto toOp = reducerChange.toOp; - auto fromOpResNum = reducerChange.fromOpResNum; - auto toOpOperandNum = reducerChange.toOpOperandNum; - - auto fromComputeOp = opToReplacedCompute[fromOp]; - assert(fromComputeOp && "fromOp should have been mapped before!"); - - // toComputeOp could be the existing pointer, or we have to remap it with - // `opToReplacedCompute` - auto toComputeOp = opToReplacedCompute[toOp]; - if (!toComputeOp) - toComputeOp = mlir::cast(toOp); - - assert(toComputeOp != fromComputeOp && "Oops should have caught this earlier!"); - - assert(toComputeOp->getNumOperands() == toOpOperandNum - && "toOpOperandNum should be the last operand of toComputeOp, are the " - "operations in the right order?"); - - // Add the new operand to `toComputeOp` - auto fromResult = fromComputeOp.getResult(fromOpResNum); - toComputeOp->insertOperands(toOpOperandNum, fromResult); - incrementWeightedComputeInputsSegmentSize(toComputeOp, 1); - } -} - -mlir::Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) { - assert(reducesFinalized && "Cannot create resolve values before finalizing the reduce updates."); - - mlir::Operation* opToCast; - auto it = opToReplacedCompute.find(opAndResNum.first); - if (it != opToReplacedCompute.end()) - opToCast = it->second; - else - opToCast = opAndResNum.first; - - auto computeOp = mlir::cast(opToCast); - - return computeOp.getResult(opAndResNum.second); -} - -void SpatialReducer::updateResultsOfCompute(mlir::Operation* computeOp) { - if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) { - // If we have already replaced the fromOp, we do not need to do it again - return; - } - auto oldComputeOp = mlir::cast(computeOp); - - auto oldComputeOpNum = oldComputeOp->getNumOperands(); - - auto yieldOp = mlir::cast(oldComputeOp.getBody().front().getTerminator()); - - if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) { - // No result was added, just add itself to the map - opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp; - return; - } - - // Add the results by inspecting its YieldOp - auto newResultTypes = yieldOp.getOperandTypes(); - - // Create a new ComputeOp with the new result type, but same operands - rewriter.setInsertionPoint(oldComputeOp); - auto newComputeOp = spatial::SpatWeightedCompute::create( - rewriter, oldComputeOp->getLoc(), newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs()); - - newComputeOp.getBody().takeBody(oldComputeOp.getBody()); - - auto newComputeOpNum = newComputeOp->getNumOperands(); - - assert(oldComputeOpNum == newComputeOpNum); - - // Since we replaced the old ComputeOp with a new one, we need to replace - // all its results' uses - for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) { - mlir::Value oldResult = oldComputeOp.getResult(i); - mlir::Value newResult = newComputeOp.getResult(i); - - // Replace the uses, except the uses of the compute ops which got deleted - // previously - rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced); - } - - // Finally, erase the old computeOp and update the map - opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp; - oldComputeOpsReplaced.insert(oldComputeOp.getOperation()); - rewriter.setInsertionPoint(oldComputeOp); - rewriter.eraseOp(oldComputeOp); -} - -mlir::Value -SpatialReducer::createImgConcatOp(llvm::SmallVector>>& outputTiles, - mlir::Location& loc, - mlir::Type outputType) { - - assert(reducesFinalized && "Cannot create ImgConcatOp before finalizing the reduce updates."); - - // outputTiles are indexed like this: [channelTile][x][y] - auto tilesCount = outputTiles.size(); - auto width = outputTiles[0].size(); - auto height = outputTiles[0][0].size(); - - llvm::SmallVector>> remappedOutputTiles( - tilesCount, llvm::SmallVector>(width, llvm::SmallVector(height))); - - for (size_t t = 0; t < tilesCount; t++) - for (size_t x = 0; x < width; x++) - for (size_t y = 0; y < height; y++) - remappedOutputTiles[t][x][y] = resolveValueFromOpAndResNum(outputTiles[t][x][y]); - - return ::onnx_mlir::createImgConcatOp(remappedOutputTiles, rewriter, loc, outputType); -} - -OpAndResNum SpatialReducer::applyAddMapReduction(llvm::SmallVector& computeOps, - mlir::ConversionPatternRewriter& rewriter, - mlir::Value biasTile, - MapOperations mapOp) { - - std::function postprocessing = nullptr; - - if (mapOp != MapOperations::None) { - postprocessing = [&](const mlir::Value a) { - mlir::Value mapOperand = a; - if (biasTile) - mapOperand = spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, biasTile); - return createMapOperation(rewriter, mapOp, mapOperand); - }; - } - - return this->applyReducePattern( - computeOps, - [&](mlir::Value a, mlir::Value b) { return spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, b); }, - /* preprocess = */ nullptr, - postprocessing); -} - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp b/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp deleted file mode 100644 index 15b26d7..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once - -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Support/Casting.h" - -#include -#include -#include - -#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp" -#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" - -namespace onnx_mlir { - -using ResNum = unsigned int; - -using ComputeAndResNum = std::pair; - -struct SpatialReducerChange { - mlir::Operation* fromOp; - unsigned int fromOpResNum; - mlir::Operation* toOp; - unsigned int toOpOperandNum; -}; - -using OpAndResNum = std::pair; - -class SpatialReducer { - -public: - SpatialReducer(mlir::ConversionPatternRewriter& rewriter) - : rewriter(rewriter) {} - - OpAndResNum applyReducePattern(llvm::SmallVector& computeOpsAndResNum, - std::function reduce, - std::function preprocess, - std::function postprocess); - - OpAndResNum applyAddMapReduction(llvm::SmallVector& computeOps, - mlir::ConversionPatternRewriter& rewriter, - mlir::Value biasTile, - MapOperations mapOp); - - void finalizeReduceUpdates(); - - ~SpatialReducer() { - if (!reducesFinalized) - finalizeReduceUpdates(); - } - - mlir::Value createImgConcatOp(llvm::SmallVector>>& outputTiles, - mlir::Location& loc, - mlir::Type outputType); - - mlir::Value resolveValueFromOpAndResNum(OpAndResNum& opAndResNum); - -private: - [[nodiscard("computeOp result number gets updated")]] ResNum - applyResultProcessing(ComputeAndResNum computeOpAndResNum, - std::function processFun, - mlir::ConversionPatternRewriter& rewriter); - - /** - * @brief Update the results of a ComputeOp. - * - * This function updates the results of a ComputeOp by taking a look at the - operands of its yieldOp. - * If the ComputeOp was replaced, it updates `opToReplacedCompute` with the - replaced ComputeOp. - * - * @param computeOp The ComputeOp to update the results of. - */ - void updateResultsOfCompute(mlir::Operation* computeOp); - - mlir::ConversionPatternRewriter& rewriter; - bool reducesFinalized = false; - - // List of changes to be applied after the reduction is finalized - llvm::SmallVector reducerChanges; - // List of computeOps that need to be replaced with new results - llvm::SmallVector computeOpNeedingResUpdate; - - std::unordered_map opToReplacedCompute; - - static llvm::SmallPtrSet oldComputeOpsReplaced; -}; - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp b/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp deleted file mode 100644 index 6affb23..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp +++ /dev/null @@ -1,53 +0,0 @@ -#include - -#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp" - -namespace onnx_mlir { - -WeightSubdivider::WeightSubdivider(std::map>> weights) -: weights(std::move(weights)) {} - -bool WeightSubdivider::isEmpty() const { return weights.empty(); } - -TaggedWeights WeightSubdivider::popGroup(size_t amount) { - assert(!weights.empty() && "No weights to extract."); - - auto it = weights.begin(); - llvm::SmallVector& values = it->second.begin()->second; - - long inputTile = it->first; - long outputTile = it->second.begin()->first; - - size_t n = std::min(amount, values.size()); - crossbarsUsed += n; - - llvm::SmallVector result; - result.assign(values.begin(), values.begin() + n); - - if (n < values.size()) { - values.erase(values.begin(), values.begin() + n); - } - else { - it->second.erase(outputTile); - if (it->second.empty()) - weights.erase(inputTile); - } - - return {inputTile, outputTile, crossbarsUsed - n, result}; -} - -llvm::SmallVector WeightSubdivider::popGroups(size_t n) { - crossbarsUsed = 0; - llvm::SmallVector result; - size_t remaining = n; - - while (remaining > 0 && !weights.empty()) { - auto group = popGroup(remaining); - result.push_back(group); - remaining -= group.weights.size(); - } - - return result; -} - -} // namespace onnx_mlir diff --git a/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp b/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp deleted file mode 100644 index eaa8320..0000000 --- a/src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp +++ /dev/null @@ -1,46 +0,0 @@ -#pragma once - -#include "mlir/IR/Value.h" - -#include "llvm/ADT/SmallVector.h" - -#include -#include - -namespace onnx_mlir { - -/** - * @brief A helper struct to store a group of weights. - * - */ -struct TaggedWeights { - long inputTile; - long outputTile; - size_t startingCrossbarIndex; - llvm::SmallVector weights; -}; - -/** - * @brief A helper class to subdivide weights into groups. - * - * Weights are stored as a map of maps of SmallVectors. The outer map is indexed - * by input tile, the inner map is indexed by output tile, and the SmallVector - * contains the weights for the filter. This class allows us to extract groups - * of weights from the map until we've extracted a certain number of elements, - * namely as many as we need to fill a compute unit. - */ -class WeightSubdivider { -private: - std::map>> weights; - size_t crossbarsUsed = 0; - - TaggedWeights popGroup(size_t amount); - -public: - WeightSubdivider(std::map>> weights); - - bool isEmpty() const; - llvm::SmallVector popGroups(size_t n); -}; - -} // namespace onnx_mlir