#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Value.h" #include "mlir/IR/ValueRange.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include "src/Accelerators/PIM/Common/PIMCommon.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp" #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" using namespace mlir; namespace onnx_mlir { template bool hasPostProcessExperimentalPoolingWindow() { return false; } template <> bool hasPostProcessExperimentalPoolingWindow() { return true; } template Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter& rewriter, Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) { return nullptr; } template <> Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter& rewriter, Location loc, ONNXAveragePoolOp poolOp, Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) { bool countIncludePad = poolOp.getCountIncludePad() == 1; size_t divisorNumber = countIncludePad ? krn_size : krn_size - tilesSkippedByPadding; RankedTensorType scalarTensor = RankedTensorType::get({1}, rewriter.getF32Type()); // Put a spat.const before the computeOp, and use its value. We do this to be // compatible with the current code generation, which assumes constant to be // loaded in global memory, which is allocated by adding a spat.const OP // directly under func.func (i.e. alongside ComputeOps) auto computeOp = cast(valueToDivide.getDefiningOp()->getParentOp()); rewriter.setInsertionPoint(computeOp); auto divisorValue = rewriter.create(loc, scalarTensor, rewriter.getI64IntegerAttr(divisorNumber), /* should_allocate = */ rewriter.getBoolAttr(true)); rewriter.setInsertionPointAfterValue(valueToDivide); return rewriter.create(loc, valueToDivide.getType(), valueToDivide, divisorValue); } template Value reduceInputTiles(SmallVector& inputTiles, ConversionPatternRewriter& rewriter) { if (inputTiles.size() == 1) return inputTiles[0]; if (inputTiles.size() == 2) { return rewriter.create( inputTiles[0].getLoc(), inputTiles[0].getType(), inputTiles[0], inputTiles[1]); } SmallVector left(inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2); SmallVector right(inputTiles.begin() + inputTiles.size() / 2, inputTiles.end()); Value leftReduced = reduceInputTiles(left, rewriter); Value rightReduced = reduceInputTiles(right, rewriter); return rewriter.create(inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced); } template struct ExperimentalPoolingBaseConverter : public OpConversionPattern { ExperimentalPoolingBaseConverter(MLIRContext* ctx) : OpConversionPattern(ctx) {} LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final { Value X = adaptor.getX(); ShapedType xShape = mlir::cast(X.getType()); Value Y = poolOp.getResult(); ShapedType yShape = mlir::cast(Y.getType()); size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h; unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y); unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y); unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h); if (adaptor.getAutoPad() != "NOTSET") return rewriter.notifyMatchFailure(poolOp, "auto_pad != NOTSET is deprecated."); size_t pad_x, pad_y; auto padUnpackError = unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y); if (padUnpackError.has_value()) return rewriter.notifyMatchFailure(poolOp, padUnpackError.value()); Location loc = poolOp.getLoc(); size_t input_h = GET_IMAGE_HEIGHT(xShape); size_t input_w = GET_IMAGE_WIDTH(xShape); size_t output_h = GET_IMAGE_HEIGHT(yShape); size_t output_w = GET_IMAGE_WIDTH(yShape); ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize); // Assert that the input is a tensor.ConcatOp. auto concat = X.getDefiningOp(); if (!concat) return rewriter.notifyMatchFailure(poolOp, "Expected input to be a tensor.ConcatOp"); // Create a [channel_tile][x][y] array to store the input tiles. std::map>> inputTiles; // For each argument of the tensor.ConcatOp, resolve the input tiles. for (size_t y = 0; y < input_h; ++y) { for (size_t x = 0; x < input_w; ++x) { for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) { size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize; SmallVector strides(4, rewriter.getIndexAttr(1)); SmallVector offsets = {/* 0 */ rewriter.getIndexAttr(0), /* 1 */ rewriter.getIndexAttr(0), /* 2 */ rewriter.getIndexAttr(x), /* 3 */ rewriter.getIndexAttr(y)}; SmallVector sizes = {/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1. /* 1 */ rewriter.getIndexAttr(tilingSize), /* 2 */ rewriter.getIndexAttr(1), /* 3 */ rewriter.getIndexAttr(1)}; // Get the concat's operand that we want to slice. Value concatInput = concat.getOperand(it); Value slicedTile = rewriter.create(loc, concatInput, offsets, sizes, strides); inputTiles[it][x][y] = slicedTile; } } } // Prepare the shape of the compute's output. ldiv_t itc = tileCount; SmallVector outputTileTypes; for (size_t y = 0; y < output_h; ++y) { for (size_t x = 0; x < output_w; ++x) { for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) { SmallVector outputShapeArray {/* 0 */ 1, // Batch size is always 1. /* 1 */ cast(inputTiles[it][0][0].getType()).getShape()[1], /* 2 */ 1, /* 3 */ 1}; auto elementType = dyn_cast(xShape).getElementType(); outputTileTypes.push_back(RankedTensorType::get(outputShapeArray, elementType)); } } } // Create a plain value list of the input tiles. SmallVector inputTilesList; for (size_t y = 0; y < input_h; ++y) { for (size_t x = 0; x < input_w; ++x) for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) inputTilesList.push_back(inputTiles[it][y][x]); } // Create a single compute to calculate the output. auto computeOp = rewriter.create(loc, outputTileTypes, SmallVector(), inputTilesList); // Create a new block for the compute unit and add the operands. Block* block = rewriter.createBlock(&computeOp.getRegion()); // Fill the block arguments and keep a reference to them. std::map>> inputTilesArgs; for (size_t y = 0; y < input_h; ++y) { for (size_t x = 0; x < input_w; ++x) { for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) { auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it; inputTilesArgs[it][y][x] = block->addArgument(computeOp->getOperand(tileIndex).getType(), loc); } } } // Begin writing in the block. rewriter.setInsertionPointToStart(block); // Go through all pooling blocks. SmallVector outputTiles; for (size_t y = 0; y < output_h; ++y) { for (size_t x = 0; x < output_w; ++x) { for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) { size_t start_x = x * stride_x; size_t start_y = y * stride_y; size_t end_x = std::min(start_x + krn_w, input_w); size_t end_y = std::min(start_y + krn_h, input_h); SmallVector inputTilesToReduce; for (size_t ky = start_y; ky < end_y; ++ky) for (size_t kx = start_x; kx < end_x; ++kx) inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]); auto reduceResult = reduceInputTiles(inputTilesToReduce, rewriter); // If the reduce op is add, we need to divide the result by the // number of elements in the pooling window. if (hasPostProcessExperimentalPoolingWindow()) { // Add a spat.const before the computeOp. rewriter.setInsertionPoint(computeOp); auto divisorValue = rewriter.create(loc, RankedTensorType::get({1}, rewriter.getF32Type()), rewriter.getI64IntegerAttr(krn_w * krn_h), rewriter.getBoolAttr(true)); rewriter.setInsertionPointAfter(reduceResult.getDefiningOp()); reduceResult = rewriter.create(loc, reduceResult.getType(), reduceResult, divisorValue); } outputTiles.push_back(reduceResult); } } } // Create a YieldOp to return the output tiles. rewriter.create(loc, outputTiles); // Set the rewrite cursor right after the computeOp. rewriter.setInsertionPointAfter(computeOp); std::map>> computeOutput; for (size_t y = 0; y < output_h; ++y) { for (size_t x = 0; x < output_w; ++x) { for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) { auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it; computeOutput[it][y][x] = computeOp.getResult(tileIndex); } } } // We'll now create spat.img.concat ops to concatenate the output tiles. SmallVector outputTilesList; for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) { SmallVector imgConcatTiles; for (size_t y = 0; y < output_h; ++y) for (size_t x = 0; x < output_w; ++x) imgConcatTiles.push_back(computeOutput[it][y][x]); size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize; SmallVector outputShapeArray {/* 0 */ 1, // Batch size is always 1. /* 1 */ (long) tilingSize, /* 2 */ (long) output_w, /* 3 */ (long) output_h}; auto elementType = dyn_cast(xShape).getElementType(); outputTilesList.push_back(rewriter.create( loc, RankedTensorType::get(outputShapeArray, elementType), imgConcatTiles)); } // Create a new tensor.ConcatOp to concatenate the output tiles. Value outputTensor = rewriter.create(loc, 1, outputTilesList); rewriter.replaceOp(poolOp, outputTensor); return success(); } }; void populateExperimentalPoolingTilingPattern(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert< ExperimentalPoolingBaseConverter>(ctx); patterns.insert>( ctx); } } // namespace onnx_mlir