add constant folding and verification pass for pim host operations

better validation scripts output big refactors
2026-03-20 12:08:12 +01:00
parent 4e50e056e3
commit 6e1de865bb
64 changed files with 1364 additions and 2265 deletions
@@ -1,300 +0,0 @@
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-
-#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-using namespace mlir;
-
-namespace onnx_mlir {
-
-template <typename PoolOp>
-bool hasPostProcessExperimentalPoolingWindow() {
-  return false;
-}
-
-template <>
-bool hasPostProcessExperimentalPoolingWindow<ONNXAveragePoolOp>() {
-  return true;
-}
-
-template <typename PoolOp>
-Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter& rewriter,
-                                           Location loc,
-                                           PoolOp poolOp,
-                                           Value valueToDivide,
-                                           size_t krn_size,
-                                           size_t tilesSkippedByPadding) {
-  return nullptr;
-}
-
-template <>
-Value postProcessExperimentalPoolingWindow<ONNXAveragePoolOp>(ConversionPatternRewriter& rewriter,
-                                                              Location loc,
-                                                              ONNXAveragePoolOp poolOp,
-                                                              Value valueToDivide,
-                                                              size_t krn_size,
-                                                              size_t tilesSkippedByPadding) {
-  bool countIncludePad = poolOp.getCountIncludePad() == 1;
-
-  size_t divisorNumber = countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
-
-  RankedTensorType scalarTensor = RankedTensorType::get({1}, rewriter.getF32Type());
-
-  // Put a spat.const before the computeOp, and use its value. We do this to be
-  // compatible with the current code generation, which assumes constant to be
-  // loaded in global memory, which is allocated by adding a spat.const OP
-  // directly under func.func (i.e. alongside ComputeOps)
-  auto computeOp = cast<spatial::SpatWeightedCompute>(valueToDivide.getDefiningOp()->getParentOp());
-  rewriter.setInsertionPoint(computeOp);
-  auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc,
-                                                               scalarTensor,
-                                                               rewriter.getI64IntegerAttr(divisorNumber),
-                                                               /* should_allocate = */ rewriter.getBoolAttr(true));
-
-  rewriter.setInsertionPointAfterValue(valueToDivide);
-  return rewriter.create<spatial::SpatVSDivOp>(loc, valueToDivide.getType(), valueToDivide, divisorValue);
-}
-
-template <typename ReductionOp>
-Value reduceInputTiles(SmallVector<Value>& inputTiles, ConversionPatternRewriter& rewriter) {
-  if (inputTiles.size() == 1)
-    return inputTiles[0];
-
-  if (inputTiles.size() == 2) {
-    return rewriter.create<spatial::SpatVMaxOp>(
-      inputTiles[0].getLoc(), inputTiles[0].getType(), inputTiles[0], inputTiles[1]);
-  }
-
-  SmallVector<Value> left(inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2);
-  SmallVector<Value> right(inputTiles.begin() + inputTiles.size() / 2, inputTiles.end());
-
-  Value leftReduced = reduceInputTiles<ReductionOp>(left, rewriter);
-  Value rightReduced = reduceInputTiles<ReductionOp>(right, rewriter);
-
-  return rewriter.create<ReductionOp>(inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced);
-}
-
-template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
-struct ExperimentalPoolingBaseConverter : public OpConversionPattern<PoolOp> {
-  ExperimentalPoolingBaseConverter(MLIRContext* ctx)
-  : OpConversionPattern<PoolOp>(ctx) {}
-
-  LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final {
-    Value X = adaptor.getX();
-    ShapedType xShape = mlir::cast<ShapedType>(X.getType());
-    Value Y = poolOp.getResult();
-    ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
-
-    size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
-    unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
-    unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
-    unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
-
-    if (adaptor.getAutoPad() != "NOTSET")
-      return rewriter.notifyMatchFailure(poolOp, "auto_pad != NOTSET is deprecated.");
-
-    size_t pad_x, pad_y;
-    auto padUnpackError = unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
-    if (padUnpackError.has_value())
-      return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
-
-    Location loc = poolOp.getLoc();
-
-    size_t input_h = GET_IMAGE_HEIGHT(xShape);
-    size_t input_w = GET_IMAGE_WIDTH(xShape);
-    size_t output_h = GET_IMAGE_HEIGHT(yShape);
-    size_t output_w = GET_IMAGE_WIDTH(yShape);
-
-    ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize);
-
-    // Assert that the input is a tensor.ConcatOp.
-    auto concat = X.getDefiningOp<tensor::ConcatOp>();
-    if (!concat)
-      return rewriter.notifyMatchFailure(poolOp, "Expected input to be a tensor.ConcatOp");
-
-    // Create a [channel_tile][x][y] array to store the input tiles.
-    std::map<long, std::map<long, std::map<long, Value>>> inputTiles;
-
-    // For each argument of the tensor.ConcatOp, resolve the input tiles.
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x) {
-        for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) {
-          size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
-
-          SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
-          SmallVector<OpFoldResult> offsets = {/* 0 */ rewriter.getIndexAttr(0),
-                                               /* 1 */ rewriter.getIndexAttr(0),
-                                               /* 2 */ rewriter.getIndexAttr(x),
-                                               /* 3 */ rewriter.getIndexAttr(y)};
-          SmallVector<OpFoldResult> sizes = {/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
-                                             /* 1 */ rewriter.getIndexAttr(tilingSize),
-                                             /* 2 */ rewriter.getIndexAttr(1),
-                                             /* 3 */ rewriter.getIndexAttr(1)};
-
-          // Get the concat's operand that we want to slice.
-          Value concatInput = concat.getOperand(it);
-          Value slicedTile = rewriter.create<tensor::ExtractSliceOp>(loc, concatInput, offsets, sizes, strides);
-
-          inputTiles[it][x][y] = slicedTile;
-        }
-      }
-    }
-
-    // Prepare the shape of the compute's output.
-    ldiv_t itc = tileCount;
-    SmallVector<Type> outputTileTypes;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          SmallVector<int64_t> outputShapeArray {/* 0 */ 1, // Batch size is always 1.
-                                                            /* 1 */
-                                                 cast<RankedTensorType>(inputTiles[it][0][0].getType()).getShape()[1],
-                                                 /* 2 */ 1,
-                                                 /* 3 */ 1};
-
-          auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
-
-          outputTileTypes.push_back(RankedTensorType::get(outputShapeArray, elementType));
-        }
-      }
-    }
-
-    // Create a plain value list of the input tiles.
-    SmallVector<Value> inputTilesList;
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x)
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it)
-          inputTilesList.push_back(inputTiles[it][y][x]);
-    }
-
-    // Create a single compute to calculate the output.
-    auto computeOp =
-      rewriter.create<spatial::SpatWeightedCompute>(loc, outputTileTypes, SmallVector<Value>(), inputTilesList);
-
-    // Create a new block for the compute unit and add the operands.
-    Block* block = rewriter.createBlock(&computeOp.getRegion());
-
-    // Fill the block arguments and keep a reference to them.
-    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> inputTilesArgs;
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it;
-          inputTilesArgs[it][y][x] = block->addArgument(computeOp->getOperand(tileIndex).getType(), loc);
-        }
-      }
-    }
-
-    // Begin writing in the block.
-    rewriter.setInsertionPointToStart(block);
-
-    // Go through all pooling blocks.
-    SmallVector<Value> outputTiles;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          size_t start_x = x * stride_x;
-          size_t start_y = y * stride_y;
-          size_t end_x = std::min(start_x + krn_w, input_w);
-          size_t end_y = std::min(start_y + krn_h, input_h);
-
-          SmallVector<Value> inputTilesToReduce;
-          for (size_t ky = start_y; ky < end_y; ++ky)
-            for (size_t kx = start_x; kx < end_x; ++kx)
-              inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]);
-
-          auto reduceResult = reduceInputTiles<ReduceOp>(inputTilesToReduce, rewriter);
-
-          // If the reduce op is add, we need to divide the result by the
-          // number of elements in the pooling window.
-          if (hasPostProcessExperimentalPoolingWindow<PoolOp>()) {
-            // Add a spat.const before the computeOp.
-            rewriter.setInsertionPoint(computeOp);
-            auto divisorValue =
-              rewriter.create<spatial::SpatConstantOp>(loc,
-                                                       RankedTensorType::get({1}, rewriter.getF32Type()),
-                                                       rewriter.getI64IntegerAttr(krn_w * krn_h),
-                                                       rewriter.getBoolAttr(true));
-
-            rewriter.setInsertionPointAfter(reduceResult.getDefiningOp());
-            reduceResult =
-              rewriter.create<spatial::SpatVSDivOp>(loc, reduceResult.getType(), reduceResult, divisorValue);
-          }
-          outputTiles.push_back(reduceResult);
-        }
-      }
-    }
-
-    // Create a YieldOp to return the output tiles.
-    rewriter.create<spatial::SpatYieldOp>(loc, outputTiles);
-
-    // Set the rewrite cursor right after the computeOp.
-    rewriter.setInsertionPointAfter(computeOp);
-
-    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> computeOutput;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it;
-          computeOutput[it][y][x] = computeOp.getResult(tileIndex);
-        }
-      }
-    }
-
-    // We'll now create spat.img.concat ops to concatenate the output tiles.
-    SmallVector<Value> outputTilesList;
-    for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-      SmallVector<Value> imgConcatTiles;
-      for (size_t y = 0; y < output_h; ++y)
-        for (size_t x = 0; x < output_w; ++x)
-          imgConcatTiles.push_back(computeOutput[it][y][x]);
-
-      size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
-
-      SmallVector<int64_t> outputShapeArray {/* 0 */ 1, // Batch size is always 1.
-                                             /* 1 */ (long) tilingSize,
-                                             /* 2 */ (long) output_w,
-                                             /* 3 */ (long) output_h};
-
-      auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
-
-      outputTilesList.push_back(rewriter.create<spatial::SpatImgConcatOp>(
-        loc, RankedTensorType::get(outputShapeArray, elementType), imgConcatTiles));
-    }
-
-    // Create a new tensor.ConcatOp to concatenate the output tiles.
-    Value outputTensor = rewriter.create<tensor::ConcatOp>(loc, 1, outputTilesList);
-
-    rewriter.replaceOp(poolOp, outputTensor);
-
-    return success();
-  }
-};
-
-void populateExperimentalPoolingTilingPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
-  patterns.insert<
-    ExperimentalPoolingBaseConverter<ONNXMaxPoolSingleOutOp, ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
-  patterns.insert<ExperimentalPoolingBaseConverter<ONNXAveragePoolOp, ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(
-    ctx);
-}
-
-} // namespace onnx_mlir
@@ -26,8 +26,6 @@ using namespace mlir;

 namespace onnx_mlir {

-llvm::SmallPtrSet<Operation*, 16> oldComputeOpsReplaced;
-
 Value applyReducePatternNew(SmallVector<Value>& valuesToReduce,
                            ConversionPatternRewriter& rewriter,
                            std::function<Value(const Value&, const Value&)> reduce,
@@ -225,12 +223,12 @@ struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {

    Location loc = poolOp.getLoc();

-    size_t input_h = GET_IMAGE_HEIGHT(xShape);
-    size_t input_w = GET_IMAGE_WIDTH(xShape);
-    size_t output_h = GET_IMAGE_HEIGHT(yShape);
-    size_t output_w = GET_IMAGE_WIDTH(yShape);
-    size_t channelTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
-    size_t channelTileRest = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
+    size_t input_h = getImageHeight(xShape);
+    size_t input_w = getImageWidth(xShape);
+    size_t output_h = getImageHeight(yShape);
+    size_t output_w = getImageWidth(yShape);
+    size_t channelTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
+    size_t channelTileRest = getImageChannel(xShape) % crossbarSize;

    // 1: Tile the input tensor
    // Input tiles need to be indexed by: