remove old unused stuff

2026-03-23 20:00:09 +01:00
parent f2d593f749
commit f869925b64
12 changed files with 16 additions and 906 deletions
@@ -7,12 +7,8 @@ add_pim_library(OMONNXToSpatial
  Patterns/Math/Conv.cpp
  Patterns/Math/MatMul.cpp
  Patterns/NN/Pool.cpp
  Patterns/NN/ReduceMean.cpp
  Patterns/Tensor/Concat.cpp
  Patterns/Tensor/Reshape.cpp
  Utils/SpatialReducer.cpp
  Utils/WeightSubdivider.cpp
  Utils/AnnotateReplication.cpp
  ONNXToSpatialPass.cpp
  Common.cpp
@@ -57,8 +57,6 @@ inline auto getFilterCount(const ShapedType& shapedType) {
  return shapedType.getDimSize(0);
 }
 inline constexpr mlir::StringRef REPLICATION_ATTR_NAME = "replication_factor";
 using HSliceId = size_t;
 using CoreId = size_t;
@@ -11,7 +11,6 @@
 #include <fstream>
 #include "Common/PimCommon.hpp"
 #include "Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
 #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
@@ -68,11 +67,6 @@ void ONNXToSpatialPass::runOnOperation() {
    signalPassFailure();
    return;
  }
  if (annotateReplication(*entryFunc, rewriter).failed()) {
    llvm::dbgs() << "Failed during annotation for replication analysis\n";
    signalPassFailure();
    return;
  }
  ConversionTarget target(*ctx);
  target.addLegalDialect<spatial::SpatialDialect, ONNXDialect, tensor::TensorDialect, arith::ArithDialect>();
@@ -98,7 +92,6 @@ void ONNXToSpatialPass::runOnOperation() {
  populateReshapeConversionPattern(patterns, ctx);
  populateONNXConcatToTensorConcatPattern(patterns, ctx);
  populateReduceMeanConversionPattern(patterns, ctx);
  if (failed(applyPartialConversion(moduleOp, target, std::move(patterns)))) {
    signalPassFailure();
@@ -17,6 +17,4 @@ void populateONNXConcatToTensorConcatPattern(mlir::RewritePatternSet& patterns,
 void populateReshapeConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
 void populateReduceMeanConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
 } // namespace onnx_mlir
@@ -5,14 +5,12 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cassert>
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
@@ -21,12 +19,8 @@ using namespace mlir;
 namespace onnx_mlir {
 namespace {
-constexpr StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor";
+static FailureOr<Value>
-
+materializeScaledConstantTensor(Value value, float factor, ConversionPatternRewriter& rewriter, Location loc) {
 static FailureOr<Value> materializeScaledConstantTensor(Value value,
                                                        float factor,
                                                        ConversionPatternRewriter& rewriter,
                                                        Location loc) {
  if (factor == 1.0f)
    return value;
@@ -70,16 +64,6 @@ struct GemvToSpatialCompute : OpConversionPattern<ONNXGemmOp> {
  LogicalResult matchAndRewrite(ONNXGemmOp gemmOp,
                                ONNXGemmOpAdaptor gemmOpAdaptor,
                                ConversionPatternRewriter& rewriter) const override;
 private:
  static Value resolveONNXExpOpFromUseChain(Value startValue);
  static LogicalResult softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
                                                   Value& softmaxChannel,
                                                   ConversionPatternRewriter& rewriter,
                                                   SpatialReducer& reducer,
                                                   ONNXGemmOp& gemmOp,
                                                   Location& loc);
 };
 } // namespace
@@ -122,7 +106,13 @@ LogicalResult GemmToManyGemv::matchAndRewrite(ONNXGemmOp gemmOp,
    // Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
    if (cType.getRank() == 1) {
      auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
-      c = tensor::ExpandShapeOp::create(rewriter, loc, expandedType, c, SmallVector<ReassociationIndices>{{0, 1}});
+      c = tensor::ExpandShapeOp::create(rewriter,
                                        loc,
                                        expandedType,
                                        c,
                                        SmallVector<ReassociationIndices> {
                                          {0, 1}
      });
      cType = expandedType;
    }
    assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
@@ -208,7 +198,13 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp,
    // Expand rank-1 bias [N] to rank-2 [1, N] for uniform handling
    if (cType.getRank() == 1) {
      auto expandedType = RankedTensorType::get({1, cType.getDimSize(0)}, cType.getElementType());
-      c = tensor::ExpandShapeOp::create(rewriter, gemmLoc, expandedType, c, SmallVector<ReassociationIndices>{{0, 1}});
+      c = tensor::ExpandShapeOp::create(rewriter,
                                        gemmLoc,
                                        expandedType,
                                        c,
                                        SmallVector<ReassociationIndices> {
                                          {0, 1}
      });
      cType = expandedType;
    }
    assert("Only support rank 2 tensor for C" && cType.getRank() == 2);
@@ -356,124 +352,6 @@ LogicalResult GemvToSpatialCompute::matchAndRewrite(ONNXGemmOp gemmOp,
  return success();
 }
 Value GemvToSpatialCompute::resolveONNXExpOpFromUseChain(Value startValue) {
  Value walker = startValue;
  while (!llvm::isa<ONNXExpOp>(walker.getDefiningOp())) {
    walker = walker.getDefiningOp()->getOperand(0);
    assert(walker && walker.getDefiningOp()
           && "Unwinded the whole chain of operations while trying to "
              "find ONNXExpOp, but did not find it");
  }
  // Make sure the dividend is actually produced by an ONNXExpOp
  assert(llvm::isa<ONNXExpOp>(walker.getDefiningOp())
         && "Old output tile (softmax reducer) is not produced by an "
            "ONNXExpOp");
  return walker;
 }
 LogicalResult GemvToSpatialCompute::softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
                                                                Value& softmaxChannel,
                                                                ConversionPatternRewriter& rewriter,
                                                                SpatialReducer& reducer,
                                                                ONNXGemmOp& gemmOp,
                                                                Location& loc) {
  // TODO: Check case with one compute op
  // Cast vector of Value into vector of ComputeOp
  SmallVector<ComputeAndResNum> softmaxOpsToReduce =
    llvm::to_vector(llvm::map_range(outputOpsAndResNums, [&](OpAndResNum computeAndResNum) {
      return std::make_pair(cast<spatial::SpatWeightedCompute>(computeAndResNum.first), computeAndResNum.second);
    }));
  RankedTensorType::Builder tensorTypeBuilder({1}, Float32Type::get(rewriter.getContext()), nullptr);
  const TensorType scalarTensorType = tensorTypeBuilder;
  reducer.applyReducePattern(
    softmaxOpsToReduce,
    [&](Value a, Value b) { return spatial::SpatVAddOp::create(rewriter, loc, scalarTensorType, a, b); },
    /* preprocess = */
    [&](Value a) { return spatial::SpatSumOp::create(rewriter, loc, scalarTensorType, a); },
    [&](Value softmaxDivisor) {
      // Signal that this is the compute with the softmax divisor
      auto computeOp = cast<spatial::SpatWeightedCompute>(softmaxDivisor.getDefiningOp()->getParentOp());
      computeOp->setAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME, rewriter.getUnitAttr());
      // Broadcast the divisor to all the cores
      rewriter.setInsertionPointAfterValue(softmaxDivisor);
      spatial::SpatChannelBroadcastSendOp::create(rewriter, loc, softmaxChannel, softmaxDivisor);
      /*
       * softmaxDividend = onnx.exp (...)
       * sum = spat.SumOp(softmaxDividend)
       * [following can be repeated N times, thus walk the use chain]
       * softmaxDivisor = spat.sadd(sum, ...)
       */
      Value softmaxDividend = resolveONNXExpOpFromUseChain(softmaxDivisor.getDefiningOp()->getOperand(0));
      // Make sure the dividend is actually produced by an ONNXExpOp
      assert(llvm::isa<ONNXExpOp>(softmaxDividend.getDefiningOp())
             && "Dividend of softmax reduction is not an ONNXExpOp");
      // Do not divide here, divide after this
      return softmaxDivisor;
    });
  // In all the cores, insert a ChannelRecvOp and divide the output tile by
  // the reduced denominator.
  outputOpsAndResNums.clear();
  outputOpsAndResNums.reserve(softmaxOpsToReduce.size());
  for (auto& computeToDivideOpAndResNum : softmaxOpsToReduce) {
    auto yieldOp = cast<spatial::SpatYieldOp>(computeToDivideOpAndResNum.first.getBody().front().getTerminator());
    Value divisor;
    // Check if this compute contains the softmax divisor: if so, find the
    // ChannelBroadcastSendOp, otherwise receive the value from the channel
    // using ChannelBroadcastReceiveOp
    if (computeToDivideOpAndResNum.first->hasAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME)) {
      bool found = false;
      for (auto broadcastOp :
           computeToDivideOpAndResNum.first.getBody().front().getOps<spatial::SpatChannelBroadcastSendOp>()) {
        assert(found == false
               && "More than one ChannelBroadcastSendOp in "
                  "compute? How is this possible?");
        found = true;
        divisor = broadcastOp.getData();
      }
      assert(found
             && "No ChannelBroadcastSendOp in compute where softmax "
                "divisor was specified to be?");
    }
    else {
      rewriter.setInsertionPoint(yieldOp);
      divisor = spatial::SpatChannelBroadcastReceiveOp::create(rewriter, loc, scalarTensorType, softmaxChannel);
    }
    // Walk the chain of operations until we find the ONNXExpOp: this is
    // needed because some some may have a different amount of `VAddOp`s due
    // to the tree reduction (e.g. some may have no VAddOp, some may have
    // multiples)
    Value oldOutputTile = resolveONNXExpOpFromUseChain(yieldOp->getOperand(computeToDivideOpAndResNum.second));
    rewriter.setInsertionPoint(yieldOp);
    Value newOutputTile = spatial::SpatVSDivOp::create(rewriter, loc, oldOutputTile.getType(), oldOutputTile, divisor);
    auto yieldOperandNum = yieldOp->getNumOperands();
    yieldOp->insertOperands(yieldOperandNum, newOutputTile);
    outputOpsAndResNums.push_back({computeToDivideOpAndResNum.first, yieldOperandNum});
  }
  return success();
 }
 void populateOnnxGemmOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
  patterns.insert<GemmToManyGemv>(ctx);
  patterns.insert<GemvToSpatialCompute>(ctx);
@@ -1,89 +0,0 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "Conversion/ONNXToSpatial/Patterns.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 using namespace mlir;
 namespace onnx_mlir {
 struct ReduceMeanConversionPattern : public OpConversionPattern<ONNXReduceMeanV13Op> {
  ReduceMeanConversionPattern(MLIRContext* ctx)
  : OpConversionPattern(ctx) {}
  LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean,
                                ONNXReduceMeanV13OpAdaptor adaptor,
                                ConversionPatternRewriter& rewriter) const final {
    // Get the input tensor.
    Value inputTensor = adaptor.getData();
    auto inputTensorType = cast<RankedTensorType>(inputTensor.getType());
    // This pattern will substitute the ONNXReduceMeanV13Op with a
    // ONNXAveragePoolOp with the same input tensor and an appropriate kernel
    // shape and strides.
    // To get the stride and shape of the kernel, we need to read the tensor
    // shape.
    int image_height = inputTensorType.getShape()[2];
    int image_width = inputTensorType.getShape()[3];
    // Define the kernel shape and strides.
    SmallVector<int64_t> kernelShapeVals = {image_height, image_width};
    SmallVector<int64_t> stridesVals = {image_height, image_width};
    SmallVector<int64_t> dilationsVals = {1, 1};
    // Set the pads to 0.
    SmallVector<int64_t> padsVals = {0, 0, 0, 0};
    // Create the ArrayAttrs
    auto kernelShape = mlir::ArrayAttr::get(
      rewriter.getContext(), llvm::to_vector(llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute {
        return rewriter.getI64IntegerAttr(v);
      })));
    auto strides = mlir::ArrayAttr::get(rewriter.getContext(),
                                        llvm::to_vector(llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute {
                                          return rewriter.getI64IntegerAttr(v);
                                        })));
    auto dilations = mlir::ArrayAttr::get(
      rewriter.getContext(), llvm::to_vector(llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute {
        return rewriter.getI64IntegerAttr(v);
      })));
    auto pads = mlir::ArrayAttr::get(rewriter.getContext(),
                                     llvm::to_vector(llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute {
                                       return rewriter.getI64IntegerAttr(v);
                                     })));
    // Create the resulting tensor type.
    auto resultType = RankedTensorType::get(
      /*shape=*/ {inputTensorType.getShape()[0], inputTensorType.getShape()[1], 1, 1},
      /*elementType=*/inputTensorType.getElementType());
    // Create the ONNXAveragePoolOp.
    auto averagePool = ONNXAveragePoolOp::create(rewriter,
                                                 reduceMean.getLoc(),
                                                 resultType,
                                                 inputTensor,
                                                 /*auto_pad=*/"NOTSET",
                                                 /*ceil_mode=*/0,
                                                 /*count_include_pad=*/1,
                                                 dilations,
                                                 /*kernel_shape=*/kernelShape,
                                                 /*pads=*/pads,
                                                 /*strides=*/strides);
    // Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp.
    rewriter.replaceOp(reduceMean, averagePool.getResult());
    return success();
  }
 };
 void populateReduceMeanConversionPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
  patterns.insert<ReduceMeanConversionPattern>(ctx);
 }
 } // namespace onnx_mlir
@@ -1,119 +0,0 @@
 #include <queue>
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 using namespace mlir;
 namespace onnx_mlir {
 /**
 * @brief Structure that describes the replication of a convolution operation,
 * along the image height axis.
 */
 struct ConvReplication {
  ONNXConvOp convOp;            // Convolution operation
  size_t input_w;               // Width of the input image
  size_t replicationFactor;     // Replication factor on the image height axis
  size_t coresNeededPerReplica; // Number of cores needed for each replica
  friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
    return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
  }
  ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
  : convOp(convOp),
    input_w(input_w),
    replicationFactor(replicationFactor),
    coresNeededPerReplica(coresNeededPerReplica) {}
 };
 LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
  if (coresCount == -1) {
    // No need for annotation, implicitly set replication to 1
    return success();
  }
  std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
  size_t minimumCores = 0;
  for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
    if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
      // Convolution layer
      Value X = convOp.getX(), W = convOp.getW();
      ShapedType xShape = mlir::cast<ShapedType>(X.getType());
      ShapedType wShape = mlir::cast<ShapedType>(W.getType());
      size_t input_w = getImageWidth(xShape);
      size_t krn_h = getKernelHeight(wShape);
      size_t krn_w = getKernelWidth(wShape);
      size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
      size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
      auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
      auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
      minimumCores += neededCores;
      convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
    }
    else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
      // Fully connected layer
      auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
      auto inputSize = matrixTensorShape.getDimSize(0);
      auto outputSize = matrixTensorShape.getDimSize(1);
      if (gemmOp.getTransB())
        std::swap(inputSize, outputSize);
      const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
      const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
      // Each output tile is computed by `coresPerOutputTile` cores. The
      // entire input is given to each of these cores.
      const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
      auto neededCores = coresPerOutputTile * outputTilesCount;
      minimumCores += neededCores;
    }
  }
  if (static_cast<size_t>(coresCount) < minimumCores) {
    return funcOp->emitError("Not enough cores for this network: ")
        << minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
  }
  size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
  // Consume all the elements in the queue
  while (!convOpsReplicationQueue.empty()) {
    auto convOpReplication = convOpsReplicationQueue.top();
    convOpsReplicationQueue.pop();
    // Check if we can replicate this convolution (e.g. we have enough cores)
    if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
      // We can replicate this convolution: increment replicationFactor and put
      // back in queue
      availableCores -= convOpReplication.coresNeededPerReplica;
      convOpReplication.replicationFactor++;
      convOpsReplicationQueue.push(convOpReplication);
    }
    else {
      // Cannot replicate this convolution anymore, annotate the operation
      // with the replication factor
      convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
                                        rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
    }
  }
  return success();
 }
 } // namespace onnx_mlir
@@ -1,10 +0,0 @@
 #pragma once
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/PatternMatch.h"
 namespace onnx_mlir {
 mlir::LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter);
 } // namespace onnx_mlir
@@ -1,348 +0,0 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Value.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <unordered_map>
 #include <utility>
 #include "SpatialReducer.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
 #define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
 namespace onnx_mlir {
 llvm::SmallPtrSet<mlir::Operation*, 16> onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
 ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum,
                                             std::function<mlir::Value(const mlir::Value&)> processFun,
                                             mlir::ConversionPatternRewriter& rewriter) {
  assert(processFun);
  auto computeOp = GET_COMP(computeOpAndResNum);
  auto resultNum = GET_RES_NUM(computeOpAndResNum);
  spatial::SpatYieldOp yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
  mlir::Value result = yieldOp->getOperand(resultNum);
  rewriter.setInsertionPointAfterValue(result);
  mlir::Value processedResult = processFun(result);
  if (processedResult == result) {
    // Sometimes we want processedResult to return the same value but do
    // something else with it (e.g. in softmax we want to broadcast the value
    // using a channel). In this case, we can just return the same value.
    return resultNum;
  }
  yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
  return yieldOp.getNumOperands() - 1;
 }
 OpAndResNum
 SpatialReducer::applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
                                   std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
                                   std::function<mlir::Value(const mlir::Value&)> preprocess,
                                   std::function<mlir::Value(const mlir::Value&)> postprocess) {
  if (preprocess)
    for (auto& computeOpAndResNum : computeOpsAndResNum)
      GET_RES_NUM(computeOpAndResNum) = applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
  // It is possible that `computeOpsAndResNum` contains two entries for the same
  // computeOp. In this case, we need to apply the reduction within-computef
  // Keep a map between a computeOp and the last Value for this reduction
  std::unordered_map<mlir::Operation*, mlir::Value> lastValueForCompute;
  for (auto& computeOpAndResNum : computeOpsAndResNum) {
    auto computeOp = GET_COMP(computeOpAndResNum);
    auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
    mlir::Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
    auto it = lastValueForCompute.find(computeOp.getOperation());
    if (it != lastValueForCompute.end()) {
      // If we have already seen this computeOp, apply the reduction
      // within-compute
      mlir::Value lastWithinComputeValue = it->second;
      assert(valueWithinCompute.getDefiningOp() && lastWithinComputeValue.getDefiningOp());
      if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp()))
        rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
      else
        rewriter.setInsertionPointAfterValue(valueWithinCompute);
      valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
      lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
    }
    lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
  }
  // Now, reconstruct from the map the computeOpsAndResNum list
  computeOpsAndResNum.clear();
  computeOpsAndResNum.reserve(lastValueForCompute.size());
  for (auto& entry : lastValueForCompute) {
    auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(entry.first);
    auto valueWithinCompute = entry.second;
    // We check if `valueWithinCompute` is already used by the yieldOp, in that
    // case no need to add it
    auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
    bool yieldOpUseFound = false;
    for (auto& use : valueWithinCompute.getUses()) {
      if (use.getOwner() == yieldOp.getOperation()) {
        // If the value is already used by the yieldOp, we can just use it
        computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
        yieldOpUseFound = true;
        break;
      }
    }
    if (yieldOpUseFound)
      continue;
    // If this result is not used within a yieldOp, then add it
    auto resultNum = yieldOp->getNumOperands();
    yieldOp->insertOperands(resultNum, valueWithinCompute);
    computeOpsAndResNum.push_back({computeOp, resultNum});
  }
  mlir::Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
  // Recursive algorithm to reduce the inputs to a single one:
  // - Take two inputs at a time, and reduce them into a single one, updating
  // the computeOpsAndResNum list which becomes half the size.
  // - Repeat until there is only one input left.
  llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
  while (computeOpsRef.size() > 1) {
    llvm::SmallVector<ComputeAndResNum> nextComputeOps;
    nextComputeOps.reserve(computeOpsRef.size() / 2);
    for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
      auto [firstCompute, firstResultNum] = computeOpsRef[i];
      auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
      if (secondCompute->isBeforeInBlock(firstCompute)) {
        std::swap(firstCompute, secondCompute);
        std::swap(firstResultNum, secondResultNum);
      }
      // We do not immediately alter the computeOps results/operands, instead we
      // do it in a delayed manner, to avoid invalidating the references to the
      // computeOps (which must be replaced by a cloned ComputeOp when changing
      // the number of results)
      // See below `reducerChanges.push_back` and `finalizeReduceUpdates`
      auto yieldOpFirstCompute = mlir::cast<spatial::SpatYieldOp>(firstCompute.getBody().front().getTerminator());
      // Add a new operand to the block of the second computeOp
      mlir::Block& secondBlock = secondCompute.getBody().front();
      mlir::Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
      auto secondComputeWeightsNum =
        secondCompute->getAttrOfType<mlir::DenseI32ArrayAttr>(secondCompute.getOperandSegmentSizesAttrName())[0];
      auto secondComputeOperandNum = secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
      // Take the "former-result" from the second computeOp
      spatial::SpatYieldOp secondYield = mlir::cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
      mlir::Value formerRes2 = secondYield.getOperand(secondResultNum);
      // Apply reduction operation
      rewriter.setInsertionPoint(secondYield);
      mlir::Value reduced = reduce(formerRes2, formerRes1);
      // Unfortunately, it is not possible to update the result in place,
      // because we may have already referenced it by <computeOp, resultNum>
      // outside of this function, thus replacing it would invalidate the
      // reference. Therefore, we need to append a new result to the yieldOp,
      // and then at a later stage update the computeOp accordingly.
      // Add `reduced` to the second yieldOp
      auto secondYieldOperandNum = secondYield.getNumOperands();
      secondYield->insertOperands(secondYieldOperandNum, reduced);
      secondResultNum = secondYieldOperandNum;
      // We should also add an entry for updating the results of the last
      // operation (the one which never becomes a `firstCompute`): because it is
      // not tracked by reducerChanges as `fromOp`
      reducerChanges.push_back(
        {firstCompute.getOperation(), firstResultNum, secondCompute.getOperation(), secondComputeOperandNum});
      nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
    }
    // If we have an odd number of inputs, we need to add the last one to the
    // newInputs list.
    if (computeOpsRef.size() % 2 == 1)
      nextComputeOps.push_back(computeOpsRef.back());
    // Replace the inputOps list with the new one.
    computeOpsRef = llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
  }
  assert(computeOpsRef.size() == 1 && "Internal error: expected a single input at this point.");
  auto finalComputeAndResNum = computeOpsRef[0];
  // Force the update of the results of this computeOp, when finalizing
  computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
  if (postprocess)
    GET_RES_NUM(finalComputeAndResNum) = applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
  return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(), GET_RES_NUM(finalComputeAndResNum));
 }
 void SpatialReducer::finalizeReduceUpdates() {
  assert(reducesFinalized == false && "Cannot finalize two times.");
  reducesFinalized = true;
  // First, add the results to the computeOps
  for (auto& reduceChange : reducerChanges)
    updateResultsOfCompute(reduceChange.fromOp);
  for (auto& c : computeOpNeedingResUpdate)
    updateResultsOfCompute(c.getOperation());
  for (auto& reducerChange : this->reducerChanges) {
    auto fromOp = reducerChange.fromOp;
    auto toOp = reducerChange.toOp;
    auto fromOpResNum = reducerChange.fromOpResNum;
    auto toOpOperandNum = reducerChange.toOpOperandNum;
    auto fromComputeOp = opToReplacedCompute[fromOp];
    assert(fromComputeOp && "fromOp should have been mapped before!");
    // toComputeOp could be the existing pointer, or we have to remap it with
    // `opToReplacedCompute`
    auto toComputeOp = opToReplacedCompute[toOp];
    if (!toComputeOp)
      toComputeOp = mlir::cast<spatial::SpatWeightedCompute>(toOp);
    assert(toComputeOp != fromComputeOp && "Oops should have caught this earlier!");
    assert(toComputeOp->getNumOperands() == toOpOperandNum
           && "toOpOperandNum should be the last operand of toComputeOp, are the "
              "operations in the right order?");
    // Add the new operand to `toComputeOp`
    auto fromResult = fromComputeOp.getResult(fromOpResNum);
    toComputeOp->insertOperands(toOpOperandNum, fromResult);
    incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
  }
 }
 mlir::Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) {
  assert(reducesFinalized && "Cannot create resolve values before finalizing the reduce updates.");
  mlir::Operation* opToCast;
  auto it = opToReplacedCompute.find(opAndResNum.first);
  if (it != opToReplacedCompute.end())
    opToCast = it->second;
  else
    opToCast = opAndResNum.first;
  auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(opToCast);
  return computeOp.getResult(opAndResNum.second);
 }
 void SpatialReducer::updateResultsOfCompute(mlir::Operation* computeOp) {
  if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
    // If we have already replaced the fromOp, we do not need to do it again
    return;
  }
  auto oldComputeOp = mlir::cast<spatial::SpatWeightedCompute>(computeOp);
  auto oldComputeOpNum = oldComputeOp->getNumOperands();
  auto yieldOp = mlir::cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
  if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
    // No result was added, just add itself to the map
    opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
    return;
  }
  // Add the results by inspecting its YieldOp
  auto newResultTypes = yieldOp.getOperandTypes();
  // Create a new ComputeOp with the new result type, but same operands
  rewriter.setInsertionPoint(oldComputeOp);
  auto newComputeOp = spatial::SpatWeightedCompute::create(
    rewriter, oldComputeOp->getLoc(), newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
  newComputeOp.getBody().takeBody(oldComputeOp.getBody());
  auto newComputeOpNum = newComputeOp->getNumOperands();
  assert(oldComputeOpNum == newComputeOpNum);
  // Since we replaced the old ComputeOp with a new one, we need to replace
  // all its results' uses
  for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
    mlir::Value oldResult = oldComputeOp.getResult(i);
    mlir::Value newResult = newComputeOp.getResult(i);
    // Replace the uses, except the uses of the compute ops which got deleted
    // previously
    rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
  }
  // Finally, erase the old computeOp and update the map
  opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
  oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
  rewriter.setInsertionPoint(oldComputeOp);
  rewriter.eraseOp(oldComputeOp);
 }
 mlir::Value
 SpatialReducer::createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
                                  mlir::Location& loc,
                                  mlir::Type outputType) {
  assert(reducesFinalized && "Cannot create ImgConcatOp before finalizing the reduce updates.");
  // outputTiles are indexed like this: [channelTile][x][y]
  auto tilesCount = outputTiles.size();
  auto width = outputTiles[0].size();
  auto height = outputTiles[0][0].size();
  llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>> remappedOutputTiles(
    tilesCount, llvm::SmallVector<llvm::SmallVector<mlir::Value>>(width, llvm::SmallVector<mlir::Value>(height)));
  for (size_t t = 0; t < tilesCount; t++)
    for (size_t x = 0; x < width; x++)
      for (size_t y = 0; y < height; y++)
        remappedOutputTiles[t][x][y] = resolveValueFromOpAndResNum(outputTiles[t][x][y]);
  return ::onnx_mlir::createImgConcatOp(remappedOutputTiles, rewriter, loc, outputType);
 }
 OpAndResNum SpatialReducer::applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
                                                 mlir::ConversionPatternRewriter& rewriter,
                                                 mlir::Value biasTile,
                                                 MapOperations mapOp) {
  std::function<mlir::Value(const mlir::Value&)> postprocessing = nullptr;
  if (mapOp != MapOperations::None) {
    postprocessing = [&](const mlir::Value a) {
      mlir::Value mapOperand = a;
      if (biasTile)
        mapOperand = spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, biasTile);
      return createMapOperation(rewriter, mapOp, mapOperand);
    };
  }
  return this->applyReducePattern(
    computeOps,
    [&](mlir::Value a, mlir::Value b) { return spatial::SpatVAddOp::create(rewriter, a.getLoc(), a.getType(), a, b); },
    /* preprocess = */ nullptr,
    postprocessing);
 }
 } // namespace onnx_mlir
@@ -1,88 +0,0 @@
 #pragma once
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Casting.h"
 #include <functional>
 #include <unordered_map>
 #include <utility>
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 namespace onnx_mlir {
 using ResNum = unsigned int;
 using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;
 struct SpatialReducerChange {
  mlir::Operation* fromOp;
  unsigned int fromOpResNum;
  mlir::Operation* toOp;
  unsigned int toOpOperandNum;
 };
 using OpAndResNum = std::pair<mlir::Operation*, ResNum>;
 class SpatialReducer {
 public:
  SpatialReducer(mlir::ConversionPatternRewriter& rewriter)
  : rewriter(rewriter) {}
  OpAndResNum applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
                                 std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
                                 std::function<mlir::Value(const mlir::Value&)> preprocess,
                                 std::function<mlir::Value(const mlir::Value&)> postprocess);
  OpAndResNum applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
                                   mlir::ConversionPatternRewriter& rewriter,
                                   mlir::Value biasTile,
                                   MapOperations mapOp);
  void finalizeReduceUpdates();
  ~SpatialReducer() {
    if (!reducesFinalized)
      finalizeReduceUpdates();
  }
  mlir::Value createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
                                mlir::Location& loc,
                                mlir::Type outputType);
  mlir::Value resolveValueFromOpAndResNum(OpAndResNum& opAndResNum);
 private:
  [[nodiscard("computeOp result number gets updated")]] ResNum
  applyResultProcessing(ComputeAndResNum computeOpAndResNum,
                        std::function<mlir::Value(const mlir::Value&)> processFun,
                        mlir::ConversionPatternRewriter& rewriter);
  /**
   * @brief Update the results of a ComputeOp.
   *
   * This function updates the results of a ComputeOp by taking a look at the
   operands of its yieldOp.
   * If the ComputeOp was replaced, it updates `opToReplacedCompute` with the
   replaced ComputeOp.
   *
   * @param computeOp The ComputeOp to update the results of.
   */
  void updateResultsOfCompute(mlir::Operation* computeOp);
  mlir::ConversionPatternRewriter& rewriter;
  bool reducesFinalized = false;
  // List of changes to be applied after the reduction is finalized
  llvm::SmallVector<SpatialReducerChange, 4> reducerChanges;
  // List of computeOps that need to be replaced with new results
  llvm::SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
  std::unordered_map<mlir::Operation*, spatial::SpatWeightedCompute> opToReplacedCompute;
  static llvm::SmallPtrSet<mlir::Operation*, 16> oldComputeOpsReplaced;
 };
 } // namespace onnx_mlir
@@ -1,53 +0,0 @@
 #include <cassert>
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
 namespace onnx_mlir {
 WeightSubdivider::WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights)
 : weights(std::move(weights)) {}
 bool WeightSubdivider::isEmpty() const { return weights.empty(); }
 TaggedWeights WeightSubdivider::popGroup(size_t amount) {
  assert(!weights.empty() && "No weights to extract.");
  auto it = weights.begin();
  llvm::SmallVector<mlir::Value>& values = it->second.begin()->second;
  long inputTile = it->first;
  long outputTile = it->second.begin()->first;
  size_t n = std::min(amount, values.size());
  crossbarsUsed += n;
  llvm::SmallVector<mlir::Value> result;
  result.assign(values.begin(), values.begin() + n);
  if (n < values.size()) {
    values.erase(values.begin(), values.begin() + n);
  }
  else {
    it->second.erase(outputTile);
    if (it->second.empty())
      weights.erase(inputTile);
  }
  return {inputTile, outputTile, crossbarsUsed - n, result};
 }
 llvm::SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
  crossbarsUsed = 0;
  llvm::SmallVector<TaggedWeights> result;
  size_t remaining = n;
  while (remaining > 0 && !weights.empty()) {
    auto group = popGroup(remaining);
    result.push_back(group);
    remaining -= group.weights.size();
  }
  return result;
 }
 } // namespace onnx_mlir
@@ -1,46 +0,0 @@
 #pragma once
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/SmallVector.h"
 #include <cstddef>
 #include <map>
 namespace onnx_mlir {
 /**
 * @brief A helper struct to store a group of weights.
 *
 */
 struct TaggedWeights {
  long inputTile;
  long outputTile;
  size_t startingCrossbarIndex;
  llvm::SmallVector<mlir::Value> weights;
 };
 /**
 * @brief A helper class to subdivide weights into groups.
 *
 * Weights are stored as a map of maps of SmallVectors. The outer map is indexed
 * by input tile, the inner map is indexed by output tile, and the SmallVector
 * contains the weights for the filter. This class allows us to extract groups
 * of weights from the map until we've extracted a certain number of elements,
 * namely as many as we need to fill a compute unit.
 */
 class WeightSubdivider {
 private:
  std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights;
  size_t crossbarsUsed = 0;
  TaggedWeights popGroup(size_t amount);
 public:
  WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights);
  bool isEmpty() const;
  llvm::SmallVector<TaggedWeights> popGroups(size_t n);
 };
 } // namespace onnx_mlir