add PIM accelerator

2026-02-24 15:09:18 +01:00
parent b24a0df8d7
commit a6e928bdd7
67 changed files with 9109 additions and 1 deletions
@@ -0,0 +1,119 @@
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+#include <queue>
+
+using namespace mlir;
+
+namespace onnx_mlir {
+
+/**
+ * @brief Structure that describes the replication of a convolution operation,
+ * along the image height axis.
+ */
+struct ConvReplication {
+  ONNXConvOp convOp;            // Convolution operation
+  size_t input_w;               // Width of the input image
+  size_t replicationFactor;     // Replication factor on the image height axis
+  size_t coresNeededPerReplica; // Number of cores needed for each replica
+
+  friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
+    return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
+  }
+
+  ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
+  : convOp(convOp),
+    input_w(input_w),
+    replicationFactor(replicationFactor),
+    coresNeededPerReplica(coresNeededPerReplica) {}
+};
+
+LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
+
+  if (coresCount == -1) {
+    // No need for annotation, implicitly set replication to 1
+    return success();
+  }
+
+  std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
+
+  size_t minimumCores = 0;
+
+  for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
+    if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
+      // Convolution layer
+
+      Value X = convOp.getX(), W = convOp.getW();
+      ShapedType xShape = mlir::cast<ShapedType>(X.getType());
+      ShapedType wShape = mlir::cast<ShapedType>(W.getType());
+
+      size_t input_w = GET_IMAGE_WIDTH(xShape);
+      size_t krn_h = GET_KERNEL_HEIGHT(wShape);
+      size_t krn_w = GET_KERNEL_WIDTH(wShape);
+
+      size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
+      size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
+
+      auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
+      auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
+
+      minimumCores += neededCores;
+
+      convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
+    }
+    else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
+      // Fully connected layer
+      auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
+      auto inputSize = matrixTensorShape.getDimSize(0);
+      auto outputSize = matrixTensorShape.getDimSize(1);
+      if (gemmOp.getTransB())
+        std::swap(inputSize, outputSize);
+
+      const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
+      const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
+
+      // Each output tile is computed by `coresPerOutputTile` cores. The
+      // entire input is given to each of these cores.
+      const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
+
+      auto neededCores = coresPerOutputTile * outputTilesCount;
+
+      minimumCores += neededCores;
+    }
+  }
+
+  if (static_cast<size_t>(coresCount) < minimumCores) {
+    return funcOp->emitError("Not enough cores for this network: ")
+        << minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
+  }
+
+  size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
+
+  // Consume all the elements in the queue
+  while (!convOpsReplicationQueue.empty()) {
+    auto convOpReplication = convOpsReplicationQueue.top();
+    convOpsReplicationQueue.pop();
+
+    // Check if we can replicate this convolution (e.g. we have enough cores)
+    if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
+      // We can replicate this convolution: increment replicationFactor and put
+      // back in queue
+      availableCores -= convOpReplication.coresNeededPerReplica;
+      convOpReplication.replicationFactor++;
+
+      convOpsReplicationQueue.push(convOpReplication);
+    }
+    else {
+      // Cannot replicate this convolution anymore, annotate the operation
+      // with the replication factor
+      convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
+                                        rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
+    }
+  }
+
+  return success();
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/PatternMatch.h"
+
+namespace onnx_mlir {
+
+mlir::LogicalResult annotateReplication(
+    mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter);
+
+} // namespace onnx_mlir
@@ -0,0 +1,382 @@
+
+#include "SpatialReducer.hpp"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Value.h"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <unordered_map>
+#include <utility>
+
+#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
+#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
+
+namespace onnx_mlir {
+
+llvm::SmallPtrSet<Operation *, 16>
+    onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
+
+ResNum SpatialReducer::applyResultProcessing(
+    ComputeAndResNum computeOpAndResNum,
+    std::function<Value(const Value &)> processFun,
+    ConversionPatternRewriter &rewriter) {
+  assert(processFun);
+
+  auto computeOp = GET_COMP(computeOpAndResNum);
+  auto resultNum = GET_RES_NUM(computeOpAndResNum);
+
+  spatial::SpatYieldOp yieldOp =
+      cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+
+  Value result = yieldOp->getOperand(resultNum);
+  rewriter.setInsertionPointAfterValue(result);
+  Value processedResult = processFun(result);
+  if (processedResult == result) {
+    // Sometimes we want processedResult to return the same value but do
+    // something else with it (e.g. in softmax we want to broadcast the value
+    // using a channel). In this case, we can just return the same value.
+    return resultNum;
+  }
+
+  yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
+
+  return yieldOp.getNumOperands() - 1;
+}
+
+OpAndResNum SpatialReducer::applyReducePattern(
+    SmallVector<ComputeAndResNum> &computeOpsAndResNum,
+    std::function<Value(const Value &, const Value &)> reduce,
+    std::function<Value(const Value &)> preprocess,
+    std::function<Value(const Value &)> postprocess) {
+
+  if (preprocess) {
+    for (auto &computeOpAndResNum : computeOpsAndResNum) {
+      GET_RES_NUM(computeOpAndResNum) =
+          applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
+    }
+  }
+
+  // It is possible that `computeOpsAndResNum` contains two entries for the same
+  // computeOp. In this case, we need to apply the reduction within-computef
+
+  // Keep a map between a computeOp and the last Value for this reduction
+  std::unordered_map<Operation *, Value> lastValueForCompute;
+  for (auto &computeOpAndResNum : computeOpsAndResNum) {
+    auto computeOp = GET_COMP(computeOpAndResNum);
+    auto yieldOp =
+        cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+    Value valueWithinCompute =
+        yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
+
+    auto it = lastValueForCompute.find(computeOp.getOperation());
+
+    if (it != lastValueForCompute.end()) {
+      // If we have already seen this computeOp, apply the reduction
+      // within-compute
+      Value lastWithinComputeValue = it->second;
+
+      assert(valueWithinCompute.getDefiningOp() &&
+             lastWithinComputeValue.getDefiningOp());
+
+      if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(
+              lastWithinComputeValue.getDefiningOp())) {
+        rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
+      } else {
+        rewriter.setInsertionPointAfterValue(valueWithinCompute);
+      }
+      valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
+      lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
+    }
+
+    lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
+  }
+
+  // Now, reconstruct from the map the computeOpsAndResNum list
+  computeOpsAndResNum.clear();
+  computeOpsAndResNum.reserve(lastValueForCompute.size());
+  for (auto &entry : lastValueForCompute) {
+    auto computeOp = cast<spatial::SpatWeightedCompute>(entry.first);
+    auto valueWithinCompute = entry.second;
+
+    // We check if `valueWithinCompute` is already used by the yieldOp, in that
+    // case no need to add it
+    auto yieldOp =
+        cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+    bool yieldOpUseFound = false;
+    for (auto &use : valueWithinCompute.getUses()) {
+      if (use.getOwner() == yieldOp.getOperation()) {
+        // If the value is already used by the yieldOp, we can just use it
+        computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
+        yieldOpUseFound = true;
+        break;
+      }
+    }
+    if (yieldOpUseFound) {
+      continue;
+    }
+
+    // If this result is not used within a yieldOp, then add it
+    auto resultNum = yieldOp->getNumOperands();
+    yieldOp->insertOperands(resultNum, valueWithinCompute);
+
+    computeOpsAndResNum.push_back({computeOp, resultNum});
+  }
+
+  Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
+
+  // Recursive algorithm to reduce the inputs to a single one:
+  // - Take two inputs at a time, and reduce them into a single one, updating
+  // the computeOpsAndResNum list which becomes half the size.
+  // - Repeat until there is only one input left.
+  llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
+  while (computeOpsRef.size() > 1) {
+    SmallVector<ComputeAndResNum> nextComputeOps;
+    nextComputeOps.reserve(computeOpsRef.size() / 2);
+    for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
+      auto [firstCompute, firstResultNum] = computeOpsRef[i];
+      auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
+
+      if (secondCompute->isBeforeInBlock(firstCompute)) {
+        std::swap(firstCompute, secondCompute);
+        std::swap(firstResultNum, secondResultNum);
+      }
+
+      // We do not immediately alter the computeOps results/operands, instead we
+      // do it in a delayed manner, to avoid invalidating the references to the
+      // computeOps (which must be replaced by a cloned ComputeOp when changing
+      // the number of results)
+      // See below `reducerChanges.push_back` and `finalizeReduceUpdates`
+
+      auto yieldOpFirstCompute = cast<spatial::SpatYieldOp>(
+          firstCompute.getBody().front().getTerminator());
+
+      // Add a new operand to the block of the second computeOp
+      Block &secondBlock = secondCompute.getBody().front();
+      Value formerRes1 = secondBlock.addArgument(
+          yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
+
+      auto secondComputeWeightsNum =
+          secondCompute->getAttrOfType<DenseI32ArrayAttr>(
+              secondCompute.getOperandSegmentSizesAttrName())[0];
+      auto secondComputeOperandNum =
+          secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
+
+      // Take the "former-result" from the second computeOp
+      spatial::SpatYieldOp secondYield =
+          cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
+      Value formerRes2 = secondYield.getOperand(secondResultNum);
+
+      // Apply reduction operation
+      rewriter.setInsertionPoint(secondYield);
+      Value reduced = reduce(formerRes2, formerRes1);
+
+      // Unfortunately, it is not possible to update the result in place,
+      // because we may have already referenced it by <computeOp, resultNum>
+      // outside of this function, thus replacing it would invalidate the
+      // reference. Therefore, we need to append a new result to the yieldOp,
+      // and then at a later stage update the computeOp accordingly.
+
+      // Add `reduced` to the second yieldOp
+      auto secondYieldOperandNum = secondYield.getNumOperands();
+      secondYield->insertOperands(secondYieldOperandNum, reduced);
+      secondResultNum = secondYieldOperandNum;
+
+      // We should also add an entry for updating the results of the last
+      // operation (the one which never becomes a `firstCompute`): because it is
+      // not tracked by reducerChanges as `fromOp`
+      reducerChanges.push_back({firstCompute.getOperation(), firstResultNum,
+          secondCompute.getOperation(), secondComputeOperandNum});
+      nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
+    }
+
+    // If we have an odd number of inputs, we need to add the last one to the
+    // newInputs list.
+    if (computeOpsRef.size() % 2 == 1) {
+      nextComputeOps.push_back(computeOpsRef.back());
+    }
+
+    // Replace the inputOps list with the new one.
+    computeOpsRef =
+        llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
+  }
+
+  assert(computeOpsRef.size() == 1 &&
+         "Internal error: expected a single input at this point.");
+
+  auto finalComputeAndResNum = computeOpsRef[0];
+
+  // Force the update of the results of this computeOp, when finalizing
+  computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
+
+  if (postprocess) {
+    GET_RES_NUM(finalComputeAndResNum) =
+        applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
+  }
+
+  return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(),
+      GET_RES_NUM(finalComputeAndResNum));
+}
+
+void SpatialReducer::finalizeReduceUpdates() {
+  assert(reducesFinalized == false && "Cannot finalize two times.");
+
+  reducesFinalized = true;
+
+  // First, add the results to the computeOps
+  for (auto &reduceChange : reducerChanges) {
+    updateResultsOfCompute(reduceChange.fromOp);
+  }
+
+  for (auto &c : computeOpNeedingResUpdate) {
+    updateResultsOfCompute(c.getOperation());
+  }
+
+  for (auto &reducerChange : this->reducerChanges) {
+    auto fromOp = reducerChange.fromOp;
+    auto toOp = reducerChange.toOp;
+    auto fromOpResNum = reducerChange.fromOpResNum;
+    auto toOpOperandNum = reducerChange.toOpOperandNum;
+
+    auto fromComputeOp = opToReplacedCompute[fromOp];
+    assert(fromComputeOp && "fromOp should have been mapped before!");
+
+    // toComputeOp could be the existing pointer, or we have to remap it with
+    // `opToReplacedCompute`
+    auto toComputeOp = opToReplacedCompute[toOp];
+    if (!toComputeOp) {
+      toComputeOp = cast<spatial::SpatWeightedCompute>(toOp);
+    }
+
+    assert(toComputeOp != fromComputeOp &&
+           "Oops should have caught this earlier!");
+
+    assert(toComputeOp->getNumOperands() == toOpOperandNum &&
+           "toOpOperandNum should be the last operand of toComputeOp, are the "
+           "operations in the right order?");
+
+    // Add the new operand to `toComputeOp`
+    auto fromResult = fromComputeOp.getResult(fromOpResNum);
+    toComputeOp->insertOperands(toOpOperandNum, fromResult);
+    incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
+  }
+}
+
+Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum &opAndResNum) {
+  assert(reducesFinalized &&
+         "Cannot create resolve values before finalizing the reduce updates.");
+
+  Operation *opToCast;
+  auto it = opToReplacedCompute.find(opAndResNum.first);
+  if (it != opToReplacedCompute.end()) {
+    opToCast = it->second;
+  } else {
+    opToCast = opAndResNum.first;
+  }
+
+  auto computeOp = cast<spatial::SpatWeightedCompute>(opToCast);
+
+  return computeOp.getResult(opAndResNum.second);
+}
+
+void SpatialReducer::updateResultsOfCompute(Operation *computeOp) {
+  if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
+    // If we have already replaced the fromOp, we do not need to do it again
+    return;
+  }
+  auto oldComputeOp = cast<spatial::SpatWeightedCompute>(computeOp);
+
+  auto oldComputeOpNum = oldComputeOp->getNumOperands();
+
+  auto yieldOp =
+      cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
+
+  if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
+    // No result was added, just add itself to the map
+    opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
+    return;
+  }
+
+  // Add the results by inspecting its YieldOp
+  auto newResultTypes = yieldOp.getOperandTypes();
+
+  // Create a new ComputeOp with the new result type, but same operands
+  rewriter.setInsertionPoint(oldComputeOp);
+  auto newComputeOp =
+      rewriter.create<spatial::SpatWeightedCompute>(oldComputeOp->getLoc(),
+          newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
+
+  newComputeOp.getBody().takeBody(oldComputeOp.getBody());
+
+  auto newComputeOpNum = newComputeOp->getNumOperands();
+
+  assert(oldComputeOpNum == newComputeOpNum);
+
+  // Since we replaced the old ComputeOp with a new one, we need to replace
+  // all its results' uses
+  for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
+    Value oldResult = oldComputeOp.getResult(i);
+    Value newResult = newComputeOp.getResult(i);
+
+    // Replace the uses, except the uses of the compute ops which got deleted
+    // previously
+    rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
+  }
+
+  // Finally, erase the old computeOp and update the map
+  opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
+  oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
+  rewriter.setInsertionPoint(oldComputeOp);
+  rewriter.eraseOp(oldComputeOp);
+}
+
+Value SpatialReducer::createImgConcatOp(
+    SmallVector<SmallVector<SmallVector<OpAndResNum>>> &outputTiles,
+    Location &loc, Type outputType) {
+
+  assert(reducesFinalized &&
+         "Cannot create ImgConcatOp before finalizing the reduce updates.");
+
+  // outputTiles are indexed like this: [channelTile][x][y]
+  auto tilesCount = outputTiles.size();
+  auto width = outputTiles[0].size();
+  auto height = outputTiles[0][0].size();
+
+  SmallVector<SmallVector<SmallVector<Value>>> remappedOutputTiles(tilesCount,
+      SmallVector<SmallVector<Value>>(width, SmallVector<Value>(height)));
+
+  for (size_t t = 0; t < tilesCount; t++)
+    for (size_t x = 0; x < width; x++)
+      for (size_t y = 0; y < height; y++)
+        remappedOutputTiles[t][x][y] =
+            resolveValueFromOpAndResNum(outputTiles[t][x][y]);
+
+  return ::onnx_mlir::createImgConcatOp(
+      remappedOutputTiles, rewriter, loc, outputType);
+}
+
+OpAndResNum SpatialReducer::applyAddMapReduction(
+    SmallVector<ComputeAndResNum> &computeOps,
+    ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp) {
+
+  std::function<Value(const Value &)> postprocessing = nullptr;
+
+  if (mapOp != MapOperations::None) {
+    postprocessing = [&](const Value a) {
+      Value mapOperand = a;
+      if (biasTile) {
+        mapOperand = rewriter.create<spatial::SpatVAddOp>(
+            a.getLoc(), a.getType(), a, biasTile);
+      }
+      return createMapOperation(rewriter, mapOp, mapOperand);
+    };
+  }
+
+  return this->applyReducePattern(
+      computeOps,
+      [&](Value a, Value b) {
+        return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b);
+      },
+      /* preprocess = */ nullptr, postprocessing);
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,83 @@
+#pragma once
+
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
+
+namespace onnx_mlir {
+
+using ResNum = unsigned int;
+
+using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;
+
+struct SpatialReducerChange {
+  Operation *fromOp;
+  unsigned int fromOpResNum;
+  Operation *toOp;
+  unsigned int toOpOperandNum;
+};
+
+using OpAndResNum = std::pair<Operation *, ResNum>;
+
+class SpatialReducer {
+
+public:
+  SpatialReducer(ConversionPatternRewriter &rewriter) : rewriter(rewriter) {}
+
+  OpAndResNum applyReducePattern(
+      SmallVector<ComputeAndResNum> &computeOpsAndResNum,
+      std::function<Value(const Value &, const Value &)> reduce,
+      std::function<Value(const Value &)> preprocess,
+      std::function<Value(const Value &)> postprocess);
+
+  OpAndResNum applyAddMapReduction(SmallVector<ComputeAndResNum> &computeOps,
+      ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp);
+
+  void finalizeReduceUpdates();
+
+  ~SpatialReducer() {
+    if (!reducesFinalized) {
+      finalizeReduceUpdates();
+    }
+  }
+
+  Value createImgConcatOp(
+      llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>
+          &outputTiles,
+      Location &loc, Type outputType);
+
+  Value resolveValueFromOpAndResNum(OpAndResNum &opAndResNum);
+
+private:
+  [[nodiscard("computeOp result number gets updated")]] ResNum
+  applyResultProcessing(ComputeAndResNum computeOpAndResNum,
+      std::function<Value(const Value &)> processFun,
+      ConversionPatternRewriter &rewriter);
+
+  /**
+   * @brief Update the results of a ComputeOp.
+   *
+   * This function updates the results of a ComputeOp by taking a look at the
+   operands of its yieldOp.
+   * If the ComputeOp was replaced, it updates `opToReplacedCompute` with the
+   replaced ComputeOp.
+   *
+   * @param computeOp The ComputeOp to update the results of.
+   */
+  void updateResultsOfCompute(Operation *computeOp);
+
+  ConversionPatternRewriter &rewriter;
+  bool reducesFinalized = false;
+
+  // List of changes to be applied after the reduction is finalized
+  SmallVector<SpatialReducerChange, 4> reducerChanges;
+  // List of computeOps that need to be replaced with new results
+  SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
+
+  std::unordered_map<Operation *, spatial::SpatWeightedCompute> opToReplacedCompute;
+
+  static llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
+};
+
+} // namespace onnx_mlir
@@ -0,0 +1,53 @@
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
+#include <cassert>
+
+namespace onnx_mlir {
+
+WeightSubdivider::WeightSubdivider(
+    map<long, map<long, SmallVector<Value>>> weights)
+    : weights(std::move(weights)) {}
+
+bool WeightSubdivider::isEmpty() const { return weights.empty(); }
+
+TaggedWeights WeightSubdivider::popGroup(size_t amount) {
+  assert(!weights.empty() && "No weights to extract.");
+
+  auto it = weights.begin();
+  SmallVector<Value> &values = it->second.begin()->second;
+
+  long inputTile = it->first;
+  long outputTile = it->second.begin()->first;
+
+  size_t n = std::min(amount, values.size());
+  crossbarsUsed += n;
+
+  SmallVector<Value> result;
+  result.assign(values.begin(), values.begin() + n);
+
+  if (n < values.size()) {
+    values.erase(values.begin(), values.begin() + n);
+  } else {
+    it->second.erase(outputTile);
+    if (it->second.empty()) {
+      weights.erase(inputTile);
+    }
+  }
+
+  return {inputTile, outputTile, crossbarsUsed - n, result};
+}
+
+SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
+  crossbarsUsed = 0;
+  SmallVector<TaggedWeights> result;
+  size_t remaining = n;
+
+  while (remaining > 0 && !weights.empty()) {
+    auto group = popGroup(remaining);
+    result.push_back(group);
+    remaining -= group.weights.size();
+  }
+
+  return result;
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/SmallVector.h"
+#include <map>
+
+using namespace mlir;
+using namespace std;
+
+namespace onnx_mlir {
+
+/**
+ * @brief A helper struct to store a group of weights.
+ *
+ */
+struct TaggedWeights {
+  long inputTile;
+  long outputTile;
+  size_t startingCrossbarIndex;
+  SmallVector<Value> weights;
+};
+
+/**
+ * @brief A helper class to subdivide weights into groups.
+ *
+ * Weights are stored as a map of maps of SmallVectors. The outer map is indexed
+ * by input tile, the inner map is indexed by output tile, and the SmallVector
+ * contains the weights for the filter. This class allows us to extract groups
+ * of weights from the map until we've extracted a certain number of elements,
+ * namely as many as we need to fill a compute unit.
+ */
+class WeightSubdivider {
+private:
+  map<long, map<long, SmallVector<Value>>> weights;
+  size_t crossbarsUsed = 0;
+
+  TaggedWeights popGroup(size_t amount);
+
+public:
+  WeightSubdivider(map<long, map<long, SmallVector<Value>>> weights);
+
+  bool isEmpty() const;
+  SmallVector<TaggedWeights> popGroups(size_t n);
+};
+
+} // namespace onnx_mlir