add constant folding and verification pass for pim host operations

better validation scripts output big refactors
2026-03-20 12:08:12 +01:00
parent 4e50e056e3
commit 6e1de865bb
64 changed files with 1364 additions and 2265 deletions
@@ -3,21 +3,15 @@ mlir_tablegen(ONNXToSpatial.hpp.inc -gen-rewriters "-I${ONNX_MLIR_SRC_ROOT}")
 add_public_tablegen_target(ONNXToSpatialIncGen)

 add_onnx_mlir_library(OMONNXToSpatial
-  Math/Gemm.hpp
  Math/Gemm.cpp
-  Math/Conv.hpp
  Math/Conv.cpp
-  Math/ExperimentalConv.cpp
-  Math/ExperimentalGemm.cpp
  NN/Pooling.cpp
-  NN/ExperimentalPooling.cpp
  NN/ReduceMean.cpp
  Tensor/ONNXConcatToTensorConcat.cpp
  Tensor/RemoveUnusedHelperOps.cpp
  Utils/SpatialReducer.cpp
  Utils/WeightSubdivider.cpp
  Utils/AnnotateReplication.cpp
-  ONNXToSpatialPass.hpp
  ONNXToSpatialPass.cpp
  ONNXToSpatialCommon.cpp

@@ -242,6 +242,6 @@ LogicalResult ConvToGemm::matchAndRewrite(ONNXConvOp convOp,
  return success();
 }

-void populateTilingConvOpPattern(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert<ConvToGemm>(ctx); }
+void populateConvOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert<ConvToGemm>(ctx); }

 } // namespace onnx_mlir
@@ -18,6 +18,6 @@ struct ConvToGemm : mlir::OpConversionPattern<mlir::ONNXConvOp> {
                                      mlir::ConversionPatternRewriter& rewriter) const override;
 };

-void populateTilingConvOpPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
+void populateConvOpPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

 } // namespace onnx_mlir
@@ -1,583 +0,0 @@
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/LogicalResult.h"
-
-#include <cstddef>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-using namespace mlir;
-using namespace std;
-
-namespace onnx_mlir {
-
-// NOTE:
-// This might be useful to re-implement this considering for loops.
-// neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
-
-/**
- * @brief A momentary representation of a core, to be used within the tiling of
- * a convolution operation.
- */
-class Core {
-public:
-  Core(const size_t coreId, ConversionPatternRewriter& rewriter)
-  : coreId(coreId), rewriter(rewriter) {}
-
-  /**
-   * @brief Add a MVM operation to the core.
-   *
-   * @param inputTile The input tile to the MVM operation.
-   * @param xbarIndex The index of the crossbar weight to use.
-   * @param outputTileId The id of the output tile.
-   * @param mvmOutType The result's shape.
-   * @return Value The result of the MVM operation.
-   */
-  Value addMVM(Value inputTile, size_t xbarIndex, size_t outputTileId, Type mvmOutType) {
-    // Use the inputTile as the reference location for the MVM operation.
-    Location loc = inputTile.getLoc();
-
-    // Move the insertion point to the end of the block.
-    rewriter.setInsertionPointToEnd(block.get());
-
-    // Add the inputTile to the block arguments, and to the operands.
-    Value operand = operandMap.lookupOrNull(inputTile);
-    if (not operand) {
-      operand = block->addArgument(inputTile.getType(), loc);
-      operands.push_back(inputTile);
-      operandMap.map(inputTile, operand);
-    }
-
-    // TODO: Compute the output type using the matrix, and check if `mvmOutType`
-    // is correct.
-
-    // Construct the MVM operation
-    Value result = rewriter.create<spatial::SpatWeightedMVMOp>(loc, mvmOutType, xbarIndex, operand);
-
-    // Since we are within the same core and no computation can happen in
-    // paralllel, we can just apply a linear reduction in case we have multiple
-    // MVM operations for the same outputTile.
-    auto lastMVM = outputTileToMVM.find(outputTileId);
-
-    // If an entry for this outputTile already exists, apply reduction.
-    if (lastMVM != outputTileToMVM.end()) {
-      // MVM results should have the same type for reduction.
-      assert(lastMVM->second.getType() == result.getType());
-      result = rewriter.create<spatial::SpatVAddOp>(loc, mvmOutType, lastMVM->second, result);
-    }
-
-    outputTileToMVM[outputTileId] = result;
-    return result;
-  }
-
-  /**
-   * @brief Mark a result as remappable, and return a shared pointer to it.
-   *
-   * This function marks a result as remappable, and returns a shared pointer to
-   * it. We need to keep track of these values to generate the YieldOp at a
-   * later stage.
-   *
-   * @param result A result to track, for later remapping.
-   * @return shared_ptr<Value> A shared pointer to the result.
-   */
-  shared_ptr<Value> makeResultRemappable(Value result) {
-    // Verify that the result is present in the block.
-    assert(result.getDefiningOp()->getBlock() == block.get());
-
-    shared_ptr<mlir::Value> remappableResult = make_shared<Value>(result);
-
-    resultsToRemap.push_back(remappableResult);
-    results.push_back(result);
-
-    return remappableResult;
-  }
-
-  /**
-   * @brief Add a remappable operand to the core, to merge partial results
-   * inter-core.
-   *
-   * @param remappableOperand The operand to add.
-   * @return Value The block argument representing the operand.
-   */
-  Value addRemappableOperand(std::shared_ptr<Value> operand) {
-    // Check that the operand is not already there.
-    assert(not operandMap.contains(*operand));
-
-    Value argument = block->addArgument(operand->getType(), operand->getLoc());
-    remappableOperands.push_back(operand);
-    return argument;
-  }
-
-  /**
-   * @brief Generate a spatial::SpatWeightedCompute operation from the core.
-   *
-   * @param loc The location of the operation.
-   * @return spatial::SpatWeightedCompute
-   */
-  spatial::SpatWeightedCompute createWComputeOp(Location loc) {
-    // Get the shape of the results.
-    SmallVector<Type> resultTypes;
-    for (const auto& value : results)
-      resultTypes.push_back(value.getType());
-
-    // Create the WComputeOp, with non-remappable operands only.
-    wcomputeOp = rewriter.create<spatial::SpatWeightedCompute>(loc, resultTypes, xbarWeights, operands);
-
-    // Add the body to the WComputeOp.
-    Block* releasedBlock = block.release();
-    wcomputeOp.getBody().push_back(releasedBlock);
-
-    // Add the `yieldOp` at the end, with the results.
-    rewriter.setInsertionPointToEnd(releasedBlock);
-    rewriter.create<spatial::SpatYieldOp>(loc, results);
-
-    return wcomputeOp;
-  }
-
-  /**
-   * @brief Remap the results to the WComputeOp results.
-   */
-  void remapResults() {
-    // Remap all the results to the WComputeOp results.
-    assert(resultsToRemap.size() == wcomputeOp->getNumResults());
-    for (size_t i = 0; i < resultsToRemap.size(); i++)
-      *resultsToRemap[i] = wcomputeOp.getResult(i);
-  }
-
-  void addRemappedOperands() {
-    // Insert the remappableOperands (which were remapped in
-    // `addRemappableOperand` of another Core)
-    for (auto remappedValue : remappableOperands)
-      wcomputeOp->insertOperands(wcomputeOp->getNumOperands(), *remappedValue);
-
-    // Update the wcomputeOp operandSegmentSize
-    incrementWeightedComputeInputsSegmentSize(wcomputeOp, static_cast<int>(remappableOperands.size()));
-  }
-
-  size_t addXbarWeight(Value weight) {
-    assert(!isXbarsFull());
-    xbarWeights.push_back(weight);
-    return xbarWeights.size() - 1;
-  }
-
-  bool isXbarsFull() {
-    assert(xbarWeights.size() <= crossbarCountInCore);
-    return xbarWeights.size() == crossbarCountInCore;
-  }
-
-  bool isCoreEmpty() { return block->empty(); }
-
-  void dump() {
-    // Print the coreId
-    llvm::outs() << "Core " << coreId << ":\n";
-    // Print the weights
-    llvm::outs() << "Xbar Weights:\n";
-    for (auto weight : xbarWeights)
-      weight.dump();
-    // Print the operands
-    llvm::outs() << "Operands:\n";
-    for (auto operand : operands)
-      llvm::outs() << operand << "\n";
-
-    // Dump the body block
-    for (auto& op : block->getOperations())
-      op.dump();
-
-    // Print the results
-    llvm::outs() << "Results:\n";
-    for (auto result : results)
-      llvm::outs() << result << "\n";
-  }
-
-  const size_t coreId;
-
-private:
-  ConversionPatternRewriter& rewriter;
-
-  // Should these be set<Value> instead? But I need to keep the order
-  vector<Value> operands;
-  vector<std::shared_ptr<Value>> remappableOperands;
-
-  vector<Value> results;
-  vector<std::shared_ptr<Value>> resultsToRemap;
-
-  // Maps from input tiles to the block operand
-  IRMapping operandMap;
-
-  // Map from outputTileId to MVM operation producing it
-  unordered_map<size_t, Value> outputTileToMVM;
-
-  vector<Value> xbarWeights;
-
-  unique_ptr<mlir::Block> block = make_unique<Block>();
-
-  spatial::SpatWeightedCompute wcomputeOp;
-};
-
-struct ConvToManyGemms : public OpConversionPattern<ONNXConvOp> {
-  ConvToManyGemms(MLIRContext* ctx)
-  : OpConversionPattern(ctx) {}
-
-  struct Producer_t {
-    Value value;
-    shared_ptr<Core> core;
-  };
-
-  LogicalResult
-  matchAndRewrite(ONNXConvOp conv, ONNXConvOpAdaptor convAdaptor, ConversionPatternRewriter& rewriter) const final {
-    ShapedType xShape = mlir::cast<ShapedType>(convAdaptor.getX().getType());
-    ShapedType wShape = mlir::cast<ShapedType>(convAdaptor.getW().getType());
-    ShapedType bShape = mlir::cast<ShapedType>(convAdaptor.getB().getType());
-    ShapedType yShape = mlir::cast<ShapedType>(conv.getY().getType());
-
-    size_t stride_x, stride_y, dilation_x, dilation_y, pad_x, pad_y;
-    unpackOptionalPairVector(conv.getStrides(), stride_x, stride_y);
-    unpackOptionalPairVector(conv.getDilations(), dilation_x, dilation_y);
-
-    auto padUnpackError = unpackOptionalPadsVector(convAdaptor.getPads(), pad_x, pad_y);
-    if (padUnpackError.has_value())
-      return rewriter.notifyMatchFailure(conv, padUnpackError.value());
-
-    // TODO: Pad value at beginning and end of each dimension could be
-    // different. We should handle this case.
-
-    // MapOperations mapOperation = MapOperations::None;
-    //
-    // // If we have just one user, and it is an activation funcion (or more in
-    // // general a mapping operation) just inline it in the computeOps
-    // auto firstUserOp = *conv->getUsers().begin();
-    // if (conv->hasOneUse()) {
-    //   mapOperation = mlirOpToMapOperationEnum(firstUserOp);
-    //
-    //   if (mapOperation == MapOperations::ONNXSoftmaxOp) {
-    //     return rewriter.notifyMatchFailure(
-    //         conv, "Softmax not supported as activation for convolutions.");
-    //   }
-    // }
-
-    size_t input_h = GET_IMAGE_HEIGHT(xShape);
-    size_t input_w = GET_IMAGE_WIDTH(xShape);
-    size_t output_h = GET_IMAGE_HEIGHT(yShape);
-    size_t output_w = GET_IMAGE_WIDTH(yShape);
-    size_t krn_h = GET_KERNEL_HEIGHT(wShape);
-    size_t krn_w = GET_KERNEL_WIDTH(wShape);
-
-    Location loc = conv.getLoc();
-
-    size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
-    size_t inputTileRemainder = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
-    size_t outputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(yShape), crossbarSize.getValue());
-    size_t outputTileRemainder = GET_IMAGE_CHANNEL(yShape) % crossbarSize;
-
-    // Tile the input tensor
-    // Input tiles need to be indexed by:
-    //    a. Channel Tile
-    //    b. Pixel `x` position
-    //    c. Pixel `y` position
-    // For example: inputTiles[channelTile][x][y]
-    // Example complete input tensor: tensor<1x3x6x6xf32> (NxCxWxH)
-    SmallVector<SmallVector<SmallVector<Value>>> inputTiles(
-      inputTileCount, SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
-
-    auto resolveErrorOpt = resolveImgInputTiles(
-      convAdaptor.getX(), inputTiles, inputTileCount, inputTileRemainder, input_h, input_h, rewriter);
-    if (resolveErrorOpt.has_value())
-      return rewriter.notifyMatchFailure(conv, *resolveErrorOpt);
-
-    SmallVector<OpFoldResult> strides = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(1));
-    SmallVector<OpFoldResult> offsets = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(0));
-    SmallVector<OpFoldResult> sizes = SmallVector<OpFoldResult> {rewriter.getIndexAttr(1),
-                                                                 rewriter.getIndexAttr(crossbarSize),
-                                                                 rewriter.getIndexAttr(1),
-                                                                 rewriter.getIndexAttr(1)};
-
-    // Tile the weight tensor
-    // Weight tiles need to be indexed by:
-    //  a. Filter Tile
-    //  b. Channel Tile
-    //  c. Kernel `x` position
-    //  d. Kernel `y` position
-    // For example: weightTiles[filterTile][channelTile][x][y]
-    // Example complete weight tensor: tensor<32x3x3x3xf32> (FxCxWxH)
-    SmallVector<SmallVector<SmallVector<SmallVector<Value>>>> weightTiles(
-      outputTileCount,
-      SmallVector<SmallVector<SmallVector<Value>>>(inputTileCount,
-                                                   SmallVector<SmallVector<Value>>(krn_w, SmallVector<Value>(krn_h))));
-    strides = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(1));
-    offsets = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(0));
-    sizes = {rewriter.getIndexAttr(crossbarSize),
-             rewriter.getIndexAttr(crossbarSize),
-             rewriter.getIndexAttr(1),
-             rewriter.getIndexAttr(1)};
-    for (size_t i = 0; i < outputTileCount; i++) {
-      if (i == outputTileCount - 1 && outputTileRemainder != 0)
-        sizes[0] = rewriter.getIndexAttr(outputTileRemainder);
-      sizes[1] = rewriter.getIndexAttr(crossbarSize);
-      offsets[0] = rewriter.getIndexAttr(i * crossbarSize);
-      for (size_t j = 0; j < inputTileCount; j++) {
-        if (j == inputTileCount - 1 && inputTileRemainder != 0)
-          sizes[1] = rewriter.getIndexAttr(inputTileRemainder);
-        for (size_t x = 0; x < krn_w; x++) {
-          for (size_t y = 0; y < krn_h; y++) {
-            offsets[1] = rewriter.getIndexAttr(j * crossbarSize);
-            offsets[2] = rewriter.getIndexAttr(x);
-            offsets[3] = rewriter.getIndexAttr(y);
-            weightTiles[i][j][x][y] =
-              rewriter.create<tensor::ExtractSliceOp>(loc, convAdaptor.getW(), offsets, sizes, strides);
-          }
-        }
-      }
-    }
-
-    /* Distribute the computation among many compute cores
-     * Try to compute in-core the computation for each output tile, and reduce
-     * over as few cores as possible
-     */
-
-    // Tile the output tensor
-    // Output tiles need to be indexed by:
-    //  a. Filter Tile
-    //  b. Pixel `x` position
-    //  c. Pixel `y` position
-    // For example: outputTiles[filterTile][x][y]
-    // Example complete output tensor: tensor<1x32x3x3xf32> (NxFxWxH)
-    SmallVector<SmallVector<SmallVector<shared_ptr<Value>>>> outputTiles(
-      outputTileCount,
-      SmallVector<SmallVector<shared_ptr<Value>>>(output_w, SmallVector<shared_ptr<Value>>(output_h, nullptr)));
-
-    size_t replicationFactor;
-    if (!conv->hasAttr(REPLICATION_ATTR_NAME))
-      replicationFactor = 1;
-    else
-      replicationFactor = conv->getAttrOfType<IntegerAttr>(REPLICATION_ATTR_NAME).getInt();
-    // producers[outTile][out_x][out_y][producerIndex]
-    vector<vector<vector<vector<Producer_t>>>> producers = vector<vector<vector<vector<Producer_t>>>>(
-      outputTileCount,
-      vector<vector<vector<Producer_t>>>(output_w, vector<vector<Producer_t>>(output_h, vector<Producer_t>())));
-
-    // Schedule in cores
-    size_t coreId = 0;
-    vector<shared_ptr<Core>> curCores(replicationFactor);
-    for (size_t i = 0; i < replicationFactor; i++)
-      curCores[i] = make_shared<Core>(coreId++, rewriter);
-
-    vector<shared_ptr<Core>> cores;
-
-    const size_t replicationSliceSize = ceilIntegerDivide(input_w, replicationFactor);
-
-    for (size_t krn_x = 0; krn_x < krn_h; krn_x++) {
-      for (size_t krn_y = 0; krn_y < krn_w; krn_y++) {
-
-        RankedTensorType mvmOutType =
-          RankedTensorType::get({1, static_cast<long>(crossbarSize), 1, 1}, bShape.getElementType());
-
-        for (size_t outTile = 0; outTile < outputTileCount; outTile++) {
-
-          if (outTile == outputTileCount - 1 && outputTileRemainder != 0)
-            mvmOutType = mvmOutType.clone({1, static_cast<long>(outputTileRemainder), 1, 1});
-
-          for (size_t inTile = 0; inTile < inputTileCount; inTile++) {
-
-            vector<size_t> xbarIndexes(replicationFactor);
-            for (size_t i = 0; i < replicationFactor; i++)
-              xbarIndexes[i] = curCores[i]->addXbarWeight(weightTiles[outTile][inTile][krn_x][krn_y]);
-
-            size_t out_x = 0;
-            for (size_t in_x = 0; in_x < input_w; in_x += stride_x) {
-              size_t out_y = 0;
-
-              // I use `replicationFactor` cores. I divide the input_w into
-              // `replicationFactor` slices, and each slice is distributed to a
-              // core. `coreIndex` is the index of the core that will be used
-              // for this slice
-              size_t coreIndex = in_x / replicationSliceSize;
-              assert(coreIndex < replicationFactor);
-
-              for (size_t in_y = 0; in_y < input_h; in_y += stride_y) {
-                // Adjust the input based on the kernel
-                int actual_in_x = in_x - ((int) krn_w / 2) + krn_x * dilation_x;
-                int actual_in_y = in_y - ((int) krn_h / 2) + krn_y * dilation_y;
-
-                // Check if we are within the input image
-                if (verifyWithinBoundsAndPaddings(input_w, input_h, actual_in_x, actual_in_y, pad_x, pad_y).failed()) {
-                  out_y++;
-                  continue;
-                }
-
-                size_t outTileId = outTile * output_w * output_h + out_x * output_h + out_y;
-                auto mvm = curCores[coreIndex]->addMVM(
-                  inputTiles[inTile][actual_in_x][actual_in_y], xbarIndexes[coreIndex], outTileId, mvmOutType);
-
-                producers[outTile][out_x][out_y].push_back({mvm, curCores[coreIndex]});
-
-                out_y++;
-              }
-              out_x++;
-            }
-
-            // Computations for these crossbars are done, check if the cores
-            // crossbars are fully used. If full, swap with new core
-            for (size_t i = 0; i < replicationFactor; i++) {
-              if (curCores[i]->isXbarsFull()) {
-                cores.emplace_back(std::move(curCores[i]));
-                curCores[i] = make_shared<Core>(coreId++, rewriter);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    for (auto& curCore : curCores)
-      if (curCore->isCoreEmpty() == false)
-        cores.emplace_back(std::move(curCore));
-    curCores.clear();
-    // Now, do the reduction of each output pixel tile
-    for (size_t outTile = 0; outTile < outputTileCount; outTile++) {
-      for (size_t out_x = 0; out_x < output_w; out_x++) {
-        for (size_t out_y = 0; out_y < output_h; out_y++) {
-          // First, check if some producers are within the same core. If this is
-          // true, `Core::addMVM` have already done the reduction within-core.
-          // This means that we only need to consider the last producer for that
-          // core.
-
-          std::unordered_map<size_t, Producer_t> withinCoreReducedProducers;
-          for (auto producer : producers[outTile][out_x][out_y])
-            withinCoreReducedProducers[producer.core->coreId] = producer;
-
-          // Now, we need to apply inter-core reduction
-
-          // Base case with one producer
-          if (withinCoreReducedProducers.size() == 1) {
-            // TODO: Add the bias and apply mapping (if present)
-
-            auto singleProducer = withinCoreReducedProducers.begin()->second;
-            // Use last producer as the final result
-            auto reducedValue = singleProducer.core->makeResultRemappable(singleProducer.value);
-            outputTiles[outTile][out_x][out_y] = reducedValue;
-            continue;
-          }
-
-          // TODO: This is a linear reduction, not a tree reduction. We can do
-          // better: a tree reduction would make more computations happen in
-          // parallel.
-
-          Producer_t lastProducer = withinCoreReducedProducers.begin()->second;
-
-          auto it = withinCoreReducedProducers.begin();
-          it++;
-          while (it != withinCoreReducedProducers.end()) {
-
-            Producer_t curProducer = it->second;
-
-            shared_ptr<Core> core1;
-            shared_ptr<Core> core2;
-            Value core1Value;
-            Value core2Value;
-
-            auto lastProducerCoreId = lastProducer.core->coreId;
-            auto curProducerCoreId = curProducer.core->coreId;
-
-            assert(lastProducerCoreId != curProducerCoreId
-                   && "We should have already applied within-core reduction, how "
-                      "could we have same cores here?");
-
-            // Sort the cores by coreId
-            if (curProducerCoreId < lastProducerCoreId) {
-              core1 = curProducer.core;
-              core1Value = curProducer.value;
-              core2 = lastProducer.core;
-              core2Value = lastProducer.value;
-            }
-            else {
-              core1 = lastProducer.core;
-              core1Value = lastProducer.value;
-              core2 = curProducer.core;
-              core2Value = curProducer.value;
-            }
-
-            auto newCoreRes = core1->makeResultRemappable(core1Value);
-            auto secondCoreBlockArg = core2->addRemappableOperand(newCoreRes);
-
-            rewriter.setInsertionPointAfterValue(core2Value);
-            Value vaddRes = rewriter.create<spatial::SpatVAddOp>(
-              core2Value.getLoc(), core2Value.getType(), core2Value, secondCoreBlockArg);
-
-            lastProducer = {vaddRes, core2};
-
-            it++;
-          }
-
-          // TODO: Add the bias and apply mapping (if present)
-
-          // Use last producer as the final result
-          auto reducedValue = lastProducer.core->makeResultRemappable(lastProducer.value);
-          outputTiles[outTile][out_x][out_y] = reducedValue;
-        }
-      }
-    }
-
-    // Now, we need to turn the cores into a spatial::SpatWeightedCompute.
-    rewriter.setInsertionPointAfter(conv);
-    spatial::SpatWeightedCompute lastWComputeOp;
-    for (auto& core : cores) {
-      lastWComputeOp = core->createWComputeOp(loc);
-      core->remapResults();
-      rewriter.setInsertionPointAfter(lastWComputeOp);
-    }
-
-    for (auto& core : cores)
-      core->addRemappedOperands();
-
-    // Set the insertion point after the last WComputeOp.
-    rewriter.setInsertionPointAfter(lastWComputeOp);
-    SmallVector<Value> tilesToConcat;
-    tilesToConcat.reserve(output_h * output_w * outputTileCount * crossbarSize);
-    for (size_t outX = 0; outX < output_h; outX++)
-      for (size_t outY = 0; outY < output_w; outY++)
-        for (size_t outTile = 0; outTile < outputTileCount; outTile++)
-          tilesToConcat.push_back(*outputTiles[outTile][outX][outY]);
-
-    Value outputImage = rewriter.create<spatial::SpatImgConcatOp>(loc, conv.getY().getType(), tilesToConcat);
-
-    // Value outputImage =
-    //     createImgConcatOp(outputTiles, rewriter, loc, Y.getType());
-
-    // If no mapping (activation) was applied, just replace ConvOp
-    // if (mapOperation == MapOperations::None) {
-    //   rewriter.replaceOp(conv, outputImage);
-    // } else {
-    //   // If mapping was applied, erase ConvOp and replace the mapping op
-    //   rewriter.eraseOp(conv);
-    //   rewriter.replaceOp(firstUserOp, outputImage);
-    // }
-
-    return success();
-  }
-};
-
-void populateTilingConvOpPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
-  patterns.insert<ConvToManyGemms>(ctx);
-}
-
-} // namespace onnx_mlir
@@ -1,400 +0,0 @@
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Types.h"
-#include "mlir/IR/Value.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-#include "llvm/ADT/SmallVector.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <unistd.h>
-
-#include "Compiler/PimCompilerOptions.hpp"
-#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
-#include "Dialect/Spatial/SpatialOps.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-using namespace mlir;
-using namespace std;
-
-namespace onnx_mlir {
-
-/**
- * @brief A pattern to tile the convolution operation into a series of compute
- *        units, each one of which applies filters to a subset of the input
- *        tensor. Results are also reduced and concatenated to form the final
- *        output tensor.
- */
-struct ExperimentalONNXConvOpTile : public OpConversionPattern<ONNXConvOp> {
-  ExperimentalONNXConvOpTile(MLIRContext* ctx)
-  : OpConversionPattern(ctx) {}
-
-  LogicalResult
-  matchAndRewrite(ONNXConvOp conv, ONNXConvOpAdaptor convAdaptor, ConversionPatternRewriter& rewriter) const final {
-
-    // --------------------------------- //
-    // --- READ OPERATION PARAMETERS --- //
-    // --------------------------------- //
-
-    // To get each crossbar's weights, we need to slice the weights tensor.
-    //   - Along the input tiles.
-    //   - Along the output tiles.
-    //   - Along the filter x position.
-    //   - Along the filter y position.
-    ShapedType inputType = cast<ShapedType>(convAdaptor.getX().getType());
-    ShapedType outputType = cast<ShapedType>(conv.getY().getType());
-    ShapedType weightsType = cast<ShapedType>(convAdaptor.getW().getType());
-
-    // TODO: Address bigger batches.
-    assert(GET_IMAGE_N(inputType) == 1
-           && "Batch size must be 1"
-              "for convolution.");
-
-    // TODO: Address replication.
-    assert(coresCount.getValue() == -1 && "Replication is not yet supported for convolution.");
-
-    // TODO: Address bias addition.
-
-    ldiv_t inputTileCount = div(GET_IMAGE_CHANNEL(inputType), crossbarSize);
-    ldiv_t outputTileCount = div(GET_IMAGE_CHANNEL(outputType), crossbarSize);
-    size_t kernelWidth = GET_KERNEL_WIDTH(weightsType);
-    size_t kernelHeight = GET_KERNEL_HEIGHT(weightsType);
-
-    // Assert that the kernel is square.
-    assert(kernelWidth == kernelHeight && "Only square kernels are supported.");
-
-    // -------------------------------- //
-    // --- SLICE THE WEIGHTS TENSOR --- //
-    // -------------------------------- //
-
-    // The core idea of this stage is classifying the weights by input and
-    // output tile. This is because we want the applyFilters operations to be
-    // tile agnostic, to keep the subsequent lowering stages as simple as
-    // possible. This data structure does this weight classification:
-    //   - The outer map is indexed by input tile.
-    //   - The inner map is indexed by output tile.
-    //   - The SmallVector contains the weights for the filter.
-    map<long, map<long, SmallVector<Value>>> weightsGroups;
-
-    // During all slicing operations within this stage, we'll use the same
-    // strides for all dimensions.
-    SmallVector<OpFoldResult> slicingStrides(4, rewriter.getIndexAttr(1));
-
-    ldiv_t itc = inputTileCount;
-    ldiv_t otc = outputTileCount;
-
-    // - Slicing along the input tiles.
-    // - Slicing along the output tiles.
-    for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-      long crossbarWidth = it == itc.quot ? itc.rem : crossbarSize;
-      for (long ot = 0; ot < otc.quot + (otc.rem > 0); ++ot) {
-        long crossbarHeight = ot == otc.quot ? otc.rem : crossbarSize;
-
-        // The loop above also sets the crossbar's used width and height,
-        // checking if we're at the last crossbar and if it's incomplete.
-
-        long outputTile = ot;
-        long inputTile = it;
-
-        // Create the slicing sizes.
-        SmallVector<OpFoldResult> slicingSizes {/* 0 */ rewriter.getIndexAttr(crossbarHeight),
-                                                /* 1 */ rewriter.getIndexAttr(crossbarWidth),
-                                                /* 2 */ rewriter.getIndexAttr(1),
-                                                /* 3 */ rewriter.getIndexAttr(1)};
-
-        // - Slicing along the filter x position.
-        // - Slicing along the filter y position.
-        for (size_t filterX = 0; filterX < kernelWidth; ++filterX) {
-          for (size_t filterY = 0; filterY < kernelHeight; ++filterY) {
-
-            // Create the slicing offsets.
-            SmallVector<OpFoldResult> slicingOffsets {/* 0 */ rewriter.getIndexAttr(outputTile * crossbarSize),
-                                                      /* 1 */ rewriter.getIndexAttr(inputTile * crossbarSize),
-                                                      /* 2 */ rewriter.getIndexAttr(filterX),
-                                                      /* 3 */ rewriter.getIndexAttr(filterY)};
-
-            // Create the slice extraction operation.
-            auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
-              conv.getLoc(), convAdaptor.getW(), slicingOffsets, slicingSizes, slicingStrides);
-
-            // Add a note to the extractSliceOp, with the filterX and filterY.
-            weightsGroups[inputTile][outputTile].push_back(extractSliceOp);
-          }
-        }
-      }
-    }
-
-    // TODO: Tree reduction for compute reduction should be implemented.
-
-    // -------------------------------- //
-    // --- CREATE ALL COMPUTE UNITS --- //
-    // -------------------------------- //
-
-    // Keep track of input slicing operations to avoid duplication across
-    // all compute units (global slices).
-    map<long, Value> globalSlices;
-
-    // Keep track of all partial compute results.
-    map<long, Value> globalPartialResults;
-
-    // Use a weight subdivider to extract groups of weights for each compute
-    // unit. We'll keep extracting groups until no more weights are left.
-    WeightSubdivider weightSubdivider(weightsGroups);
-    while (!weightSubdivider.isEmpty()) {
-
-      // -------------------------------- //
-      // --- BEGIN A NEW COMPUTE UNIT --- //
-      // -------------------------------- //
-
-      // Get the next group of weights for the compute unit.
-      SmallVector<TaggedWeights> weightsGroups = weightSubdivider.popGroups(crossbarCountInCore.getValue());
-
-      SmallVector<Value> computeWeights;
-      SmallVector<Value> computeOperands;
-
-      // ------------------------------ //
-      // --- SLICE THE INPUT TENSOR --- //
-      // ------------------------------ //
-
-      // Note each tile's index in the compute unit arguments.
-      map<long, size_t> inputTileIndices;
-      map<long, size_t> outputTileIndices;
-      map<long, size_t> reductionTileIndices; // Incoming partial results.
-
-      // Iterate over all weights groups for this compute unit.
-      map<long, Value> localSlices; // WRT the current compute unit.
-      for (auto group : weightsGroups) {
-        for (Value weight : group.weights)
-          computeWeights.push_back(weight);
-
-        // There might be multiple weight groups for the same input tile, so if
-        // we've already added the input tile, skip it.
-        if (localSlices.find(group.inputTile) != localSlices.end())
-          continue;
-
-        // We might have already sliced the input tensor for some other compute
-        // unit, so if we have, reuse the slicing operation without creating a
-        // new one.
-        if (globalSlices.find(group.inputTile) != globalSlices.end()) {
-          computeOperands.push_back(globalSlices[group.inputTile]);
-          localSlices[group.inputTile] = globalSlices[group.inputTile];
-          continue;
-        }
-
-        // Create the input tensor slicing offsets.
-        SmallVector<OpFoldResult> slicingOffsets {/* 0 */ rewriter.getIndexAttr(0), // No offset along the batch axis.
-                                                  /* 1 */ rewriter.getIndexAttr(group.inputTile * crossbarSize),
-                                                  /* 2 */ rewriter.getIndexAttr(0),
-                                                  /* 3 */ rewriter.getIndexAttr(0)};
-
-        // Create the input tensor slicing sizes.
-        size_t tilingSize = group.inputTile == inputTileCount.quot ? inputTileCount.rem : crossbarSize;
-        SmallVector<OpFoldResult> slicingSizes {/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
-                                                /* 1 */ rewriter.getIndexAttr(tilingSize),
-                                                /* 2 */ rewriter.getIndexAttr(GET_IMAGE_WIDTH(inputType)),
-                                                /* 3 */ rewriter.getIndexAttr(GET_IMAGE_HEIGHT(inputType))};
-
-        // Create the slice extraction operation.
-        auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
-          conv.getLoc(), convAdaptor.getX(), slicingOffsets, slicingSizes, slicingStrides);
-
-        computeOperands.push_back(extractSliceOp);
-
-        // Update slicing maps.
-        globalSlices[group.inputTile] = extractSliceOp;
-        localSlices[group.inputTile] = extractSliceOp;
-
-        // Update the input tile index.
-        inputTileIndices[group.inputTile] = computeOperands.size() - 1;
-      }
-
-      // ------------------------------- //
-      // --- PREPARE THE OUTPUT TYPE --- //
-      // ------------------------------- //
-
-      // Fill the compute output's type by looking at the output tiles.
-      SmallVector<Type> computeOutputType;
-      for (TaggedWeights group : weightsGroups) {
-
-        // There might be multiple weight groups for the same output tile, so if
-        // we've already added the output tile, skip it.
-        if (outputTileIndices.find(group.outputTile) != outputTileIndices.end())
-          continue;
-
-        // Additionally, after adding the input slices as operands, also add any
-        // compatible partial results from previous compute units.
-        if (globalPartialResults.find(group.outputTile) != globalPartialResults.end()) {
-          computeOperands.push_back(globalPartialResults[group.outputTile]);
-          reductionTileIndices[group.outputTile] = computeOperands.size() - 1;
-        }
-
-        // Define the output shape for this group.
-        long outputTileSize = group.outputTile == outputTileCount.quot ? outputTileCount.rem : crossbarSize;
-
-        // TODO: Address non-same padding.
-        SmallVector<int64_t> outputShapeArray {/* 0 */ 1,                           // Batch size is always 1.
-                                               /* 1 */ outputTileSize,
-                                               /* 2 */ GET_IMAGE_WIDTH(outputType), // Same padding assumed.
-                                               /* 3 */ GET_IMAGE_HEIGHT(outputType)};
-
-        auto elementType = dyn_cast<RankedTensorType>(conv.getY().getType()).getElementType();
-
-        computeOutputType.push_back(RankedTensorType::get(outputShapeArray, elementType));
-
-        outputTileIndices[group.outputTile] = computeOutputType.size() - 1;
-      }
-
-      // ----------------------------- //
-      // --- FILL THE COMPUTE UNIT --- //
-      // ----------------------------- //
-
-      // Create the compute unit.
-      spatial::SpatWeightedCompute currentCompute = rewriter.create<spatial::SpatWeightedCompute>(
-        conv.getLoc(), computeOutputType, computeWeights, computeOperands);
-
-      // Create a new block for the compute unit and add the operands.
-      Block* block = rewriter.createBlock(&currentCompute.getRegion());
-      rewriter.setInsertionPointToStart(block);
-      for (Value operand : computeOperands)
-        block->addArgument(operand.getType(), conv->getLoc());
-
-      // Initialize a map of local partial results.
-      map<long, Value> localPartialResults; // WRT the current compute unit.
-
-      // If we have any reduction tiles, add them to the local partial results.
-      for (auto reductionTileIndex : reductionTileIndices)
-        localPartialResults[reductionTileIndex.first] = block->getArgument(reductionTileIndex.second);
-
-      // Add all the applyFilters operations to the block.
-      for (TaggedWeights group : weightsGroups) {
-
-        // Get the outputType for this group.
-        Type outputType = computeOutputType[outputTileIndices[group.outputTile]];
-
-        // Create an apply filters operation.
-        BlockArgument blockArgument = block->getArgument(inputTileIndices[group.inputTile]);
-
-        // The list of weight indices is group.startingCrossbarIndex + 0, 1, 2,
-        // ... As many weights as the size of group.weights.
-        SmallVector<long> weightIndices;
-        for (size_t i = 0; i < group.weights.size(); ++i)
-          weightIndices.push_back(group.startingCrossbarIndex + i);
-
-        SmallVector<int64_t> xKerPos;
-        SmallVector<int64_t> yKerPos;
-        for (auto weight : group.weights) {
-          // Assert that the weight is an extract_slice operation.
-          auto extractSliceOp = weight.getDefiningOp<tensor::ExtractSliceOp>();
-          assert(extractSliceOp && "Weight is not an extract_slice operation.");
-
-          // Get the filter x and y positions from the extract_slice operation.
-          auto offsets = extractSliceOp.getStaticOffsets();
-          xKerPos.push_back(offsets[2]);
-          yKerPos.push_back(offsets[3]);
-        }
-
-        ArrayAttr weightIndicesAttr = rewriter.getI64ArrayAttr(weightIndices);
-        ArrayAttr xKerPosAttr = rewriter.getI64ArrayAttr(xKerPos);
-        ArrayAttr yKerPosAttr = rewriter.getI64ArrayAttr(yKerPos);
-
-        Value result = rewriter.create<spatial::SpatApplyFiltersOp>(
-          conv.getLoc(), outputType, weightIndicesAttr, xKerPosAttr, yKerPosAttr, blockArgument);
-
-        // Perform local reduction if necessary.
-        if (localPartialResults.find(group.outputTile) != localPartialResults.end()) {
-
-          result = rewriter.create<spatial::SpatVAddOp>(
-            conv.getLoc(), result.getType(), localPartialResults[group.outputTile], result);
-        }
-
-        // Update the partial results map.
-        localPartialResults[group.outputTile] = result;
-      }
-
-      // Add a yield operation to the block by concatenating the partial
-      // results.
-      SmallVector<Value> applyFiltersResults;
-      for (size_t i = 0; i < computeOutputType.size(); ++i) {
-        long outputTile;
-
-        // Given an output tile index, find the corresponding output tile.
-        for (auto outputTileIndex : outputTileIndices) {
-          if (outputTileIndex.second == i) {
-            outputTile = outputTileIndex.first;
-            break;
-          }
-        }
-
-        // Get that tile's partial result and add it to the list.
-        applyFiltersResults.push_back(localPartialResults[outputTile]);
-      }
-
-      // Create the yield operation with the given results.
-      rewriter.create<spatial::SpatYieldOp>(conv.getLoc(), applyFiltersResults);
-
-      // Update the global partial results map.
-      for (size_t i = 0; i < applyFiltersResults.size(); ++i) {
-        long outputTile;
-
-        // Given an output tile index, find the corresponding output tile.
-        for (auto outputTileIndex : outputTileIndices) {
-          if (outputTileIndex.second == i) {
-            outputTile = outputTileIndex.first;
-            break;
-          }
-        }
-
-        globalPartialResults[outputTile] = currentCompute.getResult(i);
-      }
-
-      // Move the rewrite cursor out of the block.
-      rewriter.setInsertionPointAfter(currentCompute);
-    }
-
-    // ------------------------------ //
-    // --- CONCATENATE THE OUTPUT --- //
-    // ------------------------------ //
-
-    // Turn the values into a SmallVector.
-    SmallVector<Value> outputValues;
-    for (long i = 0; i < outputTileCount.quot + (outputTileCount.rem > 0); ++i)
-      outputValues.push_back(globalPartialResults[i]);
-
-    // Assert that the number of output values is correct.
-    assert(outputValues.size() > 0 && "No output values were generated for the convolution.");
-
-    // If the conv's user is a ReLU...
-    if (conv->hasOneUse()) {
-      Operation* user = *conv->getUsers().begin();
-      if (auto relu = dyn_cast<ONNXReluOp>(user)) {
-        // ...then we can just replace the ReLU with the concatenation.
-        rewriter.replaceOp(relu, rewriter.create<tensor::ConcatOp>(conv.getLoc(), 1, outputValues));
-
-        // And erase the convolution.
-        rewriter.eraseOp(conv);
-        return success();
-      }
-    }
-
-    // Return the final output.
-    rewriter.replaceOp(conv, rewriter.create<tensor::ConcatOp>(conv.getLoc(), 1, outputValues));
-
-    return success();
-  }
-};
-
-/**
- * @brief Populate the tiling pattern for a convolution operation.
- *
- * @param patterns The pattern set to populate.
- * @param ctx The MLIR context.
- */
-void populateExperimentalTilingConvOpPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
-  patterns.insert<ExperimentalONNXConvOpTile>(ctx);
-}
-
-} // namespace onnx_mlir
@@ -1,365 +0,0 @@
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Transforms/DialectConversion.h"
-
-#include <cstdlib>
-
-#include "Compiler/PimCompilerOptions.hpp"
-#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
-#include "Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
-#include "Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-using namespace mlir;
-using namespace std;
-
-namespace onnx_mlir {
-
-struct ExperimentalGemmConversionPattern : public OpConversionPattern<ONNXGemmOp> {
-  ExperimentalGemmConversionPattern(MLIRContext* ctx)
-  : OpConversionPattern(ctx) {}
-
-  LogicalResult
-  matchAndRewrite(ONNXGemmOp gemmOp, ONNXGemmOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final {
-
-    // --------------------------------- //
-    // --- READ OPERATION PARAMETERS --- //
-    // --------------------------------- //
-
-    // To get each crossbar's weights, we need to slice the weights tensor.
-    //   - Along the input tiles.
-    //   - Along the output tiles.
-    //   - Along the filter x position.
-    //   - Along the filter y position.
-    ShapedType inputType = cast<ShapedType>(adaptor.getA().getType());
-    ShapedType outputType = cast<ShapedType>(gemmOp.getY().getType());
-    ShapedType matrixType = cast<ShapedType>(adaptor.getB().getType());
-
-    // TODO: Address bigger batches.
-    assert(inputType.getShape()[0] == 1 && "Only batch size of 1 is supported for GEMM.");
-
-    // TODO: Address replication.
-    assert(coresCount.getValue() == -1 && "Replication is not yet supported for GEMM.");
-
-    // TODO: Address bias addition.
-
-    assert(inputType.getShape()[1] == matrixType.getShape()[0] && "Input tile size must match the matrix's row size.");
-
-    ldiv_t inputTileCount = div(inputType.getShape()[1], crossbarSize);
-    ldiv_t outputTileCount = div(outputType.getShape()[1], crossbarSize);
-    size_t kernelWidth = 1;
-    size_t kernelHeight = 1;
-
-    // Assert that the kernel is square.
-    assert(kernelWidth == kernelHeight && "Only square kernels are supported.");
-
-    // -------------------------------- //
-    // --- SLICE THE WEIGHTS TENSOR --- //
-    // -------------------------------- //
-
-    // The core idea of this stage is classifying the weights by input and
-    // output tile. This is because we want the applyFilters operations to be
-    // tile agnostic, to keep the subsequent lowering stages as simple as
-    // possible. This data structure does this weight classification:
-    //   - The outer map is indexed by input tile.
-    //   - The inner map is indexed by output tile.
-    //   - The SmallVector contains the weights for the filter.
-    map<long, map<long, SmallVector<Value>>> weightsGroups;
-
-    // During all slicing operations within this stage, we'll use the same
-    // strides for all dimensions.
-    SmallVector<OpFoldResult> slicingStrides(2, rewriter.getIndexAttr(1));
-
-    ldiv_t itc = inputTileCount;
-    ldiv_t otc = outputTileCount;
-
-    // - Slicing along the input tiles.
-    // - Slicing along the output tiles.
-    for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-      long crossbarWidth = it == itc.quot ? itc.rem : crossbarSize;
-      for (long ot = 0; ot < otc.quot + (otc.rem > 0); ++ot) {
-        long crossbarHeight = ot == otc.quot ? otc.rem : crossbarSize;
-
-        // The loop above also sets the crossbar's used width and height,
-        // checking if we're at the last crossbar and if it's incomplete.
-
-        long outputTile = ot;
-        long inputTile = it;
-
-        // Create the slicing sizes.
-        SmallVector<OpFoldResult> slicingSizes {/* 0 */ rewriter.getIndexAttr(crossbarHeight),
-                                                /* 1 */ rewriter.getIndexAttr(crossbarWidth),
-                                                /* 2 */ /* rewriter.getIndexAttr(1), */
-                                                /* 3 */ /* rewriter.getIndexAttr(1) */};
-
-        // - Slicing along the filter x position.
-        // - Slicing along the filter y position.
-        for (size_t filterX = 0; filterX < kernelWidth; ++filterX) {
-          for (size_t filterY = 0; filterY < kernelHeight; ++filterY) {
-
-            // Create the slicing offsets.
-            SmallVector<OpFoldResult> slicingOffsets {/* 0 */ rewriter.getIndexAttr(outputTile * crossbarSize),
-                                                      /* 1 */ rewriter.getIndexAttr(inputTile * crossbarSize),
-                                                      /* 2 */ /* rewriter.getIndexAttr(filterX), */
-                                                      /* 3 */ /* rewriter.getIndexAttr(filterY) */};
-
-            // Create the slice extraction operation.
-            auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
-              gemmOp.getLoc(), adaptor.getB(), slicingOffsets, slicingSizes, slicingStrides);
-
-            // Add a note to the extractSliceOp, with the filterX and filterY.
-            weightsGroups[inputTile][outputTile].push_back(extractSliceOp);
-          }
-        }
-      }
-    }
-
-    // TODO: Tree reduction for compute reduction should be implemented.
-
-    // -------------------------------- //
-    // --- CREATE ALL COMPUTE UNITS --- //
-    // -------------------------------- //
-
-    // Keep track of input slicing operations to avoid duplication across
-    // all compute units (global slices).
-    map<long, Value> globalSlices;
-
-    // Keep track of all partial compute results.
-    map<long, Value> globalPartialResults;
-
-    // Use a weight subdivider to extract groups of weights for each compute
-    // unit. We'll keep extracting groups until no more weights are left.
-    WeightSubdivider weightSubdivider(weightsGroups);
-    while (!weightSubdivider.isEmpty()) {
-
-      // -------------------------------- //
-      // --- BEGIN A NEW COMPUTE UNIT --- //
-      // -------------------------------- //
-
-      // Get the next group of weights for the compute unit.
-      SmallVector<TaggedWeights> weightsGroups = weightSubdivider.popGroups(crossbarCountInCore.getValue());
-
-      SmallVector<Value> computeWeights;
-      SmallVector<Value> computeOperands;
-
-      // ------------------------------ //
-      // --- SLICE THE INPUT TENSOR --- //
-      // ------------------------------ //
-
-      // Note each tile's index in the compute unit arguments.
-      map<long, size_t> inputTileIndices;
-      map<long, size_t> outputTileIndices;
-      map<long, size_t> reductionTileIndices; // Incoming partial results.
-
-      // Iterate over all weights groups for this compute unit.
-      map<long, Value> localSlices; // WRT the current compute unit.
-      for (auto group : weightsGroups) {
-        for (Value weight : group.weights)
-          computeWeights.push_back(weight);
-
-        // There might be multiple weight groups for the same input tile, so if
-        // we've already added the input tile, skip it.
-        if (localSlices.find(group.inputTile) != localSlices.end())
-          continue;
-
-        // We might have already sliced the input tensor for some other compute
-        // unit, so if we have, reuse the slicing operation without creating a
-        // new one.
-        if (globalSlices.find(group.inputTile) != globalSlices.end()) {
-          computeOperands.push_back(globalSlices[group.inputTile]);
-          localSlices[group.inputTile] = globalSlices[group.inputTile];
-          continue;
-        }
-
-        // Create the input tensor slicing offsets.
-        SmallVector<OpFoldResult> slicingOffsets {/* 0 */ rewriter.getIndexAttr(0), // No offset along the batch axis.
-                                                  /* 1 */ rewriter.getIndexAttr(group.inputTile * crossbarSize),
-                                                  /* 2 */                           /* rewriter.getIndexAttr(0), */
-                                                  /* 3 */ /* rewriter.getIndexAttr(0) */};
-
-        // Create the input tensor slicing sizes.
-        size_t tilingSize = group.inputTile == inputTileCount.quot ? inputTileCount.rem : crossbarSize;
-        SmallVector<OpFoldResult> slicingSizes {/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
-                                                /* 1 */ rewriter.getIndexAttr(tilingSize),
-                                                /* 2 */ /* rewriter.getIndexAttr(GET_IMAGE_WIDTH(inputType)), */
-                                                /* 3 */ /* rewriter.getIndexAttr(GET_IMAGE_HEIGHT(inputType)) */};
-
-        // Create the slice extraction operation.
-        auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
-          gemmOp.getLoc(), adaptor.getA(), slicingOffsets, slicingSizes, slicingStrides);
-
-        computeOperands.push_back(extractSliceOp);
-
-        // Update slicing maps.
-        globalSlices[group.inputTile] = extractSliceOp;
-        localSlices[group.inputTile] = extractSliceOp;
-
-        // Update the input tile index.
-        inputTileIndices[group.inputTile] = computeOperands.size() - 1;
-      }
-
-      // ------------------------------- //
-      // --- PREPARE THE OUTPUT TYPE --- //
-      // ------------------------------- //
-
-      // Fill the compute output's type by looking at the output tiles.
-      SmallVector<Type> computeOutputType;
-      for (TaggedWeights group : weightsGroups) {
-
-        // There might be multiple weight groups for the same output tile, so if
-        // we've already added the output tile, skip it.
-        if (outputTileIndices.find(group.outputTile) != outputTileIndices.end())
-          continue;
-
-        // Additionally, after adding the input slices as operands, also add any
-        // compatible partial results from previous compute units.
-        if (globalPartialResults.find(group.outputTile) != globalPartialResults.end()) {
-          computeOperands.push_back(globalPartialResults[group.outputTile]);
-          reductionTileIndices[group.outputTile] = computeOperands.size() - 1;
-        }
-
-        // Define the output shape for this group.
-        long outputTileSize = group.outputTile == outputTileCount.quot ? outputTileCount.rem : crossbarSize;
-
-        // TODO: Address non-same padding.
-        SmallVector<int64_t> outputShapeArray {/* 0 */ 1,                                 // Batch size is always 1.
-                                               /* 1 */ outputTileSize,
-                                               /* 2 */ /* GET_IMAGE_WIDTH(outputType), */ // Same padding assumed.
-                                               /* 3 */ /* GET_IMAGE_HEIGHT(outputType) */};
-
-        auto elementType = dyn_cast<RankedTensorType>(gemmOp.getY().getType()).getElementType();
-
-        computeOutputType.push_back(RankedTensorType::get(outputShapeArray, elementType));
-
-        outputTileIndices[group.outputTile] = computeOutputType.size() - 1;
-      }
-
-      // ----------------------------- //
-      // --- FILL THE COMPUTE UNIT --- //
-      // ----------------------------- //
-
-      // Create the compute unit.
-      spatial::SpatWeightedCompute currentCompute = rewriter.create<spatial::SpatWeightedCompute>(
-        gemmOp.getLoc(), computeOutputType, computeWeights, computeOperands);
-
-      // Create a new block for the compute unit and add the operands.
-      Block* block = rewriter.createBlock(&currentCompute.getRegion());
-      rewriter.setInsertionPointToStart(block);
-      for (Value operand : computeOperands)
-        block->addArgument(operand.getType(), gemmOp->getLoc());
-
-      // Initialize a map of local partial results.
-      map<long, Value> localPartialResults; // WRT the current compute unit.
-
-      // If we have any reduction tiles, add them to the local partial results.
-      for (auto reductionTileIndex : reductionTileIndices)
-        localPartialResults[reductionTileIndex.first] = block->getArgument(reductionTileIndex.second);
-
-      // Add all the applyFilters operations to the block.
-      for (TaggedWeights group : weightsGroups) {
-
-        // Get the outputType for this group.
-        Type outputType = computeOutputType[outputTileIndices[group.outputTile]];
-
-        // Create an apply filters operation.
-        BlockArgument blockArgument = block->getArgument(inputTileIndices[group.inputTile]);
-
-        // The list of weight indices is group.startingCrossbarIndex + 0, 1, 2,
-        // ... As many weights as the size of group.weights.
-        SmallVector<long> weightIndices;
-        for (size_t i = 0; i < group.weights.size(); ++i)
-          weightIndices.push_back(group.startingCrossbarIndex + i);
-
-        SmallVector<int64_t> xKerPos;
-        SmallVector<int64_t> yKerPos;
-        for (auto weight : group.weights) {
-          // Assert that the weight is an extract_slice operation.
-          auto extractSliceOp = weight.getDefiningOp<tensor::ExtractSliceOp>();
-          assert(extractSliceOp && "Weight is not an extract_slice operation.");
-
-          // Get the filter x and y positions from the extract_slice operation.
-          xKerPos.push_back(0);
-          yKerPos.push_back(0);
-        }
-
-        ArrayAttr weightIndicesAttr = rewriter.getI64ArrayAttr(weightIndices);
-        ArrayAttr xKerPosAttr = rewriter.getI64ArrayAttr(xKerPos);
-        ArrayAttr yKerPosAttr = rewriter.getI64ArrayAttr(yKerPos);
-
-        Value result = rewriter.create<spatial::SpatApplyFiltersOp>(
-          gemmOp.getLoc(), outputType, weightIndicesAttr, xKerPosAttr, yKerPosAttr, blockArgument);
-
-        // Perform local reduction if necessary.
-        if (localPartialResults.find(group.outputTile) != localPartialResults.end()) {
-
-          result = rewriter.create<spatial::SpatVAddOp>(
-            gemmOp.getLoc(), result.getType(), localPartialResults[group.outputTile], result);
-        }
-
-        // Update the partial results map.
-        localPartialResults[group.outputTile] = result;
-      }
-
-      // Add a yield operation to the block by concatenating the partial
-      // results.
-      SmallVector<Value> applyFiltersResults;
-      for (size_t i = 0; i < computeOutputType.size(); ++i) {
-        long outputTile;
-
-        // Given an output tile index, find the corresponding output tile.
-        for (auto outputTileIndex : outputTileIndices) {
-          if (outputTileIndex.second == i) {
-            outputTile = outputTileIndex.first;
-            break;
-          }
-        }
-
-        // Get that tile's partial result and add it to the list.
-        applyFiltersResults.push_back(localPartialResults[outputTile]);
-      }
-
-      // Create the yield operation with the given results.
-      rewriter.create<spatial::SpatYieldOp>(gemmOp.getLoc(), applyFiltersResults);
-
-      // Update the global partial results map.
-      for (size_t i = 0; i < applyFiltersResults.size(); ++i) {
-        long outputTile;
-
-        // Given an output tile index, find the corresponding output tile.
-        for (auto outputTileIndex : outputTileIndices) {
-          if (outputTileIndex.second == i) {
-            outputTile = outputTileIndex.first;
-            break;
-          }
-        }
-
-        globalPartialResults[outputTile] = currentCompute.getResult(i);
-      }
-
-      // Move the rewrite cursor out of the block.
-      rewriter.setInsertionPointAfter(currentCompute);
-    }
-
-    // ------------------------------ //
-    // --- CONCATENATE THE OUTPUT --- //
-    // ------------------------------ //
-
-    // Turn the values into a SmallVector.
-    SmallVector<Value> outputValues;
-    for (long i = 0; i < outputTileCount.quot + (outputTileCount.rem > 0); ++i)
-      outputValues.push_back(globalPartialResults[i]);
-
-    // Assert that the number of output values is correct.
-    assert(outputValues.size() > 0 && "No output values were generated for the GEMM operation.");
-
-    // Return the final output.
-    rewriter.replaceOp(gemmOp, rewriter.create<tensor::ConcatOp>(gemmOp.getLoc(), 1, outputValues));
-
-    return success();
-  }
-};
-
-void populateGemmToConvConversionPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
-  patterns.insert<ExperimentalGemmConversionPattern>(ctx);
-}
-
-} // namespace onnx_mlir
@@ -10,7 +10,6 @@

 #include <cassert>

-#include "Gemm.hpp"
 #include "src/Accelerators/PIM/Common/PIMCommon.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
@@ -20,6 +19,38 @@
 using namespace mlir;

 namespace onnx_mlir {
+namespace {
+
+constexpr StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor";
+
+struct GemmToManyGemv : OpConversionPattern<ONNXGemmOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(ONNXGemmOp gemmOp,
+                                ONNXGemmOpAdaptor gemmOpAdaptor,
+                                ConversionPatternRewriter& rewriter) const override;
+};
+
+struct GemvToSpatialCompute : OpConversionPattern<ONNXGemmOp> {
+  GemvToSpatialCompute(MLIRContext* ctx)
+  : OpConversionPattern(ctx, 1) {}
+
+  LogicalResult matchAndRewrite(ONNXGemmOp gemmOp,
+                                ONNXGemmOpAdaptor gemmOpAdaptor,
+                                ConversionPatternRewriter& rewriter) const override;
+
+private:
+  static Value resolveONNXExpOpFromUseChain(Value startValue);
+
+  static LogicalResult softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
+                                                   Value& softmaxChannel,
+                                                   ConversionPatternRewriter& rewriter,
+                                                   SpatialReducer& reducer,
+                                                   ONNXGemmOp& gemmOp,
+                                                   Location& loc);
+};
+
+} // namespace

 LogicalResult GemmToManyGemv::matchAndRewrite(ONNXGemmOp gemmOp,
                                              ONNXGemmOpAdaptor gemmOpAdaptor,
@@ -1,54 +0,0 @@
-#pragma once
-
-#include "Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-namespace onnx_mlir {
-
-constexpr mlir::StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor";
-
-struct GemmToManyGemv : mlir::OpConversionPattern<mlir::ONNXGemmOp> {
-  GemmToManyGemv(mlir::MLIRContext* ctx)
-  : OpConversionPattern(ctx, 2) {}
-
-  mlir::LogicalResult matchAndRewrite(mlir::ONNXGemmOp gemmOp,
-                                      mlir::ONNXGemmOpAdaptor gemmOpAdaptor,
-                                      mlir::ConversionPatternRewriter& rewriter) const override;
-};
-
-struct GemvToSpatialCompute : mlir::OpConversionPattern<mlir::ONNXGemmOp> {
-  GemvToSpatialCompute(mlir::MLIRContext* ctx)
-  : OpConversionPattern(ctx, 1) {}
-
-  llvm::LogicalResult matchAndRewrite(mlir::ONNXGemmOp gemmOp,
-                                      mlir::ONNXGemmOpAdaptor gemmOpAdaptor,
-                                      mlir::ConversionPatternRewriter& rewriter) const override;
-
-private:
-  /**
-   * Resolves the ONNXExpOp from the use chain of the given start value.
-   *
-   * This function traverses the use chain of the start value until it finds an
-   * ONNXExpOp. It returns the value of the ONNXExpOp.
-   *
-   * @param startValue The starting value of the use chain.
-   * @return The value of the ONNXExpOp found in the use chain.
-   */
-  static mlir::Value resolveONNXExpOpFromUseChain(mlir::Value startValue);
-
-  // Softmax is a special case, as it requires another reduction after the
-  // first one. In the cores, `applyReducePattern` already applied
-  // f(x) = exp(x) to each tile. This mean that now we just need to
-  // reduce-sum these tiles, and then divide each tile by the reduced sum,
-  // which is propagated back to the cores via a broadcast channel.
-  static llvm::LogicalResult softmaxReductionApplication(llvm::SmallVector<OpAndResNum>& outputOpsAndResNums,
-                                                         Value& softmaxChannel,
-                                                         ConversionPatternRewriter& rewriter,
-                                                         SpatialReducer& reducer,
-                                                         ONNXGemmOp& gemmOp,
-                                                         Location& loc);
-};
-
-void populateOnnxGemmOpPatterns(RewritePatternSet& patterns, MLIRContext* ctx);
-
-} // namespace onnx_mlir
@@ -1,300 +0,0 @@
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Value.h"
-#include "mlir/IR/ValueRange.h"
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstddef>
-
-#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-using namespace mlir;
-
-namespace onnx_mlir {
-
-template <typename PoolOp>
-bool hasPostProcessExperimentalPoolingWindow() {
-  return false;
-}
-
-template <>
-bool hasPostProcessExperimentalPoolingWindow<ONNXAveragePoolOp>() {
-  return true;
-}
-
-template <typename PoolOp>
-Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter& rewriter,
-                                           Location loc,
-                                           PoolOp poolOp,
-                                           Value valueToDivide,
-                                           size_t krn_size,
-                                           size_t tilesSkippedByPadding) {
-  return nullptr;
-}
-
-template <>
-Value postProcessExperimentalPoolingWindow<ONNXAveragePoolOp>(ConversionPatternRewriter& rewriter,
-                                                              Location loc,
-                                                              ONNXAveragePoolOp poolOp,
-                                                              Value valueToDivide,
-                                                              size_t krn_size,
-                                                              size_t tilesSkippedByPadding) {
-  bool countIncludePad = poolOp.getCountIncludePad() == 1;
-
-  size_t divisorNumber = countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
-
-  RankedTensorType scalarTensor = RankedTensorType::get({1}, rewriter.getF32Type());
-
-  // Put a spat.const before the computeOp, and use its value. We do this to be
-  // compatible with the current code generation, which assumes constant to be
-  // loaded in global memory, which is allocated by adding a spat.const OP
-  // directly under func.func (i.e. alongside ComputeOps)
-  auto computeOp = cast<spatial::SpatWeightedCompute>(valueToDivide.getDefiningOp()->getParentOp());
-  rewriter.setInsertionPoint(computeOp);
-  auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc,
-                                                               scalarTensor,
-                                                               rewriter.getI64IntegerAttr(divisorNumber),
-                                                               /* should_allocate = */ rewriter.getBoolAttr(true));
-
-  rewriter.setInsertionPointAfterValue(valueToDivide);
-  return rewriter.create<spatial::SpatVSDivOp>(loc, valueToDivide.getType(), valueToDivide, divisorValue);
-}
-
-template <typename ReductionOp>
-Value reduceInputTiles(SmallVector<Value>& inputTiles, ConversionPatternRewriter& rewriter) {
-  if (inputTiles.size() == 1)
-    return inputTiles[0];
-
-  if (inputTiles.size() == 2) {
-    return rewriter.create<spatial::SpatVMaxOp>(
-      inputTiles[0].getLoc(), inputTiles[0].getType(), inputTiles[0], inputTiles[1]);
-  }
-
-  SmallVector<Value> left(inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2);
-  SmallVector<Value> right(inputTiles.begin() + inputTiles.size() / 2, inputTiles.end());
-
-  Value leftReduced = reduceInputTiles<ReductionOp>(left, rewriter);
-  Value rightReduced = reduceInputTiles<ReductionOp>(right, rewriter);
-
-  return rewriter.create<ReductionOp>(inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced);
-}
-
-template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
-struct ExperimentalPoolingBaseConverter : public OpConversionPattern<PoolOp> {
-  ExperimentalPoolingBaseConverter(MLIRContext* ctx)
-  : OpConversionPattern<PoolOp>(ctx) {}
-
-  LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final {
-    Value X = adaptor.getX();
-    ShapedType xShape = mlir::cast<ShapedType>(X.getType());
-    Value Y = poolOp.getResult();
-    ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
-
-    size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
-    unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
-    unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
-    unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
-
-    if (adaptor.getAutoPad() != "NOTSET")
-      return rewriter.notifyMatchFailure(poolOp, "auto_pad != NOTSET is deprecated.");
-
-    size_t pad_x, pad_y;
-    auto padUnpackError = unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
-    if (padUnpackError.has_value())
-      return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
-
-    Location loc = poolOp.getLoc();
-
-    size_t input_h = GET_IMAGE_HEIGHT(xShape);
-    size_t input_w = GET_IMAGE_WIDTH(xShape);
-    size_t output_h = GET_IMAGE_HEIGHT(yShape);
-    size_t output_w = GET_IMAGE_WIDTH(yShape);
-
-    ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize);
-
-    // Assert that the input is a tensor.ConcatOp.
-    auto concat = X.getDefiningOp<tensor::ConcatOp>();
-    if (!concat)
-      return rewriter.notifyMatchFailure(poolOp, "Expected input to be a tensor.ConcatOp");
-
-    // Create a [channel_tile][x][y] array to store the input tiles.
-    std::map<long, std::map<long, std::map<long, Value>>> inputTiles;
-
-    // For each argument of the tensor.ConcatOp, resolve the input tiles.
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x) {
-        for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) {
-          size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
-
-          SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
-          SmallVector<OpFoldResult> offsets = {/* 0 */ rewriter.getIndexAttr(0),
-                                               /* 1 */ rewriter.getIndexAttr(0),
-                                               /* 2 */ rewriter.getIndexAttr(x),
-                                               /* 3 */ rewriter.getIndexAttr(y)};
-          SmallVector<OpFoldResult> sizes = {/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
-                                             /* 1 */ rewriter.getIndexAttr(tilingSize),
-                                             /* 2 */ rewriter.getIndexAttr(1),
-                                             /* 3 */ rewriter.getIndexAttr(1)};
-
-          // Get the concat's operand that we want to slice.
-          Value concatInput = concat.getOperand(it);
-          Value slicedTile = rewriter.create<tensor::ExtractSliceOp>(loc, concatInput, offsets, sizes, strides);
-
-          inputTiles[it][x][y] = slicedTile;
-        }
-      }
-    }
-
-    // Prepare the shape of the compute's output.
-    ldiv_t itc = tileCount;
-    SmallVector<Type> outputTileTypes;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          SmallVector<int64_t> outputShapeArray {/* 0 */ 1, // Batch size is always 1.
-                                                            /* 1 */
-                                                 cast<RankedTensorType>(inputTiles[it][0][0].getType()).getShape()[1],
-                                                 /* 2 */ 1,
-                                                 /* 3 */ 1};
-
-          auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
-
-          outputTileTypes.push_back(RankedTensorType::get(outputShapeArray, elementType));
-        }
-      }
-    }
-
-    // Create a plain value list of the input tiles.
-    SmallVector<Value> inputTilesList;
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x)
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it)
-          inputTilesList.push_back(inputTiles[it][y][x]);
-    }
-
-    // Create a single compute to calculate the output.
-    auto computeOp =
-      rewriter.create<spatial::SpatWeightedCompute>(loc, outputTileTypes, SmallVector<Value>(), inputTilesList);
-
-    // Create a new block for the compute unit and add the operands.
-    Block* block = rewriter.createBlock(&computeOp.getRegion());
-
-    // Fill the block arguments and keep a reference to them.
-    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> inputTilesArgs;
-    for (size_t y = 0; y < input_h; ++y) {
-      for (size_t x = 0; x < input_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it;
-          inputTilesArgs[it][y][x] = block->addArgument(computeOp->getOperand(tileIndex).getType(), loc);
-        }
-      }
-    }
-
-    // Begin writing in the block.
-    rewriter.setInsertionPointToStart(block);
-
-    // Go through all pooling blocks.
-    SmallVector<Value> outputTiles;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          size_t start_x = x * stride_x;
-          size_t start_y = y * stride_y;
-          size_t end_x = std::min(start_x + krn_w, input_w);
-          size_t end_y = std::min(start_y + krn_h, input_h);
-
-          SmallVector<Value> inputTilesToReduce;
-          for (size_t ky = start_y; ky < end_y; ++ky)
-            for (size_t kx = start_x; kx < end_x; ++kx)
-              inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]);
-
-          auto reduceResult = reduceInputTiles<ReduceOp>(inputTilesToReduce, rewriter);
-
-          // If the reduce op is add, we need to divide the result by the
-          // number of elements in the pooling window.
-          if (hasPostProcessExperimentalPoolingWindow<PoolOp>()) {
-            // Add a spat.const before the computeOp.
-            rewriter.setInsertionPoint(computeOp);
-            auto divisorValue =
-              rewriter.create<spatial::SpatConstantOp>(loc,
-                                                       RankedTensorType::get({1}, rewriter.getF32Type()),
-                                                       rewriter.getI64IntegerAttr(krn_w * krn_h),
-                                                       rewriter.getBoolAttr(true));
-
-            rewriter.setInsertionPointAfter(reduceResult.getDefiningOp());
-            reduceResult =
-              rewriter.create<spatial::SpatVSDivOp>(loc, reduceResult.getType(), reduceResult, divisorValue);
-          }
-          outputTiles.push_back(reduceResult);
-        }
-      }
-    }
-
-    // Create a YieldOp to return the output tiles.
-    rewriter.create<spatial::SpatYieldOp>(loc, outputTiles);
-
-    // Set the rewrite cursor right after the computeOp.
-    rewriter.setInsertionPointAfter(computeOp);
-
-    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> computeOutput;
-    for (size_t y = 0; y < output_h; ++y) {
-      for (size_t x = 0; x < output_w; ++x) {
-        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-          auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) + x * (itc.quot + (itc.rem > 0)) + it;
-          computeOutput[it][y][x] = computeOp.getResult(tileIndex);
-        }
-      }
-    }
-
-    // We'll now create spat.img.concat ops to concatenate the output tiles.
-    SmallVector<Value> outputTilesList;
-    for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
-      SmallVector<Value> imgConcatTiles;
-      for (size_t y = 0; y < output_h; ++y)
-        for (size_t x = 0; x < output_w; ++x)
-          imgConcatTiles.push_back(computeOutput[it][y][x]);
-
-      size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
-
-      SmallVector<int64_t> outputShapeArray {/* 0 */ 1, // Batch size is always 1.
-                                             /* 1 */ (long) tilingSize,
-                                             /* 2 */ (long) output_w,
-                                             /* 3 */ (long) output_h};
-
-      auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
-
-      outputTilesList.push_back(rewriter.create<spatial::SpatImgConcatOp>(
-        loc, RankedTensorType::get(outputShapeArray, elementType), imgConcatTiles));
-    }
-
-    // Create a new tensor.ConcatOp to concatenate the output tiles.
-    Value outputTensor = rewriter.create<tensor::ConcatOp>(loc, 1, outputTilesList);
-
-    rewriter.replaceOp(poolOp, outputTensor);
-
-    return success();
-  }
-};
-
-void populateExperimentalPoolingTilingPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
-  patterns.insert<
-    ExperimentalPoolingBaseConverter<ONNXMaxPoolSingleOutOp, ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
-  patterns.insert<ExperimentalPoolingBaseConverter<ONNXAveragePoolOp, ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(
-    ctx);
-}
-
-} // namespace onnx_mlir
@@ -26,8 +26,6 @@ using namespace mlir;

 namespace onnx_mlir {

-llvm::SmallPtrSet<Operation*, 16> oldComputeOpsReplaced;
-
 Value applyReducePatternNew(SmallVector<Value>& valuesToReduce,
                            ConversionPatternRewriter& rewriter,
                            std::function<Value(const Value&, const Value&)> reduce,
@@ -225,12 +223,12 @@ struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {

    Location loc = poolOp.getLoc();

-    size_t input_h = GET_IMAGE_HEIGHT(xShape);
-    size_t input_w = GET_IMAGE_WIDTH(xShape);
-    size_t output_h = GET_IMAGE_HEIGHT(yShape);
-    size_t output_w = GET_IMAGE_WIDTH(yShape);
-    size_t channelTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
-    size_t channelTileRest = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
+    size_t input_h = getImageHeight(xShape);
+    size_t input_w = getImageWidth(xShape);
+    size_t output_h = getImageHeight(yShape);
+    size_t output_w = getImageWidth(yShape);
+    size_t channelTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
+    size_t channelTileRest = getImageChannel(xShape) % crossbarSize;

    // 1: Tile the input tensor
    // Input tiles need to be indexed by:
@@ -13,9 +13,7 @@ def onnxToArithConstantOp : Pat<
  (Arith_ConstantOp $value)
 >;

-//===----------------------------------------------------------------------===//
 // ONNXMatMulOp to ONNXGemmOp patterns
-//===----------------------------------------------------------------------===//

 def matMulAddToGemmPattern : Pat<
  (ONNXAddOp (ONNXMatMulOp:$matmulres $A, $B), $C),
@@ -39,9 +37,7 @@ def matMulToGemmPattern : Pat<
  )
 >;

-//===----------------------------------------------------------------------===//
 // ONNXConvOp + ONNXAddOp to ONNXConvOp pattern
-//===----------------------------------------------------------------------===//

 // This pattern is used to fuse an ONNXConvOp and an ONNXAddOp into a single
 // ONNXConvOp with a bias.
@@ -55,9 +51,7 @@ def convAddToConvWithBiasPatternRight : Pat<
  (ONNXConvOp $x, $w, $add_operand, $auto_pad, $dilations, $group, $kernel_shape, $pad, $strides)
 >;

-//===----------------------------------------------------------------------===//
 // Operation to ignore (i.e. remove)
-//===----------------------------------------------------------------------===//

 def replaceWithOperationOfValue : NativeCodeCall<"$0">;

@@ -180,10 +180,10 @@ void tileImageTensorByChannel(Value imageTensor,
                              ConversionPatternRewriter& rewriter) {
  ShapedType imageShape = mlir::cast<ShapedType>(imageTensor.getType());

-  size_t input_h = GET_IMAGE_HEIGHT(imageShape);
-  size_t input_w = GET_IMAGE_WIDTH(imageShape);
-  size_t tileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(imageShape), tileSize);
-  size_t tileRest = GET_IMAGE_CHANNEL(imageShape) % tileSize;
+  size_t input_h = getImageHeight(imageShape);
+  size_t input_w = getImageWidth(imageShape);
+  size_t tileCount = ceilIntegerDivide(getImageChannel(imageShape), tileSize);
+  size_t tileRest = getImageChannel(imageShape) % tileSize;

  SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
  SmallVector<OpFoldResult> offsets(4, rewriter.getIndexAttr(0));
@@ -9,24 +9,55 @@

 #include "llvm/Support/LogicalResult.h"

+#include <cassert>
+#include <cstddef>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"

 #define DEFINE_MAP_OP(opname) opname,

-#define GET_IMAGE_WIDTH(shapedType) shapedType.getDimSize(2)
-#define GET_IMAGE_HEIGHT(shapedType) shapedType.getDimSize(3)
-#define GET_IMAGE_CHANNEL(shapedType) shapedType.getDimSize(1)
-#define GET_IMAGE_N(shapedType) shapedType.getDimSize(0)
-#define GET_KERNEL_WIDTH(shapedType) shapedType.getDimSize(2)
-#define GET_KERNEL_HEIGHT(shapedType) shapedType.getDimSize(3)
-#define GET_FILTER_COUNT(shapedType) shapedType.getDimSize(0)
-
-using namespace mlir;
-
 namespace onnx_mlir {

-const StringRef REPLICATION_ATTR_NAME = "replication_factor";
+template <class ShapedType>
+inline auto getImageWidth(const ShapedType& shapedType) {
+  return shapedType.getDimSize(2);
+}
+
+template <class ShapedType>
+inline auto getImageHeight(const ShapedType& shapedType) {
+  return shapedType.getDimSize(3);
+}
+
+template <class ShapedType>
+inline auto getImageChannel(const ShapedType& shapedType) {
+  return shapedType.getDimSize(1);
+}
+
+template <class ShapedType>
+inline auto getImageN(const ShapedType& shapedType) {
+  return shapedType.getDimSize(0);
+}
+
+template <class ShapedType>
+inline auto getKernelWidth(const ShapedType& shapedType) {
+  return shapedType.getDimSize(2);
+}
+
+template <class ShapedType>
+inline auto getKernelHeight(const ShapedType& shapedType) {
+  return shapedType.getDimSize(3);
+}
+
+template <class ShapedType>
+inline auto getFilterCount(const ShapedType& shapedType) {
+  return shapedType.getDimSize(0);
+}
+
+inline constexpr mlir::StringRef REPLICATION_ATTR_NAME = "replication_factor";

 using HSliceId = size_t;
 using CoreId = size_t;
@@ -58,51 +89,64 @@ constexpr std::pair<C, C> ceilIntegerDivideWithRemainder(A a, B b) {
 }

 template <class T>
-bool isVectorShape(const ArrayRef<T> shape) {
+bool isVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && (shape[0] == 1 || shape[1] == 1);
 }

 template <class T>
-bool isMatrixShape(const ArrayRef<T> shape) {
+bool isMatrixShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2;
 }

 template <class T>
-bool isHVectorShape(const ArrayRef<T> shape) {
+bool isHVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && shape[0] == 1;
 }

 template <class T>
-bool isVVectorShape(const ArrayRef<T> shape) {
+bool isVVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && shape[1] == 1;
 }

 template <class T>
-T getVectorLength(const ArrayRef<T> shape) {
+T getVectorLength(mlir::ArrayRef<T> shape) {
  assert(isVectorShape(shape));
  return shape[0] != 1 ? shape[0] : shape[1];
 }

-inline auto getTensorShape(const Value tensor) { return cast<RankedTensorType>(tensor.getType()).getShape(); }
+inline auto getTensorShape(mlir::Value tensor) {
+  return mlir::cast<mlir::RankedTensorType>(tensor.getType()).getShape();
+}

-SmallVector<Value> sliceTensor(
-  const Value& tensorToSlice, size_t axis, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc);
+llvm::SmallVector<mlir::Value> sliceTensor(const mlir::Value& tensorToSlice,
+                                           size_t axis,
+                                           int64_t sliceSize,
+                                           mlir::ConversionPatternRewriter& rewriter,
+                                           mlir::Location loc);

-SmallVector<Value>
-sliceVector(const Value& vectorToSlice, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc);
+llvm::SmallVector<mlir::Value> sliceVector(const mlir::Value& vectorToSlice,
+                                           int64_t sliceSize,
+                                           mlir::ConversionPatternRewriter& rewriter,
+                                           mlir::Location loc);

-DenseMap<CoreId, SmallVector<Value>>
-sliceVectorPerCrossbarPerCore(const Value& vectorToSlice, ConversionPatternRewriter& rewriter, Location loc);
+llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>> sliceVectorPerCrossbarPerCore(
+  const mlir::Value& vectorToSlice, mlir::ConversionPatternRewriter& rewriter, mlir::Location loc);

-DenseMap<HSliceId, DenseMap<CoreId, SmallVector<Value>>> tileMatrix(
-  Value& matrixToTile, int64_t hSliceSize, int64_t vSliceSize, ConversionPatternRewriter& rewriter, Location& loc);
+llvm::DenseMap<HSliceId, llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>>>
+tileMatrix(mlir::Value& matrixToTile,
+           int64_t hSliceSize,
+           int64_t vSliceSize,
+           mlir::ConversionPatternRewriter& rewriter,
+           mlir::Location& loc);

-tensor::SplatOp
-broadcastToVector(Value scalarToBroadcast, int64_t length, ConversionPatternRewriter& rewriter, Location loc);
+mlir::tensor::SplatOp broadcastToVector(mlir::Value scalarToBroadcast,
+                                        int64_t length,
+                                        mlir::ConversionPatternRewriter& rewriter,
+                                        mlir::Location loc);

-Value sumTensors(ArrayRef<Value> tensors, ConversionPatternRewriter& rewriter);
+mlir::Value sumTensors(mlir::ArrayRef<mlir::Value> tensors, mlir::ConversionPatternRewriter& rewriter);

-Value createMapOperation(PatternRewriter& rewriter, MapOperations mapOp, const Value& input);
+mlir::Value createMapOperation(mlir::PatternRewriter& rewriter, MapOperations mapOp, const mlir::Value& input);

 /**
 * Unpacks an optional pair vector into two size_t values.
@@ -126,7 +170,8 @@ void unpackOptionalPairVector(std::optional<mlir::ArrayAttr> valuesArray, size_t
 *
 * @return llvm::Optional<llvm::Twine> The error message if the pads are invalid
 */
-std::optional<Twine> unpackOptionalPadsVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& pad_x, size_t& pad_y);
+std::optional<llvm::Twine>
+unpackOptionalPadsVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& pad_x, size_t& pad_y);

 /**
 * Tiles the image tensor by channel.
@@ -140,10 +185,10 @@ std::optional<Twine> unpackOptionalPadsVector(std::optional<mlir::ArrayAttr> val
 * @param tileSize The size of each tile.
 * @param rewriter The ConversionPatternRewriter used for creating operations.
 */
-void tileImageTensorByChannel(Value imageTensor,
-                              SmallVector<SmallVector<SmallVector<Value>>>& tiles,
+void tileImageTensorByChannel(mlir::Value imageTensor,
+                              llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>>& tiles,
                              size_t tileSize,
-                              ConversionPatternRewriter& rewriter);
+                              mlir::ConversionPatternRewriter& rewriter);

 /**
 * Creates an ImgConcatOp based on the given tiles.
@@ -159,10 +204,10 @@ void tileImageTensorByChannel(Value imageTensor,
 *
 * @return The created ImgConcatOp.
 */
-Value createImgConcatOp(SmallVector<SmallVector<SmallVector<Value>>>& outputTiles,
-                        ConversionPatternRewriter& rewriter,
-                        Location& loc,
-                        Type outputType);
+mlir::Value createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>>& outputTiles,
+                              mlir::ConversionPatternRewriter& rewriter,
+                              mlir::Location& loc,
+                              mlir::Type outputType);

 /**
 * @brief Verifies if the given input coordinates and padding values are within
@@ -177,7 +222,7 @@ Value createImgConcatOp(SmallVector<SmallVector<SmallVector<Value>>>& outputTile
 * @return LogicalResult Returns success if the coordinates and padding are
 * within bounds, failure otherwise.
 */
-LogicalResult
+mlir::LogicalResult
 verifyWithinBoundsAndPaddings(size_t input_w, size_t input_h, int inX, int inY, size_t pad_x, size_t pad_y);

 /**
@@ -207,13 +252,14 @@ verifyWithinBoundsAndPaddings(size_t input_w, size_t input_h, int inX, int inY,
 * @return std::optional<llvm::Twine> An error message if the input tensor could
 * not be resolved into tiles.
 */
-std::optional<Twine> resolveImgInputTiles(Value wholeInputTensor,
-                                          SmallVector<SmallVector<SmallVector<Value>>>& inputTiles,
-                                          size_t channelTileCount,
-                                          size_t channelTileRest,
-                                          size_t input_w,
-                                          size_t input_h,
-                                          mlir::ConversionPatternRewriter& rewriter);
+std::optional<llvm::Twine>
+resolveImgInputTiles(mlir::Value wholeInputTensor,
+                     llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>>& inputTiles,
+                     size_t channelTileCount,
+                     size_t channelTileRest,
+                     size_t input_w,
+                     size_t input_h,
+                     mlir::ConversionPatternRewriter& rewriter);

 /**
 * Computes the boundaries of an image kernel application.
@@ -258,6 +304,6 @@ void incrementWeightedComputeInputsSegmentSize(spatial::SpatWeightedCompute wcom
 * @return The index of the result of the operation that produces the specified
 * value.
 */
-int getResultIndex(Operation* op, Value v);
+int getResultIndex(mlir::Operation* op, mlir::Value v);

 }; // namespace onnx_mlir
@@ -1,3 +1,4 @@
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Pass/Pass.h"
@@ -10,19 +11,39 @@

 #include "Common/PIMCommon.hpp"
 #include "Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
-#include "Math/Conv.hpp"
-#include "ONNXToSpatialPass.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
-#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
+#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
 #include "src/Compiler/CompilerOptions.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"

 using namespace mlir;

 namespace onnx_mlir {

-namespace spatial {
+bool haveSameStaticShape(Value lhs, Value rhs);
+
+namespace {
+
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatial.hpp.inc"
+
+struct ONNXToSpatialPass : PassWrapper<ONNXToSpatialPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ONNXToSpatialPass)
+  StringRef getArgument() const override { return "convert-onnx-to-spatial"; }
+  StringRef getDescription() const override { return "Lower ONNX ops to Spatial ops."; }
+
+  ONNXToSpatialPass() = default;
+  ONNXToSpatialPass(const ONNXToSpatialPass& pass) {}
+
+  void runOnOperation() override;
+
+private:
+  void annotateWeightsConstants(func::FuncOp funcOp) const;
+};
+
+} // namespace

 void ONNXToSpatialPass::runOnOperation() {
  ModuleOp moduleOp = getOperation();
@@ -40,15 +61,19 @@ void ONNXToSpatialPass::runOnOperation() {
    llvm::dbgs() << "Failed to merge activation patterns, continuing...\n";

  IRRewriter rewriter(moduleOp);
-  func::FuncOp funcOp = *moduleOp.getOps<func::FuncOp>().begin();
-  if (annotateReplication(funcOp, rewriter).failed()) {
+  auto entryFunc = getPimEntryFunc(moduleOp);
+  if (failed(entryFunc)) {
+    signalPassFailure();
+    return;
+  }
+  if (annotateReplication(*entryFunc, rewriter).failed()) {
    llvm::dbgs() << "Failed during annotation for replication analysis\n";
    signalPassFailure();
    return;
  }

  ConversionTarget target(*ctx);
-  target.addLegalDialect<ONNXDialect, SpatialDialect, tensor::TensorDialect, arith::ArithDialect, tosa::TosaDialect>();
+  target.addLegalDialect<spatial::SpatialDialect, ONNXDialect, tensor::TensorDialect, arith::ArithDialect>();
  target.addIllegalOp<ONNXMatMulOp>();
  target.addIllegalOp<ONNXGemmOp>();
  target.addIllegalOp<ONNXConvOp>();
@@ -62,16 +87,9 @@ void ONNXToSpatialPass::runOnOperation() {
  RewritePatternSet patterns(ctx);
  patterns.add<removeLRNPattern>(ctx);

-  if (useExperimentalConvImpl) {
-    populateExperimentalTilingConvOpPattern(patterns, ctx);
-    populateExperimentalPoolingTilingPattern(patterns, ctx);
-    populateGemmToConvConversionPattern(patterns, ctx);
-  }
-  else {
-    populateTilingConvOpPattern(patterns, ctx);
-    populatePoolingTilingPattern(patterns, ctx);
-    populateOnnxGemmOpPatterns(patterns, ctx);
-  }
+  populateConvOpPatterns(patterns, ctx);
+  populatePoolingTilingPattern(patterns, ctx);
+  populateOnnxGemmOpPatterns(patterns, ctx);

  populateONNXConcatToTensorConcatPattern(patterns, ctx);
  populateReduceMeanConversionPattern(patterns, ctx);
@@ -84,8 +102,8 @@ void ONNXToSpatialPass::runOnOperation() {
  // Count the number of compute ops and check they do not exceed the core count
  if (coresCount != -1) {
    int computeOpsCount = 0;
-    for (auto& op : funcOp.getFunctionBody().front().getOperations())
-      if (isa<SpatWeightedCompute>(op))
+    for (auto& op : entryFunc->getFunctionBody().front().getOperations())
+      if (isa<spatial::SpatWeightedCompute>(op))
        computeOpsCount++;

    if (computeOpsCount > coresCount) {
@@ -102,22 +120,21 @@ void ONNXToSpatialPass::runOnOperation() {
  if (failed(applyPatternsGreedily(moduleOp, std::move(removeUnusedHelperOpsPatterns))))
    llvm::dbgs() << "Failed to remove unused helper ops, continuing...\n";

-  annotateWeightsConstants(funcOp);
+  annotateWeightsConstants(*entryFunc);

  // Dump to file for debug
  dumpModule(moduleOp, "spatial");
 }

 void ONNXToSpatialPass::annotateWeightsConstants(func::FuncOp funcOp) const {
-  MLIRContext* ctx = funcOp.getContext();
  funcOp.walk([&](arith::ConstantOp constantOp) {
    bool isAlwaysWeight =
-      llvm::all_of(constantOp->getUsers(), [](auto user) -> bool { return isa<SpatWeightedCompute>(user); });
+      llvm::all_of(constantOp->getUsers(), [](auto user) -> bool { return isa<spatial::SpatWeightedCompute>(user); });
    if (isAlwaysWeight)
-      constantOp->setAttr("weightAlways", UnitAttr::get(ctx));
+      markWeightAlways(constantOp);
  });
 }

-} // namespace spatial
+std::unique_ptr<Pass> createONNXToSpatialPass() { return std::make_unique<ONNXToSpatialPass>(); }

 } // namespace onnx_mlir
@@ -1,34 +0,0 @@
-#pragma once
-
-#include "mlir/Pass/Pass.h"
-
-#include "src/Dialect/ONNX/ONNXOps.hpp"
-
-namespace onnx_mlir {
-
-using namespace mlir;
-extern bool haveSameStaticShape(Value lhs, Value rhs);
-
-namespace spatial {
-
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatial.hpp.inc"
-
-struct ONNXToSpatialPass : PassWrapper<ONNXToSpatialPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ONNXToSpatialPass)
-  StringRef getArgument() const override { return "convert-onnx-to-spatial"; }
-  StringRef getDescription() const override { return "Lower ONNX ops to Spatial ops."; }
-
-  ONNXToSpatialPass() = default;
-  ONNXToSpatialPass(const ONNXToSpatialPass& pass) {}
-
-  void runOnOperation() override;
-
-private:
-  void annotateWeightsConstants(func::FuncOp funcOp) const;
-};
-
-} // namespace spatial
-
-std::unique_ptr<Pass> createONNXToSpatialPass() { return std::make_unique<spatial::ONNXToSpatialPass>(); }
-
-} // namespace onnx_mlir
@@ -1,27 +1,20 @@
 #pragma once
-#include "mlir/IR/PatternMatch.h"
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Transforms/DialectConversion.h"

 namespace onnx_mlir {

-void populateLoweringONNXMatMulOpToSpatialPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
+void populateConvOpPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

 void populateOnnxGemmOpPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

 void populatePoolingTilingPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

-void populateDistributeReducePattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
-
-void populateFoldComputePattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
-
 void populateONNXConcatToTensorConcatPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

 void populateRemoveUnusedHelperOpsPatterns(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

 void populateReduceMeanConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);

-// Experimental patterns.
-void populateExperimentalTilingConvOpPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
-void populateGemmToConvConversionPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
-void populateExperimentalPoolingTilingPattern(mlir::RewritePatternSet& patterns, mlir::MLIRContext* ctx);
-
 } // namespace onnx_mlir
@@ -10,7 +10,7 @@ using namespace mlir;
 namespace onnx_mlir {

 template <typename OpTy, typename OpAdaptorTy>
-struct RemoveUnusedHelperOps : public OpRewritePattern<OpTy> {
+struct RemoveUnusedHelperOps : OpRewritePattern<OpTy> {
  RemoveUnusedHelperOps(MLIRContext* ctx)
  : OpRewritePattern<OpTy>(ctx) {}

@@ -49,11 +49,11 @@ LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& r
      ShapedType xShape = mlir::cast<ShapedType>(X.getType());
      ShapedType wShape = mlir::cast<ShapedType>(W.getType());

-      size_t input_w = GET_IMAGE_WIDTH(xShape);
-      size_t krn_h = GET_KERNEL_HEIGHT(wShape);
-      size_t krn_w = GET_KERNEL_WIDTH(wShape);
+      size_t input_w = getImageWidth(xShape);
+      size_t krn_h = getKernelHeight(wShape);
+      size_t krn_w = getKernelWidth(wShape);

-      size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
+      size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
      size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());

      auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
@@ -15,21 +15,21 @@

 namespace onnx_mlir {

-llvm::SmallPtrSet<Operation*, 16> onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
+llvm::SmallPtrSet<mlir::Operation*, 16> onnx_mlir::SpatialReducer::oldComputeOpsReplaced;

 ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum,
-                                             std::function<Value(const Value&)> processFun,
-                                             ConversionPatternRewriter& rewriter) {
+                                             std::function<mlir::Value(const mlir::Value&)> processFun,
+                                             mlir::ConversionPatternRewriter& rewriter) {
  assert(processFun);

  auto computeOp = GET_COMP(computeOpAndResNum);
  auto resultNum = GET_RES_NUM(computeOpAndResNum);

-  spatial::SpatYieldOp yieldOp = cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+  spatial::SpatYieldOp yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());

-  Value result = yieldOp->getOperand(resultNum);
+  mlir::Value result = yieldOp->getOperand(resultNum);
  rewriter.setInsertionPointAfterValue(result);
-  Value processedResult = processFun(result);
+  mlir::Value processedResult = processFun(result);
  if (processedResult == result) {
    // Sometimes we want processedResult to return the same value but do
    // something else with it (e.g. in softmax we want to broadcast the value
@@ -42,10 +42,11 @@ ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum
  return yieldOp.getNumOperands() - 1;
 }

-OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& computeOpsAndResNum,
-                                               std::function<Value(const Value&, const Value&)> reduce,
-                                               std::function<Value(const Value&)> preprocess,
-                                               std::function<Value(const Value&)> postprocess) {
+OpAndResNum
+SpatialReducer::applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
+                                   std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
+                                   std::function<mlir::Value(const mlir::Value&)> preprocess,
+                                   std::function<mlir::Value(const mlir::Value&)> postprocess) {

  if (preprocess)
    for (auto& computeOpAndResNum : computeOpsAndResNum)
@@ -55,18 +56,18 @@ OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& co
  // computeOp. In this case, we need to apply the reduction within-computef

  // Keep a map between a computeOp and the last Value for this reduction
-  std::unordered_map<Operation*, Value> lastValueForCompute;
+  std::unordered_map<mlir::Operation*, mlir::Value> lastValueForCompute;
  for (auto& computeOpAndResNum : computeOpsAndResNum) {
    auto computeOp = GET_COMP(computeOpAndResNum);
-    auto yieldOp = cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
-    Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
+    auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+    mlir::Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));

    auto it = lastValueForCompute.find(computeOp.getOperation());

    if (it != lastValueForCompute.end()) {
      // If we have already seen this computeOp, apply the reduction
      // within-compute
-      Value lastWithinComputeValue = it->second;
+      mlir::Value lastWithinComputeValue = it->second;

      assert(valueWithinCompute.getDefiningOp() && lastWithinComputeValue.getDefiningOp());

@@ -85,12 +86,12 @@ OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& co
  computeOpsAndResNum.clear();
  computeOpsAndResNum.reserve(lastValueForCompute.size());
  for (auto& entry : lastValueForCompute) {
-    auto computeOp = cast<spatial::SpatWeightedCompute>(entry.first);
+    auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(entry.first);
    auto valueWithinCompute = entry.second;

    // We check if `valueWithinCompute` is already used by the yieldOp, in that
    // case no need to add it
-    auto yieldOp = cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
+    auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
    bool yieldOpUseFound = false;
    for (auto& use : valueWithinCompute.getUses()) {
      if (use.getOwner() == yieldOp.getOperation()) {
@@ -110,7 +111,7 @@ OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& co
    computeOpsAndResNum.push_back({computeOp, resultNum});
  }

-  Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
+  mlir::Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();

  // Recursive algorithm to reduce the inputs to a single one:
  // - Take two inputs at a time, and reduce them into a single one, updating
@@ -118,7 +119,7 @@ OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& co
  // - Repeat until there is only one input left.
  llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
  while (computeOpsRef.size() > 1) {
-    SmallVector<ComputeAndResNum> nextComputeOps;
+    llvm::SmallVector<ComputeAndResNum> nextComputeOps;
    nextComputeOps.reserve(computeOpsRef.size() / 2);
    for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
      auto [firstCompute, firstResultNum] = computeOpsRef[i];
@@ -135,23 +136,23 @@ OpAndResNum SpatialReducer::applyReducePattern(SmallVector<ComputeAndResNum>& co
      // the number of results)
      // See below `reducerChanges.push_back` and `finalizeReduceUpdates`

-      auto yieldOpFirstCompute = cast<spatial::SpatYieldOp>(firstCompute.getBody().front().getTerminator());
+      auto yieldOpFirstCompute = mlir::cast<spatial::SpatYieldOp>(firstCompute.getBody().front().getTerminator());

      // Add a new operand to the block of the second computeOp
-      Block& secondBlock = secondCompute.getBody().front();
-      Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
+      mlir::Block& secondBlock = secondCompute.getBody().front();
+      mlir::Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);

      auto secondComputeWeightsNum =
-        secondCompute->getAttrOfType<DenseI32ArrayAttr>(secondCompute.getOperandSegmentSizesAttrName())[0];
+        secondCompute->getAttrOfType<mlir::DenseI32ArrayAttr>(secondCompute.getOperandSegmentSizesAttrName())[0];
      auto secondComputeOperandNum = secondComputeWeightsNum + secondBlock.getNumArguments() - 1;

      // Take the "former-result" from the second computeOp
-      spatial::SpatYieldOp secondYield = cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
-      Value formerRes2 = secondYield.getOperand(secondResultNum);
+      spatial::SpatYieldOp secondYield = mlir::cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
+      mlir::Value formerRes2 = secondYield.getOperand(secondResultNum);

      // Apply reduction operation
      rewriter.setInsertionPoint(secondYield);
-      Value reduced = reduce(formerRes2, formerRes1);
+      mlir::Value reduced = reduce(formerRes2, formerRes1);

      // Unfortunately, it is not possible to update the result in place,
      // because we may have already referenced it by <computeOp, resultNum>
@@ -219,7 +220,7 @@ void SpatialReducer::finalizeReduceUpdates() {
    // `opToReplacedCompute`
    auto toComputeOp = opToReplacedCompute[toOp];
    if (!toComputeOp)
-      toComputeOp = cast<spatial::SpatWeightedCompute>(toOp);
+      toComputeOp = mlir::cast<spatial::SpatWeightedCompute>(toOp);

    assert(toComputeOp != fromComputeOp && "Oops should have caught this earlier!");

@@ -234,31 +235,31 @@ void SpatialReducer::finalizeReduceUpdates() {
  }
 }

-Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) {
+mlir::Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) {
  assert(reducesFinalized && "Cannot create resolve values before finalizing the reduce updates.");

-  Operation* opToCast;
+  mlir::Operation* opToCast;
  auto it = opToReplacedCompute.find(opAndResNum.first);
  if (it != opToReplacedCompute.end())
    opToCast = it->second;
  else
    opToCast = opAndResNum.first;

-  auto computeOp = cast<spatial::SpatWeightedCompute>(opToCast);
+  auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(opToCast);

  return computeOp.getResult(opAndResNum.second);
 }

-void SpatialReducer::updateResultsOfCompute(Operation* computeOp) {
+void SpatialReducer::updateResultsOfCompute(mlir::Operation* computeOp) {
  if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
    // If we have already replaced the fromOp, we do not need to do it again
    return;
  }
-  auto oldComputeOp = cast<spatial::SpatWeightedCompute>(computeOp);
+  auto oldComputeOp = mlir::cast<spatial::SpatWeightedCompute>(computeOp);

  auto oldComputeOpNum = oldComputeOp->getNumOperands();

-  auto yieldOp = cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
+  auto yieldOp = mlir::cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());

  if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
    // No result was added, just add itself to the map
@@ -283,8 +284,8 @@ void SpatialReducer::updateResultsOfCompute(Operation* computeOp) {
  // Since we replaced the old ComputeOp with a new one, we need to replace
  // all its results' uses
  for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
-    Value oldResult = oldComputeOp.getResult(i);
-    Value newResult = newComputeOp.getResult(i);
+    mlir::Value oldResult = oldComputeOp.getResult(i);
+    mlir::Value newResult = newComputeOp.getResult(i);

    // Replace the uses, except the uses of the compute ops which got deleted
    // previously
@@ -298,9 +299,10 @@ void SpatialReducer::updateResultsOfCompute(Operation* computeOp) {
  rewriter.eraseOp(oldComputeOp);
 }

-Value SpatialReducer::createImgConcatOp(SmallVector<SmallVector<SmallVector<OpAndResNum>>>& outputTiles,
-                                        Location& loc,
-                                        Type outputType) {
+mlir::Value
+SpatialReducer::createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
+                                  mlir::Location& loc,
+                                  mlir::Type outputType) {

  assert(reducesFinalized && "Cannot create ImgConcatOp before finalizing the reduce updates.");

@@ -309,8 +311,8 @@ Value SpatialReducer::createImgConcatOp(SmallVector<SmallVector<SmallVector<OpAn
  auto width = outputTiles[0].size();
  auto height = outputTiles[0][0].size();

-  SmallVector<SmallVector<SmallVector<Value>>> remappedOutputTiles(
-    tilesCount, SmallVector<SmallVector<Value>>(width, SmallVector<Value>(height)));
+  llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>> remappedOutputTiles(
+    tilesCount, llvm::SmallVector<llvm::SmallVector<mlir::Value>>(width, llvm::SmallVector<mlir::Value>(height)));

  for (size_t t = 0; t < tilesCount; t++)
    for (size_t x = 0; x < width; x++)
@@ -320,16 +322,16 @@ Value SpatialReducer::createImgConcatOp(SmallVector<SmallVector<SmallVector<OpAn
  return ::onnx_mlir::createImgConcatOp(remappedOutputTiles, rewriter, loc, outputType);
 }

-OpAndResNum SpatialReducer::applyAddMapReduction(SmallVector<ComputeAndResNum>& computeOps,
-                                                 ConversionPatternRewriter& rewriter,
-                                                 Value biasTile,
+OpAndResNum SpatialReducer::applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
+                                                 mlir::ConversionPatternRewriter& rewriter,
+                                                 mlir::Value biasTile,
                                                 MapOperations mapOp) {

-  std::function<Value(const Value&)> postprocessing = nullptr;
+  std::function<mlir::Value(const mlir::Value&)> postprocessing = nullptr;

  if (mapOp != MapOperations::None) {
-    postprocessing = [&](const Value a) {
-      Value mapOperand = a;
+    postprocessing = [&](const mlir::Value a) {
+      mlir::Value mapOperand = a;
      if (biasTile)
        mapOperand = rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, biasTile);
      return createMapOperation(rewriter, mapOp, mapOperand);
@@ -338,7 +340,7 @@ OpAndResNum SpatialReducer::applyAddMapReduction(SmallVector<ComputeAndResNum>&

  return this->applyReducePattern(
    computeOps,
-    [&](Value a, Value b) { return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b); },
+    [&](mlir::Value a, mlir::Value b) { return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b); },
    /* preprocess = */ nullptr,
    postprocessing);
 }
@@ -3,6 +3,10 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Casting.h"

+#include <functional>
+#include <unordered_map>
+#include <utility>
+
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"

@@ -13,28 +17,28 @@ using ResNum = unsigned int;
 using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;

 struct SpatialReducerChange {
-  Operation* fromOp;
+  mlir::Operation* fromOp;
  unsigned int fromOpResNum;
-  Operation* toOp;
+  mlir::Operation* toOp;
  unsigned int toOpOperandNum;
 };

-using OpAndResNum = std::pair<Operation*, ResNum>;
+using OpAndResNum = std::pair<mlir::Operation*, ResNum>;

 class SpatialReducer {

 public:
-  SpatialReducer(ConversionPatternRewriter& rewriter)
+  SpatialReducer(mlir::ConversionPatternRewriter& rewriter)
  : rewriter(rewriter) {}

-  OpAndResNum applyReducePattern(SmallVector<ComputeAndResNum>& computeOpsAndResNum,
-                                 std::function<Value(const Value&, const Value&)> reduce,
-                                 std::function<Value(const Value&)> preprocess,
-                                 std::function<Value(const Value&)> postprocess);
+  OpAndResNum applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
+                                 std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
+                                 std::function<mlir::Value(const mlir::Value&)> preprocess,
+                                 std::function<mlir::Value(const mlir::Value&)> postprocess);

-  OpAndResNum applyAddMapReduction(SmallVector<ComputeAndResNum>& computeOps,
-                                   ConversionPatternRewriter& rewriter,
-                                   Value biasTile,
+  OpAndResNum applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
+                                   mlir::ConversionPatternRewriter& rewriter,
+                                   mlir::Value biasTile,
                                   MapOperations mapOp);

  void finalizeReduceUpdates();
@@ -44,17 +48,17 @@ public:
      finalizeReduceUpdates();
  }

-  Value createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
-                          Location& loc,
-                          Type outputType);
+  mlir::Value createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
+                                mlir::Location& loc,
+                                mlir::Type outputType);

-  Value resolveValueFromOpAndResNum(OpAndResNum& opAndResNum);
+  mlir::Value resolveValueFromOpAndResNum(OpAndResNum& opAndResNum);

 private:
  [[nodiscard("computeOp result number gets updated")]] ResNum
  applyResultProcessing(ComputeAndResNum computeOpAndResNum,
-                        std::function<Value(const Value&)> processFun,
-                        ConversionPatternRewriter& rewriter);
+                        std::function<mlir::Value(const mlir::Value&)> processFun,
+                        mlir::ConversionPatternRewriter& rewriter);

  /**
   * @brief Update the results of a ComputeOp.
@@ -66,19 +70,19 @@ private:
   *
   * @param computeOp The ComputeOp to update the results of.
   */
-  void updateResultsOfCompute(Operation* computeOp);
+  void updateResultsOfCompute(mlir::Operation* computeOp);

-  ConversionPatternRewriter& rewriter;
+  mlir::ConversionPatternRewriter& rewriter;
  bool reducesFinalized = false;

  // List of changes to be applied after the reduction is finalized
-  SmallVector<SpatialReducerChange, 4> reducerChanges;
+  llvm::SmallVector<SpatialReducerChange, 4> reducerChanges;
  // List of computeOps that need to be replaced with new results
-  SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
+  llvm::SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;

-  std::unordered_map<Operation*, spatial::SpatWeightedCompute> opToReplacedCompute;
+  std::unordered_map<mlir::Operation*, spatial::SpatWeightedCompute> opToReplacedCompute;

-  static llvm::SmallPtrSet<Operation*, 16> oldComputeOpsReplaced;
+  static llvm::SmallPtrSet<mlir::Operation*, 16> oldComputeOpsReplaced;
 };

 } // namespace onnx_mlir
@@ -4,7 +4,7 @@

 namespace onnx_mlir {

-WeightSubdivider::WeightSubdivider(map<long, map<long, SmallVector<Value>>> weights)
+WeightSubdivider::WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights)
 : weights(std::move(weights)) {}

 bool WeightSubdivider::isEmpty() const { return weights.empty(); }
@@ -13,7 +13,7 @@ TaggedWeights WeightSubdivider::popGroup(size_t amount) {
  assert(!weights.empty() && "No weights to extract.");

  auto it = weights.begin();
-  SmallVector<Value>& values = it->second.begin()->second;
+  llvm::SmallVector<mlir::Value>& values = it->second.begin()->second;

  long inputTile = it->first;
  long outputTile = it->second.begin()->first;
@@ -21,7 +21,7 @@ TaggedWeights WeightSubdivider::popGroup(size_t amount) {
  size_t n = std::min(amount, values.size());
  crossbarsUsed += n;

-  SmallVector<Value> result;
+  llvm::SmallVector<mlir::Value> result;
  result.assign(values.begin(), values.begin() + n);

  if (n < values.size()) {
@@ -36,9 +36,9 @@ TaggedWeights WeightSubdivider::popGroup(size_t amount) {
  return {inputTile, outputTile, crossbarsUsed - n, result};
 }

-SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
+llvm::SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
  crossbarsUsed = 0;
-  SmallVector<TaggedWeights> result;
+  llvm::SmallVector<TaggedWeights> result;
  size_t remaining = n;

  while (remaining > 0 && !weights.empty()) {
@@ -4,11 +4,9 @@

 #include "llvm/ADT/SmallVector.h"

+#include <cstddef>
 #include <map>

-using namespace mlir;
-using namespace std;
-
 namespace onnx_mlir {

 /**
@@ -19,7 +17,7 @@ struct TaggedWeights {
  long inputTile;
  long outputTile;
  size_t startingCrossbarIndex;
-  SmallVector<Value> weights;
+  llvm::SmallVector<mlir::Value> weights;
 };

 /**
@@ -33,16 +31,16 @@ struct TaggedWeights {
 */
 class WeightSubdivider {
 private:
-  map<long, map<long, SmallVector<Value>>> weights;
+  std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights;
  size_t crossbarsUsed = 0;

  TaggedWeights popGroup(size_t amount);

 public:
-  WeightSubdivider(map<long, map<long, SmallVector<Value>>> weights);
+  WeightSubdivider(std::map<long, std::map<long, llvm::SmallVector<mlir::Value>>> weights);

  bool isEmpty() const;
-  SmallVector<TaggedWeights> popGroups(size_t n);
+  llvm::SmallVector<TaggedWeights> popGroups(size_t n);
 };

 } // namespace onnx_mlir