add PIM accelerator

2026-02-24 15:09:18 +01:00
parent b24a0df8d7
commit a6e928bdd7
67 changed files with 9109 additions and 1 deletions
@@ -0,0 +1,327 @@
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+
+using namespace mlir;
+
+namespace onnx_mlir {
+
+template <typename PoolOp>
+bool hasPostProcessExperimentalPoolingWindow() {
+  return false;
+}
+
+template <>
+bool hasPostProcessExperimentalPoolingWindow<ONNXAveragePoolOp>() {
+  return true;
+}
+
+template <typename PoolOp>
+Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter &rewriter,
+    Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
+    size_t tilesSkippedByPadding) {
+  return nullptr;
+}
+
+template <>
+Value postProcessExperimentalPoolingWindow<ONNXAveragePoolOp>(
+    ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
+    Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
+  bool countIncludePad = poolOp.getCountIncludePad() == 1;
+
+  size_t divisorNumber =
+      countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
+
+  RankedTensorType scalarTensor =
+      RankedTensorType::get({1}, rewriter.getF32Type());
+
+  // Put a spat.const before the computeOp, and use its value. We do this to be
+  // compatible with the current code generation, which assumes constant to be
+  // loaded in global memory, which is allocated by adding a spat.const OP
+  // directly under func.func (i.e. alongside ComputeOps)
+  auto computeOp = cast<spatial::SpatWeightedCompute>(
+      valueToDivide.getDefiningOp()->getParentOp());
+  rewriter.setInsertionPoint(computeOp);
+  auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
+      rewriter.getI64IntegerAttr(divisorNumber),
+      /* should_allocate = */ rewriter.getBoolAttr(true));
+
+  rewriter.setInsertionPointAfterValue(valueToDivide);
+  return rewriter.create<spatial::SpatVSDivOp>(
+      loc, valueToDivide.getType(), valueToDivide, divisorValue);
+}
+
+template <typename ReductionOp>
+Value reduceInputTiles(
+    SmallVector<Value> &inputTiles, ConversionPatternRewriter &rewriter) {
+  if (inputTiles.size() == 1) {
+    return inputTiles[0];
+  }
+
+  if (inputTiles.size() == 2) {
+    return rewriter.create<spatial::SpatVMaxOp>(inputTiles[0].getLoc(),
+        inputTiles[0].getType(), inputTiles[0], inputTiles[1]);
+  }
+
+  SmallVector<Value> left(
+      inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2);
+  SmallVector<Value> right(
+      inputTiles.begin() + inputTiles.size() / 2, inputTiles.end());
+
+  Value leftReduced = reduceInputTiles<ReductionOp>(left, rewriter);
+  Value rightReduced = reduceInputTiles<ReductionOp>(right, rewriter);
+
+  return rewriter.create<ReductionOp>(
+      inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced);
+}
+
+template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
+struct ExperimentalPoolingBaseConverter : public OpConversionPattern<PoolOp> {
+  ExperimentalPoolingBaseConverter(MLIRContext *ctx)
+      : OpConversionPattern<PoolOp>(ctx) {}
+
+  LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const final {
+    Value X = adaptor.getX();
+    ShapedType xShape = mlir::cast<ShapedType>(X.getType());
+    Value Y = poolOp.getResult();
+    ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
+
+    size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
+    unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
+    unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
+    unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
+
+    if (adaptor.getAutoPad() != "NOTSET") {
+      return rewriter.notifyMatchFailure(
+          poolOp, "auto_pad != NOTSET is deprecated.");
+    }
+
+    size_t pad_x, pad_y;
+    auto padUnpackError =
+        unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
+    if (padUnpackError.has_value()) {
+      return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
+    }
+
+    Location loc = poolOp.getLoc();
+
+    size_t input_h = GET_IMAGE_HEIGHT(xShape);
+    size_t input_w = GET_IMAGE_WIDTH(xShape);
+    size_t output_h = GET_IMAGE_HEIGHT(yShape);
+    size_t output_w = GET_IMAGE_WIDTH(yShape);
+
+    ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize);
+
+    // Assert that the input is a tensor.ConcatOp.
+    auto concat = X.getDefiningOp<tensor::ConcatOp>();
+    if (!concat) {
+      return rewriter.notifyMatchFailure(
+          poolOp, "Expected input to be a tensor.ConcatOp");
+    }
+
+    // Create a [channel_tile][x][y] array to store the input tiles.
+    std::map<long, std::map<long, std::map<long, Value>>> inputTiles;
+
+    // For each argument of the tensor.ConcatOp, resolve the input tiles.
+    for (size_t y = 0; y < input_h; ++y) {
+      for (size_t x = 0; x < input_w; ++x) {
+        for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) {
+          size_t tilingSize =
+              it == tileCount.quot ? tileCount.rem : crossbarSize;
+
+          SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
+          SmallVector<OpFoldResult> offsets = {/* 0 */ rewriter.getIndexAttr(0),
+              /* 1 */ rewriter.getIndexAttr(0),
+              /* 2 */ rewriter.getIndexAttr(x),
+              /* 3 */ rewriter.getIndexAttr(y)};
+          SmallVector<OpFoldResult> sizes = {
+              /* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
+              /* 1 */ rewriter.getIndexAttr(tilingSize),
+              /* 2 */ rewriter.getIndexAttr(1),
+              /* 3 */ rewriter.getIndexAttr(1)};
+
+          // Get the concat's operand that we want to slice.
+          Value concatInput = concat.getOperand(it);
+          Value slicedTile = rewriter.create<tensor::ExtractSliceOp>(
+              loc, concatInput, offsets, sizes, strides);
+
+          inputTiles[it][x][y] = slicedTile;
+        }
+      }
+    }
+
+    // Prepare the shape of the compute's output.
+    ldiv_t itc = tileCount;
+    SmallVector<Type> outputTileTypes;
+    for (size_t y = 0; y < output_h; ++y) {
+      for (size_t x = 0; x < output_w; ++x) {
+        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+          SmallVector<int64_t> outputShapeArray{
+              /* 0 */ 1, // Batch size is always 1.
+                         /* 1 */
+              cast<RankedTensorType>(inputTiles[it][0][0].getType())
+                  .getShape()[1],
+              /* 2 */ 1,
+              /* 3 */ 1};
+
+          auto elementType =
+              dyn_cast<RankedTensorType>(xShape).getElementType();
+
+          outputTileTypes.push_back(
+              RankedTensorType::get(outputShapeArray, elementType));
+        }
+      }
+    }
+
+    // Create a plain value list of the input tiles.
+    SmallVector<Value> inputTilesList;
+    for (size_t y = 0; y < input_h; ++y) {
+      for (size_t x = 0; x < input_w; ++x) {
+        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+          inputTilesList.push_back(inputTiles[it][y][x]);
+        }
+      }
+    }
+
+    // Create a single compute to calculate the output.
+    auto computeOp = rewriter.create<spatial::SpatWeightedCompute>(
+        loc, outputTileTypes, SmallVector<Value>(), inputTilesList);
+
+    // Create a new block for the compute unit and add the operands.
+    Block *block = rewriter.createBlock(&computeOp.getRegion());
+
+    // Fill the block arguments and keep a reference to them.
+    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> inputTilesArgs;
+    for (size_t y = 0; y < input_h; ++y) {
+      for (size_t x = 0; x < input_w; ++x) {
+        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+          auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) +
+                           x * (itc.quot + (itc.rem > 0)) + it;
+          inputTilesArgs[it][y][x] = block->addArgument(
+              computeOp->getOperand(tileIndex).getType(), loc);
+        }
+      }
+    }
+
+    // Begin writing in the block.
+    rewriter.setInsertionPointToStart(block);
+
+    // Go through all pooling blocks.
+    SmallVector<Value> outputTiles;
+    for (size_t y = 0; y < output_h; ++y) {
+      for (size_t x = 0; x < output_w; ++x) {
+        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+          size_t start_x = x * stride_x;
+          size_t start_y = y * stride_y;
+          size_t end_x = std::min(start_x + krn_w, input_w);
+          size_t end_y = std::min(start_y + krn_h, input_h);
+
+          SmallVector<Value> inputTilesToReduce;
+          for (size_t ky = start_y; ky < end_y; ++ky) {
+            for (size_t kx = start_x; kx < end_x; ++kx) {
+              inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]);
+            }
+          }
+
+          auto reduceResult =
+              reduceInputTiles<ReduceOp>(inputTilesToReduce, rewriter);
+
+          // If the reduce op is add, we need to divide the result by the
+          // number of elements in the pooling window.
+          if (hasPostProcessExperimentalPoolingWindow<PoolOp>()) {
+            // Add a spat.const before the computeOp.
+            rewriter.setInsertionPoint(computeOp);
+            auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc,
+                RankedTensorType::get({1}, rewriter.getF32Type()),
+                rewriter.getI64IntegerAttr(krn_w * krn_h),
+                rewriter.getBoolAttr(true));
+
+            rewriter.setInsertionPointAfter(reduceResult.getDefiningOp());
+            reduceResult = rewriter.create<spatial::SpatVSDivOp>(
+                loc, reduceResult.getType(), reduceResult, divisorValue);
+          }
+          outputTiles.push_back(reduceResult);
+        }
+      }
+    }
+
+    // Create a YieldOp to return the output tiles.
+    rewriter.create<spatial::SpatYieldOp>(loc, outputTiles);
+
+    // Set the rewrite cursor right after the computeOp.
+    rewriter.setInsertionPointAfter(computeOp);
+
+    std::map<size_t, std::map<size_t, std::map<size_t, Value>>> computeOutput;
+    for (size_t y = 0; y < output_h; ++y) {
+      for (size_t x = 0; x < output_w; ++x) {
+        for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+          auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) +
+                           x * (itc.quot + (itc.rem > 0)) + it;
+          computeOutput[it][y][x] = computeOp.getResult(tileIndex);
+        }
+      }
+    }
+
+    // We'll now create spat.img.concat ops to concatenate the output tiles.
+    SmallVector<Value> outputTilesList;
+    for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
+      SmallVector<Value> imgConcatTiles;
+      for (size_t y = 0; y < output_h; ++y) {
+        for (size_t x = 0; x < output_w; ++x) {
+          imgConcatTiles.push_back(computeOutput[it][y][x]);
+        }
+      }
+
+      size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
+
+      SmallVector<int64_t> outputShapeArray{
+          /* 0 */ 1, // Batch size is always 1.
+          /* 1 */ (long)tilingSize,
+          /* 2 */ (long)output_w,
+          /* 3 */ (long)output_h};
+
+      auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
+
+      outputTilesList.push_back(rewriter.create<spatial::SpatImgConcatOp>(loc,
+          RankedTensorType::get(outputShapeArray, elementType),
+          imgConcatTiles));
+    }
+
+    // Create a new tensor.ConcatOp to concatenate the output tiles.
+    Value outputTensor =
+        rewriter.create<tensor::ConcatOp>(loc, 1, outputTilesList);
+
+    rewriter.replaceOp(poolOp, outputTensor);
+
+    return success();
+  }
+};
+
+void populateExperimentalPoolingTilingPattern(
+    RewritePatternSet &patterns, MLIRContext *ctx) {
+  patterns.insert<ExperimentalPoolingBaseConverter<ONNXMaxPoolSingleOutOp,
+      ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
+  patterns.insert<ExperimentalPoolingBaseConverter<ONNXAveragePoolOp,
+      ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,452 @@
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
+#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+
+using namespace mlir;
+
+namespace onnx_mlir {
+
+llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
+
+Value applyReducePatternNew(SmallVector<Value> &valuesToReduce,
+    ConversionPatternRewriter &rewriter,
+    std::function<Value(const Value &, const Value &)> reduce,
+    std::function<Value(const Value &)> preprocess,
+    std::function<Value(const Value &)> postprocess) {
+  // Simple case: if we have only one input, just return it
+  if (valuesToReduce.size() == 1) {
+    return valuesToReduce[0];
+  }
+
+  if (preprocess) {
+    for (auto &valToReduce : valuesToReduce) {
+      rewriter.setInsertionPointAfterValue(valToReduce);
+      valToReduce = preprocess(valToReduce);
+    }
+  }
+
+  // It is possible that `valuesToReduce` contains two entries for the same
+  // computeOp. In this case, we need to apply the reduction within-computef
+
+  // Keep a map between a computeOp and the last Value for this reduction
+  std::unordered_map<Operation *, Value> lastValueForCompute;
+  for (auto &valToReduce : valuesToReduce) {
+    Operation *computeOp = valToReduce.getParentBlock()->getParentOp();
+    // if (valToReduce.getDefiningOp()) {
+    //   // If the value is defined by an operation, we take the parent
+    //   operation computeOp = valToReduce.getDefiningOp()->getParentOp();
+    // } else {
+    //   // Otherwise it is a block argument,
+    //   computeOp->getBlock()->getParentOp();
+    // }
+
+    assert(isa<spatial::SpatWeightedCompute>(computeOp) && "Expected a ComputeOp");
+
+    auto it = lastValueForCompute.find(computeOp);
+
+    if (it != lastValueForCompute.end()) {
+      // If we have already seen this computeOp, apply the reduction
+      // within-compute
+      Value lastWithinComputeValue = it->second;
+
+      if (valToReduce.getDefiningOp()->isBeforeInBlock(
+              lastWithinComputeValue.getDefiningOp())) {
+        rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
+      } else {
+        rewriter.setInsertionPointAfterValue(valToReduce);
+      }
+      valToReduce = reduce(lastWithinComputeValue, valToReduce);
+      lastValueForCompute[computeOp] = valToReduce;
+    }
+
+    lastValueForCompute[computeOp] = valToReduce;
+  }
+
+  // Now, reconstruct from the map the valuesToReduce list
+  valuesToReduce.clear();
+  valuesToReduce.reserve(lastValueForCompute.size());
+  for (auto &entry : lastValueForCompute) {
+    valuesToReduce.push_back(entry.second);
+  }
+
+  Location loc = valuesToReduce[0].getLoc();
+  auto channelType = spatial::SpatChannelType::get(rewriter.getContext());
+
+  // Recursive algorithm to reduce the inputs to a single one:
+  // - Take two inputs at a time, and reduce them into a single one, updating
+  // the valuesToReduce list which becomes half the size.
+  // - Repeat until there is only one input left.
+  llvm::OwningArrayRef<Value> valuesToReduceRef(valuesToReduce);
+  while (valuesToReduceRef.size() > 1) {
+    SmallVector<Value> nextValuesToReduce;
+    nextValuesToReduce.reserve(valuesToReduceRef.size() / 2);
+    for (size_t i = 0; i < valuesToReduceRef.size() - 1; i += 2) {
+      auto firstValue = valuesToReduceRef[i];
+      auto secondValue = valuesToReduceRef[i + 1];
+
+      auto firstCompute = firstValue.getParentBlock()->getParentOp();
+      auto secondCompute = secondValue.getParentBlock()->getParentOp();
+
+      assert(isa<spatial::SpatWeightedCompute>(firstCompute));
+      assert(isa<spatial::SpatWeightedCompute>(secondCompute));
+
+      if (secondCompute->isBeforeInBlock(firstCompute)) {
+        std::swap(firstValue, secondValue);
+        std::swap(firstCompute, secondCompute);
+      }
+
+      // 1. Add a channel before the first computeOp
+      rewriter.setInsertionPoint(firstCompute);
+      auto channel = rewriter.create<spatial::SpatChannelNewOp>(loc, channelType);
+
+      // 2. Add a sendOp after the first value
+      rewriter.setInsertionPointAfterValue(firstValue);
+      rewriter.create<spatial::SpatChannelSendOp>(loc, channel, firstValue);
+
+      // 3. Add a receiveOp after the second value
+      rewriter.setInsertionPointAfterValue(secondValue);
+      auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
+          loc, secondValue.getType(), channel);
+
+      // 4. Apply reduction between second value and received value
+      rewriter.setInsertionPointAfterValue(receivedValue);
+      Value reduced = reduce(receivedValue, secondValue);
+
+      nextValuesToReduce.push_back(reduced);
+    }
+
+    // If we have an odd number of inputs, we need to add the last one to the
+    // newInputs list.
+    if (valuesToReduceRef.size() % 2 == 1) {
+      nextValuesToReduce.push_back(valuesToReduceRef.back());
+    }
+
+    // Replace the inputOps list with the new one.
+    valuesToReduceRef =
+        llvm::OwningArrayRef<Value>(std::move(nextValuesToReduce));
+  }
+
+  assert(valuesToReduceRef.size() == 1 &&
+         "Internal error: expected a single input at this point.");
+
+  auto finalValue = valuesToReduceRef[0];
+
+  if (postprocess) {
+    rewriter.setInsertionPointAfterValue(finalValue);
+    finalValue = postprocess(finalValue);
+  }
+
+  return finalValue;
+}
+
+template <typename PoolOp>
+bool hasPostProcessPoolingWindow() {
+  return false;
+}
+
+template <>
+bool hasPostProcessPoolingWindow<ONNXAveragePoolOp>() {
+  return true;
+}
+
+template <typename PoolOp>
+Value postProcessPoolingWindow(ConversionPatternRewriter &rewriter,
+    Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
+    size_t tilesSkippedByPadding) {
+  return nullptr;
+}
+
+template <>
+Value postProcessPoolingWindow<ONNXAveragePoolOp>(
+    ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
+    Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
+  bool countIncludePad = poolOp.getCountIncludePad() == 1;
+
+  size_t divisorNumber =
+      countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
+
+  RankedTensorType scalarTensor =
+      RankedTensorType::get({1}, rewriter.getF32Type());
+
+  // Put a spat.const before the computeOp, and use its value. We do this to be
+  // compatible with the current code generation, which assumes constant to be
+  // loaded in global memory, which is allocated by adding a spat.const OP
+  // directly under func.func (i.e. alongside ComputeOps)
+  auto computeOp = cast<spatial::SpatWeightedCompute>(
+      valueToDivide.getDefiningOp()->getParentOp());
+  rewriter.setInsertionPoint(computeOp);
+  auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
+      rewriter.getI64IntegerAttr(divisorNumber),
+      /* should_allocate = */ rewriter.getBoolAttr(true));
+
+  rewriter.setInsertionPointAfterValue(valueToDivide);
+  return rewriter.create<spatial::SpatVSDivOp>(
+      loc, valueToDivide.getType(), valueToDivide, divisorValue);
+}
+
+template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
+struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {
+  PoolingBaseConverter(MLIRContext *ctx) : OpConversionPattern<PoolOp>(ctx) {}
+
+  LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const final {
+    Value X = adaptor.getX();
+    ShapedType xShape = mlir::cast<ShapedType>(X.getType());
+    Value Y = poolOp.getResult();
+    ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
+
+    size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
+    unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
+    unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
+    unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
+
+    if (adaptor.getAutoPad() != "NOTSET") {
+      return rewriter.notifyMatchFailure(
+          poolOp, "auto_pad != NOTSET is deprecated.");
+    }
+
+    size_t pad_x, pad_y;
+    auto padUnpackError =
+        unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
+    if (padUnpackError.has_value()) {
+      return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
+    }
+
+    Location loc = poolOp.getLoc();
+
+    size_t input_h = GET_IMAGE_HEIGHT(xShape);
+    size_t input_w = GET_IMAGE_WIDTH(xShape);
+    size_t output_h = GET_IMAGE_HEIGHT(yShape);
+    size_t output_w = GET_IMAGE_WIDTH(yShape);
+    size_t channelTileCount =
+        ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
+    size_t channelTileRest = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
+
+    // 1: Tile the input tensor
+    // Input tiles need to be indexed by:
+    //    a. Channel Tile
+    //    b. Pixel `x` position
+    //    c. Pixel `y` position
+    // For example: inputTiles[channelTile][x][y]
+    // Example complete input tensor: tensor<1x3x12x12xf32> (NxCxWxH)
+    // Suppose that the input tensor is produced by concatenating the results of
+    // many ComputeOps. Get the result tiles from these ComputeOps.
+    SmallVector<SmallVector<SmallVector<Value>>> inputTiles(channelTileCount,
+        SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
+
+    auto resolveErrorOpt = resolveImgInputTiles(X, inputTiles, channelTileCount,
+        channelTileRest, input_w, input_h, rewriter);
+    if (resolveErrorOpt.has_value()) {
+      return rewriter.notifyMatchFailure(poolOp, *resolveErrorOpt);
+    }
+
+    // TODO: This requires a core for each input tile, which is not ideal. We
+    // can do better.
+    // If some input tiles come from the func.func operands, load
+    // them into a computeOp and yield them
+    for (size_t t = 0; t < channelTileCount; t++) {
+      for (size_t x = 0; x < input_w; x++) {
+        for (size_t y = 0; y < input_h; y++) {
+          if (auto extractSliceOp =
+                  inputTiles[t][x][y].getDefiningOp<tensor::ExtractSliceOp>()) {
+            Location tileLoc = extractSliceOp.getLoc();
+
+            auto tempComputeOp = rewriter.create<spatial::SpatWeightedCompute>(
+                tileLoc, extractSliceOp.getResultType(),
+                /* xbarWeights =*/ValueRange(), extractSliceOp.getResult());
+
+            Block *tempComputeOpBlock = new Block();
+            tempComputeOp.getBody().push_back(tempComputeOpBlock);
+            auto tempComputeOpBlockArg = tempComputeOpBlock->addArgument(
+                extractSliceOp.getType(), tileLoc);
+
+            rewriter.setInsertionPointToStart(tempComputeOpBlock);
+            rewriter.create<spatial::SpatYieldOp>(tileLoc, tempComputeOpBlockArg);
+            rewriter.setInsertionPointAfter(tempComputeOp);
+            inputTiles[t][x][y] = tempComputeOp.getResult(0);
+          }
+        }
+      }
+    }
+
+    // 2: Tile the output tensor
+    // Output tiles need to be indexed by:
+    //  a. Channel Tile
+    //  b. Pixel `x` position
+    //  c. Pixel `y` position
+    // For example: outputTiles[channelTile][x][y]
+    // Example complete output tensor: tensor<1x3x6x6xf32> (NxCxWxH)
+    SmallVector<SmallVector<SmallVector<Value>>> outputTiles(
+        channelTileCount, SmallVector<SmallVector<Value>>(
+                              output_w, SmallVector<Value>(output_h, nullptr)));
+
+    // List of values to pool for each output pixel
+    SmallVector<Value> valuesToPool;
+
+    // Iterate each output tile
+    for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
+      // Iterate each output pixel
+      for (size_t outX = 0; outX < output_w; outX++) {
+        for (size_t outY = 0; outY < output_h; outY++) {
+
+          // Each output pixel tile is computed by pooling a window of input
+          // pixel tiles
+          valuesToPool.clear();
+          size_t tilesSkippedByPadding = 0;
+
+          auto [start_x, end_x] = kernel_get_start_and_end(
+              outX, input_w, krn_w, stride_x, dilation_x, pad_x);
+          auto [start_y, end_y] = kernel_get_start_and_end(
+              outY, input_h, krn_h, stride_y, dilation_y, pad_y);
+
+          for (size_t inX = start_x; inX < end_x; inX += dilation_x) {
+            for (size_t inY = start_y; inY < end_y; inY += dilation_y) {
+              if (failed(verifyWithinBoundsAndPaddings(
+                      input_w, input_h, inX, inY, pad_x, pad_y))) {
+                tilesSkippedByPadding++;
+                continue;
+              }
+
+              Value inputTile = inputTiles[outTile][inX][inY];
+
+              Value valueToPool;
+              if (auto computeProducer =
+                      inputTile.getDefiningOp<spatial::SpatWeightedCompute>()) {
+
+                int resultNumber = getResultIndex(computeProducer, inputTile);
+
+                auto yieldInComputeOp = cast<spatial::SpatYieldOp>(
+                    computeProducer.getBody().front().getTerminator());
+                valueToPool = yieldInComputeOp.getOperand(resultNumber);
+              } else if (auto receiveProducer =
+                             inputTile
+                                 .getDefiningOp<spatial::SpatChannelReceiveOp>()) {
+                auto sendOpOpt =
+                    getOtherEndOfChannel(receiveProducer, true, rewriter);
+                if (failed(sendOpOpt)) {
+                  return rewriter.notifyMatchFailure(poolOp,
+                      "ChannelReceiveOp does not have a matching "
+                      "ChannelSendOp.");
+                }
+                auto sendOp = cast<spatial::SpatChannelSendOp>(*sendOpOpt);
+
+                valueToPool = sendOp.getData();
+              } else {
+                return rewriter.notifyMatchFailure(poolOp,
+                    "Input tile for Pooling is not produced by a "
+                    "WeightedComputeOp nor a receiveOp");
+              }
+
+              valuesToPool.push_back(valueToPool);
+            }
+          }
+
+          assert(valuesToPool.size() != 0 &&
+                 "Pooling computed on zero tiles make no sense.");
+          // assert(computeOpsForPooling.size() != 1 &&
+          //        "Pooling computed on one tiles make no sense??? Or maybe
+          //        this " "should have been simplified earlier???");
+
+          std::function<Value(const Value &)> postProcessFn = nullptr;
+          if (hasPostProcessPoolingWindow<PoolOp>()) {
+            postProcessFn = [&](const Value prevFinalRes) {
+              return postProcessPoolingWindow(rewriter, loc, poolOp,
+                  prevFinalRes, krn_h * krn_w, tilesSkippedByPadding);
+            };
+          }
+
+          Value reducedWithinCompute = applyReducePatternNew(
+              valuesToPool, rewriter,
+              [&](const Value lhs, const Value rhs) {
+                return rewriter.create<ReduceOp>(loc, lhs.getType(), lhs, rhs);
+              },
+              nullptr, postProcessFn);
+
+          // Send this value through a channel, and receive it in the
+          // `func.func`. During lowering, we will need to "move it" into the
+          // users computeOps
+          auto computeOpOfReduced = cast<spatial::SpatWeightedCompute>(
+              reducedWithinCompute.getDefiningOp()->getParentOp());
+
+          // Create a new channel before the computeOp
+          rewriter.setInsertionPoint(computeOpOfReduced);
+          auto reduceChannel = rewriter.create<spatial::SpatChannelNewOp>(
+              loc, spatial::SpatChannelType::get(rewriter.getContext()));
+
+          // Send value through the channel
+          rewriter.setInsertionPointAfterValue(reducedWithinCompute);
+          rewriter.create<spatial::SpatChannelSendOp>(
+              loc, reduceChannel, reducedWithinCompute);
+
+          // Receive after the computeOp
+          rewriter.setInsertionPointAfter(computeOpOfReduced);
+          auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
+              loc, reducedWithinCompute.getType(), reduceChannel);
+
+          outputTiles[outTile][outX][outY] = receivedValue;
+        }
+      }
+    }
+
+    // TODO: outputTiles are not the results of the computeOps! We need to add
+    // them!
+
+    std::unordered_map<Operation *,
+        SmallVector<std::tuple<size_t, size_t, size_t, Value>>>
+        computeOpNeedingResults;
+
+    // Iterate each output tile
+    for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
+      // Iterate each output pixel
+      for (size_t outX = 0; outX < output_w; outX++) {
+        for (size_t outY = 0; outY < output_h; outY++) {
+          auto outputTile = outputTiles[outTile][outX][outY];
+          auto outputTileProducer = outputTile.getDefiningOp()->getParentOp();
+          if (!outputTileProducer) {
+            return rewriter.notifyMatchFailure(poolOp,
+                "Output tile for Pooling is not produced by a "
+                "WeightedComputeOp.");
+          }
+
+          computeOpNeedingResults[outputTileProducer].push_back(
+              std::make_tuple(outTile, outX, outY, outputTile));
+        }
+      }
+    }
+
+    Value outputImage =
+        createImgConcatOp(outputTiles, rewriter, loc, poolOp.getType());
+
+    rewriter.replaceOp(poolOp, outputImage);
+
+    return success();
+  }
+};
+
+void populatePoolingTilingPattern(
+    RewritePatternSet &patterns, MLIRContext *ctx) {
+  patterns.insert<PoolingBaseConverter<ONNXMaxPoolSingleOutOp,
+      ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
+  patterns.insert<PoolingBaseConverter<ONNXAveragePoolOp,
+      ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,90 @@
+
+
+#include "Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
+#include "mlir/Transforms/DialectConversion.h"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+using namespace mlir;
+namespace onnx_mlir {
+
+struct ReduceMeanConversionPattern
+    : public OpConversionPattern<ONNXReduceMeanV13Op> {
+
+  ReduceMeanConversionPattern(MLIRContext *ctx) : OpConversionPattern(ctx) {}
+
+  LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean,
+      ONNXReduceMeanV13OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const final {
+
+    // Get the input tensor.
+    Value inputTensor = adaptor.getData();
+    auto inputTensorType = cast<RankedTensorType>(inputTensor.getType());
+
+    // This pattern will substitute the ONNXReduceMeanV13Op with a
+    // ONNXAveragePoolOp with the same input tensor and an appropriate kernel
+    // shape and strides.
+
+    // To get the stride and shape of the kernel, we need to read the tensor
+    // shape.
+    int image_height = inputTensorType.getShape()[2];
+    int image_width = inputTensorType.getShape()[3];
+
+    // Define the kernel shape and strides.
+    SmallVector<int64_t> kernelShapeVals = {image_height, image_width};
+    SmallVector<int64_t> stridesVals = {image_height, image_width};
+    SmallVector<int64_t> dilationsVals = {1, 1};
+
+    // Set the pads to 0.
+    SmallVector<int64_t> padsVals = {0, 0, 0, 0};
+
+    // Create the ArrayAttrs
+    auto kernelShape = mlir::ArrayAttr::get(rewriter.getContext(),
+        llvm::to_vector(
+            llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute {
+              return rewriter.getI64IntegerAttr(v);
+            })));
+
+    auto strides = mlir::ArrayAttr::get(rewriter.getContext(),
+        llvm::to_vector(
+            llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute {
+              return rewriter.getI64IntegerAttr(v);
+            })));
+
+    auto dilations = mlir::ArrayAttr::get(rewriter.getContext(),
+        llvm::to_vector(
+            llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute {
+              return rewriter.getI64IntegerAttr(v);
+            })));
+
+    auto pads = mlir::ArrayAttr::get(rewriter.getContext(),
+        llvm::to_vector(
+            llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute {
+              return rewriter.getI64IntegerAttr(v);
+            })));
+
+    // Create the resulting tensor type.
+    auto resultType = RankedTensorType::get(
+        /*shape=*/{inputTensorType.getShape()[0], inputTensorType.getShape()[1],
+            1, 1},
+        /*elementType=*/inputTensorType.getElementType());
+
+    // Create the ONNXAveragePoolOp.
+    auto averagePool = rewriter.create<ONNXAveragePoolOp>(reduceMean.getLoc(),
+        resultType, inputTensor, /*auto_pad=*/"NOTSET",
+        /*ceil_mode=*/0, /*count_include_pad=*/1, dilations,
+        /*kernel_shape=*/kernelShape,
+        /*pads=*/pads, /*strides=*/strides);
+
+    // Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp.
+    rewriter.replaceOp(reduceMean, averagePool.getResult());
+
+    return success();
+  }
+};
+
+void populateReduceMeanConversionPattern(
+    RewritePatternSet &patterns, MLIRContext *ctx) {
+  patterns.insert<ReduceMeanConversionPattern>(ctx);
+}
+
+} // namespace onnx_mlir