add PIM accelerator

2026-02-24 15:09:18 +01:00
parent b24a0df8d7
commit a6e928bdd7
67 changed files with 9109 additions and 1 deletions
--- a/src/PIM/Dialect/Spatial/Transforms/SpatialBufferizableOpInterface.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/SpatialBufferizableOpInterface.cpp
@@ -0,0 +1,493 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+
+#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
+#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/SpatialBufferizableOpInterface.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+using namespace mlir;
+using namespace bufferization;
+
+namespace onnx_mlir {
+namespace spatial {
+
+memref::AllocOp createEmptyFromType(Type resultType, Location loc, RewriterBase& rewriter) {
+  auto resultShape = cast<ShapedType>(resultType);
+  auto memrefResultType = MemRefType::get(resultShape.getShape(), resultShape.getElementType());
+
+  // Alloc an output memref
+  return rewriter.create<memref::AllocOp>(loc, memrefResultType);
+}
+
+const llvm::StringRef PRECOMPUTED_OTHER_CORE_ID_ATTR_NAME("precomp_other_core_id");
+
+llvm::FailureOr<uint32_t> getCoreIdOfOtherEndOfChannel(Operation* op, bool opIsReceive, RewriterBase& rewriter) {
+
+  // This function requires the existence of ChannelNewOp and the other
+  // Receive/Send operation. However, during bufferization, the first of the
+  // Receive/Send operation that is processed gets removed. As such, we need to
+  // "precompute" the coreId needed for the other op, and save it as attribute
+  auto precomputedOtherCoreId = op->getAttr(PRECOMPUTED_OTHER_CORE_ID_ATTR_NAME);
+  if (precomputedOtherCoreId)
+    return cast<IntegerAttr>(precomputedOtherCoreId).getInt();
+
+  auto notOpUserOpt = getOtherEndOfChannel(op, opIsReceive, rewriter);
+  if (failed(notOpUserOpt))
+    return failure();
+  Operation* notOpUser = *notOpUserOpt;
+
+  // Save the coreId for this op into the other op as attribute
+  auto opCoreIdAttr = cast<pim::PimCoreOp>(op->getParentOp()).getCoreIdAttr();
+  notOpUser->setAttr(PRECOMPUTED_OTHER_CORE_ID_ATTR_NAME, opCoreIdAttr);
+
+  return cast<pim::PimCoreOp>(notOpUser->getParentOp()).getCoreId();
+}
+
+struct WComputeOpInterface : BufferizableOpInterface::ExternalModel<WComputeOpInterface, SpatWeightedCompute> {
+
+  // Input tensor to the compute OP are always read into its local memory
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; }
+
+  // Input tensor to the compute OP are _never_ written into its local memory
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // In general, no tensor is aliased with any other tensor in the compute OP
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    // TODO: Is it an empty list or a list of "UNKNOWN" values?
+    return {};
+  }
+
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+    // Bufferize its block
+
+    auto& block = op->getRegion(0).front();
+
+    return bufferizeBlockSignature(&block, rewriter, options, state);
+  }
+};
+
+/*
+ * This can be used for operation that have a single argument, which is a
+ * variadic of tensors, and a single output with the same same shape
+ * Example: VAdd, VSub, VExp
+ */
+template <typename InterfaceName, typename OpTy, typename ToTy>
+struct VariadicArgumentElementWiseOpInterface : BufferizableOpInterface::ExternalModel<InterfaceName, OpTy> {
+
+  // Input tensors to the OP are always read
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; }
+
+  // Input tensors to the OP are _never_ written
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // In general, no tensor is aliased with any other tensor in the OP
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    return {};
+  }
+
+  // Cast tensor values into memref values
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+
+    // Turn Tensor Operands into Memref Operands
+    SmallVector<Value> memrefOperands;
+    memrefOperands.reserve(op->getNumOperands());
+    for (auto operand : op->getOperands()) {
+      auto memref = getBuffer(rewriter, operand, options, state);
+      if (failed(memref))
+        return failure();
+      memrefOperands.push_back(*memref);
+    }
+
+    // TODO: Support addiction with more than 2 operands
+    if (memrefOperands.size() > 2) {
+      op->emitError("VariadicArgumentElementWiseOpInterface only supports OPs "
+                    "with 1 or 2 operands, for now.");
+      return failure();
+    }
+
+    // Alloc an output memref
+    Value outputTensor = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+
+    memrefOperands.push_back(outputTensor);
+
+    Value newValue = rewriter.create<ToTy>(op->getLoc(), outputTensor.getType(), memrefOperands).getOutRes();
+
+    replaceOpWithBufferizedValues(rewriter, op, newValue);
+
+    return success();
+  }
+};
+
+template <typename InterfaceName, typename OpTy, typename ToTy>
+struct WeightedMultiplicationsOpInterface : BufferizableOpInterface::ExternalModel<InterfaceName, OpTy> {
+
+  // Input tensors to the OP are always read
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return true; }
+
+  // Input tensors to the OP are _never_ written
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // In general, no tensor is aliased with any other tensor in the OP
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    return {};
+  }
+
+  // Cast tensor value into memref value
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+    auto memrefOperandOpt = getBuffer(rewriter, op->getOperand(0), options, state);
+    if (failed(memrefOperandOpt))
+      return failure();
+    auto memrefOperand = *memrefOperandOpt;
+
+    // Alloc an output memref
+    Value outputTensor = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+
+    Value newValue =
+      rewriter
+        .create<ToTy>(
+          op->getLoc(), outputTensor.getType(), cast<OpTy>(op).getWeightIndexAttr(), memrefOperand, outputTensor)
+        .getOutRes();
+
+    replaceOpWithBufferizedValues(rewriter, op, newValue);
+
+    return success();
+  }
+};
+
+struct ChannelReceiveOpInterface
+: BufferizableOpInterface::ExternalModel<ChannelReceiveOpInterface, SpatChannelReceiveOp> {
+
+  // Input value is the channel (not read/written, its more of an attribute)
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    // TODO: Is it an empty list or a list of "UNKNOWN" values?
+    return {};
+  }
+
+  /*
+   * Turn the channel receive to pim.recv
+   */
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+
+    auto outputTensor = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+
+    auto numElements = cast<ShapedType>(outputTensor.getType()).getNumElements();
+    auto elementSize = cast<ShapedType>(outputTensor.getType()).getElementTypeBitWidth() / 8;
+
+    auto srcCoreId = getCoreIdOfOtherEndOfChannel(op, true, rewriter);
+    if (failed(srcCoreId))
+      return failure();
+
+    Value newValue = rewriter
+                       .create<pim::PimReceiveOp>(op->getLoc(),
+                                                  outputTensor.getType(),
+                                                  outputTensor,
+                                                  rewriter.getI32IntegerAttr(numElements * elementSize),
+                                                  rewriter.getI32IntegerAttr(srcCoreId.value()))
+                       .getOut();
+
+    replaceOpWithBufferizedValues(rewriter, op, newValue);
+
+    return success();
+  }
+};
+
+struct ChannelSendOpInterface : BufferizableOpInterface::ExternalModel<ChannelSendOpInterface, SpatChannelSendOp> {
+
+  // First input is channel (not read/writter) second input is Tensor to send,
+  // which is read
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    return opOperand.getOperandNumber() == 2;
+  }
+
+  // See above (both non-written)
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    // TODO: Is it an empty list or a list of "UNKNOWN" values?
+    return {};
+  }
+
+  /*
+   * Turn the channel send to pim.send
+   */
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+    auto srcTensor = op->getOperand(1);
+
+    auto srcTensorOpt = getBuffer(rewriter, srcTensor, options, state);
+    if (failed(srcTensorOpt))
+      return failure();
+    auto srcMemRef = *srcTensorOpt;
+
+    auto numElements = cast<ShapedType>(srcTensor.getType()).getNumElements();
+    auto elementSize = cast<ShapedType>(srcTensor.getType()).getElementTypeBitWidth() / 8;
+
+    auto dstCoreId = getCoreIdOfOtherEndOfChannel(op, false, rewriter);
+    if (failed(dstCoreId))
+      return failure();
+
+    replaceOpWithNewBufferizedOp<pim::PimSendOp>(rewriter,
+                                                 op,
+                                                 srcMemRef,
+                                                 rewriter.getI32IntegerAttr(numElements * elementSize),
+                                                 rewriter.getI32IntegerAttr(dstCoreId.value()));
+
+    return success();
+  }
+};
+
+struct ChannelBroadcastReceiveOpInterface
+: BufferizableOpInterface::ExternalModel<ChannelBroadcastReceiveOpInterface, SpatChannelBroadcastReceiveOp> {
+
+  // Input value is the channel (not read/written, its more of an attribute)
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    // TODO: Is it an empty list or a list of "UNKNOWN" values?
+    return {};
+  }
+
+  /*
+   * Turn the channel receive to pim.load using by creating a new global buffer
+   */
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+
+    auto outputTensor = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+
+    auto outputSize = cast<ShapedType>(outputTensor.getType()).getNumElements();
+
+    auto channelNewOp = op->getOperand(0).getDefiningOp<SpatChannelNewOp>();
+    if (!channelNewOp) {
+      op->emitError("ChannelBroadcastReceiveOp does not use a channel as operand");
+      return failure();
+    }
+
+    // The first 'broadcast' operation creates the buffer just after the
+    // channelNewOp, while the other 'broadcast' operation need to find this
+    // buffer allocation just after the channelNewOp
+    Value bufferAllocation;
+    if (auto allocOpAfterChannel = dyn_cast<memref::AllocOp>(channelNewOp->getNextNode())) {
+      // Buffer already allocated, load from this buffer
+      bufferAllocation = allocOpAfterChannel;
+    }
+    else {
+      // Buffer was not allocated previously, allocate it after channelNewOp
+      rewriter.setInsertionPointAfter(channelNewOp);
+      bufferAllocation = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+    }
+
+    rewriter.setInsertionPoint(op);
+    auto memCopyHostToDevOp = rewriter.create<pim::PimMemCopyHostToDevOp>(op->getLoc(),
+                                                                          outputTensor.getType(),
+                                                                          outputTensor,
+                                                                          bufferAllocation,
+                                                                          rewriter.getI32IntegerAttr(0),
+                                                                          rewriter.getI32IntegerAttr(0),
+                                                                          rewriter.getI32IntegerAttr(outputSize));
+
+    replaceOpWithBufferizedValues(rewriter, op, memCopyHostToDevOp.getDeviceDst());
+
+    return success();
+  }
+};
+
+struct ChannelBroadcastSendOpInterface
+: BufferizableOpInterface::ExternalModel<ChannelBroadcastSendOpInterface, SpatChannelBroadcastSendOp> {
+
+  // First input is channel (not read/writter) second input is Tensor to send,
+  // which is read
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    return opOperand.getOperandNumber() == 2;
+  }
+
+  // See above (both non-written)
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const { return false; }
+
+  // See above
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    // TODO: Is it an empty list or a list of "UNKNOWN" values?
+    return {};
+  }
+
+  /*
+   * Turn the channel send to pim.send
+   */
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+    auto srcTensor = op->getOperand(1);
+
+    auto srcTensorOpt = getBuffer(rewriter, srcTensor, options, state);
+    if (failed(srcTensorOpt))
+      return failure();
+    auto srcMemRef = *srcTensorOpt;
+
+    auto channelNewOp = op->getOperand(0).getDefiningOp<SpatChannelNewOp>();
+    if (!channelNewOp) {
+      op->emitError("SpatChannelBroadcastSendOp does not use a channel as operand");
+      return failure();
+    }
+
+    // The first 'broadcast' operation creates the buffer just after the
+    // channelNewOp, while the other 'broadcast' operation need to find this
+    // buffer allocation just after the channelNewOp
+    Value bufferAllocation;
+    if (auto allocOpAfterChannel = dyn_cast<memref::AllocOp>(channelNewOp->getNextNode())) {
+      // Buffer already allocated, load from this buffer
+      bufferAllocation = allocOpAfterChannel;
+    }
+    else {
+      // Buffer was not allocated previously, allocate it after channelNewOp
+      rewriter.setInsertionPointAfter(channelNewOp);
+      bufferAllocation = createEmptyFromType(srcTensor.getType(), op->getLoc(), rewriter);
+    }
+
+    rewriter.setInsertionPoint(op);
+    replaceOpWithBufferizedValues(rewriter, op, {bufferAllocation, srcMemRef});
+    return success();
+  }
+};
+
+struct VAddOpInterfaceFromTemplate
+: VariadicArgumentElementWiseOpInterface<VAddOpInterfaceFromTemplate, SpatVAddOp, pim::PimVAddOp> {};
+
+struct WVMMOpInterface : WeightedMultiplicationsOpInterface<WVMMOpInterface, SpatWeightedVMMOp, pim::PimVMMOp> {};
+
+struct WMVMOpInterface : WeightedMultiplicationsOpInterface<WMVMOpInterface, SpatWeightedMVMOp, pim::PimMVMOp> {};
+
+struct SumOpInterface : VariadicArgumentElementWiseOpInterface<SumOpInterface, SpatSumOp, pim::PimSumOp> {};
+
+struct VSDivOpInterface : VariadicArgumentElementWiseOpInterface<VSDivOpInterface, SpatVSDivOp, pim::PimVSDivOp> {};
+
+struct VMaxOpInterface : VariadicArgumentElementWiseOpInterface<VMaxOpInterface, SpatVMaxOp, pim::PimVMaxOp> {};
+
+// Create a new bufferizable op interface for the apply filters operation.
+struct ApplyFiltersOpInterface : BufferizableOpInterface::ExternalModel<ApplyFiltersOpInterface, SpatApplyFiltersOp> {
+
+  // One operand ($input) is read from. All other inputs are only written to.
+  bool bufferizesToMemoryRead(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+
+    // Operand 0: $input
+    // Operand 1: $outBuf
+    // Operand 2: $accumBuf
+    return opOperand.getOperandNumber() == 0;
+  }
+
+  // One input ($accumBuf) is written to. All other inputs are only read.
+  bool bufferizesToMemoryWrite(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+
+    // Operand 0: $input
+    // Operand 1: $outBuf
+    // Operand 2: $accumBuf
+    return opOperand.getOperandNumber() == 2;
+  }
+
+  // No operands are aliased with any other operands.
+  AliasingValueList getAliasingValues(Operation* op, OpOperand& opOperand, const AnalysisState& state) const {
+    return {};
+  }
+
+  // Bufferize the operation.
+  LogicalResult bufferize(Operation* op, RewriterBase& rewriter, const BufferizationOptions& options, BufferizationState &state) const {
+
+    // Get the input tensor buffer.
+    auto inputBuffer = getBuffer(rewriter, op->getOperand(0), options, state);
+
+    if (failed(inputBuffer))
+      return failure();
+
+    // Create a new buffer for the output tensor.
+    auto outputTensor = createEmptyFromType(op->getResult(0).getType(), op->getLoc(), rewriter);
+
+    // Create a new buffer for the accumulation buffer.
+    // To do this, create a new allocation operation. Size must be axbx1x1,
+    // where axbxcxd is the size of the output tensor. Since the shape is
+    // different, we can't immediately use createEmptyFromType, we first need to
+    // create the shape of the accumulation buffer.
+    auto accumShape = llvm::to_vector<4>(cast<ShapedType>(op->getResult(0).getType()).getShape());
+
+    // Set the last two dimensions to 1.
+    accumShape[accumShape.size() - 1] = 1;
+    accumShape[accumShape.size() - 2] = 1;
+
+    auto accumType = MemRefType::get(accumShape, cast<ShapedType>(op->getResult(0).getType()).getElementType());
+
+    auto accumBuffer = createEmptyFromType(accumType, op->getLoc(), rewriter);
+
+    // Bufferize the operation.
+    auto weightIndices = cast<SpatApplyFiltersOp>(op).getWeightIndicesAttr();
+    auto xKernelPositions = cast<SpatApplyFiltersOp>(op).getXKernelPositionsAttr();
+    auto yKernelPositions = cast<SpatApplyFiltersOp>(op).getYKernelPositionsAttr();
+
+    Value bufferized = rewriter.create<pim::PimApplyFiltersOp>(op->getLoc(),
+                                                               outputTensor.getType(),
+                                                               weightIndices,
+                                                               xKernelPositions,
+                                                               yKernelPositions,
+                                                               *inputBuffer,
+                                                               outputTensor,
+                                                               accumBuffer);
+
+    // Replace the operation with the bufferized value.
+    replaceOpWithBufferizedValues(rewriter, op, bufferized);
+
+    return success();
+  }
+};
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry& registry) {
+  registry.addExtension(+[](MLIRContext* ctx, SpatialDialect* dialect) {
+    SpatWeightedCompute::attachInterface<WComputeOpInterface>(*ctx);
+    SpatVAddOp::attachInterface<VAddOpInterfaceFromTemplate>(*ctx);
+    SpatWeightedVMMOp::attachInterface<WVMMOpInterface>(*ctx);
+    SpatWeightedMVMOp::attachInterface<WMVMOpInterface>(*ctx);
+    SpatSumOp::attachInterface<SumOpInterface>(*ctx);
+    SpatVSDivOp::attachInterface<VSDivOpInterface>(*ctx);
+    SpatVMaxOp::attachInterface<VMaxOpInterface>(*ctx);
+    SpatChannelReceiveOp::attachInterface<ChannelReceiveOpInterface>(*ctx);
+    SpatChannelSendOp::attachInterface<ChannelSendOpInterface>(*ctx);
+    SpatChannelBroadcastReceiveOp::attachInterface<ChannelBroadcastReceiveOpInterface>(*ctx);
+    SpatChannelBroadcastSendOp::attachInterface<ChannelBroadcastSendOpInterface>(*ctx);
+    SpatApplyFiltersOp::attachInterface<ApplyFiltersOpInterface>(*ctx);
+  });
+}
+
+struct ONNXReluInterface : VariadicArgumentElementWiseOpInterface<ONNXReluInterface, ONNXReluOp, pim::PimVReluOp> {};
+
+struct ONNXExpOpInterface : VariadicArgumentElementWiseOpInterface<ONNXExpOpInterface, ONNXExpOp, pim::PimVExpOp> {};
+
+void registerONNXBufferizableOpInterfaceExternalModels(DialectRegistry& registry) {
+  registry.addExtension(+[](MLIRContext* ctx, ONNXDialect* dialect) {
+    ONNXReluOp::attachInterface<ONNXReluInterface>(*ctx);
+    ONNXExpOp::attachInterface<ONNXExpOpInterface>(*ctx);
+  });
+}
+
+} // namespace spatial
+} // namespace onnx_mlir
--- a/src/PIM/Dialect/Spatial/Transforms/SpatialBufferizableOpInterface.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/SpatialBufferizableOpInterface.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "mlir/IR/DialectRegistry.h"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+
+using namespace mlir;
+
+namespace onnx_mlir {
+namespace spatial {
+
+void registerBufferizableOpInterfaceExternalModels(DialectRegistry& registry);
+
+void registerONNXBufferizableOpInterfaceExternalModels(DialectRegistry& registry);
+
+} // namespace spatial
+} // namespace onnx_mlir