add PIM accelerator

2026-02-24 15:09:18 +01:00
parent b24a0df8d7
commit a6e928bdd7
67 changed files with 9109 additions and 1 deletions
--- a/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
@@ -0,0 +1,119 @@
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
+#include "src/Dialect/ONNX/ONNXOps.hpp"
+
+#include <queue>
+
+using namespace mlir;
+
+namespace onnx_mlir {
+
+/**
+ * @brief Structure that describes the replication of a convolution operation,
+ * along the image height axis.
+ */
+struct ConvReplication {
+  ONNXConvOp convOp;            // Convolution operation
+  size_t input_w;               // Width of the input image
+  size_t replicationFactor;     // Replication factor on the image height axis
+  size_t coresNeededPerReplica; // Number of cores needed for each replica
+
+  friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
+    return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
+  }
+
+  ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
+  : convOp(convOp),
+    input_w(input_w),
+    replicationFactor(replicationFactor),
+    coresNeededPerReplica(coresNeededPerReplica) {}
+};
+
+LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
+
+  if (coresCount == -1) {
+    // No need for annotation, implicitly set replication to 1
+    return success();
+  }
+
+  std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
+
+  size_t minimumCores = 0;
+
+  for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
+    if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
+      // Convolution layer
+
+      Value X = convOp.getX(), W = convOp.getW();
+      ShapedType xShape = mlir::cast<ShapedType>(X.getType());
+      ShapedType wShape = mlir::cast<ShapedType>(W.getType());
+
+      size_t input_w = GET_IMAGE_WIDTH(xShape);
+      size_t krn_h = GET_KERNEL_HEIGHT(wShape);
+      size_t krn_w = GET_KERNEL_WIDTH(wShape);
+
+      size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
+      size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
+
+      auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
+      auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
+
+      minimumCores += neededCores;
+
+      convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
+    }
+    else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
+      // Fully connected layer
+      auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
+      auto inputSize = matrixTensorShape.getDimSize(0);
+      auto outputSize = matrixTensorShape.getDimSize(1);
+      if (gemmOp.getTransB())
+        std::swap(inputSize, outputSize);
+
+      const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
+      const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
+
+      // Each output tile is computed by `coresPerOutputTile` cores. The
+      // entire input is given to each of these cores.
+      const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
+
+      auto neededCores = coresPerOutputTile * outputTilesCount;
+
+      minimumCores += neededCores;
+    }
+  }
+
+  if (static_cast<size_t>(coresCount) < minimumCores) {
+    return funcOp->emitError("Not enough cores for this network: ")
+        << minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
+  }
+
+  size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
+
+  // Consume all the elements in the queue
+  while (!convOpsReplicationQueue.empty()) {
+    auto convOpReplication = convOpsReplicationQueue.top();
+    convOpsReplicationQueue.pop();
+
+    // Check if we can replicate this convolution (e.g. we have enough cores)
+    if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
+      // We can replicate this convolution: increment replicationFactor and put
+      // back in queue
+      availableCores -= convOpReplication.coresNeededPerReplica;
+      convOpReplication.replicationFactor++;
+
+      convOpsReplicationQueue.push(convOpReplication);
+    }
+    else {
+      // Cannot replicate this convolution anymore, annotate the operation
+      // with the replication factor
+      convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
+                                        rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
+    }
+  }
+
+  return success();
+}
+
+} // namespace onnx_mlir