huge refactor for high RewritePatterns usage and less ad-hoc cpp code

remove Spatial many ops in favor of tensor ops like in pim
2026-05-12 10:35:44 +02:00
parent feaff820e1
commit 909c4acfdd
84 changed files with 4048 additions and 3310 deletions
@@ -0,0 +1,126 @@
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+
+#include "src/Accelerators/PIM/Common/PimCommon.hpp"
+#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
+
+using namespace mlir;
+
+namespace onnx_mlir {
+namespace {
+
+static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
+  auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
+  assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute");
+  return SmallVector<int32_t>(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end());
+}
+
+static SmallVector<int32_t> getLaneChunkCoreIds(ArrayRef<int32_t> coreIds, size_t laneCount, unsigned lane) {
+  SmallVector<int32_t> laneCoreIds;
+  laneCoreIds.reserve(coreIds.size() / laneCount);
+  for (size_t chunkIndex = 0; chunkIndex < coreIds.size() / laneCount; ++chunkIndex)
+    laneCoreIds.push_back(coreIds[chunkIndex * laneCount + lane]);
+  return laneCoreIds;
+}
+
+} // namespace
+
+LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
+                                          unsigned lane,
+                                          llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
+  OwningOpRef<ModuleOp> scratchModule = ModuleOp::create(coreBatchOp.getLoc());
+  OpBuilder builder(scratchModule->getContext());
+  builder.setInsertionPointToStart(scratchModule->getBody());
+
+  size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
+  size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount;
+  SmallVector<Value> laneWeights;
+  laneWeights.reserve(weightsPerLane);
+  for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex)
+    laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]);
+
+  auto coreIds = getBatchCoreIds(coreBatchOp);
+  auto scalarCore = pim::PimCoreOp::create(
+    builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane]));
+  Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end());
+  IRMapping mapper;
+  if (coreBatchOp.getBody().front().getNumArguments() == 1)
+    mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]);
+
+  builder.setInsertionPointToEnd(block);
+  for (Operation& op : coreBatchOp.getBody().front()) {
+    if (isa<pim::PimHaltOp>(op)) {
+      pim::PimHaltOp::create(builder, op.getLoc());
+      continue;
+    }
+
+    if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
+      pim::PimSendOp::create(builder,
+                             sendBatchOp.getLoc(),
+                             mapper.lookup(sendBatchOp.getInput()),
+                             sendBatchOp.getSizeAttr(),
+                             builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
+      continue;
+    }
+
+    if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
+      pim::PimSendTensorOp::create(
+        builder,
+        sendTensorBatchOp.getLoc(),
+        mapper.lookup(sendTensorBatchOp.getInput()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
+      continue;
+    }
+
+    if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
+      auto scalarReceive =
+        pim::PimReceiveOp::create(builder,
+                                  receiveBatchOp.getLoc(),
+                                  receiveBatchOp.getOutput().getType(),
+                                  mapper.lookup(receiveBatchOp.getOutputBuffer()),
+                                  receiveBatchOp.getSizeAttr(),
+                                  builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
+      mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
+      continue;
+    }
+
+    if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
+      auto scalarReceive = pim::PimReceiveTensorOp::create(
+        builder,
+        receiveTensorBatchOp.getLoc(),
+        receiveTensorBatchOp.getOutput().getType(),
+        mapper.lookup(receiveTensorBatchOp.getOutputBuffer()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
+      mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput());
+      continue;
+    }
+
+    if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
+      Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource());
+      if (!hostSource)
+        hostSource = memcpBatchOp.getHostSource();
+
+      auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder,
+                                                           memcpBatchOp.getLoc(),
+                                                           memcpBatchOp.getOutput().getType(),
+                                                           mapper.lookup(memcpBatchOp.getDeviceTarget()),
+                                                           hostSource,
+                                                           memcpBatchOp.getDeviceTargetOffsetAttr(),
+                                                           memcpBatchOp.getHostSourceOffsetAttr(),
+                                                           memcpBatchOp.getSizeAttr());
+      mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
+      continue;
+    }
+
+    Operation* cloned = builder.clone(op, mapper);
+    for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
+      mapper.map(originalResult, clonedResult);
+  }
+
+  if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
+    pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
+  return callback(scalarCore);
+}
+
+} // namespace onnx_mlir