refactorone

2026-05-20 19:06:41 +02:00
parent f56c4159b5
commit a50e77ff38
50 changed files with 3420 additions and 1187 deletions
@@ -1,7 +1,11 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"

+#include "llvm/ADT/StringRef.h"
+
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"

@@ -24,113 +28,132 @@ static SmallVector<int32_t> getLaneChunkCoreIds(ArrayRef<int32_t> coreIds, size_
  return laneCoreIds;
 }

-static void scalarizeBatchOpsInCore(pim::PimCoreOp scalarCore, size_t laneCount, unsigned lane) {
-  IRRewriter rewriter(scalarCore.getContext());
-  SmallVector<Operation*> batchOps;
-  scalarCore.walk([&](Operation* op) {
-    if (isa<pim::PimSendBatchOp,
-            pim::PimSendTensorBatchOp,
-            pim::PimReceiveBatchOp,
-            pim::PimReceiveTensorBatchOp,
-            pim::PimMemCopyHostToDevBatchOp>(op)) {
-      batchOps.push_back(op);
-    }
-  });
+static void cloneScalarizedLaneBody(OpBuilder& builder,
+                                    pim::PimCoreBatchOp coreBatchOp,
+                                    unsigned lane,
+                                    OperationFolder& constantFolder) {
+  Block& oldBlock = coreBatchOp.getBody().front();
+  size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
+  size_t weightCount = coreBatchOp.getWeights().size();

-  for (Operation* op : batchOps) {
-    rewriter.setInsertionPoint(op);
+  IRMapping mapper;
+  for (auto [argIndex, blockArg] : llvm::enumerate(oldBlock.getArguments())) {
+    if (blockArg.getType().isIndex()) {
+      mapper.map(blockArg, getOrCreateHostIndexConstant(coreBatchOp, static_cast<int64_t>(lane), constantFolder));
+      continue;
+    }
+
+    if (argIndex <= weightCount) {
+      mapper.map(blockArg, coreBatchOp.getWeights()[argIndex - 1]);
+      continue;
+    }
+
+    size_t inputIndex = argIndex - 1 - weightCount;
+    assert(inputIndex < coreBatchOp.getInputs().size() && "pim.core_batch block input index out of range");
+    mapper.map(blockArg, coreBatchOp.getInputs()[inputIndex]);
+  }
+
+  for (Operation& op : oldBlock) {
+    if (isa<pim::PimHaltOp>(op))
+      continue;

    if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
-      pim::PimSendOp::create(rewriter,
-                             sendBatchOp.getLoc(),
-                             sendBatchOp.getInput(),
-                             sendBatchOp.getSizeAttr(),
-                             rewriter.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
-      rewriter.eraseOp(op);
+      Operation* anchorOp = builder.getInsertionBlock()->getParentOp();
+      pim::PimSendOp::create(
+        builder,
+        sendBatchOp.getLoc(),
+        mapper.lookup(sendBatchOp.getInput()),
+        sendBatchOp.getSizeAttr(),
+        getOrCreateHostIndexConstant(anchorOp, sendBatchOp.getTargetCoreIds()[lane], constantFolder));
      continue;
    }

    if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
      pim::PimSendTensorOp::create(
-        rewriter,
+        builder,
        sendTensorBatchOp.getLoc(),
-        sendTensorBatchOp.getInput(),
-        rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
-      rewriter.eraseOp(op);
+        mapper.lookup(sendTensorBatchOp.getInput()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
      continue;
    }

    if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
-      auto scalarReceive =
-        pim::PimReceiveOp::create(rewriter,
-                                  receiveBatchOp.getLoc(),
-                                  receiveBatchOp.getOutput().getType(),
-                                  receiveBatchOp.getOutputBuffer(),
-                                  receiveBatchOp.getSizeAttr(),
-                                  rewriter.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
-      rewriter.replaceOp(op, scalarReceive->getResults());
+      Operation* anchorOp = builder.getInsertionBlock()->getParentOp();
+      auto scalarReceive = pim::PimReceiveOp::create(
+        builder,
+        receiveBatchOp.getLoc(),
+        receiveBatchOp.getOutput().getType(),
+        mapper.lookup(receiveBatchOp.getOutputBuffer()),
+        receiveBatchOp.getSizeAttr(),
+        getOrCreateHostIndexConstant(anchorOp, receiveBatchOp.getSourceCoreIds()[lane], constantFolder));
+      mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
      continue;
    }

    if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
      auto scalarReceive = pim::PimReceiveTensorOp::create(
-        rewriter,
+        builder,
        receiveTensorBatchOp.getLoc(),
        receiveTensorBatchOp.getOutput().getType(),
-        receiveTensorBatchOp.getOutputBuffer(),
-        rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
-      rewriter.replaceOp(op, scalarReceive->getResults());
+        mapper.lookup(receiveTensorBatchOp.getOutputBuffer()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
+      mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput());
      continue;
    }

-    auto memcpBatchOp = cast<pim::PimMemCopyHostToDevBatchOp>(op);
-    auto scalarCopy = pim::PimMemCopyHostToDevOp::create(rewriter,
-                                                         memcpBatchOp.getLoc(),
-                                                         memcpBatchOp.getOutput().getType(),
-                                                         memcpBatchOp.getDeviceTarget(),
-                                                         memcpBatchOp.getHostSource(),
-                                                         memcpBatchOp.getDeviceTargetOffsetAttr(),
-                                                         memcpBatchOp.getHostSourceOffsetAttr(),
-                                                         memcpBatchOp.getSizeAttr());
-    rewriter.replaceOp(op, scalarCopy->getResults());
+    if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
+      auto scalarCopy = pim::PimMemCopyHostToDevOp::create(
+        builder,
+        memcpBatchOp.getLoc(),
+        memcpBatchOp.getOutput().getType(),
+        getOrCreateHostIndexConstant(coreBatchOp, memcpBatchOp.getDeviceTargetOffset(), constantFolder),
+        getOrCreateHostIndexConstant(coreBatchOp, memcpBatchOp.getHostSourceOffset(), constantFolder),
+        mapper.lookup(memcpBatchOp.getDeviceTarget()),
+        mapper.lookup(memcpBatchOp.getHostSource()),
+        memcpBatchOp.getSizeAttr());
+      mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
+      continue;
+    }
+
+    Operation* cloned = builder.clone(op, mapper);
+    for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
+      mapper.map(originalResult, clonedResult);
  }
 }

 } // namespace

-LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
-                                          unsigned lane,
-                                          llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
+LogicalResult withScalarCoreFromBatchLanes(pim::PimCoreBatchOp coreBatchOp,
+                                           ArrayRef<unsigned> lanes,
+                                           llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
+  assert(!lanes.empty() && "expected at least one batch lane");
+
  OwningOpRef<ModuleOp> scratchModule = ModuleOp::create(coreBatchOp.getLoc());
  OpBuilder builder(scratchModule->getContext());
+  OperationFolder constantFolder(scratchModule->getContext());
  builder.setInsertionPointToStart(scratchModule->getBody());

-  size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
-  size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount;
-  SmallVector<Value> laneWeights;
-  laneWeights.reserve(weightsPerLane);
-  for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex)
-    laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]);
-
+  SmallVector<Value> weights(coreBatchOp.getWeights().begin(), coreBatchOp.getWeights().end());
  auto coreIds = getBatchCoreIds(coreBatchOp);
-  auto scalarCore = pim::PimCoreOp::create(
-    builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane]));
+  int32_t coreId = coreIds[lanes.front()];
+  for (unsigned lane : lanes)
+    assert(coreIds[lane] == coreId && "all grouped lanes must target the same core");
+
+  auto scalarCore =
+    pim::PimCoreOp::create(builder, coreBatchOp.getLoc(), ValueRange(weights), builder.getI32IntegerAttr(coreId));
  Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end());
-  IRMapping mapper;
-  if (coreBatchOp.getBody().front().getNumArguments() == 1)
-    mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]);
-
  builder.setInsertionPointToEnd(block);
-  for (Operation& op : coreBatchOp.getBody().front()) {
-    Operation* cloned = builder.clone(op, mapper);
-    for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
-      mapper.map(originalResult, clonedResult);
-  }
-
+  for (unsigned lane : lanes)
+    cloneScalarizedLaneBody(builder, coreBatchOp, lane, constantFolder);
  if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
    pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
-  scalarizeBatchOpsInCore(scalarCore, laneCount, lane);
  return callback(scalarCore);
 }

+LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
+                                          unsigned lane,
+                                          llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
+  return withScalarCoreFromBatchLanes(coreBatchOp, ArrayRef<unsigned> {lane}, callback);
+}
+
 } // namespace onnx_mlir