remove dead logic

2026-05-19 12:23:01 +02:00
parent e263e05f56
commit a103ba328b
4 changed files with 21 additions and 81 deletions
@@ -9,7 +9,6 @@

 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <optional>
 #include <utility>

@@ -54,7 +53,6 @@ public:
    replaceExternalUses();
    if (failed(eraseOldScheduledOps()))
      return failure();
-    moveExternalUsersBeforeReturn();
    return success();
  }

@@ -97,18 +95,6 @@ private:
         | static_cast<uint32_t>(channelInfo.targetCoreId);
  }

-  void collectExternalUsers(Operation* op) {
-    if (!externalUsersToMove.insert(op).second)
-      return;
-    for (Value result : op->getResults()) {
-      for (Operation* user : result.getUsers()) {
-        if (oldComputeOps.contains(user) || isa<func::ReturnOp>(user))
-          continue;
-        collectExternalUsers(user);
-      }
-    }
-  }
-
  void collectScheduledTasks() {
    for (ComputeInstance scheduledInstance : schedule->dominanceOrderCompute) {
      oldComputeOps.insert(scheduledInstance.op);
@@ -151,25 +137,22 @@ private:
        auto& remoteInputs = remoteInputsByTask[task.computeInstance];
        remoteInputs.resize(taskInputs.size());
        for (auto [inputIndex, input] : llvm::enumerate(taskInputs)) {
-          auto producerRef = getProducerValueRef(input);
-          if (producerRef) {
+          if (auto producerRef = getProducerValueRef(input)) {
            auto producerIt = taskByComputeInstance.find(producerRef->instance);
-            if (producerIt != taskByComputeInstance.end()) {
-              if (producerIt->second.cpu != cpu) {
-                ChannelInfo info {
-                  (*nextChannelId)++,
-                  static_cast<int32_t>(producerIt->second.cpu),
-                  static_cast<int32_t>(cpu),
-                };
-                remoteInputs[inputIndex] = info;
-                auto& perResultChannels = remoteSendsByTask[producerRef->instance];
-                if (perResultChannels.empty())
-                  perResultChannels.resize(getComputeInstanceOutputTypes(producerIt->second.computeInstance).size());
-                perResultChannels[producerRef->resultIndex].push_back(
-                  {info, task.computeInstance, inputIndex, task.orderWithinCpu, 0});
-              }
-              continue;
+            if (producerIt->second.cpu != cpu) {
+              ChannelInfo info {
+                (*nextChannelId)++,
+                static_cast<int32_t>(producerIt->second.cpu),
+                static_cast<int32_t>(cpu),
+              };
+              remoteInputs[inputIndex] = info;
+              auto& perResultChannels = remoteSendsByTask[producerRef->instance];
+              if (perResultChannels.empty())
+                perResultChannels.resize(getComputeInstanceOutputTypes(producerIt->second.computeInstance).size());
+              perResultChannels[producerRef->resultIndex].push_back(
+                {info, task.computeInstance, inputIndex, task.orderWithinCpu, 0});
            }
+            continue;
          }
          if (seenExternalInputsByCpu[cpu].insert(input).second)
            cpuExternalInputs[cpu].push_back(input);
@@ -183,8 +166,6 @@ private:
            if (oldComputeOps.contains(useOwner))
              continue;
            hasExternalUser = true;
-            if (!isa<func::ReturnOp>(useOwner))
-              collectExternalUsers(useOwner);
          }
          if (hasExternalUser)
            cpuExternalOutputs[cpu].push_back({task.computeInstance, resultIndex});
@@ -407,7 +388,8 @@ private:
              if (producerIt->second.cpu == cpu) {
                auto producedIt = producedValuesByTask.find(producerRef->instance);
                if (producedIt == producedValuesByTask.end() || producedIt->second.size() <= producerRef->resultIndex) {
-                  task.computeInstance.op->emitOpError("missing local producer value during per-cpu merge materialization")
+                  task.computeInstance.op->emitOpError(
+                    "missing local producer value during per-cpu merge materialization")
                    << " consumerCpu=" << cpu << " producerCpu=" << producerIt->second.cpu
                    << " producerLaneStart=" << producerRef->instance.laneStart
                    << " producerLaneCount=" << producerRef->instance.laneCount;
@@ -586,18 +568,6 @@ private:
    return success();
  }

-  void moveExternalUsersBeforeReturn() {
-    SmallVector<Operation*> orderedUsersToMove;
-    for (Operation& op : func.getBody().front()) {
-      if (&op == returnOp.getOperation())
-        break;
-      if (externalUsersToMove.contains(&op))
-        orderedUsersToMove.push_back(&op);
-    }
-    for (Operation* op : orderedUsersToMove)
-      op->moveBefore(returnOp);
-  }
-
  func::FuncOp func;
  const MergeScheduleResult* schedule = nullptr;
  int64_t* nextChannelId = nullptr;
@@ -610,7 +580,6 @@ private:
  DenseMap<size_t, SmallVector<ScheduledTask>> tasksByCpu;
  SmallVector<size_t> orderedCpus;
  DenseSet<size_t> seenCpus;
-  DenseSet<Operation*> externalUsersToMove;
  DenseMap<ComputeInstance, SmallVector<SmallVector<RemoteSendInfo>>> remoteSendsByTask;
  DenseMap<ComputeInstance, SmallVector<std::optional<ChannelInfo>>> remoteInputsByTask;
  DenseMap<size_t, SmallVector<Value>> cpuExternalInputs;
@@ -13,7 +13,6 @@
 #include "mlir/Support/LLVM.h"

 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,9 +27,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <fstream>
-#include <functional>
 #include <iterator>
-#include <limits>
 #include <memory>
 #include <optional>
 #include <tuple>
@@ -39,13 +36,11 @@

 #include "MaterializeMergeSchedule.hpp"
 #include "PostMergeCompaction.hpp"
-#include "RegularOpCompaction.hpp"
 #include "Scheduling/ComputeInstanceUtils.hpp"
 #include "Scheduling/MergeSchedulingAnalysis.hpp"
 #include "src/Accelerators/PIM/Common/IR/CompactAsmUtils.hpp"
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"

 using namespace mlir;
@@ -53,10 +48,8 @@ using namespace mlir;
 namespace onnx_mlir {
 namespace {
 using namespace onnx_mlir::compact_asm;
-using ProducerValueRef = spatial::ProducerValueRef;
 using SpatCompute = spatial::SpatCompute;
 using SpatComputeBatch = spatial::SpatComputeBatch;
-using spatial::getOriginalSpatCompute;
 using spatial::getProducerValueRef;

 bool isMergeProfilingEnabled() { return std::getenv("RAPTOR_PROFILE_MERGE") != nullptr; }
@@ -303,7 +296,7 @@ void emitMotifProfile(func::FuncOp funcOp) {
    }

    for (Value input : compute.getInputs()) {
-      auto parent = getOriginalSpatCompute(input.getDefiningOp());
+      auto parent = dyn_cast<SpatCompute>(input.getDefiningOp());
      if (!parent || parent == compute)
        continue;
      auto parentIt = computeToIndex.find(parent);
@@ -22,7 +22,7 @@ size_t getBatchChunkTargetCount(int32_t laneCount) {
 }

 ComputeInstance getBatchChunkForIndex(SpatComputeBatch batch, size_t chunkIndex) {
-  size_t totalLanes = static_cast<size_t>(batch.getLaneCount());
+  size_t totalLanes = batch.getLaneCount();
  size_t chunkCount = getBatchChunkTargetCount(batch.getLaneCount());
  size_t baseChunkSize = totalLanes / chunkCount;
  size_t largeChunkCount = totalLanes % chunkCount;
@@ -33,7 +33,7 @@ ComputeInstance getBatchChunkForIndex(SpatComputeBatch batch, size_t chunkIndex)
 }

 ComputeInstance getBatchChunkForLane(SpatComputeBatch batch, uint32_t lane) {
-  size_t totalLanes = static_cast<size_t>(batch.getLaneCount());
+  size_t totalLanes = batch.getLaneCount();
  size_t chunkCount = getBatchChunkTargetCount(batch.getLaneCount());
  size_t baseChunkSize = totalLanes / chunkCount;
  size_t largeChunkCount = totalLanes % chunkCount;
@@ -47,32 +47,11 @@ ComputeInstance getBatchChunkForLane(SpatComputeBatch batch, uint32_t lane) {
  return getBatchChunkForIndex(batch, chunkIndex);
 }

-SpatCompute getOriginalSpatCompute(Operation *op) {
-  if (!op)
-    return {};
-
-  while (auto extract = dyn_cast<tensor::ExtractSliceOp>(op)) {
-    op = extract.getSource().getDefiningOp();
-    if (!op)
-      return {};
-  }
-
-  return dyn_cast<SpatCompute>(op);
-}
-
 std::optional<ProducerValueRef> getProducerValueRef(Value value) {
  Operation *op = value.getDefiningOp();
  if (!op)
    return std::nullopt;

-  //TODO Extract Slice is not the only global non compute operation. There are other legal op
-  while (auto extract = dyn_cast<tensor::ExtractSliceOp>(op)) {
-    value = extract.getSource();
-    op = value.getDefiningOp();
-    if (!op)
-      return std::nullopt;
-  }
-
  if (auto compute = dyn_cast<SpatCompute>(op)) {
    return ProducerValueRef {
      ComputeInstance {compute.getOperation(), 0, 1},
@@ -81,9 +60,9 @@ std::optional<ProducerValueRef> getProducerValueRef(Value value) {
  }

  if (auto batch = dyn_cast<SpatComputeBatch>(op)) {
-    uint32_t lane = static_cast<uint32_t>(cast<OpResult>(value).getResultNumber());
+    uint32_t lane = cast<OpResult>(value).getResultNumber();
    ComputeInstance instance = getBatchChunkForLane(batch, lane);
-    size_t resultIndex = static_cast<size_t>(lane - instance.laneStart);
+    size_t resultIndex = lane - instance.laneStart;
    return ProducerValueRef {instance, resultIndex};
  }

@@ -26,7 +26,6 @@ size_t getBatchChunkTargetCount(int32_t laneCount);
 ComputeInstance getBatchChunkForIndex(SpatComputeBatch batch, size_t chunkIndex);
 ComputeInstance getBatchChunkForLane(SpatComputeBatch batch, uint32_t lane);

-SpatCompute getOriginalSpatCompute(mlir::Operation *op);
 std::optional<ProducerValueRef> getProducerValueRef(mlir::Value value);
 std::optional<ComputeInstance> getComputeProducerInstance(mlir::Value value);