From 85e2750d6c197b307cc8c4b43261dcb874f3e21e Mon Sep 17 00:00:00 2001
From: NiccoloN <niccolo.nicolosi@gmail.com>
Date: Tue, 21 Apr 2026 12:33:44 +0200
Subject: [PATCH] faster (and refactored) DCP analysis

---
 README.md                                     |    6 +-
 .../SpatialToPim/SpatialToPimPass.cpp         |    6 +-
 src/PIM/Dialect/Spatial/CMakeLists.txt        |    2 +
 .../DCPGraph/DCPAnalysis.cpp                  |   28 +-
 .../DCPGraph/DCPAnalysis.hpp                  |    9 +-
 .../MergeComputeNodes/DCPGraph/Graph.cpp      | 1489 ++++++++++++-----
 .../MergeComputeNodes/DCPGraph/Graph.hpp      |  123 +-
 .../MergeComputeNodes/DCPGraph/GraphDebug.cpp |  152 ++
 .../MergeComputeNodes/DCPGraph/GraphDebug.hpp |   57 +
 .../DCPGraph/GraphSupport.cpp                 |  105 ++
 .../DCPGraph/GraphSupport.hpp                 |   41 +
 .../MergeComputeNodes/DCPGraph/Task.cpp       |   62 +-
 .../MergeComputeNodes/DCPGraph/Task.hpp       |  111 +-
 .../DCPGraph/UniqueWorklist.hpp               |   57 +-
 .../MergeComputeNodes/DCPGraph/Utils.hpp      |  120 +-
 .../MergeComputeNodesPass.cpp                 |  112 +-
 test/PIM/CMakeLists.txt                       |   11 +-
 test/PIM/DCPTest.cpp                          |  528 ++++++
 test/PIM/LabeledListTest.cpp                  |  162 ++
 test/PIM/TestPIM.cpp                          |  202 ---
 20 files changed, 2525 insertions(+), 858 deletions(-)
 create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
 create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp
 create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
 create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp
 create mode 100644 test/PIM/DCPTest.cpp
 create mode 100644 test/PIM/LabeledListTest.cpp
 delete mode 100644 test/PIM/TestPIM.cpp

diff --git a/README.md b/README.md
index 3f9ac1b..3d26525 100644
--- a/README.md
+++ b/README.md
@@ -31,8 +31,7 @@ Moreover, if compiling with build type debug, it is also suggested to use
 mold as linker (you will need to install it if you don't have it already)
 to reduce memory usage during linking. You can use it by setting the options:
 ```
--DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \
--DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold"
+-DLLVM_USE_LINKER=mold
 ```
 
 ### Raptor
@@ -45,7 +44,8 @@ Also in this case, it is suggested to use mold as linker to reduce link time and
 setting the options:
 ```
 -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \
--DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold"
+-DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" \
+-DCMAKE_MODULE_LINKER_FLAGS="-fuse-ld=mold"
 ```
 
 ```
diff --git a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp
index 867b207..3621336 100644
--- a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp
+++ b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp
@@ -661,9 +661,8 @@ void SpatialToPimPass::annotateChannelCoreIds(func::FuncOp funcOp) {
         broadcastSendOp = op;
         continue;
       }
-      if (auto op = dyn_cast<spatial::SpatChannelBroadcastReceiveOp>(user)) {
+      if (auto op = dyn_cast<spatial::SpatChannelBroadcastReceiveOp>(user))
         continue;
-      }
       llvm_unreachable("Unexpected user of spat.channel_new during Spatial-to-PIM lowering");
     }
 
@@ -719,7 +718,8 @@ void SpatialToPimPass::lowerBroadcastChannelOps(func::FuncOp funcOp, IRRewriter&
     auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult());
     auto sourceCoreIdAttr = getSpatialChannelSourceCoreIdAttr(rewriter, receiveOp.getChannel());
     Value receivedValue =
-      PimReceiveOp::create(rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr)
+      PimReceiveOp::create(
+        rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr)
         .getOutput();
     rewriter.replaceOp(receiveOp, receivedValue);
   }
diff --git a/src/PIM/Dialect/Spatial/CMakeLists.txt b/src/PIM/Dialect/Spatial/CMakeLists.txt
index 286abd7..641f011 100644
--- a/src/PIM/Dialect/Spatial/CMakeLists.txt
+++ b/src/PIM/Dialect/Spatial/CMakeLists.txt
@@ -5,6 +5,8 @@ add_pim_library(SpatialOps
   SpatialOps.cpp
   Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
   Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
+  Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
+  Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
   Transforms/MergeComputeNodes/DCPGraph/Task.cpp
   Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
 
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
index d5006d7..0c0c5a7 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
@@ -17,7 +17,7 @@ namespace spatial {
 
 using namespace mlir;
 
-SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) {
+SpatWeightedCompute getOriginalSpatWeightedCompute(Operation* op) {
   if (!op)
     return {};
   while (auto extract = llvm::dyn_cast<tensor::ExtractSliceOp>(op)) {
@@ -30,32 +30,32 @@ SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) {
   return {};
 }
 
-DCPAnalysisResult DCPAnalysis::runAnalysis() {
-  using EdgesIndex = std::tuple<int64_t, int64_t, int64_t>;
+DCPAnalysisResult DCPAnalysis::run() {
   llvm::SmallVector<SpatWeightedCompute, 10> spatWeightedComputes;
-  llvm::SmallVector<EdgesIndex, 10> edges;
-  for (auto& regions : entryOp->getRegions())
-    for (SpatWeightedCompute spatWeightedCompute : regions.getOps<SpatWeightedCompute>())
+  llvm::SmallVector<IndexedEdge, 10> edges;
+  for (auto& region : entryOp->getRegions())
+    for (SpatWeightedCompute spatWeightedCompute : region.getOps<SpatWeightedCompute>())
       spatWeightedComputes.push_back(spatWeightedCompute);
 
   for (auto [indexEndEdge, spatWeightedCompute] : llvm::enumerate(spatWeightedComputes)) {
     for (Value input : spatWeightedCompute.getInputs()) {
-      if (auto spatWeightedComputeArgOp = getOriginalSpatWeightCompute(input.getDefiningOp())) {
-        auto elemIter = llvm::find(spatWeightedComputes, spatWeightedComputeArgOp);
-        assert(elemIter != spatWeightedComputes.end());
-        auto indexStartEdge = std::distance(spatWeightedComputes.begin(), elemIter);
-        ResultRange outputs = spatWeightedComputeArgOp.getResults();
+      if (auto producerCompute = getOriginalSpatWeightedCompute(input.getDefiningOp())) {
+        auto producerIt = llvm::find(spatWeightedComputes, producerCompute);
+        assert(producerIt != spatWeightedComputes.end());
+        auto indexStartEdge = std::distance(spatWeightedComputes.begin(), producerIt);
+        ResultRange outputs = producerCompute.getResults();
         int64_t totalSize = 0;
         for (auto output : outputs) {
-          ShapedType result = cast<ShapedType>(output.getType());
-          totalSize += getSizeInBytes(result);
+          ShapedType resultType = cast<ShapedType>(output.getType());
+          totalSize += getSizeInBytes(resultType);
         }
         edges.push_back({indexStartEdge, indexEndEdge, totalSize});
       }
     }
   }
   GraphDCP graphDCP(spatWeightedComputes, edges);
-  graphDCP.DCP();
+  graphDCP.setContext(entryOp->getContext());
+  graphDCP.runDcp();
   return graphDCP.getResult();
 }
 
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp
index f7426cc..472e51f 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp
@@ -3,6 +3,7 @@
 #include "mlir/IR/Operation.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 
 #include <vector>
 
@@ -10,8 +11,8 @@
 
 struct DCPAnalysisResult {
   std::vector<onnx_mlir::spatial::SpatWeightedCompute> dominanceOrderCompute;
-  llvm::DenseMap<onnx_mlir::spatial::SpatWeightedCompute, size_t> computeToCPUMap;
-  llvm::DenseSet<onnx_mlir::spatial::SpatWeightedCompute> isLastComputeOfACpu;
+  llvm::DenseMap<onnx_mlir::spatial::SpatWeightedCompute, size_t> computeToCpuMap;
+  llvm::DenseSet<onnx_mlir::spatial::SpatWeightedCompute> isLastComputeOfCpu;
   llvm::DenseMap<size_t, onnx_mlir::spatial::SpatWeightedCompute> cpuToLastComputeMap;
 };
 
@@ -21,12 +22,12 @@ struct DCPAnalysis {
 private:
   DCPAnalysisResult result;
   mlir::Operation* entryOp;
-  DCPAnalysisResult runAnalysis();
+  DCPAnalysisResult run();
 
 public:
   DCPAnalysis(mlir::Operation* op)
   : entryOp(op) {
-    result = runAnalysis();
+    result = run();
   }
   DCPAnalysisResult& getResult() { return result; }
 };
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
index 8a45713..0056071 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
@@ -1,105 +1,152 @@
+//===----------------------------------------------------------------------===//
+// DCP-inspired task scheduler.
+//
+// Input: a DAG of compute tasks. Each task has an execution weight; each edge
+// carries an inter-task transfer cost that only applies when producer and
+// consumer land on different CPUs.
+//
+// Output: an assignment of every task to a CPU and an order within that CPU,
+// aiming to minimise the overall critical-path length (DCPL).
+//
+// Every task keeps two timing estimates:
+//   AEST - earliest start time, driven by parent completions + transfers.
+//   ALST - latest start time that still keeps the task on the critical path.
+// A task is "critical" when its slack (ALST - AEST) is zero.
+//
+// Main loop (runDcp):
+//   1. Build a topological order and seed AEST/ALST from the unscheduled DAG.
+//   2. While there are ready tasks (all dependency parents scheduled):
+//        a. Pick the candidate with tightest slack (earliest AEST breaks ties).
+//        b. selectProcessor() tries every candidate CPU and picks the one that
+//           minimises a composite cost (own slot + smallest unscheduled child).
+//        c. Commit the placement and refresh AEST/ALST.
+//        d. Release any child whose dependency parents are now all scheduled.
+//
+// Heuristic notes: classic DCP assumes identical task costs on every CPU and
+// a single-issue processor model. We diverge - crossbar capacity can make a
+// task infeasible on a CPU, and placement happens incrementally. That makes
+// this a heuristic rather than a faithful DCP implementation.
+//
+// Parallelism: selectProcessor's per-CPU findSlot sweep is read-only, so we
+// run it concurrently across CPUs via mlir::parallelFor. The subsequent
+// sequential evaluation benefits from ordering CPUs by ascending slot.aest,
+// which tightens the bestComposite early-prune bound.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Threading.h"
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 
 #include <algorithm>
-#include <array>
 #include <cassert>
-#include <climits>
-#include <cstddef>
-#include <deque>
-#include <fstream>
-#include <iterator>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
 #include <vector>
 
 #include "DCPAnalysis.hpp"
 #include "Graph.hpp"
+#include "GraphDebug.hpp"
+#include "GraphSupport.hpp"
 #include "Task.hpp"
 #include "UniqueWorklist.hpp"
 #include "Utils.hpp"
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 
-std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight) {
-  auto old_child = parent->addChild(child, weight);
-  auto old_parent = child->addParent(parent, weight);
-  assert(old_child.has_value() == old_parent.has_value() && "The edge must be present in both element");
-  if (old_child.has_value()) {
+#ifdef DCP_DEBUG_ENABLED
+namespace {
+// Coarse-grained phase timers printed when DCP_SELECT_PROFILE is set.
+struct SelectTimers {
+  double findSlot = 0.0;
+  double precheck = 0.0;
+  double snapshotInsertUpdate = 0.0;
+  double childSlot = 0.0;
+  double rollbackRestore = 0.0;
+  long iterations = 0;
+  long passedPrecheck = 0;
+  long passedDcpl = 0;
+  long tasksProcessed = 0;
+  void dump(const char* label) const {
+    std::fprintf(stderr,
+                 "[selectProfile:%s] tasks=%ld findSlot=%.2fs precheck=%.2fs snapUpd=%.2fs childSlot=%.2fs rollback=%.2fs iter=%ld precheckPass=%ld dcplPass=%ld\n",
+                 label, tasksProcessed, findSlot, precheck, snapshotInsertUpdate, childSlot,
+                 rollbackRestore, iterations, passedPrecheck, passedDcpl);
+  }
+  ~SelectTimers() {
+    if (std::getenv("DCP_SELECT_PROFILE"))
+      dump("exit");
+  }
+};
+static SelectTimers gSelectTimers;
+} // namespace
+#endif
 
+//===----------------------------------------------------------------------===//
+// Edge manipulation
+//===----------------------------------------------------------------------===//
+
+std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling) {
+  auto oldChild = parent->addChild(child, weight, isScheduling);
+  auto oldParent = child->addParent(parent, weight, isScheduling);
+  assert(oldChild.has_value() == oldParent.has_value() && "The edge must be present in both element");
+  if (oldChild.has_value()) {
     return {
-      {*old_parent, *old_child}
+      {*oldParent, *oldChild}
     };
   }
   return {};
 }
 
-void removeEdge(TaskDCP* parent, TaskDCP* child) {
-  parent->removeChild(child);
-  child->removeParent(parent);
+void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling) {
+  parent->removeChild(child, isScheduling);
+  child->removeParent(parent, isScheduling);
 }
 
-int getTranferCost(TaskDCP* parent, TaskDCP* child) {
-  if (parent->scheduledCPU.has_value() && child->scheduledCPU.has_value()
-      && *parent->scheduledCPU == *child->scheduledCPU) {
+// A dependency edge may appear multiple times (e.g. from separate data inputs);
+// the transfer cost is the maximum across those parallel edges. Cost is zero
+// when both endpoints share a CPU.
+Weight getTransferCost(TaskDCP* parent, TaskDCP* child) {
+  if (parent->scheduledCpu.has_value() && child->scheduledCpu.has_value()
+      && *parent->scheduledCpu == *child->scheduledCpu)
     return 0;
-  }
-  auto child_position =
-    std::find_if(parent->childs.begin(), parent->childs.end(), [&child](Edge_t elem) { return elem.first == child; });
-  assert(child_position != parent->childs.end());
-  return child_position->second;
+  Weight maxTransferCost = 0;
+  bool foundTransferCost = false;
+  for (const auto& edge : parent->children)
+    if (edge.first == child && !edge.isScheduling) {
+      maxTransferCost = std::max(maxTransferCost, edge.second);
+      foundTransferCost = true;
+    }
+  assert(foundTransferCost && "missing transfer cost for dependency edge");
+  return maxTransferCost;
 }
 
+//===----------------------------------------------------------------------===//
+// Indexing and CPU task lists
+//===----------------------------------------------------------------------===//
+
 size_t GraphDCP::getNodeIndex(const TaskDCP* task) const {
   assert(task >= nodes.data() && task < nodes.data() + nodes.size() && "task must belong to graph");
   return static_cast<size_t>(task - nodes.data());
 }
 
-TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) {
-  TaskInsertion ret;
-  task->setCPU(cpu);
-  task->setWeight(task->computeWeight(this, cpu));
-  auto& tasksInCPU = mapCPUTasks[cpu];
-  unsigned int numCPUTask = tasksInCPU.size();
-  assert(position <= numCPUTask && "Inserting in a not valid position");
-  auto insertedPoint = tasksInCPU.insert(std::next(tasksInCPU.begin(), position), task);
-  ret.oldTopologicalPosition = std::next(task->getTopologicalPosition());
-  ret.cpuModified = cpu;
-  ret.taskInserted = task;
-  ret.graph = this;
-
-  if (insertedPoint != tasksInCPU.begin()) {
-    auto precedentPoint = std::prev(insertedPoint, 1);
-    auto oldEdge = addEdge(*precedentPoint, *insertedPoint, 0);
-    ret.beforeNode = oldEdge;
-
-    if (*task < **insertedPoint)
-      topologicalMoveAfter(task, *precedentPoint);
-  }
-
-  if (std::next(insertedPoint) != tasksInCPU.end()) {
-    auto nextPoint = std::next(insertedPoint, 1);
-    auto oldEdge = addEdge(*insertedPoint, *nextPoint, 0);
-    ret.afterNode = oldEdge;
-    if (**insertedPoint < *task)
-      topologicalMoveBefore(task, *nextPoint);
-  }
-  return ret;
+GraphDCP::CpuTaskList& GraphDCP::getOrCreateCpuTasks(CPU cpu) {
+  assert(cpu >= 0 && "cpu id must be non-negative");
+  size_t cpuIndex = static_cast<size_t>(cpu);
+  if (cpuTasks.size() <= cpuIndex)
+    cpuTasks.resize(cpuIndex + 1);
+  return cpuTasks[cpuIndex];
 }
 
-void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) {
-  task->resetCPU();
-  task->resetWeight();
-  auto& list = mapCPUTasks[cpu];
-  auto task_position = std::find(list.begin(), list.end(), task);
-  assert(task_position != list.end() && "Removing a not present task");
-  if (task_position != list.begin()) {
-    auto precedent_point = std::prev(task_position, 1);
-    removeEdge(*precedent_point, *task_position);
-  }
-
-  if (std::next(task_position) != list.end()) {
-    auto next_point = std::next(task_position, 1);
-    removeEdge(*task_position, *next_point);
-  }
-  list.erase(task_position);
+const GraphDCP::CpuTaskList* GraphDCP::findCpuTasks(CPU cpu) const {
+  if (cpu < 0)
+    return nullptr;
+  size_t cpuIndex = static_cast<size_t>(cpu);
+  if (cpuIndex >= cpuTasks.size())
+    return nullptr;
+  return &cpuTasks[cpuIndex];
 }
 
 std::vector<TaskDCP*> GraphDCP::getRoots() {
@@ -110,153 +157,495 @@ std::vector<TaskDCP*> GraphDCP::getRoots() {
   return tmp;
 }
 
-void GraphDCP::initAEST() {
-  UniqueWorkList<std::deque<TaskDCP*>> worklists(getRoots());
+// Inserts `task` at `position` on `cpu`, wiring up scheduling edges with the
+// neighbouring tasks and keeping the global topological order consistent.
+TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) {
+  TaskInsertion ret;
+  Weight scheduledWeight = task->computeWeightOnCpu(this, cpu);
+  task->setCpu(cpu);
+  task->setWeight(scheduledWeight);
+  reserveTaskCrossbars(cpu, task);
+  auto& tasksInCpu = getOrCreateCpuTasks(cpu);
+  unsigned int numCpuTasks = tasksInCpu.size();
+  assert(position <= numCpuTasks && "Inserting in a not valid position");
+  auto insertedPoint = tasksInCpu.insert(std::next(tasksInCpu.begin(), position), task);
+  ret.cpuModified = cpu;
+  ret.taskInserted = task;
+  ret.graph = this;
 
+  // If we split an existing neighbour-neighbour scheduling edge, drop it; the
+  // two new edges below recreate the ordering with `task` in between.
+  if (insertedPoint != tasksInCpu.begin() && std::next(insertedPoint) != tasksInCpu.end()) {
+    auto precedentPoint = std::prev(insertedPoint, 1);
+    auto nextPoint = std::next(insertedPoint, 1);
+    removeEdge(*precedentPoint, *nextPoint, true);
+  }
+
+  if (insertedPoint != tasksInCpu.begin()) {
+    auto precedentPoint = std::prev(insertedPoint, 1);
+    auto oldEdge = addEdge(*precedentPoint, *insertedPoint, 0, true);
+    ret.beforeNode = oldEdge;
+
+    if (*task < **precedentPoint)
+      topologicalMoveAfter(task, *precedentPoint, &ret);
+  }
+
+  if (std::next(insertedPoint) != tasksInCpu.end()) {
+    auto nextPoint = std::next(insertedPoint, 1);
+    auto oldEdge = addEdge(*insertedPoint, *nextPoint, 0, true);
+    ret.afterNode = oldEdge;
+    if (**nextPoint < *task)
+      topologicalMoveBefore(task, *nextPoint, &ret);
+  }
+  return ret;
+}
+
+void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) {
+  releaseTaskCrossbars(cpu, task);
+  task->resetCpu();
+  task->resetWeight();
+  auto& scheduledTasks = getOrCreateCpuTasks(cpu);
+  auto taskPosition = std::find(scheduledTasks.begin(), scheduledTasks.end(), task);
+  assert(taskPosition != scheduledTasks.end() && "Removing a not present task");
+  TaskDCP* previousTask = nullptr;
+  TaskDCP* nextTask = nullptr;
+  if (taskPosition != scheduledTasks.begin()) {
+    auto previousPoint = std::prev(taskPosition, 1);
+    previousTask = *previousPoint;
+    removeEdge(*previousPoint, *taskPosition, true);
+  }
+
+  if (std::next(taskPosition) != scheduledTasks.end()) {
+    auto nextPoint = std::next(taskPosition, 1);
+    nextTask = *nextPoint;
+    removeEdge(*taskPosition, *nextPoint, true);
+  }
+  if (previousTask != nullptr && nextTask != nullptr)
+    addEdge(previousTask, nextTask, 0, true);
+  scheduledTasks.erase(taskPosition);
+}
+
+//===----------------------------------------------------------------------===//
+// Crossbar capacity bookkeeping
+//===----------------------------------------------------------------------===//
+
+CrossbarUsage GraphDCP::getCpuCrossbarUsage(CPU cpu) const {
+  auto it = cpuCrossbarUsage.find(cpu);
+  if (it == cpuCrossbarUsage.end())
+    return 0;
+  return it->second;
+}
+
+CrossbarUsage GraphDCP::getCpuCrossbarCapacity() const {
+  assert(onnx_mlir::crossbarSize.getValue() > 0 && "crossbar-size must be strictly positive");
+  assert(onnx_mlir::crossbarCountInCore.getValue() > 0 && "crossbar-count must be strictly positive");
+  CrossbarUsage crossbarEdge = static_cast<CrossbarUsage>(onnx_mlir::crossbarSize.getValue());
+  CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge);
+  return checkedMultiply(static_cast<CrossbarUsage>(onnx_mlir::crossbarCountInCore.getValue()), crossbarArea);
+}
+
+CrossbarUsage GraphDCP::getTaskCrossbarFootprint(const TaskDCP* task) const {
+  CrossbarUsage crossbarCount = task->getCrossbarUsage();
+  if (crossbarCount == 0)
+    return 0;
+  CrossbarUsage crossbarEdge = static_cast<CrossbarUsage>(onnx_mlir::crossbarSize.getValue());
+  CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge);
+  return checkedMultiply(crossbarCount, crossbarArea);
+}
+
+void GraphDCP::reserveTaskCrossbars(CPU cpu, const TaskDCP* task) {
+  cpuCrossbarUsage[cpu] = checkedAdd(getCpuCrossbarUsage(cpu), getTaskCrossbarFootprint(task));
+}
+
+void GraphDCP::releaseTaskCrossbars(CPU cpu, const TaskDCP* task) {
+  CrossbarUsage footprint = getTaskCrossbarFootprint(task);
+  CrossbarUsage currentUsage = getCpuCrossbarUsage(cpu);
+  assert(currentUsage >= footprint && "crossbar usage underflow");
+  cpuCrossbarUsage[cpu] = currentUsage - footprint;
+}
+
+bool GraphDCP::wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const {
+  CrossbarUsage footprint = getTaskCrossbarFootprint(task);
+  if (footprint == 0)
+    return false;
+  CrossbarUsage nextUsage = checkedAdd(getCpuCrossbarUsage(cpu), footprint);
+  return nextUsage >= getCpuCrossbarCapacity();
+}
+
+//===----------------------------------------------------------------------===//
+// AEST / ALST computation
+//===----------------------------------------------------------------------===//
+
+// Walks the topological order once and fills AEST from parent completions,
+// while tracking the top two completion times so DCPL updates can avoid a
+// second pass. secondMaxCompletion lets us invalidate `maxCompletionTask`
+// locally when its AEST moves.
+void GraphDCP::initAest() {
   auto& worklist = topologicalOrder;
-  int max_dcpl = 0;
+  Time maxDcpl = 0;
+  Time secondMaxCompletionCandidate = 0;
+  TaskDCP* maxCompletionTaskCandidate = nullptr;
   for (auto& task : worklist) {
-    int max_parent_aest = 0;
-    for (Edge_t parentEdge : task.parents) {
-      max_parent_aest =
-        std::max(parentEdge.first->getAEST() + parentEdge.first->getWeight() + getTranferCost(parentEdge.first, &task),
-                 max_parent_aest);
+    Time maxParentAest = 0;
+    for (Edge parentEdge : task.parents) {
+      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
+                                        getTransferCost(parentEdge.first, &task)),
+                               maxParentAest);
     }
-    task.setAEST(max_parent_aest);
-    max_dcpl = std::max(max_parent_aest + task.getWeight(), max_dcpl);
-  }
-  DCPL = max_dcpl;
-}
-
-int GraphDCP::computeAEST(TaskDCP* task, CPU cpu) {
-  int max_parent_aest = 0;
-  for (Edge_t parentEdge : task->parents) {
-    int transfer_cost = 0;
-    if (!(parentEdge.first->isScheduled() && cpu == *parentEdge.first->getCPU()))
-      transfer_cost = getTranferCost(parentEdge.first, task);
-    max_parent_aest =
-      std::max(parentEdge.first->getAEST() + parentEdge.first->getWeight() + transfer_cost, max_parent_aest);
-  }
-  return max_parent_aest;
-}
-
-int GraphDCP::computeDCPL(TaskDCP* task, CPU cpu) {
-  int max_aest = 0;
-  for (auto& node : nodes)
-    if (&node != task)
-      max_aest = std::max(node.getAEST() + node.getWeight(), max_aest);
-    else
-      max_aest = std::max(computeAEST(task, cpu) + node.computeWeight(this, cpu), max_aest);
-  return max_aest;
-}
-
-void GraphDCP::initALST() {
-  int dcpl = getDCPL();
-  auto& worklists = topologicalOrder;
-
-  for (TaskDCP& node : llvm::reverse(worklists)) {
-    int min_alst = INT_MAX;
-    if (!node.hasChilds())
-      min_alst = dcpl - node.getWeight();
-    for (Edge_t childEdge : node.childs)
-      min_alst =
-        std::min(min_alst, childEdge.first->getALST() - node.getWeight() - getTranferCost(&node, childEdge.first));
-    node.setALST(min_alst);
-  }
-}
-
-llvm::DenseMap<TaskDCP*, int> GraphDCP::computeALST(TaskDCP* task, CPU cpu) {
-  int dcpl = computeDCPL(task, cpu);
-  llvm::DenseMap<TaskDCP*, int> temp_ALST;
-
-  auto& worklists = topologicalOrder;
-  for (TaskDCP& node : llvm::reverse(worklists)) {
-    int min_alst = INT_MAX;
-    if (!node.hasChilds()) {
-      if (&node != task)
-        min_alst = dcpl - node.getWeight();
-      else
-        min_alst = dcpl - node.computeWeight(this, cpu);
+    task.setAest(maxParentAest);
+    Time completion = addOrMax(maxParentAest, task.getWeight());
+    if (completion >= maxDcpl) {
+      secondMaxCompletionCandidate = maxDcpl;
+      maxDcpl = completion;
+      maxCompletionTaskCandidate = &task;
     }
-
-    for (Edge_t childEdge : node.childs) {
-      int transfer_cost = getTranferCost(&node, childEdge.first);
-      if (&node == task && childEdge.first->isScheduled() && cpu == *childEdge.first->getCPU())
-        transfer_cost = 0;
-      min_alst = std::min(min_alst, temp_ALST[childEdge.first] - node.getWeight() - transfer_cost);
+    else if (completion > secondMaxCompletionCandidate) {
+      secondMaxCompletionCandidate = completion;
     }
-    temp_ALST[&node] = min_alst;
   }
-  return temp_ALST;
+  dcpl = maxDcpl;
+  maxCompletion = maxDcpl;
+  secondMaxCompletion = secondMaxCompletionCandidate;
+  maxCompletionTask = maxCompletionTaskCandidate;
 }
 
-TaskDCP* GraphDCP::findCandidate(std::vector<TaskDCP*> nodes) {
-  auto hasNoCPParentUnsecheduled = [](TaskDCP* node) {
-    return std::all_of(
-      node->parents.begin(), node->parents.end(), [](Edge_t element) { return element.first->isScheduled() == true; });
+// Same backward pass as initAest but over the reverse topological order,
+// seeding ALST from scheduleDcpl on leaves.
+void GraphDCP::initAlst() {
+  Time scheduleDcpl = getDcpl();
+  auto& worklist = topologicalOrder;
+
+  for (TaskDCP& node : llvm::reverse(worklist)) {
+    Time minAlst = std::numeric_limits<Time>::max();
+    if (!node.hasChildren())
+      minAlst = subtractOrZero(scheduleDcpl, node.getWeight());
+    for (Edge childEdge : node.children)
+      minAlst = std::min(minAlst,
+                         subtractOrZero(childEdge.first->getAlst(),
+                                        addOrMax(node.getWeight(), getTransferCost(&node, childEdge.first))));
+    node.setAlst(minAlst);
+  }
+}
+
+Time GraphDCP::computeAestOnCpu(TaskDCP* task, CPU cpu) {
+  Time maxParentAest = 0;
+  for (Edge parentEdge : task->parents) {
+    Weight transferCost = 0;
+    if (!(parentEdge.first->isScheduled() && cpu == *parentEdge.first->getCpu()))
+      transferCost = getTransferCost(parentEdge.first, task);
+    maxParentAest = std::max(
+      addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()), transferCost), maxParentAest);
+  }
+  return maxParentAest;
+}
+
+// DCPL if `task` were placed on `cpu`: pre-update max (excluding `task` when
+// it currently holds it) vs. the task's new completion on this CPU.
+Time GraphDCP::computeDcplOnCpu(TaskDCP* task, CPU cpu) {
+  Time candidateCompletion = addOrMax(computeAestOnCpu(task, cpu), task->computeWeightOnCpu(this, cpu));
+  Time unchangedMaxCompletion = maxCompletionTask == task ? secondMaxCompletion : maxCompletion;
+  return std::max(unchangedMaxCompletion, candidateCompletion);
+}
+
+Time GraphDCP::computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl) {
+  Weight weight = task->computeWeightOnCpu(this, cpu);
+  if (!task->hasChildren())
+    return subtractOrZero(scheduleDcpl, weight);
+
+  Time minAlst = std::numeric_limits<Time>::max();
+  for (const Edge& childEdge : task->children) {
+    Weight transferCost = getTransferCost(task, childEdge.first);
+    if (childEdge.first->isScheduled() && cpu == *childEdge.first->getCpu())
+      transferCost = 0;
+    minAlst = std::min(minAlst, subtractOrZero(childEdge.first->getAlst(), addOrMax(weight, transferCost)));
+  }
+  return minAlst;
+}
+
+void GraphDCP::updateAestFromTask(TaskDCP* task) {
+  llvm::DenseSet<TaskDCP*> descendants = dcp_graph::collectReachableTasks(task, false);
+  updateAestFromTaskWithDescendants(task, descendants);
+}
+
+void GraphDCP::updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants) {
+  Time modifiedMaxCompletion = 0;
+  TaskDCP* modifiedMaxTask = nullptr;
+  Time modifiedSecondMaxCompletion = 0;
+
+  auto considerCompletion = [&](TaskDCP* currentTask) {
+    Time completion = addOrMax(currentTask->getAest(), currentTask->getWeight());
+    if (completion >= modifiedMaxCompletion) {
+      modifiedSecondMaxCompletion = modifiedMaxCompletion;
+      modifiedMaxCompletion = completion;
+      modifiedMaxTask = currentTask;
+    }
+    else if (completion > modifiedSecondMaxCompletion) {
+      modifiedSecondMaxCompletion = completion;
+    }
   };
 
-  auto findBestNode = [](auto lft, auto rgt) {
-    int lft_difference = (*lft)->getALST() - (*lft)->getAEST();
-    int rgt_difference = (*rgt)->getALST() - (*rgt)->getAEST();
-    if (lft_difference < rgt_difference)
-      return lft;
-    if (rgt_difference < lft_difference)
-      return rgt;
-    if ((*lft)->getAEST() < (*rgt)->getAEST())
-      return lft;
-    return rgt;
-  };
-
-  auto valid_node = std::find_if(nodes.begin(), nodes.end(), hasNoCPParentUnsecheduled);
-  auto best_node = valid_node;
-
-  while (valid_node != nodes.end()) {
-    if (!hasNoCPParentUnsecheduled(*valid_node)) {
-      std::advance(valid_node, 1);
+  for (auto it = task->getTopologicalIterator(); it != topologicalOrder.end(); ++it) {
+    TaskDCP* currentTask = &*it;
+    if (currentTask != task && !descendants.contains(currentTask))
       continue;
+
+    Time maxParentAest = 0;
+    for (const Edge& parentEdge : currentTask->parents) {
+      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
+                                        getTransferCost(parentEdge.first, currentTask)),
+                               maxParentAest);
     }
-    best_node = findBestNode(valid_node, best_node);
-    std::advance(valid_node, 1);
+    currentTask->setAest(maxParentAest);
+    considerCompletion(currentTask);
+  }
+
+  // AEST only grows for modified tasks, so the new DCPL is either the old one
+  // (from an unmodified task) or the new max among modified tasks.
+  bool oldMaxInvalidated =
+    maxCompletionTask != nullptr && (maxCompletionTask == task || descendants.contains(maxCompletionTask));
+  if (oldMaxInvalidated) {
+    dcpl = modifiedMaxCompletion;
+    maxCompletion = modifiedMaxCompletion;
+    maxCompletionTask = modifiedMaxTask;
+    secondMaxCompletion = modifiedSecondMaxCompletion;
+  }
+  else {
+    if (modifiedMaxCompletion > maxCompletion) {
+      secondMaxCompletion = maxCompletion;
+      maxCompletion = modifiedMaxCompletion;
+      maxCompletionTask = modifiedMaxTask;
+    }
+    else if (modifiedMaxCompletion > secondMaxCompletion) {
+      secondMaxCompletion = modifiedMaxCompletion;
+    }
+    dcpl = maxCompletion;
   }
-  return *best_node;
 }
 
+// Same as the DenseSet overload, but expects descendants already sorted in
+// topological order (avoids an extra traversal of the whole topo list on hot
+// paths).
+void GraphDCP::updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder) {
+  Time modifiedMaxCompletion = 0;
+  TaskDCP* modifiedMaxTask = nullptr;
+  Time modifiedSecondMaxCompletion = 0;
+
+  auto considerCompletion = [&](TaskDCP* currentTask) {
+    Time completion = addOrMax(currentTask->getAest(), currentTask->getWeight());
+    if (completion >= modifiedMaxCompletion) {
+      modifiedSecondMaxCompletion = modifiedMaxCompletion;
+      modifiedMaxCompletion = completion;
+      modifiedMaxTask = currentTask;
+    }
+    else if (completion > modifiedSecondMaxCompletion) {
+      modifiedSecondMaxCompletion = completion;
+    }
+  };
+
+  auto recomputeAest = [&](TaskDCP* currentTask) {
+    Time maxParentAest = 0;
+    for (const Edge& parentEdge : currentTask->parents) {
+      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
+                                        getTransferCost(parentEdge.first, currentTask)),
+                               maxParentAest);
+    }
+    currentTask->setAest(maxParentAest);
+    considerCompletion(currentTask);
+  };
+
+  recomputeAest(task);
+  for (TaskDCP* descendant : descendantsTopoOrder)
+    recomputeAest(descendant);
+
+  const bool oldMaxInvalidated = maxCompletionTask != nullptr
+                              && (maxCompletionTask == task
+                                  || llvm::is_contained(descendantsTopoOrder, maxCompletionTask));
+  if (oldMaxInvalidated) {
+    // The pre-update max came from a modified task; its completion has moved
+    // upward, so modifiedMaxCompletion is an upper bound covering it. The
+    // second-best from the modified set is only a local view and may
+    // under-estimate the real runner-up; the conservative use of
+    // secondMaxCompletion in computeDcplOnCpu still yields a correct upper
+    // bound on DCPL.
+    dcpl = modifiedMaxCompletion;
+    maxCompletion = modifiedMaxCompletion;
+    maxCompletionTask = modifiedMaxTask;
+    secondMaxCompletion = modifiedSecondMaxCompletion;
+  }
+  else {
+    if (modifiedMaxCompletion > maxCompletion) {
+      secondMaxCompletion = maxCompletion;
+      maxCompletion = modifiedMaxCompletion;
+      maxCompletionTask = modifiedMaxTask;
+    }
+    else if (modifiedMaxCompletion > secondMaxCompletion) {
+      secondMaxCompletion = modifiedMaxCompletion;
+    }
+    dcpl = maxCompletion;
+  }
+}
+
+// Walks the same topological tail as the update overload but aborts as soon as
+// any modified task's completion exceeds `dcplBudget`. The caller's snapshot
+// restores AEST on the aborted tail. Returns true iff every descendant stayed
+// within budget and the scheduler state now reflects the new DCPL.
+bool GraphDCP::tryUpdateAestWithinBudget(TaskDCP* task,
+                                         llvm::ArrayRef<TaskDCP*> descendantsTopoOrder,
+                                         Time dcplBudget) {
+  Time modifiedMaxCompletion = 0;
+  TaskDCP* modifiedMaxTask = nullptr;
+  Time modifiedSecondMaxCompletion = 0;
+
+  auto process = [&](TaskDCP* currentTask) {
+    Time maxParentAest = 0;
+    for (const Edge& parentEdge : currentTask->parents) {
+      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
+                                        getTransferCost(parentEdge.first, currentTask)),
+                               maxParentAest);
+    }
+    currentTask->setAest(maxParentAest);
+    Time completion = addOrMax(maxParentAest, currentTask->getWeight());
+    if (completion > dcplBudget)
+      return false;
+    if (completion >= modifiedMaxCompletion) {
+      modifiedSecondMaxCompletion = modifiedMaxCompletion;
+      modifiedMaxCompletion = completion;
+      modifiedMaxTask = currentTask;
+    }
+    else if (completion > modifiedSecondMaxCompletion) {
+      modifiedSecondMaxCompletion = completion;
+    }
+    return true;
+  };
+
+  if (!process(task))
+    return false;
+  for (TaskDCP* descendant : descendantsTopoOrder)
+    if (!process(descendant))
+      return false;
+
+  const bool oldMaxInvalidated = maxCompletionTask != nullptr
+                              && (maxCompletionTask == task
+                                  || llvm::is_contained(descendantsTopoOrder, maxCompletionTask));
+  if (oldMaxInvalidated) {
+    dcpl = modifiedMaxCompletion;
+    maxCompletion = modifiedMaxCompletion;
+    maxCompletionTask = modifiedMaxTask;
+    secondMaxCompletion = modifiedSecondMaxCompletion;
+  }
+  else {
+    if (modifiedMaxCompletion > maxCompletion) {
+      secondMaxCompletion = maxCompletion;
+      maxCompletion = modifiedMaxCompletion;
+      maxCompletionTask = modifiedMaxTask;
+    }
+    else if (modifiedMaxCompletion > secondMaxCompletion) {
+      secondMaxCompletion = modifiedMaxCompletion;
+    }
+    dcpl = maxCompletion;
+  }
+  return true;
+}
+
+// Computes a localised ALST: only ancestors of the candidate (plus the
+// candidate itself) get recomputed, every other task keeps its current ALST.
+// Processes nodes in reverse dependency order using a pending-children
+// counter.
+llvm::DenseMap<TaskDCP*, Time> GraphDCP::computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations) {
+  Time scheduleDcpl = computeDcplOnCpu(task, cpu);
+  llvm::DenseMap<TaskDCP*, Time> tempAlst;
+
+  llvm::DenseSet<TaskDCP*> affectedTasks = relations.ancestors;
+  affectedTasks.insert(task);
+  llvm::DenseMap<TaskDCP*, int> pendingAffectedChildren;
+  std::vector<TaskDCP*> worklist;
+  worklist.reserve(affectedTasks.size());
+
+  for (TaskDCP* affectedTask : affectedTasks) {
+    int affectedChildren = 0;
+    for (const Edge& childEdge : affectedTask->children)
+      if (affectedTasks.contains(childEdge.first))
+        affectedChildren++;
+    pendingAffectedChildren[affectedTask] = affectedChildren;
+    if (affectedChildren == 0)
+      worklist.push_back(affectedTask);
+  }
+
+  while (!worklist.empty()) {
+    TaskDCP* node = worklist.back();
+    worklist.pop_back();
+    Time minAlst = std::numeric_limits<Time>::max();
+    if (!node->hasChildren()) {
+      if (node != task)
+        minAlst = subtractOrZero(scheduleDcpl, node->getWeight());
+      else
+        minAlst = subtractOrZero(scheduleDcpl, node->computeWeightOnCpu(this, cpu));
+    }
+
+    for (const Edge& childEdge : node->children) {
+      Weight transferCost = getTransferCost(node, childEdge.first);
+      if (node == task && childEdge.first->isScheduled() && cpu == *childEdge.first->getCpu())
+        transferCost = 0;
+      Time childAlst = affectedTasks.contains(childEdge.first) ? tempAlst[childEdge.first] : childEdge.first->getAlst();
+      minAlst = std::min(minAlst, subtractOrZero(childAlst, addOrMax(node->getWeight(), transferCost)));
+    }
+    tempAlst[node] = minAlst;
+
+    for (const Edge& parentEdge : node->parents) {
+      if (!affectedTasks.contains(parentEdge.first))
+        continue;
+      int& remainingChildren = pendingAffectedChildren[parentEdge.first];
+      remainingChildren--;
+      assert(remainingChildren >= 0 && "affected child count must stay non-negative");
+      if (remainingChildren == 0)
+        worklist.push_back(parentEdge.first);
+    }
+  }
+  return tempAlst;
+}
+
+//===----------------------------------------------------------------------===//
+// Topological order maintenance
+//===----------------------------------------------------------------------===//
+
 void GraphDCP::initTopological() {
-  UniqueWorkList<std::vector<TaskDCP*>> worklists(getRoots());
+  topologicalOrder.clear();
+  UniqueWorkList<std::vector<TaskDCP*>> worklist(getRoots());
   long long flag = getUniqueFlag();
-  for (auto root : worklists)
+  for (auto root : worklist)
     root->setFlag(flag);
 
   size_t i = 0;
-  while (i != worklists.size()) {
-    for (auto& child : worklists.at(i)->childs) {
+  while (i != worklist.size()) {
+    for (auto& child : worklist.at(i)->children) {
       TaskDCP* childTask = child.first;
-      if (std::all_of(childTask->parents.begin(), childTask->parents.end(), [flag](Edge_t edge) {
+      if (std::all_of(childTask->parents.begin(), childTask->parents.end(), [flag](Edge edge) {
             return edge.first->getFlag() == flag;
           })) {
-        worklists.push_back(childTask);
+        worklist.pushBack(childTask);
         childTask->setFlag(flag);
       }
     }
     i++;
   }
 
-  for (auto task : worklists)
+  for (auto task : worklist)
     topologicalOrder.pushBack(task);
 }
 
-void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint) {
-  auto moveChildAfterMe = [this](TaskDCP* origTask) -> void {
-    auto cmp = [](Edge_t lft, Edge_t rgt) { return *rgt.first < *lft.first; };
+// After `task` is moved after `pivotPoint`, any child of `task` (transitively)
+// that ends up ordered before `task` must be bubbled forward too.
+void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion) {
+  auto moveChildAfterMe = [this, insertion](TaskDCP* origTask) -> void {
+    auto cmp = [](Edge lhs, Edge rhs) { return *rhs.first < *lhs.first; };
     TaskDCP* insertionPoint = origTask;
     std::vector<TaskDCP*> worklist;
     worklist.push_back(origTask);
     size_t i = 0;
     while (i < worklist.size()) {
       auto task = worklist[i];
-      std::vector<Edge_t>& childEdges = task->childs;
-      // build min heap Complexity 3N
+      std::vector<Edge>& childEdges = task->children;
+      // Heap-sort children lazily and stop at the first one already in order.
       std::make_heap(childEdges.begin(), childEdges.end(), cmp);
       auto lastPoppedIter = childEdges.end();
       bool foundChildInOrder = false;
@@ -265,6 +654,7 @@ void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint) {
         lastPoppedIter = std::prev(lastPoppedIter, 1);
         auto currentChild = (*lastPoppedIter).first;
         if (*currentChild < *task) {
+          dcp_graph::recordTopologicalMove(currentChild, insertion);
           topologicalOrder.moveAfter(currentChild, insertionPoint);
           insertionPoint = currentChild;
         }
@@ -278,7 +668,7 @@ void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint) {
         lastPoppedIter++;
 
       for (auto it = lastPoppedIter; it != childEdges.end(); ++it)
-        if (it->first->hasChilds())
+        if (it->first->hasChildren())
           worklist.push_back(it->first);
       i++;
     }
@@ -287,23 +677,22 @@ void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint) {
   if (!(*task < *pivotPoint))
     return;
 
+  dcp_graph::recordTopologicalMove(task, insertion);
   topologicalOrder.moveAfter(task, pivotPoint);
-  if (task->hasChilds())
+  if (task->hasChildren())
     moveChildAfterMe(task);
 }
 
-void GraphDCP::topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint) {
-
-  auto moveParentBeforeMe = [this](TaskDCP* origTask) -> void {
-    auto cmp = [](Edge_t lft, Edge_t rgt) { return *lft.first < *rgt.first; };
+void GraphDCP::topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion) {
+  auto moveParentBeforeMe = [this, insertion](TaskDCP* origTask) -> void {
+    auto cmp = [](Edge lhs, Edge rhs) { return *lhs.first < *rhs.first; };
     TaskDCP* insertionPoint = origTask;
     std::vector<TaskDCP*> worklist;
     worklist.push_back(origTask);
     size_t i = 0;
     while (i < worklist.size()) {
       auto task = worklist[i];
-      std::vector<Edge_t>& parentEdges = task->parents;
-      // build max heap Complexity 3N
+      std::vector<Edge>& parentEdges = task->parents;
       std::make_heap(parentEdges.begin(), parentEdges.end(), cmp);
       auto lastPoppedIter = parentEdges.end();
       bool foundParentInOrder = false;
@@ -312,6 +701,7 @@ void GraphDCP::topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint) {
         lastPoppedIter = std::prev(lastPoppedIter, 1);
         auto currentParent = (*lastPoppedIter).first;
         if (*currentParent < *task) {
+          dcp_graph::recordTopologicalMove(currentParent, insertion);
           topologicalOrder.moveBefore(currentParent, insertionPoint);
           insertionPoint = currentParent;
         }
@@ -334,239 +724,554 @@ void GraphDCP::topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint) {
   if (!(*pivotPoint < *task))
     return;
 
+  dcp_graph::recordTopologicalMove(task, insertion);
   topologicalOrder.moveBefore(task, pivotPoint);
   if (task->hasParents())
     moveParentBeforeMe(task);
 }
 
-GraphDCP::FindSlot GraphDCP::findSlot(TaskDCP* candidate, CPU cpu, bool push) {
-  int aest_on_cpu = computeAEST(candidate, cpu);
-  auto tmp_ALST = computeALST(candidate, cpu);
-  int final_time = tmp_ALST[candidate] + candidate->computeWeight(this, cpu);
-  std::list<TaskDCP*>& scheduledTasks = mapCPUTasks[cpu];
+//===----------------------------------------------------------------------===//
+// Slot search
+//===----------------------------------------------------------------------===//
 
-  // Search last non ancestor
-  auto after_last_anc = scheduledTasks.end();
-  while (after_last_anc != scheduledTasks.begin()) {
-    if (after_last_anc != scheduledTasks.end() && (*after_last_anc)->hasDescendent(candidate))
+using ScheduledTaskList = std::list<TaskDCP*>;
+
+// A legal slot lives strictly after the last ancestor already on the CPU and
+// strictly before the first descendant. This narrows the linear scan below.
+struct SlotBounds {
+  ScheduledTaskList::iterator afterLastAncestor;
+  ScheduledTaskList::iterator firstDescendant;
+};
+
+static SlotBounds computeSlotBounds(ScheduledTaskList& scheduledTasks, const GraphDCP::CandidateRelations& relations) {
+  auto firstDescendant = scheduledTasks.begin();
+  while (firstDescendant != scheduledTasks.end()) {
+    if (relations.descendants.contains(*firstDescendant))
       break;
-    after_last_anc = std::prev(after_last_anc, 1);
+    firstDescendant = std::next(firstDescendant, 1);
   }
 
-  if (after_last_anc != scheduledTasks.end() && (*after_last_anc)->hasDescendent(candidate))
-    std::advance(after_last_anc, 1);
-
-  auto first_descendent_index = scheduledTasks.begin();
-  while (first_descendent_index != scheduledTasks.end()) {
-    if (first_descendent_index != scheduledTasks.end() && candidate->hasDescendent(*first_descendent_index))
+  auto afterLastAncestor = scheduledTasks.end();
+  while (afterLastAncestor != scheduledTasks.begin()) {
+    if (afterLastAncestor != scheduledTasks.end() && relations.ancestors.contains(*afterLastAncestor))
       break;
-    first_descendent_index = std::next(first_descendent_index, 1);
+    afterLastAncestor = std::prev(afterLastAncestor, 1);
   }
 
-  auto iter_index = after_last_anc;
-  auto best_index = scheduledTasks.end();
-  int best_max;
-  assert(std::distance(scheduledTasks.begin(), after_last_anc)
-         <= std::distance(scheduledTasks.begin(), first_descendent_index));
+  if (afterLastAncestor != scheduledTasks.end() && relations.ancestors.contains(*afterLastAncestor))
+    std::advance(afterLastAncestor, 1);
 
-  bool keep = true;
-  while (keep && iter_index != scheduledTasks.end()) {
-    if (iter_index == first_descendent_index)
-      keep = false;
-    int min = INT_MAX;
-    if (!push)
-      min = std::min(final_time, (*iter_index)->getAEST());
-    else if (tmp_ALST.count(*iter_index) == 1)
-      min = std::min(final_time, tmp_ALST[*iter_index]);
-    else
-      min = std::min(final_time, (*iter_index)->getALST());
-    int max = aest_on_cpu;
-    if (iter_index != scheduledTasks.begin()) {
-      auto prev_iter = std::prev(iter_index);
-      max = std::max(aest_on_cpu, (*prev_iter)->getAEST() + (*prev_iter)->getWeight());
-    }
-    if (min - max >= candidate->computeWeight(this, cpu)) {
-      best_index = iter_index;
-      best_max = max;
-      break;
-    }
-
-    std::advance(iter_index, 1);
-    if (iter_index == scheduledTasks.end())
-      keep = false;
-  }
-
-  if (best_index != scheduledTasks.end())
-    return FindSlot {best_max, (int) std::distance(scheduledTasks.begin(), best_index)};
-
-  if (iter_index == scheduledTasks.end()) {
-    best_max = aest_on_cpu;
-    if (iter_index != scheduledTasks.begin()) {
-      auto prev_iter = std::prev(iter_index);
-      best_max = std::max(aest_on_cpu, (*prev_iter)->getAEST() + (*prev_iter)->getWeight());
-    }
-    return FindSlot {best_max, (int) std::distance(scheduledTasks.begin(), scheduledTasks.end())};
-  }
-  return FindSlot {INT_MAX, 0};
+  return {afterLastAncestor, firstDescendant};
 }
 
-void GraphDCP::selectProcessor(TaskDCP* candidate, bool push) {
-  std::vector<CPU> processors;
-  processors.reserve(lastCPU());
-  for (CPU c = push ? lastCPU() : lastCPU(); c >= 0; c--)
-    processors.push_back(c);
+// Scans legal positions on `cpu` and returns the first slot whose available
+// window fits the candidate's weight. With `push=false` we keep existing tasks
+// in place; with `push=true` we may shift them later, so ALST needs a local
+// recomputation (computeAlst) to reflect tighter constraints.
+GraphDCP::FindSlot GraphDCP::findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations) {
+  Time aestOnCpu = computeAestOnCpu(candidate, cpu);
+  ScheduledTaskList& scheduledTasks = getOrCreateCpuTasks(cpu);
+  SlotBounds bounds = computeSlotBounds(scheduledTasks, relations);
+  auto afterLastAncestor = bounds.afterLastAncestor;
+  auto firstDescendantIndex = bounds.firstDescendant;
 
-  CPU best_process = -1;
-  int best_composite = INT_MAX;
-  FindSlot best_slot;
-
-  while (!processors.empty()) {
-    CPU current_cpu = processors.back();
-    processors.pop_back();
-    auto slot = findSlot(candidate, current_cpu, 0);
-    if (slot.aest == INT_MAX && push)
-      slot = findSlot(candidate, current_cpu, 1);
-    if (slot.aest == INT_MAX)
-      continue;
-    if (std::all_of(candidate->childs.begin(), candidate->childs.end(), [](Edge_t child) {
-          return child.first->isScheduled();
-        })) {
-      if (slot.aest < best_composite) {
-        best_process = current_cpu;
-        best_composite = slot.aest;
-        best_slot = slot;
-      }
-    }
-    else if (candidate->hasChilds()) {
-      auto dcpl = getDCPL();
-      auto taskInsertion = insertTaskInCPU(current_cpu, candidate, slot.index);
-      initAEST();
-      initALST();
-      Edge_t smallest_child {nullptr, 0};
-      for (auto child : candidate->childs) {
-        if (child.first->isScheduled())
-          continue;
-        if (smallest_child.first == nullptr) {
-          smallest_child = child;
-          continue;
-        }
-        if (smallest_child.first->getALST() - smallest_child.first->getAEST()
-            > child.first->getALST() - child.first->getAEST()) {
-          smallest_child = child;
-        }
-      }
-      auto child_slot = findSlot(smallest_child.first, current_cpu, false);
-      auto dcpl_with_child = computeDCPL(smallest_child.first, current_cpu);
-      if (child_slot.aest != INT_MAX and child_slot.aest + slot.aest < best_composite and dcpl_with_child <= dcpl) {
-        best_process = current_cpu;
-        best_composite = slot.aest + child_slot.aest;
-        best_slot = slot;
-      }
-      taskInsertion.rollBack();
-      initAEST();
-      initALST();
-    }
+  if (firstDescendantIndex == scheduledTasks.end()) {
+    Time bestMax = aestOnCpu;
+    if (!scheduledTasks.empty())
+      bestMax = std::max(aestOnCpu, addOrMax(scheduledTasks.back()->getAest(), scheduledTasks.back()->getWeight()));
+    if (scheduledTasks.empty() || afterLastAncestor == scheduledTasks.end())
+      return FindSlot {bestMax, static_cast<int>(scheduledTasks.size())};
   }
-  if (best_process == -1) {
-    best_process = lastCPU();
-    incLastCPU();
-  }
-  if (best_process == lastCPU())
-    incLastCPU();
-  insertTaskInCPU(best_process, candidate, best_slot.index);
-}
 
-void GraphDCP::DCP() {
-  initTopological();
-  initAEST();
-  initALST();
-  to_dot();
-  std::vector<TaskDCP*> worklists;
-  worklists.reserve(nodes.size());
-  for (auto& node : nodes)
-    worklists.push_back(&node);
-  while (!worklists.empty()) {
-    auto candidate = findCandidate(worklists);
-    selectProcessor(candidate, candidate->isCP());
-    initAEST();
-    initALST();
-    fastRemove(worklists, candidate);
-  }
-  to_dot();
-}
-
-void GraphDCP::to_dot() {
-  static int index = 0;
-  std::string outputDir = onnx_mlir::getOutputDir();
-  if (outputDir.empty())
-    return;
-  std::string graphDir = outputDir + "/dcp_graph";
-  onnx_mlir::createDirectory(graphDir);
-  std::fstream file(graphDir + "/graph_" + std::to_string(index++) + ".dot", std::ios::out);
-  file << "digraph G {\n";
-  if (mapCPUTasks.size() != 0) {
-    for (CPU c = 0; c < lastCPU(); c++) {
-      file << "subgraph cluster_" << c << "{\nstyle=filled;\ncolor=lightgrey;\n";
-      for (auto node : mapCPUTasks[c]) {
-        file << node->Id() << " [label=\"";
-        file << "n:" << node->Id() << "\n";
-        file << "aest:" << node->getAEST() << "\n";
-        file << "alst:" << node->getALST() << "\n";
-        file << "weight:" << node->getWeight() << "\"]\n";
-      }
-      file << " }\n";
-    }
+  // When `push` is false the inner loop never consults `tempAlst[*iterIndex]`,
+  // so only the candidate ALST must be recomputed here.
+  llvm::DenseMap<TaskDCP*, Time> tempAlst;
+  Time finalTime;
+  if (!push) {
+    Time candidateDcpl = computeDcplOnCpu(candidate, cpu);
+    finalTime = addOrMax(computeTaskAlstOnCpu(candidate, cpu, candidateDcpl), candidate->computeWeightOnCpu(this, cpu));
   }
   else {
-    for (auto& node : nodes) {
-      file << node.Id() << " [label=\"";
-      file << "n:" << node.Id() << "\n";
-      file << "aest:" << node.getAEST() << "\n";
-      file << "alst:" << node.getALST() << "\n";
-      file << "weight:" << node.getWeight() << "\"]\n";
-    }
+    tempAlst = computeAlst(candidate, cpu, relations);
+    finalTime = addOrMax(tempAlst[candidate], candidate->computeWeightOnCpu(this, cpu));
   }
-  for (auto& node : nodes) {
-    for (auto& child : node.childs) {
-      file << node.Id() << " -> " << child.first->Id();
-      file << " [label=\"" << child.second << "\"]\n";
+
+  auto iterIndex = afterLastAncestor;
+  auto bestIndex = scheduledTasks.end();
+  Time bestMax = 0;
+  assert(std::distance(scheduledTasks.begin(), afterLastAncestor)
+         <= std::distance(scheduledTasks.begin(), firstDescendantIndex));
+
+  bool keep = true;
+  while (keep && iterIndex != scheduledTasks.end()) {
+    if (iterIndex == firstDescendantIndex)
+      keep = false;
+    Time minStart;
+    if (!push)
+      minStart = std::min(finalTime, (*iterIndex)->getAest());
+    else if (tempAlst.count(*iterIndex) == 1)
+      minStart = std::min(finalTime, tempAlst[*iterIndex]);
+    else
+      minStart = std::min(finalTime, (*iterIndex)->getAlst());
+    Time maxStart = aestOnCpu;
+    if (iterIndex != scheduledTasks.begin()) {
+      auto prevIter = std::prev(iterIndex);
+      maxStart = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
     }
+    if (subtractOrZero(minStart, maxStart) >= candidate->computeWeightOnCpu(this, cpu)) {
+      bestIndex = iterIndex;
+      bestMax = maxStart;
+      break;
+    }
+
+    std::advance(iterIndex, 1);
+    if (iterIndex == scheduledTasks.end())
+      keep = false;
   }
-  file << "}\n";
-  file.flush();
-  file.close();
+
+  if (bestIndex != scheduledTasks.end())
+    return FindSlot {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), bestIndex))};
+
+  if (iterIndex == scheduledTasks.end()) {
+    bestMax = aestOnCpu;
+    if (iterIndex != scheduledTasks.begin()) {
+      auto prevIter = std::prev(iterIndex);
+      bestMax = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
+    }
+    return FindSlot {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), scheduledTasks.end()))};
+  }
+  return FindSlot {std::numeric_limits<Time>::max(), 0};
 }
 
+// Same scan as findSlot(push=false) but skips the ALST recompute - caller
+// already knows the candidate's finalTime and AEST on this CPU.
+GraphDCP::FindSlot GraphDCP::findSlotWithFixedFinalTime(
+  TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu) {
+  ScheduledTaskList& scheduledTasks = getOrCreateCpuTasks(cpu);
+  SlotBounds bounds = computeSlotBounds(scheduledTasks, relations);
+  auto afterLastAncestor = bounds.afterLastAncestor;
+  auto firstDescendantIndex = bounds.firstDescendant;
+
+  if (firstDescendantIndex == scheduledTasks.end()) {
+    Time bestMax = aestOnCpu;
+    if (!scheduledTasks.empty())
+      bestMax = std::max(aestOnCpu, addOrMax(scheduledTasks.back()->getAest(), scheduledTasks.back()->getWeight()));
+    if (scheduledTasks.empty() || afterLastAncestor == scheduledTasks.end())
+      return {bestMax, static_cast<int>(scheduledTasks.size())};
+  }
+
+  auto iterIndex = afterLastAncestor;
+  auto bestIndex = scheduledTasks.end();
+  Time bestMax = std::numeric_limits<Time>::max();
+  assert(std::distance(scheduledTasks.begin(), afterLastAncestor)
+         <= std::distance(scheduledTasks.begin(), firstDescendantIndex));
+
+  bool keep = true;
+  while (keep && iterIndex != scheduledTasks.end()) {
+    if (iterIndex == firstDescendantIndex)
+      keep = false;
+    Time minStart = std::min(finalTime, (*iterIndex)->getAest());
+    Time maxStart = aestOnCpu;
+    if (iterIndex != scheduledTasks.begin()) {
+      auto prevIter = std::prev(iterIndex);
+      maxStart = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
+    }
+    if (subtractOrZero(minStart, maxStart) >= candidate->computeWeightOnCpu(this, cpu)) {
+      bestIndex = iterIndex;
+      bestMax = maxStart;
+      break;
+    }
+
+    std::advance(iterIndex, 1);
+    if (iterIndex == scheduledTasks.end())
+      keep = false;
+  }
+
+  if (bestIndex != scheduledTasks.end())
+    return {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), bestIndex))};
+
+  if (iterIndex == scheduledTasks.end()) {
+    bestMax = aestOnCpu;
+    if (iterIndex != scheduledTasks.begin()) {
+      auto prevIter = std::prev(iterIndex);
+      bestMax = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
+    }
+    return {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), scheduledTasks.end()))};
+  }
+
+  return {std::numeric_limits<Time>::max(), 0};
+}
+
+//===----------------------------------------------------------------------===//
+// Candidate selection and processor assignment
+//===----------------------------------------------------------------------===//
+
+// Lowest slack wins; earliest AEST breaks ties. Critical-path tasks (zero
+// slack) naturally float to the front.
+TaskDCP* GraphDCP::findCandidate(const std::vector<TaskDCP*>& readyNodes) {
+  auto findBestNode = [](auto lft, auto rgt) {
+    Time leftSlack = slackOrZero((*lft)->getAest(), (*lft)->getAlst());
+    Time rightSlack = slackOrZero((*rgt)->getAest(), (*rgt)->getAlst());
+    if (leftSlack < rightSlack)
+      return lft;
+    if (rightSlack < leftSlack)
+      return rgt;
+    if ((*lft)->getAest() < (*rgt)->getAest())
+      return lft;
+    return rgt;
+  };
+
+  assert(!readyNodes.empty() && "expected at least one ready node");
+  auto validNode = readyNodes.begin();
+  auto bestNode = validNode;
+
+  while (validNode != readyNodes.end()) {
+    bestNode = findBestNode(validNode, bestNode);
+    std::advance(validNode, 1);
+  }
+  return *bestNode;
+}
+
+// Picks the best CPU + slot for `candidate`:
+//   * Phase 1 (parallel, read-only): call findSlot on every candidate CPU.
+//   * Phase 2 (sequential): process CPUs in ascending slot.aest order. For
+//     each, refine the composite cost. If the candidate has unscheduled
+//     children, speculatively insert it, try a within-budget AEST update and
+//     evaluate a slot for the smallest-slack child, then roll back.
+//   * Rescue (sequential): if nothing fit, grow the CPU count if allowed,
+//     otherwise pick the CPU that leads to the smallest DCPL increase.
+void GraphDCP::selectProcessor(TaskDCP* candidate, bool push) {
+  CandidateRelations relations = dcp_graph::computeCandidateRelations(candidate);
+  relations.descendantsTopoOrder.reserve(relations.descendants.size());
+  for (auto it = candidate->getTopologicalIterator(); it != topologicalOrder.end(); ++it) {
+    TaskDCP* current = &*it;
+    if (current != candidate && relations.descendants.contains(current))
+      relations.descendantsTopoOrder.push_back(current);
+  }
+
+  // Build the list of candidate CPUs in ascending index order. Skip CPUs
+  // where the crossbar footprint definitely wouldn't fit (computeWeightOnCpu
+  // would reject them anyway, but this check is cheaper than a full
+  // findSlot).
+  std::vector<CPU> processors;
+  const bool canCreateNewCpu = getLastCpu() < maxCpuCount;
+  const CPU topCpu = canCreateNewCpu ? getLastCpu() : getLastCpu() - 1;
+  processors.reserve(static_cast<size_t>(topCpu + 1));
+  const CrossbarUsage candidateFootprint = getTaskCrossbarFootprint(candidate);
+  const bool candidateHasCrossbar = candidateFootprint != 0;
+  const CrossbarUsage cpuCapacity = candidateHasCrossbar ? getCpuCrossbarCapacity() : 0;
+  for (CPU c = 0; c <= topCpu; c++) {
+    if (candidateHasCrossbar && c != getLastCpu()) {
+      CrossbarUsage nextUsage = checkedAdd(getCpuCrossbarUsage(c), candidateFootprint);
+      if (nextUsage >= cpuCapacity)
+        continue;
+    }
+    processors.push_back(c);
+  }
+
+  if (processors.empty()) {
+    CPU bestCpu = canCreateNewCpu ? getLastCpu() : 0;
+    FindSlot bestSlot = {computeAestOnCpu(candidate, bestCpu), static_cast<int>(getOrCreateCpuTasks(bestCpu).size())};
+    if (canCreateNewCpu)
+      incrementLastCpu();
+    insertTaskInCPU(bestCpu, candidate, bestSlot.index);
+    return;
+  }
+
+  // Phase 1: parallel findSlot sweep (read-only over graph state).
+  // Pre-size cpuTasks so findSlot never needs to resize the outer vector
+  // while threads share it.
+  getOrCreateCpuTasks(topCpu);
+  struct PrecomputedSlot {
+    CPU cpu;
+    FindSlot slot;
+  };
+  std::vector<PrecomputedSlot> precomputed(processors.size());
+  auto sweep = [&](size_t i) {
+    CPU cpu = processors[i];
+    FindSlot slot = findSlot(candidate, cpu, false, relations);
+    if (slot.aest == std::numeric_limits<Time>::max() && push)
+      slot = findSlot(candidate, cpu, true, relations);
+    precomputed[i] = {cpu, slot};
+  };
+  DCP_DEBUG_IF(auto sweepStart = std::chrono::steady_clock::now();)
+  if (context != nullptr)
+    mlir::parallelFor(context, 0, processors.size(), sweep);
+  else
+    for (size_t i = 0; i < processors.size(); ++i)
+      sweep(i);
+  DCP_DEBUG_IF(gSelectTimers.findSlot +=
+                 std::chrono::duration<double>(std::chrono::steady_clock::now() - sweepStart).count();)
+
+#ifdef DCP_DEBUG_ENABLED
+  {
+    static bool reported = false;
+    if (!reported) {
+      reported = true;
+      std::fprintf(stderr,
+                   "[dcp] selectProcessor parallel sweep: context=%p mt=%d procs=%zu pool=%u\n",
+                   (void*) context,
+                   context != nullptr ? (int) context->isMultithreadingEnabled() : -1,
+                   processors.size(),
+                   context != nullptr && context->isMultithreadingEnabled()
+                     ? context->getThreadPool().getMaxConcurrency()
+                     : 0u);
+    }
+  }
+#endif
+
+  CPU bestCpu = -1;
+  Time bestComposite = std::numeric_limits<Time>::max();
+  FindSlot bestSlot;
+  llvm::DenseMap<TaskDCP*, CandidateRelations> childRelationsCache;
+
+  // Phase 2: sequential composite evaluation in ascending CPU index order.
+  // Keeping the same order as the pre-parallel implementation preserves
+  // tie-breaking: whenever two CPUs produce an equal composite cost, the
+  // lower index wins.
+  for (const auto& ps : precomputed) {
+    DCP_DEBUG_IF(++gSelectTimers.iterations;)
+    CPU currentCpu = ps.cpu;
+    FindSlot slot = ps.slot;
+    if (slot.aest == std::numeric_limits<Time>::max())
+      continue;
+    // slot.aest alone is a lower bound on the composite cost (child.aest >= 0)
+    // so any CPU that already meets/exceeds the current best is pruned.
+    if (slot.aest >= bestComposite)
+      continue;
+
+    if (std::all_of(candidate->children.begin(), candidate->children.end(), [](Edge child) {
+          return child.first->isScheduled();
+        })) {
+      bestCpu = currentCpu;
+      bestComposite = slot.aest;
+      bestSlot = slot;
+    }
+    else if (candidate->hasChildren()) {
+      const bool emptyCpu = getOrCreateCpuTasks(currentCpu).empty();
+      auto currentDcpl = getDcpl();
+      // Combined prune: the DCPL budget AND a tighter composite bound. Any
+      // unscheduled child starts no earlier than candidate's completion, so
+      // composite >= 2*slot.aest + weight.
+      DCP_DEBUG_IF(auto t2 = std::chrono::steady_clock::now();)
+      Weight candidateWeight = candidate->computeWeightOnCpu(this, currentCpu);
+      Time candidateCompletion = addOrMax(slot.aest, candidateWeight);
+      bool skip = (!emptyCpu && candidateCompletion > currentDcpl)
+               || addOrMax(slot.aest, candidateCompletion) >= bestComposite;
+      DCP_DEBUG_IF(gSelectTimers.precheck += std::chrono::duration<double>(std::chrono::steady_clock::now() - t2).count();)
+      if (skip)
+        continue;
+      DCP_DEBUG_IF(++gSelectTimers.passedPrecheck;)
+
+      dcp_graph::LocalScheduleSnapshot scheduleSnapshot;
+      TaskInsertion taskInsertion;
+      DCP_DEBUG_IF(auto t3 = std::chrono::steady_clock::now();)
+      if (emptyCpu) {
+        // Empty CPU: insertion adds no scheduling edge, AEST/DCPL cannot move.
+        taskInsertion = insertTaskInCPU(currentCpu, candidate, slot.index);
+      }
+      else {
+        scheduleSnapshot = dcp_graph::captureLocalScheduleState(
+          candidate, relations.descendants, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
+        taskInsertion = insertTaskInCPU(currentCpu, candidate, slot.index);
+        bool withinBudget = tryUpdateAestWithinBudget(
+          candidate, llvm::ArrayRef<TaskDCP*>(relations.descendantsTopoOrder), currentDcpl);
+        if (!withinBudget) {
+          DCP_DEBUG_IF(auto t4 = std::chrono::steady_clock::now();)
+          taskInsertion.rollBack();
+          dcp_graph::restoreLocalScheduleState(
+            scheduleSnapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
+          DCP_DEBUG_IF(auto tNow = std::chrono::steady_clock::now();
+                       gSelectTimers.snapshotInsertUpdate += std::chrono::duration<double>(t4 - t3).count();
+                       gSelectTimers.rollbackRestore += std::chrono::duration<double>(tNow - t4).count();)
+          continue;
+        }
+      }
+      DCP_DEBUG_IF(gSelectTimers.snapshotInsertUpdate +=
+                     std::chrono::duration<double>(std::chrono::steady_clock::now() - t3).count();)
+      DCP_DEBUG_IF(++gSelectTimers.passedDcpl;)
+
+      // Pick the tightest unscheduled child (smallest slack) and measure what
+      // slot it would get on the same CPU; that's the composite contribution.
+      Edge smallestChild {nullptr, 0};
+      for (auto child : candidate->children) {
+        if (child.first->isScheduled())
+          continue;
+        if (smallestChild.first == nullptr) {
+          smallestChild = child;
+          continue;
+        }
+        if (slackOrZero(smallestChild.first->getAest(), smallestChild.first->getAlst())
+            > slackOrZero(child.first->getAest(), child.first->getAlst())) {
+          smallestChild = child;
+        }
+      }
+      DCP_DEBUG_IF(auto t5 = std::chrono::steady_clock::now();)
+      auto dcplWithChild = computeDcplOnCpu(smallestChild.first, currentCpu);
+      auto childRelationsIt = childRelationsCache.find(smallestChild.first);
+      if (childRelationsIt == childRelationsCache.end())
+        childRelationsIt =
+          childRelationsCache.insert({smallestChild.first, dcp_graph::computeCandidateRelations(smallestChild.first)})
+            .first;
+      const CandidateRelations& childRelations = childRelationsIt->second;
+      auto childSlot =
+        dcplWithChild == getDcpl()
+          ? findSlotWithFixedFinalTime(smallestChild.first,
+                                       currentCpu,
+                                       childRelations,
+                                       addOrMax(computeTaskAlstOnCpu(smallestChild.first, currentCpu, dcplWithChild),
+                                                smallestChild.first->computeWeightOnCpu(this, currentCpu)),
+                                       computeAestOnCpu(smallestChild.first, currentCpu))
+          : findSlot(smallestChild.first, currentCpu, false, childRelations);
+      if (childSlot.aest != std::numeric_limits<Time>::max() && addOrMax(childSlot.aest, slot.aest) < bestComposite
+          && dcplWithChild <= currentDcpl) {
+        bestCpu = currentCpu;
+        bestComposite = addOrMax(slot.aest, childSlot.aest);
+        bestSlot = slot;
+      }
+      DCP_DEBUG_IF(auto t6 = std::chrono::steady_clock::now();
+                   gSelectTimers.childSlot += std::chrono::duration<double>(t6 - t5).count();)
+      taskInsertion.rollBack();
+      if (!emptyCpu)
+        dcp_graph::restoreLocalScheduleState(
+          scheduleSnapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
+      DCP_DEBUG_IF(gSelectTimers.rollbackRestore +=
+                     std::chrono::duration<double>(std::chrono::steady_clock::now() - t6).count();)
+    }
+  }
+
+  // Rescue path: no CPU was acceptable. Either open a fresh CPU or fall back
+  // to the CPU that minimises the resulting DCPL.
+  if (bestCpu == -1) {
+    if (getLastCpu() < maxCpuCount) {
+      bestCpu = getLastCpu();
+      bestSlot = {computeAestOnCpu(candidate, bestCpu), 0};
+      incrementLastCpu();
+    }
+    else {
+      Time bestDcpl = std::numeric_limits<Time>::max();
+      Time currentDcpl = getDcpl();
+      for (CPU c = 0; c < getLastCpu(); c++) {
+        auto slot = findSlot(candidate, c, false, relations);
+        if (slot.aest == std::numeric_limits<Time>::max())
+          slot = findSlot(candidate, c, true, relations);
+        if (slot.aest == std::numeric_limits<Time>::max())
+          continue;
+        // Cheap lower bound: post-insertion DCPL is at least max(currentDcpl,
+        // candidate completion on this slot). Skip CPUs already worse than
+        // the best seen.
+        Time lowerBound =
+          std::max(currentDcpl, addOrMax(slot.aest, candidate->computeWeightOnCpu(this, c)));
+        if (lowerBound >= bestDcpl)
+          continue;
+        auto snapshot = dcp_graph::captureLocalScheduleState(
+          candidate, relations.descendants, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
+        auto taskInsertion = insertTaskInCPU(c, candidate, slot.index);
+        updateAestFromTaskWithDescendants(candidate, llvm::ArrayRef<TaskDCP*>(relations.descendantsTopoOrder));
+        Time candidateDcpl = getDcpl();
+        taskInsertion.rollBack();
+        dcp_graph::restoreLocalScheduleState(
+          snapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
+        if (candidateDcpl < bestDcpl) {
+          bestDcpl = candidateDcpl;
+          bestCpu = c;
+          bestSlot = slot;
+        }
+      }
+      if (bestCpu == -1) {
+        bestCpu = 0;
+        bestSlot = {computeAestOnCpu(candidate, bestCpu), static_cast<int>(getOrCreateCpuTasks(bestCpu).size())};
+      }
+    }
+  }
+  if (bestCpu == getLastCpu() && getLastCpu() < maxCpuCount)
+    incrementLastCpu();
+  insertTaskInCPU(bestCpu, candidate, bestSlot.index);
+}
+
+//===----------------------------------------------------------------------===//
+// Main scheduling loop and result extraction
+//===----------------------------------------------------------------------===//
+
+void GraphDCP::runDcp() {
+  initTopological();
+  initAest();
+  initAlst();
+  dumpDot();
+
+  dcp_graph::DcpProgressLogger progressLogger(nodes.size());
+  llvm::DenseMap<TaskDCP*, int> unscheduledParents;
+  std::vector<TaskDCP*> readyNodes;
+  readyNodes.reserve(nodes.size());
+  for (auto& node : nodes) {
+    int dependencyParents = dcp_graph::countDependencyParents(&node);
+    unscheduledParents[&node] = dependencyParents;
+    if (dependencyParents == 0)
+      readyNodes.push_back(&node);
+  }
+  progressLogger.printStart(readyNodes.size());
+
+  while (!readyNodes.empty()) {
+    DCP_DEBUG_IF(auto findStart = std::chrono::steady_clock::now();)
+    TaskDCP* candidate = findCandidate(readyNodes);
+    DCP_DEBUG_IF(progressLogger.recordFindDuration(
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - findStart).count());)
+    fastRemove(readyNodes, candidate);
+
+    DCP_DEBUG_IF(auto selectStart = std::chrono::steady_clock::now();)
+    selectProcessor(candidate, candidate->isCriticalPath());
+    DCP_DEBUG_IF(
+      double selectSeconds = std::chrono::duration<double>(std::chrono::steady_clock::now() - selectStart).count();
+      progressLogger.recordSelectDuration(selectSeconds);
+      progressLogger.maybePrintSlowCandidate(getNodeIndex(candidate), selectSeconds, readyNodes.size(), getLastCpu());
+    )
+
+    DCP_DEBUG_IF(auto updateStart = std::chrono::steady_clock::now();)
+    initAest();
+    initAlst();
+    DCP_DEBUG_IF(progressLogger.recordUpdateDuration(
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - updateStart).count());)
+    progressLogger.advanceCompleted();
+    progressLogger.printProgress(readyNodes.size(), getLastCpu(), "recompute", false);
+
+    for (const auto& childEdge : candidate->children) {
+      if (childEdge.isScheduling || childEdge.first->isScheduled())
+        continue;
+      int& dependencyParents = unscheduledParents[childEdge.first];
+      assert(dependencyParents > 0 && "dependency parent count must stay positive");
+      dependencyParents--;
+      if (dependencyParents == 0)
+        readyNodes.push_back(childEdge.first);
+    }
+    DCP_DEBUG_IF(
+      ++gSelectTimers.tasksProcessed;
+      if (std::getenv("DCP_SELECT_PROFILE") && (gSelectTimers.tasksProcessed % 100 == 0))
+        gSelectTimers.dump("tick");
+    )
+  }
+  progressLogger.printProgress(readyNodes.size(), getLastCpu(), "done", true);
+  dumpDot();
+}
+
+void GraphDCP::dumpDot() { dcp_graph::dumpGraphDot(nodes, cpuTasks, getLastCpu()); }
+
 DCPAnalysisResult GraphDCP::getResult() {
   DCPAnalysisResult ret;
 
-  std::vector<TaskDCP*> roots = getRoots();
-  UniqueWorkList<std::vector<TaskDCP*>> worklists(roots);
-  worklists.reserve(nodes.size());
-  size_t i = 0;
-  while (i != worklists.size()) {
-    bool modified = true;
-    while (modified) {
-      modified = false;
-      for (auto& child : worklists.at(i)->childs) {
-        if (worklists.allElementContained(
-              child.first->parents.begin(), child.first->parents.end(), [](Edge_t edge) { return edge.first; })) {
-          modified |= worklists.push_back(child.first);
-        }
-      }
-    }
-    i++;
-  }
-  ret.dominanceOrderCompute.reserve(worklists.size());
-  for (auto elem : worklists)
+  auto dominanceOrder = dcp_graph::collectDominanceOrder(getRoots(), nodes.size());
+  ret.dominanceOrderCompute.reserve(dominanceOrder.size());
+  for (auto elem : dominanceOrder)
     ret.dominanceOrderCompute.push_back(elem->getSpatWeightedCompute());
 
-  for (auto [cpu, nodes] : mapCPUTasks) {
+  for (CPU cpu = 0; cpu < getLastCpu(); ++cpu) {
+    const CpuTaskList* tasks = findCpuTasks(cpu);
+    if (tasks == nullptr || tasks->empty())
+      continue;
     size_t i = 0;
-    for (auto node : nodes) {
-      ret.computeToCPUMap[node->getSpatWeightedCompute()] = cpu;
-      if (i++ == nodes.size() - 1) {
-        ret.isLastComputeOfACpu.insert(node->getSpatWeightedCompute());
+    for (auto node : *tasks) {
+      ret.computeToCpuMap[node->getSpatWeightedCompute()] = cpu;
+      if (i++ == tasks->size() - 1) {
+        ret.isLastComputeOfCpu.insert(node->getSpatWeightedCompute());
         ret.cpuToLastComputeMap[cpu] = node->getSpatWeightedCompute();
       }
     }
@@ -577,12 +1282,12 @@ DCPAnalysisResult GraphDCP::getResult() {
 
 std::vector<GraphDCP::ScheduledTaskInfo> GraphDCP::getScheduledTasks(CPU cpu) const {
   std::vector<ScheduledTaskInfo> scheduledTasks;
-  auto cpuIt = mapCPUTasks.find(cpu);
-  if (cpuIt == mapCPUTasks.end())
+  const CpuTaskList* tasks = findCpuTasks(cpu);
+  if (tasks == nullptr)
     return scheduledTasks;
 
-  scheduledTasks.reserve(cpuIt->second.size());
-  for (auto* task : cpuIt->second)
-    scheduledTasks.push_back({getNodeIndex(task), task->getAEST(), task->getALST(), task->getWeight()});
+  scheduledTasks.reserve(tasks->size());
+  for (auto* task : *tasks)
+    scheduledTasks.push_back({getNodeIndex(task), task->getAest(), task->getAlst(), task->getWeight()});
   return scheduledTasks;
 }
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp
index 15d5c51..a66690f 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp
@@ -2,6 +2,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 
 #include <list>
 #include <optional>
@@ -12,90 +13,144 @@
 #include "Task.hpp"
 #include "Utils.hpp"
 
-std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
-void removeEdge(TaskDCP* parent, TaskDCP* child);
-int getTranferCost(TaskDCP* parent, TaskDCP* child);
+namespace mlir {
+class MLIRContext;
+} // namespace mlir
+
+std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling = false);
+void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling = false);
+Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
 
 class GraphDCP {
 public:
+  struct CandidateRelations {
+    llvm::DenseSet<TaskDCP*> ancestors;
+    llvm::DenseSet<TaskDCP*> descendants;
+    // descendants ordered by position in the graph's topological order;
+    // iterating this avoids walking non-descendant tail tasks on hot paths.
+    llvm::SmallVector<TaskDCP*, 32> descendantsTopoOrder;
+  };
+
   struct ScheduledTaskInfo {
     size_t nodeIndex;
-    int aest;
-    int alst;
-    int weight;
+    Time aest;
+    Time alst;
+    Weight weight;
   };
 
 private:
+  using CpuTaskList = std::list<TaskDCP*>;
+
   struct FindSlot {
-    int aest;
+    Time aest;
     int index;
   };
 
   std::vector<TaskDCP> nodes;
   onnx_mlir::LabeledList<TaskDCP> topologicalOrder;
-  std::unordered_map<CPU, std::list<TaskDCP*>> mapCPUTasks;
-  CPU last_cpu = 0;
+  std::vector<CpuTaskList> cpuTasks;
+  std::unordered_map<CPU, CrossbarUsage> cpuCrossbarUsage;
+  CPU lastCpu = 0;
   long long flag = 1;
-  int DCPL;
+  Time dcpl = 0;
+  Time maxCompletion = 0;
+  Time secondMaxCompletion = 0;
+  TaskDCP* maxCompletionTask = nullptr;
+  int maxCpuCount = 1000;
+  mlir::MLIRContext* context = nullptr;
 
   TaskInsertion insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position);
   void removeTaskFromCPU(CPU cpu, TaskDCP* task);
+  CpuTaskList& getOrCreateCpuTasks(CPU cpu);
+  const CpuTaskList* findCpuTasks(CPU cpu) const;
 
   std::vector<TaskDCP*> getRoots();
 
   long long getUniqueFlag() { return flag++; }
 
-  void initAEST();
-  int initDCPL();
-  void initALST();
+  void initAest();
+  void initAlst();
 
-  int computeAEST(TaskDCP* task, CPU cpu);
-  int computeDCPL(TaskDCP* task, CPU cpu);
-  int getDCPL() { return DCPL; }
+  Time computeAestOnCpu(TaskDCP* task, CPU cpu);
+  Time computeDcplOnCpu(TaskDCP* task, CPU cpu);
+  Time getDcpl() const { return dcpl; }
+  Time computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl);
+  void updateAestFromTask(TaskDCP* task);
+  void updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants);
+  void updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder);
+  // Propagates AEST like the overload above but returns early (before touching
+  // the remaining descendants) as soon as a task's completion exceeds
+  // `dcplBudget`, signalling that the new DCPL would exceed the budget.
+  // Returns true iff the full propagation completed without exceeding the
+  // budget. Uses the caller's snapshot to restore AEST on the aborted tail.
+  bool tryUpdateAestWithinBudget(TaskDCP* task,
+                                 llvm::ArrayRef<TaskDCP*> descendantsTopoOrder,
+                                 Time dcplBudget);
 
   void initTopological();
-  void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint);
-  void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint);
+  void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
+  void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
 
-  llvm::DenseMap<TaskDCP*, int> computeALST(TaskDCP* task, CPU cpu);
+  llvm::DenseMap<TaskDCP*, Time> computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations);
   size_t getNodeIndex(const TaskDCP* task) const;
 
-  TaskDCP* findCandidate(std::vector<TaskDCP*> nodes);
+  TaskDCP* findCandidate(const std::vector<TaskDCP*>& readyNodes);
   void selectProcessor(TaskDCP* candidate, bool push);
-  CPU lastCPU() const { return last_cpu; }
-  void incLastCPU() { last_cpu++; }
-  FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push);
-  void to_dot();
+  CPU getLastCpu() const { return lastCpu; }
+  void incrementLastCpu() { lastCpu++; }
+  FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations);
+  FindSlot findSlotWithFixedFinalTime(
+    TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu);
+  void dumpDot();
 
   friend TaskInsertion;
+  friend class TaskDCP;
+
+  CrossbarUsage getCpuCrossbarUsage(CPU cpu) const;
+  CrossbarUsage getCpuCrossbarCapacity() const;
+  CrossbarUsage getTaskCrossbarFootprint(const TaskDCP* task) const;
+  void reserveTaskCrossbars(CPU cpu, const TaskDCP* task);
+  void releaseTaskCrossbars(CPU cpu, const TaskDCP* task);
+  bool wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const;
 
 public:
-  void DCP();
+  void runDcp();
   GraphDCP(llvm::ArrayRef<onnx_mlir::spatial::SpatWeightedCompute> spatWeightedComputes,
-           llvm::ArrayRef<EdgesIndex> edges)
-  : nodes(), mapCPUTasks() {
+           llvm::ArrayRef<IndexedEdge> edges)
+  : nodes(), cpuTasks(), cpuCrossbarUsage() {
     for (auto spatWeightedCompute : spatWeightedComputes)
       nodes.emplace_back(spatWeightedCompute);
     for (auto [start, end, weight] : edges)
       makeEdge(start, end, weight);
   }
 
-  GraphDCP(llvm::ArrayRef<Weight_t> nodeWeights, llvm::ArrayRef<EdgesIndex> edges)
-  : nodes(), mapCPUTasks() {
+  GraphDCP(llvm::ArrayRef<Weight> nodeWeights,
+           llvm::ArrayRef<IndexedEdge> edges,
+           llvm::ArrayRef<CrossbarUsage> nodeCrossbarUsage = {})
+  : nodes(), cpuTasks(), cpuCrossbarUsage() {
+    assert((nodeCrossbarUsage.empty() || nodeCrossbarUsage.size() == nodeWeights.size())
+           && "synthetic crossbar usage must match synthetic node weights");
     nodes.reserve(nodeWeights.size());
     for (auto [index, weight] : llvm::enumerate(nodeWeights))
-      nodes.emplace_back(index, weight);
+      nodes.emplace_back(index, weight, nodeCrossbarUsage.empty() ? 0 : nodeCrossbarUsage[index]);
     for (auto [start, end, weight] : edges)
       makeEdge(start, end, weight);
   }
 
   DCPAnalysisResult getResult();
   std::vector<ScheduledTaskInfo> getScheduledTasks(CPU cpu) const;
-  CPU cpuCount() const { return last_cpu; }
+  CPU cpuCount() const { return lastCpu; }
 
-  void makeEdge(size_t parent_index, size_t child_index, Weight_t weight) {
-    addEdge(&nodes[parent_index], &nodes[child_index], weight);
+  void makeEdge(size_t parentIndex, size_t childIndex, Weight weight) {
+    addEdge(&nodes[parentIndex], &nodes[childIndex], weight);
   }
 
-  size_t taskInCPU(CPU cpu) { return mapCPUTasks[cpu].size(); }
+  size_t taskInCpu(CPU cpu) { return getOrCreateCpuTasks(cpu).size(); }
+
+  void setMaxCpuCount(int value) { maxCpuCount = value; }
+  int getMaxCpuCount() const { return maxCpuCount; }
+
+  // Optional MLIR context used to drive mlir::parallelFor inside runDcp. If
+  // null the scheduler runs single-threaded (tests use this path).
+  void setContext(mlir::MLIRContext* ctx) { context = ctx; }
 };
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
new file mode 100644
index 0000000..7f3a89d
--- /dev/null
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
@@ -0,0 +1,152 @@
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <fstream>
+#include <string>
+
+#include "GraphDebug.hpp"
+#include "src/Accelerators/PIM/Common/PimCommon.hpp"
+
+namespace dcp_graph {
+
+#ifdef DCP_DEBUG_ENABLED
+
+DcpProgressLogger::DcpProgressLogger(size_t totalTasks)
+: logProgress(totalTasks >= 200),
+  totalTasks(totalTasks),
+  startTime(std::chrono::steady_clock::now()),
+  lastProgressPrint(startTime) {}
+
+std::string DcpProgressLogger::formatDuration(double seconds) {
+  if (seconds < 0)
+    seconds = 0;
+
+  long totalSeconds = static_cast<long>(seconds + 0.5);
+  long hours = totalSeconds / 3600;
+  long minutes = (totalSeconds % 3600) / 60;
+  long secs = totalSeconds % 60;
+  if (hours > 0)
+    return llvm::formatv("{0}:{1:02}:{2:02}", hours, minutes, secs).str();
+  return llvm::formatv("{0}:{1:02}", minutes, secs).str();
+}
+
+void DcpProgressLogger::recordFindDuration(double seconds) { findCandidateSeconds += seconds; }
+void DcpProgressLogger::recordSelectDuration(double seconds) { selectProcessorSeconds += seconds; }
+void DcpProgressLogger::recordUpdateDuration(double seconds) { updateTimingSeconds += seconds; }
+void DcpProgressLogger::advanceCompleted(size_t taskCount) { completedTasks += taskCount; }
+
+void DcpProgressLogger::printStart(size_t readyCount) const {
+  if (!logProgress)
+    return;
+  llvm::errs() << llvm::formatv("[DCP] start: tasks={0} ready={1}\n", totalTasks, readyCount);
+}
+
+void DcpProgressLogger::maybePrintSlowCandidate(size_t nodeIndex,
+                                                double elapsedSeconds,
+                                                size_t readyCount,
+                                                CPU cpuCount) const {
+  if (!logProgress || elapsedSeconds < 1.0)
+    return;
+
+  llvm::errs() << llvm::formatv("[DCP] slow candidate node={0} elapsed={1} ready={2} cpus={3}\n",
+                                nodeIndex,
+                                formatDuration(elapsedSeconds),
+                                readyCount,
+                                cpuCount);
+}
+
+void DcpProgressLogger::printProgress(size_t readyCount, CPU cpuCount, llvm::StringRef stage, bool force) {
+  if (!logProgress)
+    return;
+
+  auto now = std::chrono::steady_clock::now();
+  if (!force && now - lastProgressPrint < std::chrono::seconds(1) && completedTasks != totalTasks)
+    return;
+
+  double elapsedSeconds = std::chrono::duration<double>(now - startTime).count();
+  double rate = elapsedSeconds > 0.0 ? static_cast<double>(completedTasks) / elapsedSeconds : 0.0;
+  double etaSeconds = rate > 0.0 ? static_cast<double>(totalTasks - completedTasks) / rate : 0.0;
+  double percent = totalTasks == 0 ? 100.0 : (100.0 * static_cast<double>(completedTasks) / totalTasks);
+
+  llvm::errs() << llvm::formatv("[DCP] {0}/{1} ({2:F1}%) ready={3} cpus={4} stage={5} elapsed={6} eta={7}\n",
+                                completedTasks,
+                                totalTasks,
+                                percent,
+                                readyCount,
+                                cpuCount,
+                                stage,
+                                formatDuration(elapsedSeconds),
+                                completedTasks == totalTasks ? "0:00" : formatDuration(etaSeconds));
+  llvm::errs() << llvm::formatv("        time(find={0}, select={1}, update={2})\n",
+                                formatDuration(findCandidateSeconds),
+                                formatDuration(selectProcessorSeconds),
+                                formatDuration(updateTimingSeconds));
+  lastProgressPrint = now;
+}
+
+#else
+
+DcpProgressLogger::DcpProgressLogger(size_t) {}
+void DcpProgressLogger::recordFindDuration(double) {}
+void DcpProgressLogger::recordSelectDuration(double) {}
+void DcpProgressLogger::recordUpdateDuration(double) {}
+void DcpProgressLogger::advanceCompleted(size_t) {}
+void DcpProgressLogger::printStart(size_t) const {}
+void DcpProgressLogger::maybePrintSlowCandidate(size_t, double, size_t, CPU) const {}
+void DcpProgressLogger::printProgress(size_t, CPU, llvm::StringRef, bool) {}
+
+#endif
+
+void dumpGraphDot(const std::vector<TaskDCP>& nodes,
+                  const std::vector<std::list<TaskDCP*>>& cpuTasks,
+                  CPU lastCpu) {
+  static int dumpIndex = 0;
+  std::string outputDir = onnx_mlir::getOutputDir();
+  if (outputDir.empty())
+    return;
+
+  std::string graphDir = outputDir + "/dcp_graph";
+  onnx_mlir::createDirectory(graphDir);
+  std::fstream file(graphDir + "/graph_" + std::to_string(dumpIndex++) + ".dot", std::ios::out);
+  file << "digraph G {\n";
+  if (!cpuTasks.empty()) {
+    for (CPU cpu = 0; cpu < lastCpu; cpu++) {
+      file << "subgraph cluster_" << cpu << "{\nstyle=filled;\ncolor=lightgrey;\n";
+      size_t cpuIndex = static_cast<size_t>(cpu);
+      if (cpuIndex >= cpuTasks.size()) {
+        file << " }\n";
+        continue;
+      }
+
+      for (auto node : cpuTasks[cpuIndex]) {
+        file << node->Id() << " [label=\"";
+        file << "n:" << node->Id() << "\n";
+        file << "aest:" << node->getAest() << "\n";
+        file << "alst:" << node->getAlst() << "\n";
+        file << "weight:" << node->getWeight() << "\"]\n";
+      }
+      file << " }\n";
+    }
+  }
+  else {
+    for (const auto& node : nodes) {
+      file << node.Id() << " [label=\"";
+      file << "n:" << node.Id() << "\n";
+      file << "aest:" << node.getAest() << "\n";
+      file << "alst:" << node.getAlst() << "\n";
+      file << "weight:" << node.getWeight() << "\"]\n";
+    }
+  }
+
+  for (const auto& node : nodes)
+    for (const auto& child : node.children) {
+      file << node.Id() << " -> " << child.first->Id();
+      file << " [label=\"" << child.second << "\"]\n";
+    }
+
+  file << "}\n";
+  file.flush();
+  file.close();
+}
+
+} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp
new file mode 100644
index 0000000..380f9df
--- /dev/null
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "llvm/ADT/StringRef.h"
+
+#include <chrono>
+#include <list>
+#include <vector>
+
+#include "Task.hpp"
+#include "Utils.hpp"
+
+// Uncomment to enable DCP progress logging and per-phase profiling during
+// development. When disabled the logger methods are no-ops and the helpers
+// compile away.
+#define DCP_DEBUG_ENABLED
+
+#ifdef DCP_DEBUG_ENABLED
+#define DCP_DEBUG_IF(...) __VA_ARGS__
+#else
+#define DCP_DEBUG_IF(...)
+#endif
+
+namespace dcp_graph {
+
+class DcpProgressLogger {
+public:
+  explicit DcpProgressLogger(size_t totalTasks);
+
+  void recordFindDuration(double seconds);
+  void recordSelectDuration(double seconds);
+  void recordUpdateDuration(double seconds);
+  void advanceCompleted(size_t taskCount = 1);
+
+  void printStart(size_t readyCount) const;
+  void maybePrintSlowCandidate(size_t nodeIndex, double elapsedSeconds, size_t readyCount, CPU cpuCount) const;
+  void printProgress(size_t readyCount, CPU cpuCount, llvm::StringRef stage, bool force);
+
+#ifdef DCP_DEBUG_ENABLED
+private:
+  static std::string formatDuration(double seconds);
+
+  bool logProgress = false;
+  size_t totalTasks = 0;
+  size_t completedTasks = 0;
+  std::chrono::steady_clock::time_point startTime;
+  std::chrono::steady_clock::time_point lastProgressPrint;
+  double findCandidateSeconds = 0.0;
+  double selectProcessorSeconds = 0.0;
+  double updateTimingSeconds = 0.0;
+#endif
+};
+
+void dumpGraphDot(const std::vector<TaskDCP>& nodes,
+                  const std::vector<std::list<TaskDCP*>>& cpuTasks,
+                  CPU lastCpu);
+
+} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
new file mode 100644
index 0000000..eefcabe
--- /dev/null
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
@@ -0,0 +1,105 @@
+#include "llvm/ADT/STLExtras.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "GraphSupport.hpp"
+#include "Task.hpp"
+#include "UniqueWorklist.hpp"
+
+namespace dcp_graph {
+
+llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents) {
+  llvm::DenseSet<TaskDCP*> reachable;
+  std::vector<TaskDCP*> worklist;
+  worklist.reserve(32);
+
+  auto enqueueEdges = [&](TaskDCP* task) {
+    const auto& edges = followParents ? task->parents : task->children;
+    for (const auto& edge : edges)
+      if (reachable.insert(edge.first).second)
+        worklist.push_back(edge.first);
+  };
+
+  enqueueEdges(root);
+  while (!worklist.empty()) {
+    TaskDCP* task = worklist.back();
+    worklist.pop_back();
+    enqueueEdges(task);
+  }
+  return reachable;
+}
+
+GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate) {
+  return {collectReachableTasks(candidate, true), collectReachableTasks(candidate, false)};
+}
+
+LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
+                                                const llvm::DenseSet<TaskDCP*>& descendants,
+                                                Time dcpl,
+                                                Time maxCompletion,
+                                                Time secondMaxCompletion,
+                                                TaskDCP* maxCompletionTask) {
+  LocalScheduleSnapshot snapshot;
+  snapshot.aestBackup.reserve(descendants.size() + 1);
+  snapshot.aestBackup.emplace_back(task, task->getAest());
+  for (TaskDCP* descendant : descendants)
+    snapshot.aestBackup.emplace_back(descendant, descendant->getAest());
+  snapshot.dcpl = dcpl;
+  snapshot.maxCompletion = maxCompletion;
+  snapshot.secondMaxCompletion = secondMaxCompletion;
+  snapshot.maxCompletionTask = maxCompletionTask;
+  return snapshot;
+}
+
+void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
+                               Time& dcpl,
+                               Time& maxCompletion,
+                               Time& secondMaxCompletion,
+                               TaskDCP*& maxCompletionTask) {
+  for (const auto& [task, aest] : snapshot.aestBackup)
+    task->setAest(aest);
+  dcpl = snapshot.dcpl;
+  maxCompletion = snapshot.maxCompletion;
+  secondMaxCompletion = snapshot.secondMaxCompletion;
+  maxCompletionTask = snapshot.maxCompletionTask;
+}
+
+int countDependencyParents(const TaskDCP* task) {
+  return static_cast<int>(llvm::count_if(task->parents, [](const Edge& edge) { return !edge.isScheduling; }));
+}
+
+void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion) {
+  if (insertion == nullptr)
+    return;
+
+  auto alreadyRecorded =
+    llvm::any_of(insertion->topologicalMoves,
+                 [task](const TaskInsertion::TopologicalMoveRecord& move) { return move.task == task; });
+  if (alreadyRecorded)
+    return;
+
+  insertion->topologicalMoves.push_back({task, onnx_mlir::LabeledList<TaskDCP>::next(task)});
+}
+
+std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount) {
+  UniqueWorkList<std::vector<TaskDCP*>> worklist(roots);
+  worklist.reserve(nodeCount);
+
+  size_t index = 0;
+  while (index != worklist.size()) {
+    bool modified = true;
+    while (modified) {
+      modified = false;
+      for (const auto& child : worklist.at(index)->children)
+        if (worklist.allElementsContained(
+              child.first->parents.begin(), child.first->parents.end(), [](Edge edge) { return edge.first; }))
+          modified |= worklist.pushBack(child.first);
+    }
+    index++;
+  }
+
+  return {worklist.begin(), worklist.end()};
+}
+
+} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp
new file mode 100644
index 0000000..9e738f5
--- /dev/null
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include <utility>
+#include <vector>
+
+#include "Graph.hpp"
+
+namespace dcp_graph {
+
+struct LocalScheduleSnapshot {
+  llvm::SmallVector<std::pair<TaskDCP*, Time>, 64> aestBackup;
+  Time dcpl = 0;
+  Time maxCompletion = 0;
+  Time secondMaxCompletion = 0;
+  TaskDCP* maxCompletionTask = nullptr;
+};
+
+llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents);
+GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate);
+
+LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
+                                                const llvm::DenseSet<TaskDCP*>& descendants,
+                                                Time dcpl,
+                                                Time maxCompletion,
+                                                Time secondMaxCompletion,
+                                                TaskDCP* maxCompletionTask);
+void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
+                               Time& dcpl,
+                               Time& maxCompletion,
+                               Time& secondMaxCompletion,
+                               TaskDCP*& maxCompletionTask);
+
+int countDependencyParents(const TaskDCP* task);
+void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion);
+std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount);
+
+} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp
index 556596b..21d93b1 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp
@@ -4,57 +4,63 @@
 #include "Task.hpp"
 #include "UniqueWorklist.hpp"
 
-std::optional<Edge_t> TaskDCP::addChild(TaskDCP* child, Weight_t weight) {
-  std::optional<Edge_t> oldEdge = std::nullopt;
-  auto founded_element =
-    std::find_if(childs.begin(), childs.end(), [child](Edge_t element) { return child == element.first; });
-  if (founded_element != childs.end()) {
-    oldEdge = *founded_element;
-    fastRemove(childs, founded_element);
+std::optional<Edge> TaskDCP::addChild(TaskDCP* child, Weight weight, bool isScheduling) {
+  std::optional<Edge> oldEdge = std::nullopt;
+  auto foundElement = std::find_if(children.begin(), children.end(), [child, isScheduling](Edge element) {
+    return child == element.first && isScheduling == element.isScheduling;
+  });
+  if (foundElement != children.end()) {
+    oldEdge = *foundElement;
+    fastRemove(children, foundElement);
   }
-  childs.emplace_back(child, weight);
+  children.emplace_back(Edge {child, weight, isScheduling});
   return oldEdge;
 }
 
-std::optional<Edge_t> TaskDCP::addParent(TaskDCP* parent, Weight_t weight) {
-  std::optional<Edge_t> oldEdge = std::nullopt;
-  auto founded_element =
-    std::find_if(parents.begin(), parents.end(), [parent](Edge_t element) { return parent == element.first; });
-  if (founded_element != parents.end()) {
-    oldEdge = *founded_element;
-    fastRemove(parents, founded_element);
+std::optional<Edge> TaskDCP::addParent(TaskDCP* parent, Weight weight, bool isScheduling) {
+  std::optional<Edge> oldEdge = std::nullopt;
+  auto foundElement = std::find_if(parents.begin(), parents.end(), [parent, isScheduling](Edge element) {
+    return parent == element.first && isScheduling == element.isScheduling;
+  });
+  if (foundElement != parents.end()) {
+    oldEdge = *foundElement;
+    fastRemove(parents, foundElement);
   }
-  parents.emplace_back(parent, weight);
+  parents.emplace_back(Edge {parent, weight, isScheduling});
   return oldEdge;
 }
 
-bool TaskDCP::hasDescendent(TaskDCP* child) {
+bool TaskDCP::hasDescendant(TaskDCP* child) {
   UniqueWorkList<std::vector<TaskDCP*>> worklist;
   worklist.reserve(32);
-  worklist.push_back(this);
+  worklist.pushBack(this);
   while (!worklist.empty()) {
     TaskDCP* task = worklist.back();
-    worklist.pop_back();
+    worklist.popBack();
     if (task == child)
       return true;
-    for (auto c : task->childs)
-      worklist.push_back(c.first);
+    for (auto edge : task->children)
+      worklist.pushBack(edge.first);
   }
   return false;
 }
 
-// TODO fare qualcosa di sensato
-int TaskDCP::computeWeight(GraphDCP* graph, CPU cpu) { return origWeight; }
+Weight TaskDCP::computeWeightOnCpu(GraphDCP* graph, CPU cpu) {
+  if (crossbarUsage != 0 && graph->wouldExhaustCrossbarCapacity(cpu, this))
+    return std::numeric_limits<Weight>::max();
+  return baseWeight;
+}
 
 void TaskInsertion::rollBack() {
   graph->removeTaskFromCPU(cpuModified, taskInserted);
   if (beforeNode.has_value()) {
-    auto double_edge = *beforeNode;
-    addEdge(double_edge.first.first, double_edge.second.first, double_edge.first.second);
+    auto edgePair = *beforeNode;
+    addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
   }
   if (afterNode.has_value()) {
-    auto double_edge = *afterNode;
-    addEdge(double_edge.first.first, double_edge.second.first, double_edge.first.second);
+    auto edgePair = *afterNode;
+    addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
   }
-  graph->topologicalOrder.moveBefore( taskInserted,&*oldTopologicalPosition );
+  // for (auto it = topologicalMoves.rbegin(); it != topologicalMoves.rend(); ++it)
+  //   graph->topologicalOrder.moveBefore(it->task, it->nextTask);
 }
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp
index 47d0e6b..2290e20 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp
@@ -7,110 +7,117 @@
 #include "Utils.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 
-std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
-void removeEdge(TaskDCP* parent, TaskDCP* child);
-
 class TaskDCP : public onnx_mlir::LabeledListNode<TaskDCP> {
   onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute;
-  int aest;
-  int alst;
-  std::optional<CPU> scheduledCPU;
-  int weight;
-  int origWeight;
+  Time aest;
+  Time alst;
+  std::optional<CPU> scheduledCpu;
+  Weight weight;
+  Weight baseWeight;
+  CrossbarUsage crossbarUsage;
   long long flag = 0;
   int64_t syntheticId = -1;
 
-  std::optional<Edge_t> addChild(TaskDCP* child, Weight_t weight);
-  std::optional<Edge_t> addChild(TaskDCP& child, Weight_t weight) { return addChild(&child, weight); }
+  std::optional<Edge> addChild(TaskDCP* child, Weight weight, bool isScheduling);
+  std::optional<Edge> addChild(TaskDCP& child, Weight weight, bool isScheduling) {
+    return addChild(&child, weight, isScheduling);
+  }
 
-  void removeChild(TaskDCP* to_remove) { fastRemove(childs, to_remove); }
-  void removeChild(TaskDCP& to_remove) { fastRemove(childs, &to_remove); }
+  void removeChild(TaskDCP* toRemove, bool isScheduling) { fastRemove(children, toRemove, isScheduling); }
+  void removeChild(TaskDCP& toRemove, bool isScheduling) { fastRemove(children, &toRemove, isScheduling); }
 
-  std::optional<Edge_t> addParent(TaskDCP* parent, Weight_t weight);
-  std::optional<Edge_t> addParent(TaskDCP& parent, Weight_t weight) { return addParent(&parent, weight); }
+  std::optional<Edge> addParent(TaskDCP* parent, Weight weight, bool isScheduling);
+  std::optional<Edge> addParent(TaskDCP& parent, Weight weight, bool isScheduling) {
+    return addParent(&parent, weight, isScheduling);
+  }
 
-  void removeParent(TaskDCP* to_remove) { fastRemove(parents, to_remove); }
-  void removeParent(TaskDCP& to_remove) { fastRemove(parents, &to_remove); }
+  void removeParent(TaskDCP* toRemove, bool isScheduling) { fastRemove(parents, toRemove, isScheduling); }
+  void removeParent(TaskDCP& toRemove, bool isScheduling) { fastRemove(parents, &toRemove, isScheduling); }
 
 public:
-  std::vector<Edge_t> parents;
-  std::vector<Edge_t> childs;
+  std::vector<Edge> parents;
+  std::vector<Edge> children;
   TaskDCP() = default;
   TaskDCP(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute)
   : onnx_mlir::LabeledListNode<TaskDCP>(),
     spatWeightedCompute(spatWeightedCompute),
     aest(0),
     alst(0),
-    scheduledCPU(),
-    weight(getSpatWeightCompute(spatWeightedCompute)),
-    origWeight(weight),
+    scheduledCpu(),
+    weight(getSpatComputeWeight(spatWeightedCompute)),
+    baseWeight(weight),
+    crossbarUsage(getSpatComputeCrossbarUsage(spatWeightedCompute)),
     syntheticId(-1),
     parents(),
-    childs() {}
+    children() {}
 
-  TaskDCP(int64_t id, int weight)
+  TaskDCP(int64_t id, Weight weight, CrossbarUsage crossbarUsage = 0)
   : onnx_mlir::LabeledListNode<TaskDCP>(),
     spatWeightedCompute(),
     aest(0),
     alst(0),
-    scheduledCPU(),
+    scheduledCpu(),
     weight(weight),
-    origWeight(weight),
+    baseWeight(weight),
+    crossbarUsage(crossbarUsage),
     flag(0),
     syntheticId(id),
     parents(),
-    childs() {}
+    children() {}
 
   TaskDCP(const TaskDCP& node) = delete;
   TaskDCP(TaskDCP&& node) = default;
 
-  void setCPU(CPU cpu) { scheduledCPU = cpu; }
-  std::optional<CPU> getCPU() const { return scheduledCPU; }
-  void resetCPU() { scheduledCPU = std::nullopt; }
-  int getWeight() const {
+  void setCpu(CPU cpu) { scheduledCpu = cpu; }
+  std::optional<CPU> getCpu() const { return scheduledCpu; }
+  void resetCpu() { scheduledCpu = std::nullopt; }
+  Weight getWeight() const {
     if (isScheduled())
       return weight;
-    return origWeight;
+    return baseWeight;
   }
-  void setWeight(int val) { weight = val; }
-  void resetWeight() { weight = origWeight; }
-  int computeWeight(GraphDCP* graph, CPU cpu);
+  void setWeight(Weight value) { weight = value; }
+  void resetWeight() { weight = baseWeight; }
+  Weight computeWeightOnCpu(GraphDCP* graph, CPU cpu);
+  CrossbarUsage getCrossbarUsage() const { return crossbarUsage; }
 
   bool hasParents() const { return parents.size() != 0; }
-  bool hasChilds() const { return childs.size() != 0; }
+  bool hasChildren() const { return children.size() != 0; }
 
-  int getAEST() const { return aest; }
-  int getALST() const { return alst; }
-  void setAEST(int val) {
-    assert(val >= 0);
-    aest = val;
-  }
-  void setALST(int val) { alst = val; }
-  bool hasDescendent(TaskDCP* child);
+  Time getAest() const { return aest; }
+  Time getAlst() const { return alst; }
+  void setAest(Time value) { aest = value; }
+  void setAlst(Time value) { alst = value; }
+  bool hasDescendant(TaskDCP* child);
   int64_t Id() const {
     if (spatWeightedCompute)
       return reinterpret_cast<int64_t>(spatWeightedCompute.getAsOpaquePointer());
     return syntheticId;
   }
 
-  bool isCP() const { return alst == aest; }
-  bool isScheduled() const { return scheduledCPU.has_value(); }
+  bool isCriticalPath() const { return alst == aest; }
+  bool isScheduled() const { return scheduledCpu.has_value(); }
   onnx_mlir::spatial::SpatWeightedCompute getSpatWeightedCompute() const { return spatWeightedCompute; }
 
   void setFlag(long long val) { flag = val; }
   long long getFlag() const { return flag; }
 
-  onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalPosition() { return getIterator(); }
+  onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalIterator() { return getIterator(); }
 
-  friend std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
-  friend void removeEdge(TaskDCP* parent, TaskDCP* child);
-  friend int getTranferCost(TaskDCP* parent, TaskDCP* child);
+  friend std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling);
+  friend void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling);
+  friend Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
 };
 
 struct TaskInsertion {
-  std::optional<DoubleEdge> beforeNode;
-  std::optional<DoubleEdge> afterNode;
-  onnx_mlir::LabeledList<TaskDCP>::Iterator oldTopologicalPosition;
+  struct TopologicalMoveRecord {
+    TaskDCP* task;
+    TaskDCP* nextTask;
+  };
+
+  std::optional<EdgePair> beforeNode;
+  std::optional<EdgePair> afterNode;
+  std::vector<TopologicalMoveRecord> topologicalMoves;
   CPU cpuModified;
   TaskDCP* taskInserted;
   GraphDCP* graph;
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp
index c92611e..9003857 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp
@@ -1,58 +1,57 @@
 #pragma once
 
 #include "llvm/ADT/DenseSet.h"
+
 #include <cassert>
 #include <type_traits>
-#include <iostream>
-#include <unordered_set>
 
 template <typename T, typename = void>
-struct has_pop_front : std::false_type {};
+struct HasPopFront : std::false_type {};
 
 template <typename T>
-struct has_pop_front<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
+struct HasPopFront<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
 
 template <typename T>
 class UniqueWorkList {
 
-  using V = typename T::value_type;
+  using ValueType = typename T::value_type;
   T storage;
-  llvm::DenseSet<V> set;
+  llvm::DenseSet<ValueType> uniqueElements;
 
 public:
   UniqueWorkList() = default;
 
-  template <typename arg_ty>
-  UniqueWorkList(const arg_ty& from)
+  template <typename RangeT>
+  UniqueWorkList(const RangeT& from)
   : storage() {
     for (auto& element : from) {
-      if (!set.contains(element)) {
+      if (!uniqueElements.contains(element)) {
         storage.push_back(element);
-        set.insert(element);
+        uniqueElements.insert(element);
       }
     }
   }
 
   bool empty() const { return storage.empty(); }
-  void reserve(size_t val) { return storage.reserve(val); }
+  void reserve(size_t value) { return storage.reserve(value); }
   size_t size() const { return storage.size(); }
-  V& at(size_t i) { return storage.at(i); }
-  const V& at(size_t i) const { return storage.at(i); }
+  ValueType& at(size_t index) { return storage.at(index); }
+  const ValueType& at(size_t index) const { return storage.at(index); }
 
-  V& front() { return storage.front(); }
-  V& back() { return storage.back(); }
+  ValueType& front() { return storage.front(); }
+  ValueType& back() { return storage.back(); }
 
-  bool push_back(const V& val) {
-    if (!set.contains(val)) {
-      storage.push_back(val);
-      set.insert(val);
+  bool pushBack(const ValueType& value) {
+    if (!uniqueElements.contains(value)) {
+      storage.push_back(value);
+      uniqueElements.insert(value);
       return true;
     }
     return false;
   }
 
-  void pop_front() {
-    if constexpr (has_pop_front<T>::value)
+  void popFront() {
+    if constexpr (HasPopFront<T>::value)
       storage.pop_front();
     else
       assert(false && "Underlying storage type does not support pop_front()");
@@ -61,15 +60,15 @@ public:
   auto cbegin() const { return storage.cbegin(); }
   auto cend() const { return storage.cend(); }
 
-  void pop_back() { storage.pop_back(); }
-
+  void popBack() { storage.pop_back(); }
 
   template <typename Iterator, typename Mapper>
-  bool allElementContained(Iterator start, Iterator end, Mapper map) {
-    while (start != end) {
-      if (!set.contains(map(*start)))
+  bool allElementsContained(Iterator begin, Iterator end, Mapper map) const {
+    auto it = begin;
+    while (it != end) {
+      if (!uniqueElements.contains(map(*it)))
         return false;
-      std::advance(start, 1);
+      std::advance(it, 1);
     }
     return true;
   }
@@ -77,4 +76,8 @@ public:
   auto begin() { return storage.begin(); }
 
   auto end() { return storage.end(); }
+
+  auto begin() const { return storage.begin(); }
+
+  auto end() const { return storage.end(); }
 };
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp
index 5864ac5..fc5a010 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp
@@ -6,60 +6,106 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
 #include <list>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "src/Accelerators/PIM/Common/LabeledList.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-#include "src/Support/TypeUtilities.hpp"
-
 
 using CPU = int;
-using Weight_t = int;
+using Weight = unsigned long long;
+using Time = unsigned long long;
+using CrossbarUsage = unsigned long long;
 class TaskDCP;
 class GraphDCP;
-using Edge_t = std::pair<TaskDCP*, Weight_t>;
-using DoubleEdge = std::pair<Edge_t, Edge_t>;
-using EdgesIndex = std::tuple<int64_t, int64_t, int64_t>;
+struct Edge {
+  TaskDCP* first;
+  Weight second;
+  bool isScheduling = false;
+};
+using EdgePair = std::pair<Edge, Edge>;
+using IndexedEdge = std::tuple<int64_t, int64_t, int64_t>;
 
+inline void fastRemove(std::vector<Edge>& vector, TaskDCP* toRemove, bool isScheduling) {
+  auto position = std::find_if(vector.begin(), vector.end(), [toRemove, isScheduling](Edge edge) {
+    return edge.first == toRemove && edge.isScheduling == isScheduling;
+  });
+  if (position != vector.end()) {
+    std::swap(*(vector.end() - 1), *position);
+    vector.pop_back();
+  }
+}
+
+inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* toRemove) {
+  auto position =
+    std::find_if(vector.begin(), vector.end(), [toRemove](TaskDCP* element) { return element == toRemove; });
+  if (position != vector.end()) {
+    std::swap(*(vector.end() - 1), *position);
+    vector.pop_back();
+  }
+}
+
+template <typename P>
+void fastRemove(std::vector<Edge>& vector, P position) {
+  if (position != vector.end()) {
+    std::swap(*(vector.end() - 1), *position);
+    vector.pop_back();
+  }
+}
 
 template <typename T>
-void fastRemove(std::vector<std::pair<T*, Weight_t>>& vector, T* to_remove) {
-  auto position =
-    std::find_if(vector.begin(), vector.end(), [to_remove](Edge_t edge) { return edge.first == to_remove; });
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
+inline T checkedAdd(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
+  assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
+  return lhs + rhs;
 }
 
-inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* to_remove) {
-  auto position =
-    std::find_if(vector.begin(), vector.end(), [to_remove](TaskDCP* element) { return element == to_remove; });
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
+template <typename T>
+inline T checkedMultiply(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
+  if (lhs == 0 || rhs == 0)
+    return 0;
+  assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
+  return lhs * rhs;
 }
 
-template <typename T, typename P>
-void fastRemove(std::vector<std::pair<T*, Weight_t>>& vector, P position) {
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
+template <typename T>
+inline T addOrMax(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
+  if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
+    return std::numeric_limits<T>::max();
+  return checkedAdd(lhs, rhs);
 }
 
-// TODO Fare qualcosa di sensato
-inline int64_t getSpatWeightCompute(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
-  int64_t tot = 0;
-  for (auto& region : spatWeightedCompute.getBody()) {
-    for (auto& inst : region) {
-      for (auto result : inst.getResults())
-        if (auto element = llvm::dyn_cast<mlir::ShapedType>(result.getType()))
-          tot += onnx_mlir::getSizeInBytes(element);
-    }
-  }
-  return tot;
+template <typename T>
+inline T subtractOrZero(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
+  if (lhs == std::numeric_limits<T>::max())
+    return lhs;
+  if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
+    return 0;
+  return lhs - rhs;
+}
+
+inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
+
+inline Weight getSpatComputeWeight(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
+  constexpr Weight kOperationWeight = 100;
+  Weight numOperations = 0;
+  for (auto& block : spatWeightedCompute.getBody())
+    for ([[maybe_unused]] auto& op : block)
+      numOperations = checkedAdd(numOperations, static_cast<Weight>(1));
+  return checkedMultiply(numOperations, kOperationWeight);
+}
+
+inline CrossbarUsage getSpatComputeCrossbarUsage(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
+  CrossbarUsage crossbarUsage = 0;
+  for (auto& region : spatWeightedCompute.getBody())
+    for (auto& inst : region)
+      if (llvm::isa<onnx_mlir::spatial::SpatWeightedVMMOp>(inst))
+        crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
+  return crossbarUsage;
 }
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
index 075e498..57b551b 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
@@ -5,7 +5,6 @@
 #include "mlir/IR/Region.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
-#include "mlir/IR/Verifier.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 
@@ -14,13 +13,12 @@
 #include "llvm/Support/Debug.h"
 
 #include <cstddef>
-#include <cstdint>
 #include <functional>
 #include <iterator>
 #include <memory>
 
-#include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "DCPGraph/DCPAnalysis.hpp"
+#include "src/Accelerators/PIM/Common/PimCommon.hpp"
 
 using namespace mlir;
 
@@ -36,10 +34,10 @@ struct ComputeValueResults {
 class LazyInsertComputeResult {
   using InsertPoint = mlir::IRRewriter::InsertPoint;
   ComputeValueResults computeResults;
-  Value channelNewOpVal;
+  Value channelValue;
   bool onlyChannel;
   std::function<void(InsertPoint insertPoint)> channelSendInserter;
-  InsertPoint insertPointSend;
+  InsertPoint sendInsertPoint;
   std::function<std::pair<Value, std::function<void(InsertPoint)>>()> channelNewInserter;
 
 public:
@@ -49,7 +47,7 @@ public:
   : computeResults(computeValueResults),
     onlyChannel(isOnlyChannel),
     channelSendInserter(nullptr),
-    insertPointSend({}),
+    sendInsertPoint({}),
     channelNewInserter(channelNewInserter) {}
 
   struct ChannelOrLocalOp {
@@ -59,23 +57,23 @@ public:
 
   bool onlyChanneled() const { return onlyChannel; }
 
-  ChannelOrLocalOp getAsChannelValueAndInsertSender(SpatWeightedCompute spatWeightedCompute) {
+  ChannelOrLocalOp getAsChannelValueAndInsertSender(SpatWeightedCompute currentCompute) {
 
-    auto [first, second] = channelNewInserter();
-    channelNewOpVal = first;
-    channelSendInserter = second;
-    auto BB = computeResults.innerValue.getParentBlock();
-    if (!BB->empty() && isa<spatial::SpatYieldOp>(BB->back()))
-      insertPointSend = InsertPoint(BB, --BB->end());
+    auto [newChannelValue, senderInserter] = channelNewInserter();
+    channelValue = newChannelValue;
+    channelSendInserter = senderInserter;
+    auto* block = computeResults.innerValue.getParentBlock();
+    if (!block->empty() && isa<spatial::SpatYieldOp>(block->back()))
+      sendInsertPoint = InsertPoint(block, --block->end());
     else
-      insertPointSend = InsertPoint(BB, BB->end());
-    if (spatWeightedCompute) {
-      for (auto& BB : spatWeightedCompute.getBody())
-        if (&BB == insertPointSend.getBlock())
+      sendInsertPoint = InsertPoint(block, block->end());
+    if (currentCompute) {
+      for (auto& block : currentCompute.getBody())
+        if (&block == sendInsertPoint.getBlock())
           return {computeResults.innerValue, false};
     }
-    channelSendInserter(insertPointSend);
-    return {channelNewOpVal, true};
+    channelSendInserter(sendInsertPoint);
+    return {channelValue, true};
   }
 
   ChannelOrLocalOp getAsChannelValueAndInsertSender() { return getAsChannelValueAndInsertSender({}); }
@@ -86,7 +84,7 @@ struct MergeComputeNodesPass : PassWrapper<MergeComputeNodesPass, OperationPass<
 private:
   DenseMap<SpatWeightedCompute, LazyInsertComputeResult> newComputeNodeResults;
   DenseMap<SpatWeightedCompute, SpatWeightedCompute> oldToNewComputeMap;
-  DenseMap<int64_t, SpatWeightedCompute> cputToNewComputeMap;
+  DenseMap<int64_t, SpatWeightedCompute> cpuToNewComputeMap;
 
 public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeComputeNodesPass)
@@ -101,17 +99,16 @@ public:
 
   void runOnOperation() override {
     DCPAnalysisResult& analysisResult = getAnalysis<spatial::DCPAnalysis>().getResult();
-    auto& lastComputeOfCpu = analysisResult.isLastComputeOfACpu;
+    auto& lastComputeOfCpu = analysisResult.isLastComputeOfCpu;
     auto& cpuToLastComputeMap = analysisResult.cpuToLastComputeMap;
-    IRRewriter rewriter(&getContext());
 
     for (auto currentComputeNode : analysisResult.dominanceOrderCompute) {
-      size_t cpu = analysisResult.computeToCPUMap.at(currentComputeNode);
-      if (!cputToNewComputeMap.contains(cpu)) {
+      size_t cpu = analysisResult.computeToCpuMap.at(currentComputeNode);
+      if (!cpuToNewComputeMap.contains(cpu)) {
         ValueTypeRange<ResultRange> newWeightedComputeType = cpuToLastComputeMap.at(cpu).getResultTypes();
         auto [newWeightedCompute, computeValueResult] = createNewComputeNode(
           currentComputeNode, newWeightedComputeType, lastComputeOfCpu.contains(currentComputeNode));
-        cputToNewComputeMap[cpu] = newWeightedCompute;
+        cpuToNewComputeMap[cpu] = newWeightedCompute;
         newComputeNodeResults.insert(
           std::make_pair(currentComputeNode,
                          createLazyComputeResult(
@@ -119,7 +116,7 @@ public:
       }
       else {
         auto [newWeightedCompute, computeValueResult] = mergeIntoComputeNode(
-          cputToNewComputeMap[cpu], currentComputeNode, lastComputeOfCpu.contains(currentComputeNode));
+          cpuToNewComputeMap[cpu], currentComputeNode, lastComputeOfCpu.contains(currentComputeNode));
         newComputeNodeResults.insert(
           std::make_pair(currentComputeNode,
                          createLazyComputeResult(
@@ -127,10 +124,10 @@ public:
       }
     }
 
-    for (auto computeNodetoRemove : llvm::make_early_inc_range(llvm::reverse(analysisResult.dominanceOrderCompute))) {
-      for (auto users : computeNodetoRemove->getUsers())
+    for (auto computeNodeToRemove : llvm::make_early_inc_range(llvm::reverse(analysisResult.dominanceOrderCompute))) {
+      for (auto users : computeNodeToRemove->getUsers())
         users->dump();
-      computeNodetoRemove.erase();
+      computeNodeToRemove.erase();
     }
     func::FuncOp func = getOperation();
     dumpModule(cast<ModuleOp>(func->getParentOp()), "spatial1_dcp_merged");
@@ -186,9 +183,9 @@ private:
         LazyInsertComputeResult& lazyArgWeight = newComputeNodeResults.at(argWeightCompute);
         auto [channelVal, isChannel] = lazyArgWeight.getAsChannelValueAndInsertSender();
         assert(isChannel == true);
-        spatial::SpatChannelReceiveOp reciveOp =
+        spatial::SpatChannelReceiveOp receiveOp =
           spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelVal);
-        mapper.map(oldBB.getArgument(indexOld - indexOldStart), reciveOp);
+        mapper.map(oldBB.getArgument(indexOld - indexOldStart), receiveOp);
       }
     }
 
@@ -238,8 +235,8 @@ private:
 
     auto& toBB = toCompute.getBody().front();
     auto& fromBB = fromCompute.getBody().front();
-    auto inputeArgMutable = toCompute.getInputsMutable();
-    // Insert reciveOp
+    auto inputArgMutable = toCompute.getInputsMutable();
+    // Insert receiveOp
     rewriter.setInsertionPointToEnd(&toBB);
     for (auto [bbIndex, arg] : llvm::enumerate(fromCompute.getInputs())) {
       if (auto argWeightCompute = llvm::dyn_cast_if_present<SpatWeightedCompute>(arg.getDefiningOp())) {
@@ -248,9 +245,9 @@ private:
         LazyInsertComputeResult::ChannelOrLocalOp channelOrLocal =
           lazyArgWeight.getAsChannelValueAndInsertSender(toCompute);
         if (channelOrLocal.isChannel) {
-          spatial::SpatChannelReceiveOp reciveOp =
+          spatial::SpatChannelReceiveOp receiveOp =
             spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelOrLocal.data);
-          mapper.map(fromBB.getArgument(bbIndex), reciveOp.getResult());
+          mapper.map(fromBB.getArgument(bbIndex), receiveOp.getResult());
         }
         else {
           mapper.map(fromBB.getArgument(bbIndex), channelOrLocal.data);
@@ -262,7 +259,7 @@ private:
         if (founded == toCompute.getInputs().end()) {
           size_t sizeW = toCompute.getWeights().size();
           size_t sizeI = toCompute.getInputs().size();
-          inputeArgMutable.append(arg);
+          inputArgMutable.append(arg);
           assert(sizeW == toCompute.getWeights().size());
           assert(sizeI + 1 == toCompute.getInputs().size());
           assert(sizeW + sizeI + 1 == toCompute.getOperands().size());
@@ -281,6 +278,12 @@ private:
       assert(mapper.contains(oldBBarg));
 
     ComputeValueResults computeValueResults;
+    auto remapWeightIndex = [&](auto weightedOp) {
+      auto oldIndex = weightedOp.getWeightIndex();
+      auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
+      auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
+      weightedOp.setWeightIndex(newIndex);
+    };
     for (auto& op : fromCompute.getOps()) {
       if (auto yield = dyn_cast<spatial::SpatYieldOp>(&op)) {
         computeValueResults.innerValue = mapper.lookup(yield.getOperand(0));
@@ -289,20 +292,10 @@ private:
       }
       else {
         auto newInst = rewriter.clone(op, mapper);
-        // TODO Refactor in a lambda? same code just different cast, but templated lambda are C++20 and a free function
-        // is a bit too much
-        if (auto vmOp = llvm::dyn_cast<spatial::SpatWeightedMVMOp>(newInst)) {
-          auto oldIndex = vmOp.getWeightIndex();
-          auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
-          auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
-          vmOp.setWeightIndex(newIndex);
-        }
-        if (auto vmOp = llvm::dyn_cast<spatial::SpatWeightedVMMOp>(newInst)) {
-          auto oldIndex = vmOp.getWeightIndex();
-          auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
-          auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
-          vmOp.setWeightIndex(newIndex);
-        }
+        if (auto weightedMvmOp = llvm::dyn_cast<spatial::SpatWeightedMVMOp>(newInst))
+          remapWeightIndex(weightedMvmOp);
+        if (auto weightedVmmOp = llvm::dyn_cast<spatial::SpatWeightedVMMOp>(newInst))
+          remapWeightIndex(weightedVmmOp);
       }
     }
 
@@ -323,19 +316,18 @@ private:
     IRRewriter rewriter(context);
 
     rewriter.setInsertionPointToStart(&funcOp.front());
-    auto saveInsertionPointChnNew = rewriter.saveInsertionPoint();
-    auto insertNew = [saveInsertionPointChnNew, context, loc, computeValueResults]() {
+    auto savedChannelInsertPoint = rewriter.saveInsertionPoint();
+    auto insertNew = [savedChannelInsertPoint, context, loc, computeValueResults]() {
       IRRewriter rewriter(context);
-      rewriter.restoreInsertionPoint(saveInsertionPointChnNew);
+      rewriter.restoreInsertionPoint(savedChannelInsertPoint);
       auto channelOp = spatial::SpatChannelNewOp::create(rewriter, loc, spatial::SpatChannelType::get(context));
       auto channelVal = channelOp.getResult();
-      auto insertVal =
-        [&context, loc, computeValueResults, channelVal](mlir::IRRewriter::InsertPoint insertPointChnSend) {
-          IRRewriter rewriter(context);
-          rewriter.restoreInsertionPoint(insertPointChnSend);
-          auto spatSend = spatial::SpatChannelSendOp::create(rewriter, loc, channelVal, computeValueResults.innerValue);
-          return spatSend;
-        };
+      auto insertVal = [&context, loc, computeValueResults, channelVal](mlir::IRRewriter::InsertPoint sendInsertPoint) {
+        IRRewriter rewriter(context);
+        rewriter.restoreInsertionPoint(sendInsertPoint);
+        auto spatSend = spatial::SpatChannelSendOp::create(rewriter, loc, channelVal, computeValueResults.innerValue);
+        return spatSend;
+      };
       std::pair<Value, std::function<void(mlir::IRRewriter::InsertPoint)>> ret {channelVal, insertVal};
       return ret;
     };
diff --git a/test/PIM/CMakeLists.txt b/test/PIM/CMakeLists.txt
index 01a9f11..9fb2355 100644
--- a/test/PIM/CMakeLists.txt
+++ b/test/PIM/CMakeLists.txt
@@ -25,8 +25,15 @@ function(add_pim_unittest test_name)
   set_tests_properties(${test_name} PROPERTIES LABELS pim-unittest)
 endfunction()
 
-add_pim_unittest(TestPIM
-  TestPIM.cpp
+add_pim_unittest(LabeledListTest
+  LabeledListTest.cpp
+
+  LINK_LIBS PRIVATE
+  OMPimCommon
+)
+
+add_pim_unittest(DCPTest
+  DCPTest.cpp
 
   LINK_LIBS PRIVATE
   OMPimCommon
diff --git a/test/PIM/DCPTest.cpp b/test/PIM/DCPTest.cpp
new file mode 100644
index 0000000..b813fb1
--- /dev/null
+++ b/test/PIM/DCPTest.cpp
@@ -0,0 +1,528 @@
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <initializer_list>
+#include <iostream>
+#include <limits>
+#include <optional>
+#include <unordered_map>
+#include <vector>
+
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
+#include "src/Compiler/CompilerOptions.hpp"
+
+namespace {
+
+struct ExpectedScheduledTask {
+  size_t nodeIndex;
+  Time aest;
+  Time alst;
+  Weight weight;
+};
+
+struct ScheduledPlacement {
+  CPU cpu;
+  GraphDCP::ScheduledTaskInfo task;
+};
+
+std::filesystem::path getDcpTestOutputDir() { return std::filesystem::temp_directory_path() / "raptor-test-pim"; }
+
+void configureDcpDotOutput() {
+  auto outputDir = getDcpTestOutputDir();
+  std::error_code errorCode;
+  std::filesystem::remove_all(outputDir, errorCode);
+  std::filesystem::create_directories(outputDir, errorCode);
+  assert(!errorCode);
+  onnx_mlir::outputBaseName = (outputDir / "DCPTest.mlir").string();
+}
+
+std::optional<std::filesystem::path> getLatestDcpDotFile() {
+  auto graphDir = getDcpTestOutputDir() / "dcp_graph";
+  if (!std::filesystem::exists(graphDir))
+    return std::nullopt;
+
+  std::optional<std::filesystem::path> latestDot;
+  for (const auto& entry : std::filesystem::directory_iterator(graphDir)) {
+    if (!entry.is_regular_file() || entry.path().extension() != ".dot")
+      continue;
+    if (!latestDot || entry.path().filename() > latestDot->filename())
+      latestDot = entry.path();
+  }
+  return latestDot;
+}
+
+void dumpDcpFailureArtifacts() {
+  auto latestDot = getLatestDcpDotFile();
+  if (!latestDot) {
+    std::cerr << "No DCP dot file was produced.\n";
+    return;
+  }
+
+  std::cerr << "DCP dot file: " << latestDot->string() << '\n';
+  std::ifstream dotFile(*latestDot);
+  if (!dotFile.is_open()) {
+    std::cerr << "Failed to open DCP dot file.\n";
+    return;
+  }
+
+  std::cerr << dotFile.rdbuf();
+}
+
+void printCpuSchedule(GraphDCP& graph, CPU cpu) {
+  auto actualTasks = graph.getScheduledTasks(cpu);
+  std::cerr << "CPU " << cpu << " actual schedule:\n";
+  for (const auto& task : actualTasks) {
+    std::cerr << "  " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
+              << " weight: " << task.weight << '\n';
+  }
+}
+
+void printGraphSchedule(GraphDCP& graph) {
+  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
+    printCpuSchedule(graph, cpu);
+}
+
+bool checkScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
+  auto actualTasks = graph.getScheduledTasks(cpu);
+  if (actualTasks.size() != expectedTasks.size()) {
+    printCpuSchedule(graph, cpu);
+    return false;
+  }
+
+  auto expectedIt = expectedTasks.begin();
+  for (const auto& actualTask : actualTasks) {
+    if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
+        || actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
+      printCpuSchedule(graph, cpu);
+      return false;
+    }
+    ++expectedIt;
+  }
+  return true;
+}
+
+std::unordered_map<size_t, ScheduledPlacement> collectScheduledPlacements(GraphDCP& graph) {
+  std::unordered_map<size_t, ScheduledPlacement> scheduledPlacements;
+  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
+    for (const auto& task : graph.getScheduledTasks(cpu)) {
+      auto [it, inserted] = scheduledPlacements.emplace(task.nodeIndex, ScheduledPlacement {cpu, task});
+      assert(inserted && "task scheduled multiple times");
+      (void) it;
+    }
+  }
+  return scheduledPlacements;
+}
+
+bool checkAllTasksScheduled(GraphDCP& graph, size_t expectedTaskCount) {
+  auto scheduledPlacements = collectScheduledPlacements(graph);
+  if (scheduledPlacements.size() != expectedTaskCount) {
+    std::cerr << "Expected " << expectedTaskCount << " scheduled tasks, got " << scheduledPlacements.size() << "\n";
+    printGraphSchedule(graph);
+    return false;
+  }
+  return true;
+}
+
+bool checkCpuSchedulesDoNotOverlap(GraphDCP& graph) {
+  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
+    auto scheduledTasks = graph.getScheduledTasks(cpu);
+    Time previousCompletion = 0;
+    bool firstTask = true;
+    for (const auto& task : scheduledTasks) {
+      Time completion = addOrMax(task.aest, task.weight);
+      if (task.aest > task.alst) {
+        std::cerr << "Task " << task.nodeIndex << " on CPU " << cpu << " has aest > alst\n";
+        printCpuSchedule(graph, cpu);
+        return false;
+      }
+      if (!firstTask && task.aest < previousCompletion) {
+        std::cerr << "CPU " << cpu << " has overlapping tasks\n";
+        printCpuSchedule(graph, cpu);
+        return false;
+      }
+      previousCompletion = completion;
+      firstTask = false;
+    }
+  }
+  return true;
+}
+
+bool checkDependencyConstraints(GraphDCP& graph, llvm::ArrayRef<IndexedEdge> edges) {
+  auto scheduledPlacements = collectScheduledPlacements(graph);
+  for (auto [parentIndex, childIndex, transferCost] : edges) {
+    const auto& parent = scheduledPlacements.at(parentIndex);
+    const auto& child = scheduledPlacements.at(childIndex);
+    Time requiredStart = addOrMax(parent.task.aest, parent.task.weight);
+    if (parent.cpu != child.cpu)
+      requiredStart = addOrMax(requiredStart, static_cast<Weight>(transferCost));
+    if (child.task.aest < requiredStart) {
+      std::cerr << "Dependency violation for edge " << parentIndex << " -> " << childIndex << '\n';
+      printGraphSchedule(graph);
+      return false;
+    }
+  }
+  return true;
+}
+
+Time getMaxCompletion(GraphDCP& graph) {
+  Time maxCompletion = 0;
+  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
+    for (const auto& task : graph.getScheduledTasks(cpu))
+      maxCompletion = std::max(maxCompletion, addOrMax(task.aest, task.weight));
+  return maxCompletion;
+}
+
+int testDCPGraphSingleNode() {
+  std::cout << "testDCPGraphSingleNode:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {15};
+  GraphDCP graph(nodeWeights, {});
+  graph.runDcp();
+
+  if (graph.cpuCount() != 1) {
+    std::cerr << "Expected exactly 1 CPU, got " << graph.cpuCount() << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           0,
+                           {
+                             {0, 0, 0, 15},
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  return 0;
+}
+
+int testDCPGraphLinearChain() {
+  std::cout << "testDCPGraphLinearChain:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {10, 20, 5};
+  const std::vector<IndexedEdge> edges = {
+    {0, 1, 7},
+    {1, 2, 9},
+  };
+
+  GraphDCP graph(nodeWeights, edges);
+  graph.runDcp();
+
+  if (graph.cpuCount() != 1) {
+    std::cerr << "Expected a linear chain to stay on one CPU, got " << graph.cpuCount() << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           0,
+                           {
+                             {0, 0,  0,  10},
+                             {1, 10, 10, 20},
+                             {2, 30, 30, 5 },
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkCpuSchedulesDoNotOverlap(graph) || !checkDependencyConstraints(graph, edges)) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  return 0;
+}
+
+int testDCPGraphFixture() {
+  std::cout << "testDCPGraphFixture:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {
+    80,
+    40,
+    40,
+    40,
+    40,
+    40,
+    60,
+    30,
+    30,
+    30,
+    30,
+    40,
+    20,
+    20,
+    20,
+    20,
+    10,
+    10,
+  };
+  const std::vector<IndexedEdge> edges = {
+    {0,  1,  3  },
+    {0,  1,  120},
+    {0,  2,  120},
+    {0,  3,  120},
+    {0,  4,  120},
+    {0,  5,  120},
+    {0,  6,  120},
+    {2,  6,  80 },
+    {2,  7,  80 },
+    {3,  8,  80 },
+    {4,  9,  80 },
+    {5,  10, 80 },
+    {6,  7,  120},
+    {6,  8,  120},
+    {6,  9,  120},
+    {6,  10, 120},
+    {6,  11, 120},
+    {8,  11, 80 },
+    {8,  12, 80 },
+    {9,  13, 80 },
+    {10, 14, 80 },
+    {11, 12, 120},
+    {11, 13, 120},
+    {11, 14, 120},
+    {11, 15, 120},
+    {13, 15, 80 },
+    {13, 16, 80 },
+    {14, 17, 80 },
+    {15, 16, 120},
+    {15, 17, 120},
+  };
+
+  GraphDCP graph(nodeWeights, {});
+  for (auto [parent, child, weight] : edges)
+    graph.makeEdge(parent, child, weight);
+
+  graph.runDcp();
+  if (graph.cpuCount() != 4) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           3,
+                           {
+                             {1, 200, 400, 40},
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           2,
+                           {
+                             {5,  200, 260, 40},
+                             {10, 300, 300, 30},
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           1,
+                           {
+                             {4, 200, 210, 40},
+                             {7, 300, 410, 30},
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkScheduledTasks(graph,
+                           0,
+                           {
+                             {0,  0,   0,   80},
+                             {2,  80,  80,  40},
+                             {6,  120, 120, 60},
+                             {3,  180, 200, 40},
+                             {8,  220, 240, 30},
+                             {11, 250, 270, 40},
+                             {12, 290, 310, 20},
+                             {9,  320, 330, 30},
+                             {13, 350, 360, 20},
+                             {15, 370, 380, 20},
+                             {16, 390, 400, 10},
+                             {14, 410, 410, 20},
+                             {17, 430, 430, 10},
+  })) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
+      || !checkDependencyConstraints(graph, edges)) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  return 0;
+}
+
+int testDCPGraphMaxCPUs() {
+  std::cout << "testDCPGraphMaxCPUs:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {20, 10, 10, 10, 10, 10, 10};
+  const std::vector<IndexedEdge> edges = {
+    {0, 1, 0},
+    {0, 2, 0},
+    {0, 3, 0},
+    {0, 4, 0},
+    {0, 5, 0},
+    {0, 6, 0},
+  };
+
+  GraphDCP graph(nodeWeights, edges);
+  graph.setMaxCpuCount(2);
+  graph.runDcp();
+
+  if (graph.cpuCount() != 2) {
+    std::cerr << "Expected exactly 2 CPUs with maxCpuCount=2, got " << graph.cpuCount() << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
+      || !checkDependencyConstraints(graph, edges)) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (getMaxCompletion(graph) > 50) {
+    std::cerr << "Expected makespan <= 50 under maxCpuCount=2, got " << getMaxCompletion(graph) << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  return 0;
+}
+
+int testDCPGraphSingleCpuCap() {
+  std::cout << "testDCPGraphSingleCpuCap:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {20, 10, 10, 10};
+  const std::vector<IndexedEdge> edges = {
+    {0, 1, 0},
+    {0, 2, 0},
+    {0, 3, 0},
+  };
+
+  GraphDCP graph(nodeWeights, edges);
+  graph.setMaxCpuCount(1);
+  graph.runDcp();
+
+  if (graph.cpuCount() != 1) {
+    std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
+      || !checkDependencyConstraints(graph, edges)) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+  if (getMaxCompletion(graph) != 50) {
+    std::cerr << "Expected makespan 50 under maxCpuCount=1, got " << getMaxCompletion(graph) << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  return 0;
+}
+
+int testDCPGraphDiamondDependencies() {
+  std::cout << "testDCPGraphDiamondDependencies:" << std::endl;
+  configureDcpDotOutput();
+
+  const std::vector<Weight> nodeWeights = {15, 10, 12, 20};
+  const std::vector<IndexedEdge> edges = {
+    {0, 1, 5},
+    {0, 2, 7},
+    {1, 3, 3},
+    {2, 3, 2},
+  };
+
+  GraphDCP graph(nodeWeights, edges);
+  graph.runDcp();
+
+  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
+      || !checkDependencyConstraints(graph, edges)) {
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  auto scheduledPlacements = collectScheduledPlacements(graph);
+  const auto& sink = scheduledPlacements.at(3).task;
+  if (sink.aest < 27) {
+    std::cerr << "Expected sink node to start no earlier than the longest parent path, got " << sink.aest << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  return 0;
+}
+
+int testDCPGraphCrossbarExhaustion() {
+  std::cout << "testDCPGraphCrossbarExhaustion:" << std::endl;
+  configureDcpDotOutput();
+
+  const size_t savedCrossbarSize = onnx_mlir::crossbarSize.getValue();
+  const size_t savedCrossbarCount = onnx_mlir::crossbarCountInCore.getValue();
+  onnx_mlir::crossbarSize = 4;
+  onnx_mlir::crossbarCountInCore = 2;
+
+  auto restoreCrossbarOptions = [&]() {
+    onnx_mlir::crossbarSize = savedCrossbarSize;
+    onnx_mlir::crossbarCountInCore = savedCrossbarCount;
+  };
+
+  const std::vector<Weight> nodeWeights = {10, 10, 10};
+  const std::vector<CrossbarUsage> nodeCrossbarUsage = {1, 1, 1};
+  GraphDCP graph(nodeWeights, {}, nodeCrossbarUsage);
+  graph.setMaxCpuCount(1);
+  graph.runDcp();
+
+  if (graph.cpuCount() != 1) {
+    restoreCrossbarOptions();
+    std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  auto scheduledTasks = graph.getScheduledTasks(0);
+  if (scheduledTasks.size() != 3) {
+    restoreCrossbarOptions();
+    std::cerr << "Expected all three tasks to be scheduled on CPU 0\n";
+    printCpuSchedule(graph, 0);
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  if (scheduledTasks[0].weight != 10 || scheduledTasks[1].weight != std::numeric_limits<Weight>::max()
+      || scheduledTasks[2].weight != std::numeric_limits<Weight>::max()) {
+    restoreCrossbarOptions();
+    std::cerr << "Unexpected effective weights under crossbar exhaustion\n";
+    printCpuSchedule(graph, 0);
+    dumpDcpFailureArtifacts();
+    return 1;
+  }
+
+  restoreCrossbarOptions();
+  return 0;
+}
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+  (void) argc;
+  (void) argv;
+
+  int failures = 0;
+  failures += testDCPGraphSingleNode();
+  failures += testDCPGraphLinearChain();
+  failures += testDCPGraphFixture();
+  failures += testDCPGraphMaxCPUs();
+  failures += testDCPGraphSingleCpuCap();
+  failures += testDCPGraphDiamondDependencies();
+  failures += testDCPGraphCrossbarExhaustion();
+  if (failures != 0) {
+    std::cerr << failures << " test failures\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/test/PIM/LabeledListTest.cpp b/test/PIM/LabeledListTest.cpp
new file mode 100644
index 0000000..6d9ae99
--- /dev/null
+++ b/test/PIM/LabeledListTest.cpp
@@ -0,0 +1,162 @@
+#include <cassert>
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <vector>
+
+#include "src/Accelerators/PIM/Common/LabeledList.hpp"
+
+using onnx_mlir::LabeledList;
+using onnx_mlir::LabeledListNode;
+
+namespace {
+
+struct TestNode : public LabeledListNode<TestNode> {
+  explicit TestNode(int id)
+  : id(id) {}
+
+  int id;
+};
+
+void assertOrder(LabeledList<TestNode>& list, std::initializer_list<int> expectedOrder) {
+  auto expectedIt = expectedOrder.begin();
+  for (auto& node : list) {
+    assert(expectedIt != expectedOrder.end());
+    assert(node.id == *expectedIt);
+    ++expectedIt;
+  }
+  assert(expectedIt == expectedOrder.end());
+}
+
+void assertStrictlyIncreasingLabels(LabeledList<TestNode>& list) {
+  auto it = list.begin();
+  if (it == list.end())
+    return;
+
+  auto previousLabel = it->getOrderLabel();
+  ++it;
+  for (; it != list.end(); ++it) {
+    assert(previousLabel < it->getOrderLabel());
+    previousLabel = it->getOrderLabel();
+  }
+}
+
+int testLabeledListBasicMutation() {
+  std::cout << "testLabeledListBasicMutation:" << std::endl;
+
+  LabeledList<TestNode> list;
+  TestNode n1(1);
+  TestNode n2(2);
+  TestNode n3(3);
+  TestNode n4(4);
+  TestNode n5(5);
+
+  assert(list.empty());
+  assert(list.front() == nullptr);
+  assert(list.back() == nullptr);
+  assert(!list.contains(&n1));
+  assert(LabeledList<TestNode>::previous(&n1) == nullptr);
+  assert(LabeledList<TestNode>::next(&n1) == nullptr);
+
+  list.pushBack(&n1);
+  list.pushBack(&n3);
+  list.insertAfter(&n1, &n2);
+  list.pushFront(&n4);
+  list.insertBefore(nullptr, &n5);
+
+  assert(list.size() == 5);
+  assert(list.front() == &n4);
+  assert(list.back() == &n5);
+  assert(list.contains(&n2));
+  assertOrder(list, {4, 1, 2, 3, 5});
+  assert(LabeledList<TestNode>::next(&n4) == &n1);
+  assert(LabeledList<TestNode>::previous(&n1) == &n4);
+  assert(LabeledList<TestNode>::next(&n5) == nullptr);
+  assert(list.comesBefore(&n1, &n3));
+  assert(list.getOrderLabel(&n1) < list.getOrderLabel(&n3));
+
+  list.moveBefore(&n5, &n2);
+  assertOrder(list, {4, 1, 5, 2, 3});
+
+  list.moveAfter(&n4, &n3);
+  assertOrder(list, {1, 5, 2, 3, 4});
+
+  list.remove(&n2);
+  assert(!n2.isLinked());
+  assert(!list.contains(&n2));
+  assertOrder(list, {1, 5, 3, 4});
+
+  list.clear();
+  assert(list.empty());
+  assert(list.size() == 0);
+  assert(list.front() == nullptr);
+  assert(list.back() == nullptr);
+  assert(!n1.isLinked());
+  assert(!n3.isLinked());
+  assert(!n4.isLinked());
+  assert(!n5.isLinked());
+
+  return 0;
+}
+
+int testLabeledListRelabelingAndNoopMoves() {
+  std::cout << "testLabeledListRelabelingAndNoopMoves:" << std::endl;
+
+  constexpr int insertedNodeCount = 80;
+  LabeledList<TestNode> list;
+  TestNode head(0);
+  TestNode tail(999);
+  std::vector<TestNode> insertedNodes;
+  insertedNodes.reserve(insertedNodeCount);
+  for (int i = 0; i < insertedNodeCount; ++i)
+    insertedNodes.emplace_back(i + 1);
+
+  list.pushBack(&head);
+  list.pushBack(&tail);
+  for (auto& node : insertedNodes)
+    list.insertAfter(&head, &node);
+
+  assert(list.size() == insertedNodeCount + 2);
+  assert(list.front() == &head);
+  assert(list.back() == &tail);
+  assert(LabeledList<TestNode>::previous(&head) == nullptr);
+  assert(LabeledList<TestNode>::next(&tail) == nullptr);
+  assertStrictlyIncreasingLabels(list);
+
+  auto* firstInserted = LabeledList<TestNode>::next(&head);
+  auto* secondInserted = LabeledList<TestNode>::next(firstInserted);
+  list.moveBefore(firstInserted, secondInserted);
+  list.moveAfter(&head, nullptr);
+  list.moveAfter(&tail, LabeledList<TestNode>::previous(&tail));
+
+  assert(list.front() == &head);
+  assert(list.back() == &tail);
+  assert(firstInserted == &insertedNodes.back());
+  assert(secondInserted == &insertedNodes[insertedNodeCount - 2]);
+  assertStrictlyIncreasingLabels(list);
+
+  int expectedId = insertedNodeCount;
+  auto it = std::next(list.begin());
+  for (; it != list.end() && &*it != &tail; ++it, --expectedId)
+    assert(it->id == expectedId);
+  assert(expectedId == 0);
+  list.clear();
+
+  return 0;
+}
+
+} // namespace
+
+int main(int argc, char* argv[]) {
+  (void) argc;
+  (void) argv;
+
+  int failures = 0;
+  failures += testLabeledListBasicMutation();
+  failures += testLabeledListRelabelingAndNoopMoves();
+  if (failures != 0) {
+    std::cerr << failures << " test failures\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/test/PIM/TestPIM.cpp b/test/PIM/TestPIM.cpp
deleted file mode 100644
index fdf5054..0000000
--- a/test/PIM/TestPIM.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include "src/Accelerators/PIM/Common/LabeledList.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
-
-#include <cassert>
-#include <cstdlib>
-#include <initializer_list>
-#include <iostream>
-#include <vector>
-
-using onnx_mlir::LabeledList;
-using onnx_mlir::LabeledListNode;
-
-namespace {
-
-struct TestNode : public LabeledListNode<TestNode> {
-  explicit TestNode(int id)
-  : id(id) {}
-
-  int id;
-};
-
-void assertOrder(LabeledList<TestNode>& list, std::initializer_list<int> expectedOrder) {
-  auto expectedIt = expectedOrder.begin();
-  for (auto& node : list) {
-    assert(expectedIt != expectedOrder.end());
-    assert(node.id == *expectedIt);
-    ++expectedIt;
-  }
-  assert(expectedIt == expectedOrder.end());
-}
-
-int testLabeledList() {
-  std::cout << "testLabeledList:" << std::endl;
-
-  LabeledList<TestNode> list;
-  TestNode n1(1);
-  TestNode n2(2);
-  TestNode n3(3);
-  TestNode n4(4);
-  TestNode n5(5);
-
-  list.pushBack(&n1);
-  list.pushBack(&n3);
-  list.insertAfter(&n1, &n2);
-  list.pushFront(&n4);
-  list.insertBefore(nullptr, &n5);
-
-  assertOrder(list, {4, 1, 2, 3, 5});
-  assert(LabeledList<TestNode>::next(&n4) == &n1);
-  assert(LabeledList<TestNode>::previous(&n1) == &n4);
-  assert(LabeledList<TestNode>::next(&n5) == nullptr);
-  assert(list.comesBefore(&n1, &n3));
-  assert(list.getOrderLabel(&n1) < list.getOrderLabel(&n3));
-
-  list.moveBefore(&n5, &n2);
-  assertOrder(list, {4, 1, 5, 2, 3});
-
-  list.moveAfter(&n4, &n3);
-  assertOrder(list, {1, 5, 2, 3, 4});
-
-  list.remove(&n2);
-  assert(!n2.isLinked());
-  assertOrder(list, {1, 5, 3, 4});
-
-  list.clear();
-  assert(list.empty());
-  assert(!n1.isLinked());
-  assert(!n3.isLinked());
-  assert(!n4.isLinked());
-  assert(!n5.isLinked());
-
-  return 0;
-}
-
-struct ExpectedScheduledTask {
-  size_t nodeIndex;
-  int aest;
-  int alst;
-  int weight;
-};
-
-void assertScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
-  auto actualTasks = graph.getScheduledTasks(cpu);
-  assert(actualTasks.size() == expectedTasks.size());
-
-  auto expectedIt = expectedTasks.begin();
-  for (const auto& actualTask : actualTasks) {
-    assert(expectedIt != expectedTasks.end());
-    if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
-        || actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
-      std::cerr << "CPU " << cpu << " actual schedule:\n";
-      for (const auto& task : actualTasks) {
-        std::cerr << "  " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
-                  << " weight: " << task.weight << '\n';
-      }
-    }
-    assert(actualTask.nodeIndex == expectedIt->nodeIndex);
-    assert(actualTask.aest == expectedIt->aest);
-    assert(actualTask.alst == expectedIt->alst);
-    assert(actualTask.weight == expectedIt->weight);
-    ++expectedIt;
-  }
-}
-
-int testDCPGraphFixture() {
-  std::cout << "testDCPGraphFixture:" << std::endl;
-
-  const std::vector<Weight_t> nodeWeights = {
-    80, 40, 40, 40, 40, 40, 60, 30, 30, 30,
-    30, 40, 20, 20, 20, 20, 10, 10,
-  };
-
-  GraphDCP graph(nodeWeights, {});
-  graph.makeEdge(0, 1, 3);
-  graph.makeEdge(0, 1, 120);
-  graph.makeEdge(0, 2, 120);
-  graph.makeEdge(0, 3, 120);
-  graph.makeEdge(0, 4, 120);
-  graph.makeEdge(0, 5, 120);
-  graph.makeEdge(0, 6, 120);
-  graph.makeEdge(2, 6, 80);
-  graph.makeEdge(2, 7, 80);
-  graph.makeEdge(3, 8, 80);
-  graph.makeEdge(4, 9, 80);
-  graph.makeEdge(5, 10, 80);
-  graph.makeEdge(6, 7, 120);
-  graph.makeEdge(6, 8, 120);
-  graph.makeEdge(6, 9, 120);
-  graph.makeEdge(6, 10, 120);
-  graph.makeEdge(6, 11, 120);
-  graph.makeEdge(8, 11, 80);
-  graph.makeEdge(8, 12, 80);
-  graph.makeEdge(9, 13, 80);
-  graph.makeEdge(10, 14, 80);
-  graph.makeEdge(11, 12, 120);
-  graph.makeEdge(11, 13, 120);
-  graph.makeEdge(11, 14, 120);
-  graph.makeEdge(11, 15, 120);
-  graph.makeEdge(13, 15, 80);
-  graph.makeEdge(13, 16, 80);
-  graph.makeEdge(14, 17, 80);
-  graph.makeEdge(15, 16, 120);
-  graph.makeEdge(15, 17, 120);
-
-  graph.DCP();
-  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
-    auto scheduledTasks = graph.getScheduledTasks(cpu);
-    std::cerr << "CPU " << cpu << " computed schedule:\n";
-    for (const auto& task : scheduledTasks) {
-      std::cerr << "  " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
-                << " weight: " << task.weight << '\n';
-    }
-  }
-  assert(graph.cpuCount() == 4);
-  assertScheduledTasks(graph, 3, {
-    {1, 200, 370, 40},
-  });
-  assertScheduledTasks(graph, 2, {
-    {5, 200, 260, 40},
-    {10, 300, 300, 30},
-  });
-  assertScheduledTasks(graph, 1, {
-    {4, 200, 210, 40},
-    {7, 300, 380, 30},
-  });
-  assertScheduledTasks(graph, 0, {
-    {0, 0, 0, 80},
-    {2, 80, 80, 40},
-    {6, 120, 120, 60},
-    {3, 180, 200, 40},
-    {8, 220, 240, 30},
-    {11, 250, 270, 40},
-    {12, 290, 310, 20},
-    {9, 320, 330, 30},
-    {13, 350, 360, 20},
-    {15, 370, 380, 20},
-    {16, 390, 400, 10},
-    {14, 410, 410, 20},
-    {17, 430, 430, 10},
-  });
-  return 0;
-}
-
-} // namespace
-
-int main(int argc, char* argv[]) {
-  (void) argc;
-  (void) argv;
-
-  int failures = 0;
-  failures += testLabeledList();
-  failures += testDCPGraphFixture();
-  if (failures != 0) {
-    std::cerr << failures << " test failures\n";
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
-}