diff --git a/src/PIM/Compiler/PimCompilerOptions.cpp b/src/PIM/Compiler/PimCompilerOptions.cpp
index 515bff5..4eee65c 100644
--- a/src/PIM/Compiler/PimCompilerOptions.cpp
+++ b/src/PIM/Compiler/PimCompilerOptions.cpp
@@ -19,7 +19,6 @@ llvm::cl::opt<PimMergeSchedulerType>
   pimMergeScheduler("pim-merge-scheduler",
                     llvm::cl::desc("Scheduler used by the Spatial merge-compute-nodes pass"),
                     llvm::cl::values(clEnumValN(MergeSchedulerPeft, "peft", "Use PEFT scheduling")),
-                    llvm::cl::values(clEnumValN(MergeSchedulerDcp, "dcp", "Use the legacy DCP-inspired scheduler")),
                     llvm::cl::init(MergeSchedulerPeft),
                     llvm::cl::cat(OnnxMlirOptions));
 
@@ -49,12 +48,6 @@ llvm::cl::opt<long> coresCount("core-count",
                                llvm::cl::desc("Number of cores in the chip. Required for PIM compilation."),
                                llvm::cl::init(-1));
 
-llvm::cl::opt<size_t> dcpCriticalWindowSize(
-  "dcp-critical-window-size",
-  llvm::cl::desc("Number of lowest-slack virtual nodes considered by each DCP coarsening iteration. "
-                 "Use 0 to run the legacy full-graph DCP analysis. Only used by the DCP scheduler."),
-  llvm::cl::init(4000));
-
 llvm::cl::opt<bool>
   ignoreConcatError("ignore-concat-error",
                     llvm::cl::desc("Ignore ConcatOp corner case: do not assert and do a simplification"),
diff --git a/src/PIM/Compiler/PimCompilerOptions.hpp b/src/PIM/Compiler/PimCompilerOptions.hpp
index a5f1d16..05e51ab 100644
--- a/src/PIM/Compiler/PimCompilerOptions.hpp
+++ b/src/PIM/Compiler/PimCompilerOptions.hpp
@@ -22,7 +22,6 @@ typedef enum {
 
 typedef enum {
   MergeSchedulerPeft = 0,
-  MergeSchedulerDcp = 1,
 } PimMergeSchedulerType;
 
 extern llvm::cl::OptionCategory OnnxMlirOptions;
@@ -36,7 +35,6 @@ extern llvm::cl::opt<bool> pimEmitJson;
 extern llvm::cl::opt<size_t> crossbarSize;
 extern llvm::cl::opt<size_t> crossbarCountInCore;
 extern llvm::cl::opt<long> coresCount;
-extern llvm::cl::opt<size_t> dcpCriticalWindowSize;
 
 bool hasExplicitPimCoreCount();
 void verifyExplicitPimCoreCount();
diff --git a/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt b/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt
index bccab97..776e445 100644
--- a/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt
+++ b/src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt
@@ -4,7 +4,7 @@ add_public_tablegen_target(ONNXToSpatialIncGen)
 
 add_pim_library(OMONNXToSpatial
   ConversionPatterns.cpp
-  HostFoldability.cpp
+  CompileTime.cpp
   HostLegality.cpp
   PrePatterns.cpp
   PostPatterns.cpp
diff --git a/src/PIM/Conversion/ONNXToSpatial/Common/ShapeTilingUtils.cpp b/src/PIM/Conversion/ONNXToSpatial/Common/ShapeTilingUtils.cpp
index 6f518eb..52a1efb 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Common/ShapeTilingUtils.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Common/ShapeTilingUtils.cpp
@@ -6,7 +6,7 @@
 #include "ShapeTilingUtils.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 
 using namespace mlir;
 
@@ -44,7 +44,7 @@ SmallVector<Value> sliceTensor(
       RankedTensorType::get(sliceShape, cast<RankedTensorType>(tensorToSlice.getType()).getElementType());
 
     Value slice;
-    if (isHostFoldableValue(tensorToSlice)) {
+    if (isCompileTimeComputable(tensorToSlice)) {
       slice = tensor::ExtractSliceOp::create(rewriter, loc, tensorToSlice, offsets, sizes, strides);
     }
     else {
@@ -113,7 +113,7 @@ Value broadcastToVector(Value scalarToBroadcast, int64_t length, ConversionPatte
     return tensor::SplatOp::create(rewriter, loc, type, elementValue);
   };
 
-  if (isHostFoldableValue(scalarToBroadcast))
+  if (isCompileTimeComputable(scalarToBroadcast))
     return buildBroadcast(scalarToBroadcast);
 
   auto broadcastCompute =
diff --git a/src/PIM/Conversion/ONNXToSpatial/HostFoldability.cpp b/src/PIM/Conversion/ONNXToSpatial/CompileTime.cpp
similarity index 84%
rename from src/PIM/Conversion/ONNXToSpatial/HostFoldability.cpp
rename to src/PIM/Conversion/ONNXToSpatial/CompileTime.cpp
index c42e186..1ae980c 100644
--- a/src/PIM/Conversion/ONNXToSpatial/HostFoldability.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/CompileTime.cpp
@@ -8,7 +8,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -145,7 +145,7 @@ static DenseElementsAttr getDirectDenseConstantAttr(Value value) {
   return nullptr;
 }
 
-static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm::SmallPtrSetImpl<Operation*>& visited) {
+static DenseElementsAttr getHostConstantDenseElementsAttrImpl(Value value, llvm::SmallPtrSetImpl<Operation*>& visited) {
   auto* definingOp = value.getDefiningOp();
   if (!definingOp || !visited.insert(definingOp).second)
     return nullptr;
@@ -156,7 +156,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
     return denseAttr;
 
   if (auto transposeOp = dyn_cast<ONNXTransposeOp>(definingOp)) {
-    auto inputAttr = getHostFoldableDenseElementsAttrImpl(transposeOp.getData(), visited);
+    auto inputAttr = getHostConstantDenseElementsAttrImpl(transposeOp.getData(), visited);
     if (!inputAttr)
       return nullptr;
 
@@ -169,7 +169,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
   }
 
   if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(definingOp)) {
-    auto inputAttr = getHostFoldableDenseElementsAttrImpl(collapseShapeOp.getSrc(), visited);
+    auto inputAttr = getHostConstantDenseElementsAttrImpl(collapseShapeOp.getSrc(), visited);
     if (!inputAttr)
       return nullptr;
     auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(collapseShapeOp.getType()));
@@ -177,7 +177,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
   }
 
   if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(definingOp)) {
-    auto inputAttr = getHostFoldableDenseElementsAttrImpl(expandShapeOp.getSrc(), visited);
+    auto inputAttr = getHostConstantDenseElementsAttrImpl(expandShapeOp.getSrc(), visited);
     if (!inputAttr)
       return nullptr;
     auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(expandShapeOp.getType()));
@@ -185,7 +185,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
   }
 
   if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(definingOp)) {
-    auto inputAttr = getHostFoldableDenseElementsAttrImpl(extractSliceOp.getSource(), visited);
+    auto inputAttr = getHostConstantDenseElementsAttrImpl(extractSliceOp.getSource(), visited);
     if (!inputAttr)
       return nullptr;
     auto slicedAttr = extractSliceDenseElements(inputAttr, extractSliceOp);
@@ -195,62 +195,71 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
   return nullptr;
 }
 
-static bool isHostFoldableOpImpl(Operation* op, llvm::SmallPtrSetImpl<Operation*>& visited) {
-  if (!op || !visited.insert(op).second)
+static bool isCompileTimeOpImpl(Operation* op, llvm::SmallPtrSetImpl<Operation*>& visited) {
+  if (!op)
     return false;
 
+  if (!visited.insert(op).second)
+    return true;
+
   if (isa<arith::ConstantOp, ONNXConstantOp, ONNXNoneOp>(op))
     return true;
 
   if (auto extractOp = dyn_cast<tensor::ExtractOp>(op))
-    return hasConstantIndices(extractOp) && isHostFoldableValue(extractOp.getTensor());
+    return hasConstantIndices(extractOp) && isCompileTimeOpImpl(extractOp, visited);
 
   if (!isStaticTensorResult(op))
     return false;
 
   if (auto transposeOp = dyn_cast<ONNXTransposeOp>(op))
-    return isHostFoldableValue(transposeOp.getData());
+    return isCompileTimeOpImpl(transposeOp, visited);
 
   if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(op))
-    return isHostFoldableValue(collapseShapeOp.getSrc());
+    return isCompileTimeOpImpl(collapseShapeOp,visited);
 
   if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(op))
-    return isHostFoldableValue(expandShapeOp.getSrc());
+    return isCompileTimeOpImpl(expandShapeOp, visited);
 
   if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
-    return hasStaticUnitStrides(extractSliceOp) && isHostFoldableValue(extractSliceOp.getSource());
+    return hasStaticUnitStrides(extractSliceOp) && isCompileTimeOpImpl(extractSliceOp, visited);
 
   if (auto splatOp = dyn_cast<tensor::SplatOp>(op))
-    return isHostFoldableValue(splatOp.getInput());
+    return isCompileTimeOpImpl(splatOp, visited);
 
   if (auto extractRowsOp = dyn_cast<spatial::SpatExtractRowsOp>(op))
-    return isHostFoldableValue(extractRowsOp.getInput());
+    return isCompileTimeOpImpl(extractRowsOp, visited);
 
-  if (auto concatOp = dyn_cast<spatial::SpatConcatOp>(op))
-    return llvm::all_of(concatOp.getInputs(), isHostFoldableValue);
+  if (auto concatOp = dyn_cast<spatial::SpatConcatOp>(op)){
+    bool res = true;
+    for(auto operandValue : concatOp.getOperands()){
+      res &= isCompileTimeOpImpl(operandValue.getDefiningOp(), visited);
+      if(!res) break;
+    }
+    return res;
+  }
 
   return false;
 }
 
 } // namespace
 
-bool isHostFoldableValue(Value value) {
+bool isCompileTimeComputable(Value value) {
   auto* definingOp = value.getDefiningOp();
   if (!definingOp)
     return false;
 
   llvm::SmallPtrSet<Operation*, 8> visited;
-  return isHostFoldableOpImpl(definingOp, visited);
+  return isCompileTimeOpImpl(definingOp, visited);
 }
 
-bool isHostFoldableOp(Operation* op) {
+bool isCompileTimeOp(Operation* op) {
   llvm::SmallPtrSet<Operation*, 8> visited;
-  return isHostFoldableOpImpl(op, visited);
+  return isCompileTimeOpImpl(op, visited);
 }
 
-DenseElementsAttr getHostFoldableDenseElementsAttr(Value value) {
+DenseElementsAttr getHostConstDenseElementsAttr(Value value) {
   llvm::SmallPtrSet<Operation*, 8> visited;
-  return getHostFoldableDenseElementsAttrImpl(value, visited);
+  return getHostConstantDenseElementsAttrImpl(value, visited);
 }
 
 } // namespace onnx_mlir
diff --git a/src/PIM/Conversion/ONNXToSpatial/CompileTime.hpp b/src/PIM/Conversion/ONNXToSpatial/CompileTime.hpp
new file mode 100644
index 0000000..f0574b5
--- /dev/null
+++ b/src/PIM/Conversion/ONNXToSpatial/CompileTime.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+
+namespace onnx_mlir {
+
+bool isCompileTimeComputable(mlir::Value value);
+
+bool isCompileTimeOp(mlir::Operation* op);
+
+mlir::DenseElementsAttr getHostConstDenseElementsAttr(mlir::Value value);
+
+} // namespace onnx_mlir
diff --git a/src/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp b/src/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp
deleted file mode 100644
index 3e3437c..0000000
--- a/src/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
-
-namespace onnx_mlir {
-
-bool isHostFoldableValue(mlir::Value value);
-
-bool isHostFoldableOp(mlir::Operation* op);
-
-mlir::DenseElementsAttr getHostFoldableDenseElementsAttr(mlir::Value value);
-
-} // namespace onnx_mlir
diff --git a/src/PIM/Conversion/ONNXToSpatial/HostLegality.cpp b/src/PIM/Conversion/ONNXToSpatial/HostLegality.cpp
index 252631a..9a9c297 100644
--- a/src/PIM/Conversion/ONNXToSpatial/HostLegality.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/HostLegality.cpp
@@ -3,7 +3,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 
 #include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 
@@ -17,7 +17,7 @@ LogicalResult verifyONNXToSpatialHostLegality(func::FuncOp funcOp) {
   for (Operation& op : funcOp.getFunctionBody().front()) {
     if (isa<func::ReturnOp, spatial::SpatCompute, spatial::SpatComputeBatch>(&op))
       continue;
-    if (isHostFoldableOp(&op))
+    if (isCompileTimeOp(&op))
       continue;
 
     diagnostics.report(&op, [](Operation* illegalOp) {
diff --git a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp
index 87da1e9..007f019 100644
--- a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp
@@ -13,7 +13,7 @@
 #include "Common/Common.hpp"
 #include "Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PostPatterns.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PrePatterns.hpp"
@@ -92,7 +92,7 @@ static void wrapTopLevelRuntimeTransposes(func::FuncOp funcOp) {
 
   for (Operation& op : llvm::make_early_inc_range(entryBlock)) {
     auto transposeOp = dyn_cast<ONNXTransposeOp>(&op);
-    if (!transposeOp || isHostFoldableOp(transposeOp))
+    if (!transposeOp || isCompileTimeOp(transposeOp))
       continue;
 
     // Transpose stays globally legal because constant/view-only cases are
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Conv.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Conv.cpp
index 0a855c4..c1f9a80 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Conv.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Conv.cpp
@@ -11,7 +11,7 @@
 #include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -391,7 +391,7 @@ static Value lowerSingleConvGroup(Value x,
   const int64_t xbarSize = static_cast<int64_t>(crossbarSize.getValue());
   const int64_t wMaxDim = std::max(patchSize, numChannelsOut);
   const int64_t maxParallelPixels = std::max<int64_t>(1, xbarSize / wMaxDim);
-  auto wDenseAttr = getHostFoldableDenseElementsAttr(w);
+  auto wDenseAttr = getHostConstDenseElementsAttr(w);
 
   // Prepare weight matrix W for crossbar storage:
   // W: [numChannelsOut, numChannelsIn, wHeight, wWidth] -> [numChannelsOut, patchSize] -> [patchSize, numChannelsOut]
@@ -412,7 +412,7 @@ static Value lowerSingleConvGroup(Value x,
   DenseElementsAttr biasDenseAttr;
   if (hasB) {
     gemmBias = b;
-    biasDenseAttr = getHostFoldableDenseElementsAttr(b);
+    biasDenseAttr = getHostConstDenseElementsAttr(b);
     biasMatrix = expandBiasIfNeeded(b, rewriter, loc);
   }
   const bool canPackWeightsAsConstants = static_cast<bool>(wDenseAttr);
@@ -717,7 +717,7 @@ LogicalResult ConvToGemm::matchAndRewrite(ONNXConvOp convOp,
   }
 
   Value result;
-  if (llvm::all_of(groupResults, isHostFoldableValue)) {
+  if (llvm::all_of(groupResults, isCompileTimeComputable)) {
     result = createSpatConcat(rewriter, loc, /*axis=*/1, groupResults);
   }
   else {
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp
index 20511b3..5284daf 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/Gemm.cpp
@@ -11,7 +11,7 @@
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -55,7 +55,7 @@ static Value transposeForSpatial(Value value,
                                  ArrayRef<int64_t> permutation,
                                  ConversionPatternRewriter& rewriter,
                                  Location loc) {
-  if (isHostFoldableValue(value))
+  if (isCompileTimeComputable(value))
     return ONNXTransposeOp::create(rewriter, loc, resultType, value, rewriter.getI64ArrayAttr(permutation));
 
   auto computeOp = createSpatCompute<1>(rewriter, loc, TypeRange {resultType}, {}, value, [&](Value input) {
@@ -67,7 +67,7 @@ static Value transposeForSpatial(Value value,
 
 static Value
 expandRankOneBias(Value value, RankedTensorType resultType, ConversionPatternRewriter& rewriter, Location loc) {
-  if (isHostFoldableValue(value))
+  if (isCompileTimeComputable(value))
     return tensor::ExpandShapeOp::create(rewriter,
                                          loc,
                                          resultType,
@@ -121,7 +121,7 @@ static SmallVector<Value> materializeBatchRowSlices(Value matrix,
   auto rowType = RankedTensorType::get({1, matrixType.getDimSize(1)}, matrixType.getElementType());
   SmallVector<Type> resultTypes(static_cast<size_t>(numRows), rowType);
 
-  if (isHostFoldableValue(matrix)) {
+  if (isCompileTimeComputable(matrix)) {
     auto extractRowsOp = spatial::SpatExtractRowsOp::create(rewriter, loc, TypeRange(resultTypes), matrix);
     return SmallVector<Value>(extractRowsOp->result_begin(), extractRowsOp->result_end());
   }
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/MatMul.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/MatMul.cpp
index 03a972a..7609b91 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/MatMul.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/MatMul.cpp
@@ -10,7 +10,7 @@
 
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -55,7 +55,7 @@ collapseBatchDims(Value value, int64_t batchSize, int64_t rows, int64_t cols, Pa
     return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input, reassociation);
   };
 
-  if (isHostFoldableValue(value))
+  if (isCompileTimeComputable(value))
     return buildCollapsed(value);
 
   auto collapseCompute =
@@ -114,7 +114,7 @@ static Value extractBatchMatrix(Value value,
     });
   };
 
-  if (isHostFoldableValue(value))
+  if (isCompileTimeComputable(value))
     return buildMatrix(value);
 
   auto batchMatrixCompute =
@@ -142,7 +142,7 @@ static Value transposeLastTwoDims(Value value, PatternRewriter& rewriter, Locati
     return ONNXTransposeOp::create(rewriter, loc, transposedType, input, rewriter.getI64ArrayAttr(perm));
   };
 
-  if (isHostFoldableValue(value))
+  if (isCompileTimeComputable(value))
     return buildTranspose(value);
 
   auto transposeCompute =
@@ -182,7 +182,7 @@ static Value concatValues(ValueRange inputs, int64_t axis, PatternRewriter& rewr
   outputShape[axis] = concatDimSize;
   auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
 
-  if (llvm::all_of(inputs, isHostFoldableValue))
+  if (llvm::all_of(inputs, isCompileTimeComputable))
     return createSpatConcat(rewriter, loc, axis, inputs);
 
   auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
@@ -235,7 +235,7 @@ struct MatMulToGemm : OpRewritePattern<ONNXMatMulOp> {
     }
 
     Location loc = matmulOp.getLoc();
-    bool useTransposedForm = isHostFoldableValue(matmulOp.getA()) && !isHostFoldableValue(matmulOp.getB());
+    bool useTransposedForm = isCompileTimeComputable(matmulOp.getA()) && !isCompileTimeComputable(matmulOp.getB());
 
     Value lhs = collapseBatchDims(matmulOp.getA(), lhsBatch, m, k, rewriter, loc);
     Value rhs = collapseBatchDims(matmulOp.getB(), rhsBatch, k, n, rewriter, loc);
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/ReduceMean.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/ReduceMean.cpp
index fa96e2e..b2f8381 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/ReduceMean.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Math/ReduceMean.cpp
@@ -7,7 +7,7 @@
 
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -91,7 +91,7 @@ static Value concatValues(ValueRange inputs, int64_t axis, ConversionPatternRewr
   outputShape[axis] = concatDimSize;
   auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
 
-  if (llvm::all_of(inputs, isHostFoldableValue))
+  if (llvm::all_of(inputs, isCompileTimeComputable))
     return createSpatConcat(rewriter, loc, axis, inputs);
 
   auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
@@ -135,7 +135,7 @@ static Value squeezeReducedAxes(Value keepdimsValue,
   }
 
   auto reassociation = buildCollapseReassociation(reducedAxes);
-  if (isHostFoldableValue(keepdimsValue))
+  if (isCompileTimeComputable(keepdimsValue))
     return tensor::CollapseShapeOp::create(rewriter, loc, resultType, keepdimsValue, reassociation).getResult();
 
   auto squeezeCompute =
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Concat.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Concat.cpp
index 4d616e8..86a96a6 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Concat.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Concat.cpp
@@ -3,7 +3,7 @@
 
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/ComputeRegionBuilder.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -20,7 +20,7 @@ struct Concat : public OpConversionPattern<ONNXConcatOp> {
     auto inputs = adaptor.getInputs();
     int64_t axis = adaptor.getAxis();
 
-    if (llvm::all_of(inputs, isHostFoldableValue)) {
+    if (llvm::all_of(inputs, isCompileTimeComputable)) {
       rewriter.replaceOp(maxpoolOp, createSpatConcat(rewriter, maxpoolOp.getLoc(), axis, inputs));
       return success();
     }
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Reshape.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Reshape.cpp
index f44c524..4c1131f 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Reshape.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Reshape.cpp
@@ -5,7 +5,7 @@
 
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -115,7 +115,7 @@ struct Reshape : OpConversionPattern<ONNXReshapeOp> {
     }
 
     auto replaceWithReshape = [&](auto buildReshape) -> LogicalResult {
-      if (isHostFoldableValue(adaptor.getData())) {
+      if (isCompileTimeComputable(adaptor.getData())) {
         rewriter.replaceOp(reshapeOp, buildReshape(adaptor.getData()));
         return success();
       }
diff --git a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Split.cpp b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Split.cpp
index 9a3e6b2..8683481 100644
--- a/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Split.cpp
+++ b/src/PIM/Conversion/ONNXToSpatial/Patterns/Tensor/Split.cpp
@@ -3,7 +3,7 @@
 
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
 #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 #include "src/Dialect/ONNX/ONNXOps.hpp"
 
@@ -61,7 +61,7 @@ struct Split : OpConversionPattern<ONNXSplitOp> {
       sliceSizes.push_back(resultType.getShape()[axis]);
     }
 
-    if (isHostFoldableValue(adaptor.getInput())) {
+    if (isCompileTimeComputable(adaptor.getInput())) {
       for (int64_t sliceSize : sliceSizes) {
         outputs.push_back(extractSliceAt(adaptor.getInput(), axis, offset, sliceSize, rewriter, splitOp.getLoc()));
         offset += sliceSize;
diff --git a/src/PIM/Dialect/Spatial/CMakeLists.txt b/src/PIM/Dialect/Spatial/CMakeLists.txt
index cf2b5ba..f843879 100644
--- a/src/PIM/Dialect/Spatial/CMakeLists.txt
+++ b/src/PIM/Dialect/Spatial/CMakeLists.txt
@@ -11,14 +11,8 @@ add_pim_library(SpatialOps
   Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp
   Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
   Transforms/MergeComputeNodes/Scheduling/ComputeInstanceUtils.cpp
-  Transforms/MergeComputeNodes/Scheduling/DcpScheduler.cpp
   Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
   Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
-  Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
-  Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
-  Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
-  Transforms/MergeComputeNodes/DCPGraph/Task.cpp
-  Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
 
   EXCLUDE_FROM_OM_LIBS
 
diff --git a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp
index 2fbf0e2..38646c4 100644
--- a/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp
+++ b/src/PIM/Dialect/Spatial/SpatialOpsVerify.cpp
@@ -15,7 +15,7 @@
 
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
+#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
 #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
 
@@ -120,7 +120,7 @@ static bool isBatchOutputArgument(SpatComputeBatch batchOp, Value value) {
 template <typename ComputeOpTy>
 static LogicalResult verifyStaticWeights(ComputeOpTy computeOp, StringRef kind) {
   for (Value weight : computeOp.getWeights()) {
-    if (!isHostFoldableValue(weight))
+    if (!isCompileTimeComputable(weight))
       return computeOp.emitOpError() << kind << " weights must be statically computed from constants";
   }
   return success();
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
deleted file mode 100644
index cfc03d3..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "../Scheduling/ComputeGraph.hpp"
-#include "../Scheduling/DcpScheduler.hpp"
-#include "DCPAnalysis.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-
-namespace onnx_mlir {
-namespace spatial {
-
-DCPAnalysisResult DCPAnalysis::run() {
-  ComputeGraph graph = buildComputeGraph(entryOp);
-  DcpScheduleOptions options;
-  if (coresCount.getValue() > 0)
-    options.processorCount = static_cast<size_t>(coresCount.getValue());
-  options.criticalWindowSize = dcpCriticalWindowSize.getValue();
-  options.allowFallbackForAutoCoreCount = true;
-  return runDcpScheduler(graph, options, entryOp->getContext());
-}
-
-} // namespace spatial
-} // namespace onnx_mlir
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp
deleted file mode 100644
index eec2d6b..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-#pragma once
-
-#include "mlir/IR/Operation.h"
-
-#include "../Scheduling/MergeSchedule.hpp"
-
-namespace onnx_mlir {
-namespace spatial {
-using DCPAnalysisResult = MergeScheduleResult;
-
-struct DCPAnalysis {
-private:
-  DCPAnalysisResult result;
-  mlir::Operation* entryOp;
-  DCPAnalysisResult run();
-
-public:
-  DCPAnalysis(mlir::Operation* op)
-  : entryOp(op) {
-    result = run();
-  }
-  DCPAnalysisResult& getResult() { return result; }
-};
-
-} // namespace spatial
-} // namespace onnx_mlir
-
-using DCPAnalysisResult = onnx_mlir::spatial::DCPAnalysisResult;
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
deleted file mode 100644
index 3bca16c..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
+++ /dev/null
@@ -1,1622 +0,0 @@
-//===----------------------------------------------------------------------===//
-// DCP-inspired task scheduler.
-//
-// Input: a DAG of compute tasks. Each task has an execution weight; each edge
-// carries an inter-task transfer cost that only applies when producer and
-// consumer land on different CPUs.
-//
-// Output: an assignment of every task to a CPU and an order within that CPU,
-// aiming to minimize the overall critical-path length (DCPL).
-//
-// Every task keeps two timing estimates:
-//   AEST - earliest start time, driven by parent completions + transfers.
-//   ALST - latest start time that still keeps the task on the critical path.
-// A task is "critical" when its slack (ALST - AEST) is zero.
-//
-// Main loop (runDcp):
-//   1. Build a topological order and seed AEST/ALST from the unscheduled DAG.
-//   2. While there are ready tasks (all dependency parents scheduled):
-//        a. Pick the candidate with the tightest slack (earliest AEST breaks ties).
-//        b. selectProcessor() tries every candidate CPU and picks the one that
-//           minimizes a composite cost (own slot + the smallest unscheduled child).
-//        c. Commit the placement and refresh AEST/ALST.
-//        d. Release any child whose dependency parents are now all scheduled.
-//
-// Heuristic notes: classic DCP assumes identical task costs on every CPU and
-// a single-issue processor model. We diverge - crossbar capacity can make a
-// task infeasible on a CPU, and placement happens incrementally. That makes
-// this a heuristic rather than a faithful DCP implementation.
-//
-// Parallelism: selectProcessor's per-CPU findSlot sweep is read-only, so we
-// run it concurrently across CPUs via mlir::parallelFor. The subsequent
-// sequential evaluation benefits from ordering CPUs by ascending slot.aest,
-// which tightens the bestComposite early-prune bound.
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Threading.h"
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/ErrorHandling.h"
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cstdint>
-#include <cstdio>
-#include <queue>
-#include <vector>
-
-#include "DCPAnalysis.hpp"
-#include "Graph.hpp"
-#include "GraphDebug.hpp"
-#include "GraphSupport.hpp"
-#include "Task.hpp"
-#include "UniqueWorklist.hpp"
-#include "Utils.hpp"
-#include "src/Accelerators/PIM/Common/PimCommon.hpp"
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-
-#ifdef DCP_DEBUG_ENABLED
-namespace {
-// Coarse-grained phase timers printed when DCP_SELECT_PROFILE is set.
-struct SelectTimers {
-  double findSlot = 0.0;
-  double dedup = 0.0;
-  double precheck = 0.0;
-  double snapshotInsertUpdate = 0.0;
-  double childSlot = 0.0;
-  double rollbackRestore = 0.0;
-  long iterations = 0;
-  long passedPrecheck = 0;
-  long passedDcpl = 0;
-  long tasksProcessed = 0;
-  void dump(const char* label) const {
-    std::fprintf(stderr,
-                 "[selectProfile:%s] tasks=%ld dedup=%.2fs findSlot=%.2fs precheck=%.2fs snapUpd=%.2fs "
-                 "childSlot=%.2fs rollback=%.2fs iter=%ld precheckPass=%ld dcplPass=%ld\n",
-                 label,
-                 tasksProcessed,
-                 dedup,
-                 findSlot,
-                 precheck,
-                 snapshotInsertUpdate,
-                 childSlot,
-                 rollbackRestore,
-                 iterations,
-                 passedPrecheck,
-                 passedDcpl);
-  }
-  ~SelectTimers() {
-    if (std::getenv("DCP_SELECT_PROFILE"))
-      dump("exit");
-  }
-};
-static SelectTimers gSelectTimers;
-} // namespace
-#endif
-
-namespace {
-
-uint64_t mixHash(uint64_t seed, uint64_t value) {
-  seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
-  return seed;
-}
-
-uint64_t finishHash(uint64_t seed) {
-  seed ^= seed >> 33;
-  seed *= 0xff51afd7ed558ccdULL;
-  seed ^= seed >> 33;
-  seed *= 0xc4ceb9fe1a85ec53ULL;
-  seed ^= seed >> 33;
-  return seed;
-}
-
-uint64_t hashEdgeSignature(uint64_t neighborHash, Weight weight, uint64_t direction) {
-  uint64_t hash = mixHash(0x84222325cbf29ce4ULL, direction);
-  hash = mixHash(hash, neighborHash);
-  hash = mixHash(hash, static_cast<uint64_t>(weight));
-  return finishHash(hash);
-}
-
-struct CpuAestCache {
-  Time defaultAest = 0;
-  llvm::SmallDenseMap<CPU, Time, 8> colocatedParentAests;
-
-  Time get(CPU cpu) const {
-    auto it = colocatedParentAests.find(cpu);
-    if (it == colocatedParentAests.end())
-      return defaultAest;
-    return it->second;
-  }
-};
-
-struct CpuTimeMax {
-  CPU cpu = -1;
-  Time time = 0;
-};
-
-void updateCpuTimeMax(CpuTimeMax& first, CpuTimeMax& second, CPU cpu, Time time) {
-  if (first.cpu == cpu) {
-    first.time = std::max(first.time, time);
-    return;
-  }
-  if (second.cpu == cpu) {
-    second.time = std::max(second.time, time);
-    if (second.time > first.time)
-      std::swap(first, second);
-    return;
-  }
-  if (time >= first.time) {
-    second = first;
-    first = {cpu, time};
-    return;
-  }
-  if (time > second.time)
-    second = {cpu, time};
-}
-
-CpuAestCache computeCpuAestCache(TaskDCP* task) {
-  CpuAestCache cache;
-  llvm::SmallDenseMap<CPU, Time, 8> transferAestByCpu;
-  llvm::SmallDenseMap<CPU, Time, 8> localAestByCpu;
-  Time unscheduledTransferAest = 0;
-
-  for (const Edge& parentEdge : task->parents) {
-    Time parentFinish = addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight());
-    Time transferAest = addOrMax(parentFinish, getTransferCost(parentEdge.first, task));
-    if (std::optional<CPU> parentCpu = parentEdge.first->getCpu()) {
-      Time& cpuTransferAest = transferAestByCpu[*parentCpu];
-      cpuTransferAest = std::max(cpuTransferAest, transferAest);
-      Time& cpuLocalAest = localAestByCpu[*parentCpu];
-      cpuLocalAest = std::max(cpuLocalAest, parentFinish);
-      continue;
-    }
-    unscheduledTransferAest = std::max(unscheduledTransferAest, transferAest);
-  }
-
-  CpuTimeMax firstOther {-1, unscheduledTransferAest};
-  CpuTimeMax secondOther {-1, 0};
-  for (const auto& entry : transferAestByCpu)
-    updateCpuTimeMax(firstOther, secondOther, entry.first, entry.second);
-
-  cache.defaultAest = firstOther.time;
-  for (const auto& entry : localAestByCpu) {
-    CPU cpu = entry.first;
-    Time bestNonLocalParentAest = firstOther.cpu == cpu ? secondOther.time : firstOther.time;
-    cache.colocatedParentAests[cpu] = std::max(bestNonLocalParentAest, entry.second);
-  }
-  return cache;
-}
-
-} // namespace
-
-//===----------------------------------------------------------------------===//
-// Edge manipulation
-//===----------------------------------------------------------------------===//
-
-std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling) {
-  auto oldChild = parent->addChild(child, weight, isScheduling);
-  auto oldParent = child->addParent(parent, weight, isScheduling);
-  assert(oldChild.has_value() == oldParent.has_value() && "The edge must be present in both element");
-  if (oldChild.has_value()) {
-    return {
-      {*oldParent, *oldChild}
-    };
-  }
-  return {};
-}
-
-void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling) {
-  parent->removeChild(child, isScheduling);
-  child->removeParent(parent, isScheduling);
-}
-
-// A dependency edge may appear multiple times (e.g. from separate data inputs);
-// the transfer cost is the maximum across those parallel edges. Cost is zero
-// when both endpoints share a CPU.
-Weight getTransferCost(TaskDCP* parent, TaskDCP* child) {
-  if (parent->scheduledCpu.has_value() && child->scheduledCpu.has_value()
-      && *parent->scheduledCpu == *child->scheduledCpu)
-    return 0;
-  Weight maxTransferCost = 0;
-  bool foundTransferCost = false;
-  for (const auto& edge : parent->children)
-    if (edge.first == child && !edge.isScheduling) {
-      maxTransferCost = std::max(maxTransferCost, edge.second);
-      foundTransferCost = true;
-    }
-  assert(foundTransferCost && "missing transfer cost for dependency edge");
-  return maxTransferCost;
-}
-
-//===----------------------------------------------------------------------===//
-// Indexing and CPU task lists
-//===----------------------------------------------------------------------===//
-
-size_t GraphDCP::getNodeIndex(const TaskDCP* task) const {
-  assert(task >= nodes.data() && task < nodes.data() + nodes.size() && "task must belong to graph");
-  return static_cast<size_t>(task - nodes.data());
-}
-
-GraphDCP::CpuTaskList& GraphDCP::getOrCreateCpuTasks(CPU cpu) {
-  assert(cpu >= 0 && "cpu id must be non-negative");
-  size_t cpuIndex = static_cast<size_t>(cpu);
-  if (cpuTasks.size() <= cpuIndex)
-    cpuTasks.resize(cpuIndex + 1);
-  return cpuTasks[cpuIndex];
-}
-
-const GraphDCP::CpuTaskList* GraphDCP::findCpuTasks(CPU cpu) const {
-  if (cpu < 0)
-    return nullptr;
-  size_t cpuIndex = static_cast<size_t>(cpu);
-  if (cpuIndex >= cpuTasks.size())
-    return nullptr;
-  return &cpuTasks[cpuIndex];
-}
-
-std::vector<TaskDCP*> GraphDCP::getRoots() {
-  std::vector<TaskDCP*> tmp;
-  for (auto& task : nodes)
-    if (!task.hasParents())
-      tmp.push_back(&task);
-  return tmp;
-}
-
-void GraphDCP::initTaskStructureHashes() {
-  taskStructureHashes.resize(nodes.size());
-  for (auto [index, task] : llvm::enumerate(nodes)) {
-    uint64_t hash = mixHash(0x7442b1129fd01363ULL, static_cast<uint64_t>(task.getWeight()));
-    hash = mixHash(hash, static_cast<uint64_t>(task.getCrossbarUsage()));
-    taskStructureHashes[index] = finishHash(hash);
-  }
-
-  std::vector<uint64_t> nextHashes(nodes.size());
-  std::vector<uint64_t> edgeHashes;
-  for (int iteration = 0; iteration < 4; ++iteration) {
-    for (auto [index, task] : llvm::enumerate(nodes)) {
-      uint64_t hash = mixHash(0x464dcab27ac82291ULL, taskStructureHashes[index]);
-      edgeHashes.clear();
-      edgeHashes.reserve(task.parents.size() + task.children.size());
-      for (const Edge& parent : task.parents)
-        if (!parent.isScheduling)
-          edgeHashes.push_back(
-            hashEdgeSignature(taskStructureHashes[getNodeIndex(parent.first)], parent.second, /*direction=*/0));
-      for (const Edge& child : task.children)
-        if (!child.isScheduling)
-          edgeHashes.push_back(
-            hashEdgeSignature(taskStructureHashes[getNodeIndex(child.first)], child.second, /*direction=*/1));
-      llvm::sort(edgeHashes);
-      hash = mixHash(hash, static_cast<uint64_t>(edgeHashes.size()));
-      for (uint64_t edgeHash : edgeHashes)
-        hash = mixHash(hash, edgeHash);
-      nextHashes[index] = finishHash(hash);
-    }
-    taskStructureHashes.swap(nextHashes);
-  }
-}
-
-// Compact dedup key for CPU `c` vs `candidate`: mixes candidateAest, crossbar
-// usage, and the incremental cpu structure hash. No heap allocation.
-uint64_t GraphDCP::computeCpuCandidateKey(Time candidateAest, CPU cpu) {
-  uint64_t hash = mixHash(0xd6e8feb86659fd93ULL, static_cast<uint64_t>(candidateAest));
-  hash = mixHash(hash, static_cast<uint64_t>(getCpuCrossbarUsage(cpu)));
-  auto it = cpuStructureHashes.find(cpu);
-  hash = mixHash(hash, it != cpuStructureHashes.end() ? it->second : 0ULL);
-  return finishHash(hash);
-}
-
-// Inserts `task` at `position` on `cpu`, wiring up scheduling edges with the
-// neighbouring tasks and keeping the global topological order consistent.
-TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) {
-  TaskInsertion ret;
-  Weight scheduledWeight = task->computeWeightOnCpu(this, cpu);
-  task->setCpu(cpu);
-  task->setWeight(scheduledWeight);
-  reserveTaskCrossbars(cpu, task);
-  cpuStructureHashes[cpu] ^= taskStructureHashes[getNodeIndex(task)];
-  auto& tasksInCpu = getOrCreateCpuTasks(cpu);
-  unsigned int numCpuTasks = tasksInCpu.size();
-  assert(position <= numCpuTasks && "Inserting in a not valid position");
-  auto insertedPoint = tasksInCpu.insert(std::next(tasksInCpu.begin(), position), task);
-  ret.cpuModified = cpu;
-  ret.taskInserted = task;
-  ret.graph = this;
-
-  // If we split an existing neighbour-neighbour scheduling edge, drop it; the
-  // two new edges below recreate the ordering with `task` in between.
-  if (insertedPoint != tasksInCpu.begin() && std::next(insertedPoint) != tasksInCpu.end()) {
-    auto precedentPoint = std::prev(insertedPoint, 1);
-    auto nextPoint = std::next(insertedPoint, 1);
-    removeEdge(*precedentPoint, *nextPoint, true);
-  }
-
-  if (insertedPoint != tasksInCpu.begin()) {
-    auto precedentPoint = std::prev(insertedPoint, 1);
-    auto oldEdge = addEdge(*precedentPoint, *insertedPoint, 0, true);
-    ret.beforeNode = oldEdge;
-
-    if (*task < **precedentPoint)
-      topologicalMoveAfter(task, *precedentPoint, &ret);
-  }
-
-  if (std::next(insertedPoint) != tasksInCpu.end()) {
-    auto nextPoint = std::next(insertedPoint, 1);
-    auto oldEdge = addEdge(*insertedPoint, *nextPoint, 0, true);
-    ret.afterNode = oldEdge;
-    if (**nextPoint < *task)
-      topologicalMoveBefore(task, *nextPoint, &ret);
-  }
-  return ret;
-}
-
-void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) {
-  releaseTaskCrossbars(cpu, task);
-  cpuStructureHashes[cpu] ^= taskStructureHashes[getNodeIndex(task)];
-  task->resetCpu();
-  task->resetWeight();
-  auto& scheduledTasks = getOrCreateCpuTasks(cpu);
-  auto taskPosition = std::find(scheduledTasks.begin(), scheduledTasks.end(), task);
-  assert(taskPosition != scheduledTasks.end() && "Removing a not present task");
-  TaskDCP* previousTask = nullptr;
-  TaskDCP* nextTask = nullptr;
-  if (taskPosition != scheduledTasks.begin()) {
-    auto previousPoint = std::prev(taskPosition, 1);
-    previousTask = *previousPoint;
-    removeEdge(*previousPoint, *taskPosition, true);
-  }
-
-  if (std::next(taskPosition) != scheduledTasks.end()) {
-    auto nextPoint = std::next(taskPosition, 1);
-    nextTask = *nextPoint;
-    removeEdge(*taskPosition, *nextPoint, true);
-  }
-  if (previousTask != nullptr && nextTask != nullptr)
-    addEdge(previousTask, nextTask, 0, true);
-  scheduledTasks.erase(taskPosition);
-}
-
-//===----------------------------------------------------------------------===//
-// Crossbar capacity bookkeeping
-//===----------------------------------------------------------------------===//
-
-CrossbarUsage GraphDCP::getCpuCrossbarUsage(CPU cpu) const {
-  auto it = cpuCrossbarUsage.find(cpu);
-  if (it == cpuCrossbarUsage.end())
-    return 0;
-  return it->second;
-}
-
-CrossbarUsage GraphDCP::getCpuCrossbarCapacity() const {
-  assert(onnx_mlir::crossbarSize.getValue() > 0 && "crossbar-size must be strictly positive");
-  assert(onnx_mlir::crossbarCountInCore.getValue() > 0 && "crossbar-count must be strictly positive");
-  CrossbarUsage crossbarEdge = static_cast<CrossbarUsage>(onnx_mlir::crossbarSize.getValue());
-  CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge);
-  return checkedMultiply(static_cast<CrossbarUsage>(onnx_mlir::crossbarCountInCore.getValue()), crossbarArea);
-}
-
-CrossbarUsage GraphDCP::getTaskCrossbarFootprint(const TaskDCP* task) const {
-  CrossbarUsage crossbarCount = task->getCrossbarUsage();
-  if (crossbarCount == 0)
-    return 0;
-  CrossbarUsage crossbarEdge = static_cast<CrossbarUsage>(onnx_mlir::crossbarSize.getValue());
-  CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge);
-  return checkedMultiply(crossbarCount, crossbarArea);
-}
-
-void GraphDCP::reserveTaskCrossbars(CPU cpu, const TaskDCP* task) {
-  cpuCrossbarUsage[cpu] = checkedAdd(getCpuCrossbarUsage(cpu), getTaskCrossbarFootprint(task));
-}
-
-void GraphDCP::releaseTaskCrossbars(CPU cpu, const TaskDCP* task) {
-  CrossbarUsage footprint = getTaskCrossbarFootprint(task);
-  CrossbarUsage currentUsage = getCpuCrossbarUsage(cpu);
-  assert(currentUsage >= footprint && "crossbar usage underflow");
-  cpuCrossbarUsage[cpu] = currentUsage - footprint;
-}
-
-bool GraphDCP::wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const {
-  CrossbarUsage footprint = getTaskCrossbarFootprint(task);
-  if (footprint == 0)
-    return false;
-  CrossbarUsage nextUsage = checkedAdd(getCpuCrossbarUsage(cpu), footprint);
-  return nextUsage >= getCpuCrossbarCapacity();
-}
-
-size_t GraphDCP::crossbarsUsed() const {
-  CrossbarUsage crossbarEdge = static_cast<CrossbarUsage>(onnx_mlir::crossbarSize.getValue());
-  CrossbarUsage crossbarArea = crossbarEdge * crossbarEdge;
-  if (crossbarArea == 0)
-    return 0;
-  CrossbarUsage totalArea = 0;
-  for (const auto& [cpu, usage] : cpuCrossbarUsage)
-    totalArea = checkedAdd(totalArea, usage);
-  return static_cast<size_t>(totalArea / crossbarArea);
-}
-
-size_t GraphDCP::crossbarsAvailable() const {
-  return static_cast<size_t>(lastCpu) * onnx_mlir::crossbarCountInCore.getValue();
-}
-
-//===----------------------------------------------------------------------===//
-// AEST / ALST computation
-//===----------------------------------------------------------------------===//
-
-// Walks the topological order once and fills AEST from parent completions,
-// while tracking the top two completion times so DCPL updates can avoid a
-// second pass. secondMaxCompletion lets us invalidate `maxCompletionTask`
-// locally when its AEST moves.
-void GraphDCP::initAest() {
-  auto& worklist = topologicalOrder;
-  Time maxDcpl = 0;
-  Time secondMaxCompletionCandidate = 0;
-  TaskDCP* maxCompletionTaskCandidate = nullptr;
-  for (auto& task : worklist) {
-    Time maxParentAest = 0;
-    for (Edge parentEdge : task.parents) {
-      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
-                                        getTransferCost(parentEdge.first, &task)),
-                               maxParentAest);
-    }
-    task.setAest(maxParentAest);
-    Time completion = addOrMax(maxParentAest, task.getWeight());
-    if (completion >= maxDcpl) {
-      secondMaxCompletionCandidate = maxDcpl;
-      maxDcpl = completion;
-      maxCompletionTaskCandidate = &task;
-    }
-    else if (completion > secondMaxCompletionCandidate) {
-      secondMaxCompletionCandidate = completion;
-    }
-  }
-  dcpl = maxDcpl;
-  maxCompletion = maxDcpl;
-  secondMaxCompletion = secondMaxCompletionCandidate;
-  maxCompletionTask = maxCompletionTaskCandidate;
-}
-
-// Same backward pass as initAest but over the reverse topological order,
-// seeding ALST from scheduleDcpl on leaves.
-void GraphDCP::initAlst() {
-  Time scheduleDcpl = getDcpl();
-  auto& worklist = topologicalOrder;
-
-  for (TaskDCP& node : llvm::reverse(worklist)) {
-    Time minAlst = std::numeric_limits<Time>::max();
-    if (!node.hasChildren())
-      minAlst = subtractOrZero(scheduleDcpl, node.getWeight());
-    for (Edge childEdge : node.children)
-      minAlst = std::min(minAlst,
-                         subtractOrZero(childEdge.first->getAlst(),
-                                        addOrMax(node.getWeight(), getTransferCost(&node, childEdge.first))));
-    node.setAlst(minAlst);
-  }
-}
-
-Time GraphDCP::computeAestOnCpu(TaskDCP* task, CPU cpu) {
-  Time maxParentAest = 0;
-  for (Edge parentEdge : task->parents) {
-    Weight transferCost = 0;
-    if (!(parentEdge.first->isScheduled() && cpu == *parentEdge.first->getCpu()))
-      transferCost = getTransferCost(parentEdge.first, task);
-    maxParentAest = std::max(
-      addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()), transferCost), maxParentAest);
-  }
-  return maxParentAest;
-}
-
-// DCPL if `task` were placed on `cpu`: pre-update max (excluding `task` when
-// it currently holds it) vs. the task's new completion on this CPU.
-Time GraphDCP::computeDcplOnCpu(TaskDCP* task, CPU cpu) {
-  Time candidateCompletion = addOrMax(computeAestOnCpu(task, cpu), task->computeWeightOnCpu(this, cpu));
-  Time unchangedMaxCompletion = maxCompletionTask == task ? secondMaxCompletion : maxCompletion;
-  return std::max(unchangedMaxCompletion, candidateCompletion);
-}
-
-Time GraphDCP::computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl) {
-  Weight weight = task->computeWeightOnCpu(this, cpu);
-  if (!task->hasChildren())
-    return subtractOrZero(scheduleDcpl, weight);
-
-  Time minAlst = std::numeric_limits<Time>::max();
-  for (const Edge& childEdge : task->children) {
-    Weight transferCost = getTransferCost(task, childEdge.first);
-    if (childEdge.first->isScheduled() && cpu == *childEdge.first->getCpu())
-      transferCost = 0;
-    minAlst = std::min(minAlst, subtractOrZero(childEdge.first->getAlst(), addOrMax(weight, transferCost)));
-  }
-  return minAlst;
-}
-
-void GraphDCP::updateAestFromTask(TaskDCP* task) {
-  llvm::DenseSet<TaskDCP*> descendants = dcp_graph::collectReachableTasks(task, false);
-  updateAestFromTaskWithDescendants(task, descendants);
-}
-
-void GraphDCP::updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants) {
-  Time modifiedMaxCompletion = 0;
-  TaskDCP* modifiedMaxTask = nullptr;
-  Time modifiedSecondMaxCompletion = 0;
-
-  auto considerCompletion = [&](TaskDCP* currentTask) {
-    Time completion = addOrMax(currentTask->getAest(), currentTask->getWeight());
-    if (completion >= modifiedMaxCompletion) {
-      modifiedSecondMaxCompletion = modifiedMaxCompletion;
-      modifiedMaxCompletion = completion;
-      modifiedMaxTask = currentTask;
-    }
-    else if (completion > modifiedSecondMaxCompletion) {
-      modifiedSecondMaxCompletion = completion;
-    }
-  };
-
-  for (auto it = task->getTopologicalIterator(); it != topologicalOrder.end(); ++it) {
-    TaskDCP* currentTask = &*it;
-    if (currentTask != task && !descendants.contains(currentTask))
-      continue;
-
-    Time maxParentAest = 0;
-    for (const Edge& parentEdge : currentTask->parents) {
-      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
-                                        getTransferCost(parentEdge.first, currentTask)),
-                               maxParentAest);
-    }
-    currentTask->setAest(maxParentAest);
-    considerCompletion(currentTask);
-  }
-
-  // AEST only grows for modified tasks, so the new DCPL is either the old one
-  // (from an unmodified task) or the new max among modified tasks.
-  bool oldMaxInvalidated =
-    maxCompletionTask != nullptr && (maxCompletionTask == task || descendants.contains(maxCompletionTask));
-  if (oldMaxInvalidated) {
-    dcpl = modifiedMaxCompletion;
-    maxCompletion = modifiedMaxCompletion;
-    maxCompletionTask = modifiedMaxTask;
-    secondMaxCompletion = modifiedSecondMaxCompletion;
-  }
-  else {
-    if (modifiedMaxCompletion > maxCompletion) {
-      secondMaxCompletion = maxCompletion;
-      maxCompletion = modifiedMaxCompletion;
-      maxCompletionTask = modifiedMaxTask;
-    }
-    else if (modifiedMaxCompletion > secondMaxCompletion) {
-      secondMaxCompletion = modifiedMaxCompletion;
-    }
-    dcpl = maxCompletion;
-  }
-}
-
-// Same as the DenseSet overload, but expects descendants already sorted in
-// topological order (avoids an extra traversal of the whole topo list on hot
-// paths).
-void GraphDCP::updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder) {
-  Time modifiedMaxCompletion = 0;
-  TaskDCP* modifiedMaxTask = nullptr;
-  Time modifiedSecondMaxCompletion = 0;
-
-  auto considerCompletion = [&](TaskDCP* currentTask) {
-    Time completion = addOrMax(currentTask->getAest(), currentTask->getWeight());
-    if (completion >= modifiedMaxCompletion) {
-      modifiedSecondMaxCompletion = modifiedMaxCompletion;
-      modifiedMaxCompletion = completion;
-      modifiedMaxTask = currentTask;
-    }
-    else if (completion > modifiedSecondMaxCompletion) {
-      modifiedSecondMaxCompletion = completion;
-    }
-  };
-
-  auto recomputeAest = [&](TaskDCP* currentTask) {
-    Time maxParentAest = 0;
-    for (const Edge& parentEdge : currentTask->parents) {
-      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
-                                        getTransferCost(parentEdge.first, currentTask)),
-                               maxParentAest);
-    }
-    currentTask->setAest(maxParentAest);
-    considerCompletion(currentTask);
-  };
-
-  recomputeAest(task);
-  for (TaskDCP* descendant : descendantsTopoOrder)
-    recomputeAest(descendant);
-
-  const bool oldMaxInvalidated =
-    maxCompletionTask != nullptr
-    && (maxCompletionTask == task || llvm::is_contained(descendantsTopoOrder, maxCompletionTask));
-  if (oldMaxInvalidated) {
-    // The pre-update max came from a modified task; its completion has moved
-    // upward, so modifiedMaxCompletion is an upper bound covering it. The
-    // second-best from the modified set is only a local view and may
-    // under-estimate the real runner-up; the conservative use of
-    // secondMaxCompletion in computeDcplOnCpu still yields a correct upper
-    // bound on DCPL.
-    dcpl = modifiedMaxCompletion;
-    maxCompletion = modifiedMaxCompletion;
-    maxCompletionTask = modifiedMaxTask;
-    secondMaxCompletion = modifiedSecondMaxCompletion;
-  }
-  else {
-    if (modifiedMaxCompletion > maxCompletion) {
-      secondMaxCompletion = maxCompletion;
-      maxCompletion = modifiedMaxCompletion;
-      maxCompletionTask = modifiedMaxTask;
-    }
-    else if (modifiedMaxCompletion > secondMaxCompletion) {
-      secondMaxCompletion = modifiedMaxCompletion;
-    }
-    dcpl = maxCompletion;
-  }
-}
-
-// Walks the same topological tail as the update overload but aborts as soon as
-// any modified task's completion exceeds `dcplBudget`. The caller's snapshot
-// restores AEST on the aborted tail. Returns true iff every descendant stayed
-// within budget and the scheduler state now reflects the new DCPL.
-bool GraphDCP::tryUpdateAestWithinBudget(TaskDCP* task,
-                                         llvm::ArrayRef<TaskDCP*> descendantsTopoOrder,
-                                         Time dcplBudget) {
-  Time modifiedMaxCompletion = 0;
-  TaskDCP* modifiedMaxTask = nullptr;
-  Time modifiedSecondMaxCompletion = 0;
-
-  auto process = [&](TaskDCP* currentTask) {
-    Time maxParentAest = 0;
-    for (const Edge& parentEdge : currentTask->parents) {
-      maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()),
-                                        getTransferCost(parentEdge.first, currentTask)),
-                               maxParentAest);
-    }
-    currentTask->setAest(maxParentAest);
-    Time completion = addOrMax(maxParentAest, currentTask->getWeight());
-    if (completion > dcplBudget)
-      return false;
-    if (completion >= modifiedMaxCompletion) {
-      modifiedSecondMaxCompletion = modifiedMaxCompletion;
-      modifiedMaxCompletion = completion;
-      modifiedMaxTask = currentTask;
-    }
-    else if (completion > modifiedSecondMaxCompletion) {
-      modifiedSecondMaxCompletion = completion;
-    }
-    return true;
-  };
-
-  if (!process(task))
-    return false;
-  for (TaskDCP* descendant : descendantsTopoOrder)
-    if (!process(descendant))
-      return false;
-
-  const bool oldMaxInvalidated =
-    maxCompletionTask != nullptr
-    && (maxCompletionTask == task || llvm::is_contained(descendantsTopoOrder, maxCompletionTask));
-  if (oldMaxInvalidated) {
-    dcpl = modifiedMaxCompletion;
-    maxCompletion = modifiedMaxCompletion;
-    maxCompletionTask = modifiedMaxTask;
-    secondMaxCompletion = modifiedSecondMaxCompletion;
-  }
-  else {
-    if (modifiedMaxCompletion > maxCompletion) {
-      secondMaxCompletion = maxCompletion;
-      maxCompletion = modifiedMaxCompletion;
-      maxCompletionTask = modifiedMaxTask;
-    }
-    else if (modifiedMaxCompletion > secondMaxCompletion) {
-      secondMaxCompletion = modifiedMaxCompletion;
-    }
-    dcpl = maxCompletion;
-  }
-  return true;
-}
-
-// Incrementally refreshes ALST after `task` was placed. The set of nodes whose
-// ALST is structurally affected by the insertion is exactly
-// `relations.ancestors ∪ {task}`: the task's outgoing transfer costs to
-// same-CPU real children become 0, and new scheduling edges create parent
-// relationships between `task` and its same-CPU neighbors. Every other node
-// keeps its relative distance to the sink boundary and only absorbs the
-// signed DCPL delta captured between `oldDcpl` and the now-updated `dcpl`.
-void GraphDCP::updateAlstFromScheduledTask(TaskDCP* task, const CandidateRelations& relations, Time oldDcpl) {
-  Time newDcpl = getDcpl();
-  // If the AEST update saturated dcpl (e.g. rescue placement on a
-  // crossbar-exhausted CPU sets task weight to UINT64_MAX), the shift delta
-  // would be meaningless. Fall back to a full recompute for this step only.
-  if (newDcpl == std::numeric_limits<Time>::max()) {
-    initAlst();
-    return;
-  }
-
-  if (newDcpl != oldDcpl) {
-    const bool increased = newDcpl > oldDcpl;
-    const Time delta = increased ? (newDcpl - oldDcpl) : (oldDcpl - newDcpl);
-    for (TaskDCP& node : topologicalOrder) {
-      if (&node == task || relations.ancestors.contains(&node))
-        continue;
-      Time alst = node.getAlst();
-      node.setAlst(increased ? addOrMax(alst, delta) : subtractOrZero(alst, delta));
-    }
-  }
-
-  auto recomputeAlst = [&](TaskDCP* node) {
-    Time minAlst = std::numeric_limits<Time>::max();
-    if (!node->hasChildren())
-      minAlst = subtractOrZero(newDcpl, node->getWeight());
-    for (const Edge& childEdge : node->children)
-      minAlst = std::min(minAlst,
-                         subtractOrZero(childEdge.first->getAlst(),
-                                        addOrMax(node->getWeight(), getTransferCost(node, childEdge.first))));
-    node->setAlst(minAlst);
-  };
-
-  // Walk the backward cone with a pending-children counter so that every
-  // ancestor is recomputed only after all of its affected children have
-  // been refreshed. This is resilient to staleness in the global
-  // `topologicalOrder` relative to freshly added scheduling edges.
-  llvm::DenseSet<TaskDCP*> affected = relations.ancestors;
-  affected.insert(task);
-  llvm::DenseMap<TaskDCP*, int> pendingAffectedChildren;
-  pendingAffectedChildren.reserve(affected.size());
-  std::vector<TaskDCP*> worklist;
-  worklist.reserve(affected.size());
-  for (TaskDCP* node : affected) {
-    int count = 0;
-    for (const Edge& childEdge : node->children)
-      if (affected.contains(childEdge.first))
-        count++;
-    pendingAffectedChildren[node] = count;
-    if (count == 0)
-      worklist.push_back(node);
-  }
-  while (!worklist.empty()) {
-    TaskDCP* node = worklist.back();
-    worklist.pop_back();
-    recomputeAlst(node);
-    for (const Edge& parentEdge : node->parents) {
-      if (!affected.contains(parentEdge.first))
-        continue;
-      auto it = pendingAffectedChildren.find(parentEdge.first);
-      assert(it != pendingAffectedChildren.end());
-      if (--it->second == 0)
-        worklist.push_back(parentEdge.first);
-    }
-  }
-
-// Opt-in consistency check: verifies the incremental ALST result against a
-// full initAlst() recomputation. Very expensive (O(V+E) per placement) - only
-// enable when investigating suspected drift.
-#ifdef DCP_DEBUG_CHECK_ALST
-  std::vector<Time> afterIncremental(nodes.size());
-  for (size_t i = 0; i < nodes.size(); ++i)
-    afterIncremental[i] = nodes[i].getAlst();
-  initAlst();
-  bool mismatched = false;
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    if (afterIncremental[i] != nodes[i].getAlst()) {
-      if (!mismatched) {
-        llvm::errs() << "[alst-mismatch] placed=" << getNodeIndex(task) << " oldDcpl=" << oldDcpl
-                     << " newDcpl=" << newDcpl << " ancestors={";
-        for (TaskDCP* a : relations.ancestors)
-          llvm::errs() << getNodeIndex(a) << ",";
-        llvm::errs() << "}\n";
-        mismatched = true;
-      }
-      llvm::errs() << "  node=" << i << " incremental=" << afterIncremental[i] << " full=" << nodes[i].getAlst()
-                   << " weight=" << nodes[i].getWeight()
-                   << " cpu=" << (nodes[i].isScheduled() ? (int) *nodes[i].getCpu() : -1) << " children=[";
-      for (const Edge& e : nodes[i].children)
-        llvm::errs() << getNodeIndex(e.first) << (e.isScheduling ? "s" : "")
-                     << "(tc=" << getTransferCost(&nodes[i], e.first) << ",alst=" << e.first->getAlst() << "),";
-      llvm::errs() << "]\n";
-    }
-  }
-#endif
-}
-
-// Computes a localised ALST: only ancestors of the candidate (plus the
-// candidate itself) get recomputed, every other task keeps its current ALST.
-// Processes nodes in reverse dependency order using a pending-children
-// counter.
-llvm::DenseMap<TaskDCP*, Time> GraphDCP::computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations) {
-  Time scheduleDcpl = computeDcplOnCpu(task, cpu);
-  llvm::DenseMap<TaskDCP*, Time> tempAlst;
-
-  llvm::DenseSet<TaskDCP*> affectedTasks = relations.ancestors;
-  affectedTasks.insert(task);
-  llvm::DenseMap<TaskDCP*, int> pendingAffectedChildren;
-  std::vector<TaskDCP*> worklist;
-  worklist.reserve(affectedTasks.size());
-
-  for (TaskDCP* affectedTask : affectedTasks) {
-    int affectedChildren = 0;
-    for (const Edge& childEdge : affectedTask->children)
-      if (affectedTasks.contains(childEdge.first))
-        affectedChildren++;
-    pendingAffectedChildren[affectedTask] = affectedChildren;
-    if (affectedChildren == 0)
-      worklist.push_back(affectedTask);
-  }
-
-  while (!worklist.empty()) {
-    TaskDCP* node = worklist.back();
-    worklist.pop_back();
-    Time minAlst = std::numeric_limits<Time>::max();
-    if (!node->hasChildren()) {
-      if (node != task)
-        minAlst = subtractOrZero(scheduleDcpl, node->getWeight());
-      else
-        minAlst = subtractOrZero(scheduleDcpl, node->computeWeightOnCpu(this, cpu));
-    }
-
-    for (const Edge& childEdge : node->children) {
-      Weight transferCost = getTransferCost(node, childEdge.first);
-      if (node == task && childEdge.first->isScheduled() && cpu == *childEdge.first->getCpu())
-        transferCost = 0;
-      Time childAlst = affectedTasks.contains(childEdge.first) ? tempAlst[childEdge.first] : childEdge.first->getAlst();
-      minAlst = std::min(minAlst, subtractOrZero(childAlst, addOrMax(node->getWeight(), transferCost)));
-    }
-    tempAlst[node] = minAlst;
-
-    for (const Edge& parentEdge : node->parents) {
-      if (!affectedTasks.contains(parentEdge.first))
-        continue;
-      int& remainingChildren = pendingAffectedChildren[parentEdge.first];
-      remainingChildren--;
-      assert(remainingChildren >= 0 && "affected child count must stay non-negative");
-      if (remainingChildren == 0)
-        worklist.push_back(parentEdge.first);
-    }
-  }
-  return tempAlst;
-}
-
-//===----------------------------------------------------------------------===//
-// Topological order maintenance
-//===----------------------------------------------------------------------===//
-
-void GraphDCP::initTopological() {
-  topologicalOrder.clear();
-  UniqueWorkList<std::vector<TaskDCP*>> worklist(getRoots());
-  long long flag = getUniqueFlag();
-  for (auto root : worklist)
-    root->setFlag(flag);
-
-  size_t i = 0;
-  while (i != worklist.size()) {
-    for (auto& child : worklist.at(i)->children) {
-      TaskDCP* childTask = child.first;
-      if (std::all_of(childTask->parents.begin(), childTask->parents.end(), [flag](Edge edge) {
-            return edge.first->getFlag() == flag;
-          })) {
-        worklist.pushBack(childTask);
-        childTask->setFlag(flag);
-      }
-    }
-    i++;
-  }
-
-  for (auto task : worklist)
-    topologicalOrder.pushBack(task);
-}
-
-// After `task` is moved after `pivotPoint`, any child of `task` (transitively)
-// that ends up ordered before `task` must be bubbled forward too.
-void GraphDCP::topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion) {
-  auto moveChildAfterMe = [this, insertion](TaskDCP* origTask) -> void {
-    auto cmp = [](Edge lhs, Edge rhs) { return *rhs.first < *lhs.first; };
-    TaskDCP* insertionPoint = origTask;
-    std::vector<TaskDCP*> worklist;
-    worklist.push_back(origTask);
-    size_t i = 0;
-    while (i < worklist.size()) {
-      auto task = worklist[i];
-      std::vector<Edge>& childEdges = task->children;
-      // Heap-sort children lazily and stop at the first one already in order.
-      std::make_heap(childEdges.begin(), childEdges.end(), cmp);
-      auto lastPoppedIter = childEdges.end();
-      bool foundChildInOrder = false;
-      while (lastPoppedIter != childEdges.begin()) {
-        std::pop_heap(childEdges.begin(), lastPoppedIter, cmp);
-        lastPoppedIter = std::prev(lastPoppedIter, 1);
-        auto currentChild = (*lastPoppedIter).first;
-        if (*currentChild < *task) {
-          dcp_graph::recordTopologicalMove(currentChild, insertion);
-          topologicalOrder.moveAfter(currentChild, insertionPoint);
-          insertionPoint = currentChild;
-        }
-        else {
-          foundChildInOrder = true;
-          break;
-        }
-      }
-
-      if (foundChildInOrder)
-        lastPoppedIter++;
-
-      for (auto it = lastPoppedIter; it != childEdges.end(); ++it)
-        if (it->first->hasChildren())
-          worklist.push_back(it->first);
-      i++;
-    }
-  };
-
-  if (!(*task < *pivotPoint))
-    return;
-
-  dcp_graph::recordTopologicalMove(task, insertion);
-  topologicalOrder.moveAfter(task, pivotPoint);
-  if (task->hasChildren())
-    moveChildAfterMe(task);
-}
-
-void GraphDCP::topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion) {
-  auto moveParentBeforeMe = [this, insertion](TaskDCP* origTask) -> void {
-    auto cmp = [](Edge lhs, Edge rhs) { return *lhs.first < *rhs.first; };
-    TaskDCP* insertionPoint = origTask;
-    std::vector<TaskDCP*> worklist;
-    worklist.push_back(origTask);
-    size_t i = 0;
-    while (i < worklist.size()) {
-      auto task = worklist[i];
-      std::vector<Edge>& parentEdges = task->parents;
-      std::make_heap(parentEdges.begin(), parentEdges.end(), cmp);
-      auto lastPoppedIter = parentEdges.end();
-      bool foundParentInOrder = false;
-      while (lastPoppedIter != parentEdges.begin()) {
-        std::pop_heap(parentEdges.begin(), lastPoppedIter, cmp);
-        lastPoppedIter = std::prev(lastPoppedIter, 1);
-        auto currentParent = (*lastPoppedIter).first;
-        if (*currentParent < *task) {
-          dcp_graph::recordTopologicalMove(currentParent, insertion);
-          topologicalOrder.moveBefore(currentParent, insertionPoint);
-          insertionPoint = currentParent;
-        }
-        else {
-          foundParentInOrder = true;
-          break;
-        }
-      }
-
-      if (foundParentInOrder)
-        lastPoppedIter++;
-
-      for (auto it = lastPoppedIter; it != parentEdges.end(); ++it)
-        if (it->first->hasParents())
-          worklist.push_back(it->first);
-      i++;
-    }
-  };
-
-  if (!(*pivotPoint < *task))
-    return;
-
-  dcp_graph::recordTopologicalMove(task, insertion);
-  topologicalOrder.moveBefore(task, pivotPoint);
-  if (task->hasParents())
-    moveParentBeforeMe(task);
-}
-
-//===----------------------------------------------------------------------===//
-// Slot search
-//===----------------------------------------------------------------------===//
-
-using ScheduledTaskList = std::list<TaskDCP*>;
-
-// A legal slot lives strictly after the last ancestor already on the CPU and
-// strictly before the first descendant. This narrows the linear scan below.
-struct SlotBounds {
-  ScheduledTaskList::iterator afterLastAncestor;
-  ScheduledTaskList::iterator firstDescendant;
-};
-
-static SlotBounds computeSlotBounds(ScheduledTaskList& scheduledTasks, const GraphDCP::CandidateRelations& relations) {
-  auto firstDescendant = scheduledTasks.begin();
-  while (firstDescendant != scheduledTasks.end()) {
-    if (relations.descendants.contains(*firstDescendant))
-      break;
-    firstDescendant = std::next(firstDescendant, 1);
-  }
-
-  auto afterLastAncestor = scheduledTasks.end();
-  while (afterLastAncestor != scheduledTasks.begin()) {
-    if (afterLastAncestor != scheduledTasks.end() && relations.ancestors.contains(*afterLastAncestor))
-      break;
-    afterLastAncestor = std::prev(afterLastAncestor, 1);
-  }
-
-  if (afterLastAncestor != scheduledTasks.end() && relations.ancestors.contains(*afterLastAncestor))
-    std::advance(afterLastAncestor, 1);
-
-  return {afterLastAncestor, firstDescendant};
-}
-
-// Scans legal positions on `cpu` and returns the first slot whose available
-// window fits the candidate's weight. With `push=false` we keep existing tasks
-// in place; with `push=true` we may shift them later, so ALST needs a local
-// recomputation (computeAlst) to reflect tighter constraints.
-GraphDCP::FindSlot GraphDCP::findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations) {
-  Time aestOnCpu = computeAestOnCpu(candidate, cpu);
-  ScheduledTaskList& scheduledTasks = getOrCreateCpuTasks(cpu);
-  SlotBounds bounds = computeSlotBounds(scheduledTasks, relations);
-  auto afterLastAncestor = bounds.afterLastAncestor;
-  auto firstDescendantIndex = bounds.firstDescendant;
-
-  if (firstDescendantIndex == scheduledTasks.end()) {
-    Time bestMax = aestOnCpu;
-    if (!scheduledTasks.empty())
-      bestMax = std::max(aestOnCpu, addOrMax(scheduledTasks.back()->getAest(), scheduledTasks.back()->getWeight()));
-    if (scheduledTasks.empty() || afterLastAncestor == scheduledTasks.end())
-      return FindSlot {bestMax, static_cast<int>(scheduledTasks.size())};
-  }
-
-  // When `push` is false the inner loop never consults `tempAlst[*iterIndex]`,
-  // so only the candidate ALST must be recomputed here.
-  llvm::DenseMap<TaskDCP*, Time> tempAlst;
-  Time finalTime;
-  if (!push) {
-    Time candidateDcpl = computeDcplOnCpu(candidate, cpu);
-    finalTime = addOrMax(computeTaskAlstOnCpu(candidate, cpu, candidateDcpl), candidate->computeWeightOnCpu(this, cpu));
-  }
-  else {
-    tempAlst = computeAlst(candidate, cpu, relations);
-    finalTime = addOrMax(tempAlst[candidate], candidate->computeWeightOnCpu(this, cpu));
-  }
-
-  auto iterIndex = afterLastAncestor;
-  auto bestIndex = scheduledTasks.end();
-  Time bestMax = 0;
-  assert(std::distance(scheduledTasks.begin(), afterLastAncestor)
-         <= std::distance(scheduledTasks.begin(), firstDescendantIndex));
-
-  bool keep = true;
-  while (keep && iterIndex != scheduledTasks.end()) {
-    if (iterIndex == firstDescendantIndex)
-      keep = false;
-    Time minStart;
-    if (!push)
-      minStart = std::min(finalTime, (*iterIndex)->getAest());
-    else if (tempAlst.count(*iterIndex) == 1)
-      minStart = std::min(finalTime, tempAlst[*iterIndex]);
-    else
-      minStart = std::min(finalTime, (*iterIndex)->getAlst());
-    Time maxStart = aestOnCpu;
-    if (iterIndex != scheduledTasks.begin()) {
-      auto prevIter = std::prev(iterIndex);
-      maxStart = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
-    }
-    if (subtractOrZero(minStart, maxStart) >= candidate->computeWeightOnCpu(this, cpu)) {
-      bestIndex = iterIndex;
-      bestMax = maxStart;
-      break;
-    }
-
-    std::advance(iterIndex, 1);
-    if (iterIndex == scheduledTasks.end())
-      keep = false;
-  }
-
-  if (bestIndex != scheduledTasks.end())
-    return FindSlot {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), bestIndex))};
-
-  if (iterIndex == scheduledTasks.end()) {
-    bestMax = aestOnCpu;
-    if (iterIndex != scheduledTasks.begin()) {
-      auto prevIter = std::prev(iterIndex);
-      bestMax = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
-    }
-    return FindSlot {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), scheduledTasks.end()))};
-  }
-  return FindSlot {std::numeric_limits<Time>::max(), 0};
-}
-
-// Same scan as findSlot(push=false) but skips the ALST recompute - caller
-// already knows the candidate's finalTime and AEST on this CPU.
-GraphDCP::FindSlot GraphDCP::findSlotWithFixedFinalTime(
-  TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu) {
-  ScheduledTaskList& scheduledTasks = getOrCreateCpuTasks(cpu);
-  SlotBounds bounds = computeSlotBounds(scheduledTasks, relations);
-  auto afterLastAncestor = bounds.afterLastAncestor;
-  auto firstDescendantIndex = bounds.firstDescendant;
-
-  if (firstDescendantIndex == scheduledTasks.end()) {
-    Time bestMax = aestOnCpu;
-    if (!scheduledTasks.empty())
-      bestMax = std::max(aestOnCpu, addOrMax(scheduledTasks.back()->getAest(), scheduledTasks.back()->getWeight()));
-    if (scheduledTasks.empty() || afterLastAncestor == scheduledTasks.end())
-      return {bestMax, static_cast<int>(scheduledTasks.size())};
-  }
-
-  auto iterIndex = afterLastAncestor;
-  auto bestIndex = scheduledTasks.end();
-  Time bestMax = std::numeric_limits<Time>::max();
-  assert(std::distance(scheduledTasks.begin(), afterLastAncestor)
-         <= std::distance(scheduledTasks.begin(), firstDescendantIndex));
-
-  bool keep = true;
-  while (keep && iterIndex != scheduledTasks.end()) {
-    if (iterIndex == firstDescendantIndex)
-      keep = false;
-    Time minStart = std::min(finalTime, (*iterIndex)->getAest());
-    Time maxStart = aestOnCpu;
-    if (iterIndex != scheduledTasks.begin()) {
-      auto prevIter = std::prev(iterIndex);
-      maxStart = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
-    }
-    if (subtractOrZero(minStart, maxStart) >= candidate->computeWeightOnCpu(this, cpu)) {
-      bestIndex = iterIndex;
-      bestMax = maxStart;
-      break;
-    }
-
-    std::advance(iterIndex, 1);
-    if (iterIndex == scheduledTasks.end())
-      keep = false;
-  }
-
-  if (bestIndex != scheduledTasks.end())
-    return {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), bestIndex))};
-
-  if (iterIndex == scheduledTasks.end()) {
-    bestMax = aestOnCpu;
-    if (iterIndex != scheduledTasks.begin()) {
-      auto prevIter = std::prev(iterIndex);
-      bestMax = std::max(aestOnCpu, addOrMax((*prevIter)->getAest(), (*prevIter)->getWeight()));
-    }
-    return {bestMax, static_cast<int>(std::distance(scheduledTasks.begin(), scheduledTasks.end()))};
-  }
-
-  return {std::numeric_limits<Time>::max(), 0};
-}
-
-//===----------------------------------------------------------------------===//
-// Candidate selection and processor assignment
-//===----------------------------------------------------------------------===//
-
-// Picks the best CPU + slot for `candidate`:
-//   * Phase 1 (parallel, read-only): call findSlot on every candidate CPU.
-//   * Phase 2 (sequential): process CPUs in ascending slot.aest order. For
-//     each, refine the composite cost. If the candidate has unscheduled
-//     children, speculatively insert it, try a within-budget AEST update and
-//     evaluate a slot for the smallest-slack child, then roll back.
-//   * Rescue (sequential): if nothing fit, grow the CPU count if allowed,
-//     otherwise pick the CPU that leads to the smallest DCPL increase.
-GraphDCP::CandidateRelations GraphDCP::selectProcessor(TaskDCP* candidate, bool push) {
-  CandidateRelations relations = dcp_graph::computeCandidateRelations(candidate);
-  relations.descendantsTopoOrder.reserve(relations.descendants.size());
-  for (auto it = candidate->getTopologicalIterator(); it != topologicalOrder.end(); ++it) {
-    TaskDCP* current = &*it;
-    if (current != candidate && relations.descendants.contains(current))
-      relations.descendantsTopoOrder.push_back(current);
-  }
-
-  // Build the list of candidate CPUs in ascending index order. Skip CPUs
-  // where the crossbar footprint definitely wouldn't fit (computeWeightOnCpu
-  // would reject them anyway, but this check is cheaper than a full
-  // findSlot).
-  std::vector<CPU> processors;
-  const bool canCreateNewCpu = getLastCpu() < maxCpuCount;
-  const CPU topCpu = canCreateNewCpu ? getLastCpu() : getLastCpu() - 1;
-  processors.reserve(static_cast<size_t>(topCpu + 1));
-  const CrossbarUsage candidateFootprint = getTaskCrossbarFootprint(candidate);
-  const bool candidateHasCrossbar = candidateFootprint != 0;
-  const CrossbarUsage cpuCapacity = candidateHasCrossbar ? getCpuCrossbarCapacity() : 0;
-  DCP_DEBUG_IF(auto dedupStart = std::chrono::steady_clock::now();)
-  CpuAestCache cpuAests = computeCpuAestCache(candidate);
-  DCP_DEBUG_IF(const bool checkCpuAestCache = std::getenv("DCP_CHECK_CPU_AEST_CACHE") != nullptr;)
-  llvm::SmallDenseSet<uint64_t, 32> seenProcessorKeys;
-  seenProcessorKeys.reserve(static_cast<size_t>(topCpu + 1));
-  for (CPU c = 0; c <= topCpu; c++) {
-    if (candidateHasCrossbar && c != getLastCpu()) {
-      CrossbarUsage nextUsage = checkedAdd(getCpuCrossbarUsage(c), candidateFootprint);
-      if (nextUsage >= cpuCapacity)
-        continue;
-    }
-    Time candidateAest = cpuAests.get(c);
-    DCP_DEBUG_IF(if (checkCpuAestCache) {
-      Time recomputedAest = computeAestOnCpu(candidate, c);
-      if (candidateAest != recomputedAest) {
-        std::fprintf(stderr,
-                     "[DCP_CHECK_CPU_AEST_CACHE] mismatch candidate=%zu cpu=%d cached=%llu recomputed=%llu\n",
-                     getNodeIndex(candidate),
-                     c,
-                     static_cast<unsigned long long>(candidateAest),
-                     static_cast<unsigned long long>(recomputedAest));
-        llvm::report_fatal_error("DCP CPU AEST cache mismatch");
-      }
-    })
-    if (!seenProcessorKeys.insert(computeCpuCandidateKey(candidateAest, c)).second)
-      continue;
-    processors.push_back(c);
-  }
-  DCP_DEBUG_IF(gSelectTimers.dedup +=
-               std::chrono::duration<double>(std::chrono::steady_clock::now() - dedupStart).count();)
-  if (processors.empty()) {
-    // processors.empty() implies !canCreateNewCpu: a fresh CPU always passes
-    // the crossbar filter and would have been added. Reaching here means every
-    // existing CPU is crossbar-exhausted and the task requires crossbar
-    // capacity — the placement is impossible.
-    llvm::report_fatal_error("DCP scheduler: crossbar capacity exhausted on all CPUs; "
-                             "cannot schedule task that requires crossbar allocation");
-  }
-
-  // Phase 1: parallel findSlot sweep (read-only over graph state).
-  // Pre-size cpuTasks so findSlot never needs to resize the outer vector
-  // while threads share it.
-  getOrCreateCpuTasks(topCpu);
-  struct PrecomputedSlot {
-    CPU cpu;
-    FindSlot slot;
-  };
-  std::vector<PrecomputedSlot> precomputed(processors.size());
-  auto sweep = [&](size_t i) {
-    CPU cpu = processors[i];
-    FindSlot slot = findSlot(candidate, cpu, false, relations);
-    if (slot.aest == std::numeric_limits<Time>::max() && push)
-      slot = findSlot(candidate, cpu, true, relations);
-    precomputed[i] = {cpu, slot};
-  };
-  DCP_DEBUG_IF(auto sweepStart = std::chrono::steady_clock::now();)
-  if (context != nullptr)
-    mlir::parallelFor(context, 0, processors.size(), sweep);
-  else
-    for (size_t i = 0; i < processors.size(); ++i)
-      sweep(i);
-  DCP_DEBUG_IF(gSelectTimers.findSlot +=
-               std::chrono::duration<double>(std::chrono::steady_clock::now() - sweepStart).count();)
-
-#ifdef DCP_DEBUG_ENABLED
-  {
-    static bool reported = false;
-    if (!reported) {
-      reported = true;
-      std::fprintf(
-        stderr,
-        "[dcp] selectProcessor parallel sweep: context=%p mt=%d procs=%zu pool=%u\n",
-        (void*) context,
-        context != nullptr ? (int) context->isMultithreadingEnabled() : -1,
-        processors.size(),
-        context != nullptr && context->isMultithreadingEnabled() ? context->getThreadPool().getMaxConcurrency() : 0u);
-    }
-  }
-#endif
-
-  CPU bestCpu = -1;
-  Time bestComposite = std::numeric_limits<Time>::max();
-  FindSlot bestSlot;
-  llvm::DenseMap<TaskDCP*, CandidateRelations> childRelationsCache;
-
-  // Phase 2: sequential composite evaluation in ascending CPU index order.
-  // Keeping the same order as the pre-parallel implementation preserves
-  // tie-breaking: whenever two CPUs produce an equal composite cost, the
-  // lower index wins.
-  for (const auto& ps : precomputed) {
-    DCP_DEBUG_IF(++gSelectTimers.iterations;)
-    CPU currentCpu = ps.cpu;
-    FindSlot slot = ps.slot;
-    if (slot.aest == std::numeric_limits<Time>::max())
-      continue;
-    // slot.aest alone is a lower bound on the composite cost (child.aest >= 0)
-    // so any CPU that already meets/exceeds the current best is pruned.
-    if (slot.aest >= bestComposite)
-      continue;
-
-    if (std::all_of(candidate->children.begin(), candidate->children.end(), [](Edge child) {
-          return child.first->isScheduled();
-        })) {
-      bestCpu = currentCpu;
-      bestComposite = slot.aest;
-      bestSlot = slot;
-    }
-    else if (candidate->hasChildren()) {
-      const bool emptyCpu = getOrCreateCpuTasks(currentCpu).empty();
-      auto currentDcpl = getDcpl();
-      // Combined prune: the DCPL budget AND a tighter composite bound. Any
-      // unscheduled child starts no earlier than candidate's completion, so
-      // composite >= 2*slot.aest + weight.
-      DCP_DEBUG_IF(auto t2 = std::chrono::steady_clock::now();)
-      Weight candidateWeight = candidate->computeWeightOnCpu(this, currentCpu);
-      Time candidateCompletion = addOrMax(slot.aest, candidateWeight);
-      bool skip =
-        (!emptyCpu && candidateCompletion > currentDcpl) || addOrMax(slot.aest, candidateCompletion) >= bestComposite;
-      DCP_DEBUG_IF(gSelectTimers.precheck +=
-                   std::chrono::duration<double>(std::chrono::steady_clock::now() - t2).count();)
-      if (skip)
-        continue;
-      DCP_DEBUG_IF(++gSelectTimers.passedPrecheck;)
-
-      dcp_graph::LocalScheduleSnapshot scheduleSnapshot;
-      TaskInsertion taskInsertion;
-      DCP_DEBUG_IF(auto t3 = std::chrono::steady_clock::now();)
-      if (emptyCpu) {
-        // Empty CPU: insertion adds no scheduling edge, AEST/DCPL cannot move.
-        taskInsertion = insertTaskInCPU(currentCpu, candidate, slot.index);
-      }
-      else {
-        scheduleSnapshot = dcp_graph::captureLocalScheduleState(
-          candidate, relations.descendants, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
-        taskInsertion = insertTaskInCPU(currentCpu, candidate, slot.index);
-        bool withinBudget =
-          tryUpdateAestWithinBudget(candidate, llvm::ArrayRef<TaskDCP*>(relations.descendantsTopoOrder), currentDcpl);
-        if (!withinBudget) {
-          DCP_DEBUG_IF(auto t4 = std::chrono::steady_clock::now();)
-          taskInsertion.rollBack();
-          dcp_graph::restoreLocalScheduleState(
-            scheduleSnapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
-          DCP_DEBUG_IF(auto tNow = std::chrono::steady_clock::now();
-                       gSelectTimers.snapshotInsertUpdate += std::chrono::duration<double>(t4 - t3).count();
-                       gSelectTimers.rollbackRestore += std::chrono::duration<double>(tNow - t4).count();)
-          continue;
-        }
-      }
-      DCP_DEBUG_IF(gSelectTimers.snapshotInsertUpdate +=
-                   std::chrono::duration<double>(std::chrono::steady_clock::now() - t3).count();)
-      DCP_DEBUG_IF(++gSelectTimers.passedDcpl;)
-
-      // Pick the tightest unscheduled child (smallest slack) and measure what
-      // slot it would get on the same CPU; that's the composite contribution.
-      Edge smallestChild {nullptr, 0};
-      for (auto child : candidate->children) {
-        if (child.first->isScheduled())
-          continue;
-        if (smallestChild.first == nullptr) {
-          smallestChild = child;
-          continue;
-        }
-        if (slackOrZero(smallestChild.first->getAest(), smallestChild.first->getAlst())
-            > slackOrZero(child.first->getAest(), child.first->getAlst())) {
-          smallestChild = child;
-        }
-      }
-      DCP_DEBUG_IF(auto t5 = std::chrono::steady_clock::now();)
-      auto dcplWithChild = computeDcplOnCpu(smallestChild.first, currentCpu);
-      auto childRelationsIt = childRelationsCache.find(smallestChild.first);
-      if (childRelationsIt == childRelationsCache.end())
-        childRelationsIt =
-          childRelationsCache.insert({smallestChild.first, dcp_graph::computeCandidateRelations(smallestChild.first)})
-            .first;
-      const CandidateRelations& childRelations = childRelationsIt->second;
-      auto childSlot =
-        dcplWithChild == getDcpl()
-          ? findSlotWithFixedFinalTime(smallestChild.first,
-                                       currentCpu,
-                                       childRelations,
-                                       addOrMax(computeTaskAlstOnCpu(smallestChild.first, currentCpu, dcplWithChild),
-                                                smallestChild.first->computeWeightOnCpu(this, currentCpu)),
-                                       computeAestOnCpu(smallestChild.first, currentCpu))
-          : findSlot(smallestChild.first, currentCpu, false, childRelations);
-      if (childSlot.aest != std::numeric_limits<Time>::max() && addOrMax(childSlot.aest, slot.aest) < bestComposite
-          && dcplWithChild <= currentDcpl) {
-        bestCpu = currentCpu;
-        bestComposite = addOrMax(slot.aest, childSlot.aest);
-        bestSlot = slot;
-      }
-      DCP_DEBUG_IF(auto t6 = std::chrono::steady_clock::now();
-                   gSelectTimers.childSlot += std::chrono::duration<double>(t6 - t5).count();)
-      taskInsertion.rollBack();
-      if (!emptyCpu)
-        dcp_graph::restoreLocalScheduleState(
-          scheduleSnapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
-      DCP_DEBUG_IF(gSelectTimers.rollbackRestore +=
-                   std::chrono::duration<double>(std::chrono::steady_clock::now() - t6).count();)
-    }
-  }
-
-  // Rescue path: no CPU was acceptable. Either open a fresh CPU or fall back
-  // to the CPU that minimises the resulting DCPL.
-  if (bestCpu == -1) {
-    if (getLastCpu() < maxCpuCount) {
-      bestCpu = getLastCpu();
-      bestSlot = {computeAestOnCpu(candidate, bestCpu), 0};
-      incrementLastCpu();
-    }
-    else {
-      Time bestDcpl = std::numeric_limits<Time>::max();
-      Time currentDcpl = getDcpl();
-      for (CPU c : processors) {
-        if (c == getLastCpu())
-          continue;
-        auto slot = findSlot(candidate, c, false, relations);
-        if (slot.aest == std::numeric_limits<Time>::max())
-          slot = findSlot(candidate, c, true, relations);
-        if (slot.aest == std::numeric_limits<Time>::max())
-          continue;
-        // Cheap lower bound: post-insertion DCPL is at least max(currentDcpl,
-        // candidate completion on this slot). Skip CPUs already worse than
-        // the best seen.
-        Time lowerBound = std::max(currentDcpl, addOrMax(slot.aest, candidate->computeWeightOnCpu(this, c)));
-        if (lowerBound >= bestDcpl)
-          continue;
-        auto snapshot = dcp_graph::captureLocalScheduleState(
-          candidate, relations.descendants, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
-        auto taskInsertion = insertTaskInCPU(c, candidate, slot.index);
-        updateAestFromTaskWithDescendants(candidate, llvm::ArrayRef<TaskDCP*>(relations.descendantsTopoOrder));
-        Time candidateDcpl = getDcpl();
-        taskInsertion.rollBack();
-        dcp_graph::restoreLocalScheduleState(snapshot, dcpl, maxCompletion, secondMaxCompletion, maxCompletionTask);
-        if (candidateDcpl < bestDcpl) {
-          bestDcpl = candidateDcpl;
-          bestCpu = c;
-          bestSlot = slot;
-        }
-      }
-      if (bestCpu == -1)
-        llvm::report_fatal_error("DCP scheduler: no valid slot found for task on any eligible CPU — "
-                                 "all slots are blocked by already-placed descendants");
-    }
-  }
-  if (bestCpu == getLastCpu() && getLastCpu() < maxCpuCount)
-    incrementLastCpu();
-  insertTaskInCPU(bestCpu, candidate, bestSlot.index);
-
-  // Incremental AEST/ALST refresh replacing the full initAest/initAlst that
-  // used to run after every placement. Post-insertion relations pick up any
-  // new scheduling-edge ancestors/descendants introduced by the insertion.
-  Time oldDcpl = getDcpl();
-  CandidateRelations postRelations = dcp_graph::computeCandidateRelations(candidate);
-  llvm::SmallVector<TaskDCP*, 32> postDescendantsTopoOrder;
-  postDescendantsTopoOrder.reserve(postRelations.descendants.size());
-  for (auto it = candidate->getTopologicalIterator(); it != topologicalOrder.end(); ++it) {
-    TaskDCP* current = &*it;
-    if (current != candidate && postRelations.descendants.contains(current))
-      postDescendantsTopoOrder.push_back(current);
-  }
-  updateAestFromTaskWithDescendants(candidate, llvm::ArrayRef<TaskDCP*>(postDescendantsTopoOrder));
-  updateAlstFromScheduledTask(candidate, postRelations, oldDcpl);
-  return postRelations;
-}
-
-//===----------------------------------------------------------------------===//
-// Main scheduling loop and result extraction
-//===----------------------------------------------------------------------===//
-
-void GraphDCP::runDcp() {
-  initTopological();
-  initTaskStructureHashes();
-  initAest();
-  initAlst();
-  dumpDot();
-
-  dcp_graph::DcpProgressLogger progressLogger(nodes.size());
-  llvm::DenseMap<TaskDCP*, int> unscheduledParents;
-
-  // Min-heap over ready tasks: tightest slack first, earliest AEST as tiebreak.
-  // Lazy deletion: when AEST/ALST change after a placement, fresh entries are
-  // pushed for the affected tasks. Stale ones are detected on pop by comparing
-  // stored vs current (slack, aest) and re-pushed with the current values.
-  struct ReadyEntry {
-    Time slack;
-    Time aest;
-    int64_t orderKey;
-    TaskDCP* task;
-    bool operator>(const ReadyEntry& other) const {
-      if (slack != other.slack)
-        return slack > other.slack;
-      if (aest != other.aest)
-        return aest > other.aest;
-      return orderKey > other.orderKey;
-    }
-  };
-  std::priority_queue<ReadyEntry, std::vector<ReadyEntry>, std::greater<ReadyEntry>> readyQueue;
-  size_t readyCount = 0;
-
-  auto pushReady = [&](TaskDCP* node) {
-    readyQueue.push({slackOrZero(node->getAest(), node->getAlst()), node->getAest(), node->Id(), node});
-  };
-
-  for (auto& node : nodes) {
-    int dependencyParents = dcp_graph::countDependencyParents(&node);
-    unscheduledParents[&node] = dependencyParents;
-    if (dependencyParents == 0) {
-      pushReady(&node);
-      ++readyCount;
-    }
-  }
-  size_t xbarsCapacity = static_cast<size_t>(maxCpuCount) * onnx_mlir::crossbarCountInCore.getValue();
-  progressLogger.printStart(readyCount, maxCpuCount, xbarsCapacity);
-
-  while (readyCount > 0) {
-    // Pop with lazy deletion: skip stale entries and re-push with current values.
-    TaskDCP* candidate = nullptr;
-    while (!readyQueue.empty()) {
-      auto entry = readyQueue.top();
-      readyQueue.pop();
-      Time curSlack = slackOrZero(entry.task->getAest(), entry.task->getAlst());
-      Time curAest = entry.task->getAest();
-      if (entry.slack == curSlack && entry.aest == curAest) {
-        candidate = entry.task;
-        break;
-      }
-      readyQueue.push({curSlack, curAest, entry.orderKey, entry.task});
-    }
-    assert(candidate != nullptr && "readyCount > 0 but heap exhausted");
-    --readyCount;
-
-    DCP_DEBUG_IF(auto selectStart = std::chrono::steady_clock::now();)
-    CandidateRelations postRelations = selectProcessor(candidate, candidate->isCriticalPath());
-    DCP_DEBUG_IF(
-      double selectSeconds = std::chrono::duration<double>(std::chrono::steady_clock::now() - selectStart).count();
-      progressLogger.recordSelectDuration(selectSeconds);
-      progressLogger.maybePrintSlowCandidate(getNodeIndex(candidate), selectSeconds, readyCount, getLastCpu());)
-
-    // Proactively refresh the heap priority for ready nodes whose AEST or ALST
-    // changed: ancestors had their ALST individually recomputed; descendants had
-    // their AEST bumped. Both may now sort differently than their stale entries.
-    for (TaskDCP* node : postRelations.ancestors)
-      if (!node->isScheduled() && unscheduledParents[node] == 0)
-        pushReady(node);
-    for (TaskDCP* node : postRelations.descendants)
-      if (!node->isScheduled() && unscheduledParents[node] == 0)
-        pushReady(node);
-
-    progressLogger.advanceCompleted();
-    progressLogger.printProgress(readyCount, getLastCpu(), maxCpuCount, crossbarsUsed(), crossbarsAvailable(), false);
-
-    for (const auto& childEdge : candidate->children) {
-      if (childEdge.isScheduling || childEdge.first->isScheduled())
-        continue;
-      int& dependencyParents = unscheduledParents[childEdge.first];
-      assert(dependencyParents > 0 && "dependency parent count must stay positive");
-      --dependencyParents;
-      if (dependencyParents == 0) {
-        pushReady(childEdge.first);
-        ++readyCount;
-      }
-    }
-    DCP_DEBUG_IF(++gSelectTimers.tasksProcessed;
-                 if (std::getenv("DCP_SELECT_PROFILE") && (gSelectTimers.tasksProcessed % 100 == 0))
-                   gSelectTimers.dump("tick");)
-  }
-  progressLogger.printProgress(0, getLastCpu(), maxCpuCount, crossbarsUsed(), crossbarsAvailable(), true);
-  dumpDot();
-}
-
-void GraphDCP::dumpDot() { dcp_graph::dumpGraphDot(nodes, cpuTasks, getLastCpu()); }
-
-DCPAnalysisResult GraphDCP::getResult() {
-  DCPAnalysisResult ret;
-
-  auto dominanceOrder = dcp_graph::collectDominanceOrder(getRoots(), nodes.size());
-  ret.dominanceOrderCompute.reserve(dominanceOrder.size());
-  for (auto elem : dominanceOrder) {
-    auto spatCompute = elem->getSpatCompute();
-    if (spatCompute)
-      ret.dominanceOrderCompute.push_back({spatCompute.getOperation(), 0});
-  }
-
-  for (CPU cpu = 0; cpu < getLastCpu(); ++cpu) {
-    const CpuTaskList* tasks = findCpuTasks(cpu);
-    if (tasks == nullptr || tasks->empty())
-      continue;
-    size_t i = 0;
-    for (auto node : *tasks) {
-      auto spatCompute = node->getSpatCompute();
-      if (!spatCompute)
-        continue;
-      ComputeInstance instance {spatCompute.getOperation(), 0};
-      ret.computeToCpuMap[instance] = cpu;
-      if (i++ == tasks->size() - 1) {
-        ret.isLastComputeOfCpu.insert(instance);
-        ret.cpuToLastComputeMap[cpu] = instance;
-      }
-    }
-  }
-
-  return ret;
-}
-
-std::vector<GraphDCP::ScheduledTaskInfo> GraphDCP::getScheduledTasks(CPU cpu) const {
-  std::vector<ScheduledTaskInfo> scheduledTasks;
-  const CpuTaskList* tasks = findCpuTasks(cpu);
-  if (tasks == nullptr)
-    return scheduledTasks;
-
-  scheduledTasks.reserve(tasks->size());
-  for (auto* task : *tasks)
-    scheduledTasks.push_back({getNodeIndex(task), task->getAest(), task->getAlst(), task->getWeight()});
-  return scheduledTasks;
-}
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp
deleted file mode 100644
index 32d1976..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-#pragma once
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-
-#include <cstdint>
-#include <list>
-#include <optional>
-#include <unordered_map>
-#include <vector>
-
-#include "DCPAnalysis.hpp"
-#include "Task.hpp"
-#include "Utils.hpp"
-
-namespace mlir {
-class MLIRContext;
-} // namespace mlir
-
-std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling = false);
-void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling = false);
-Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
-
-class GraphDCP {
-public:
-  struct CandidateRelations {
-    llvm::DenseSet<TaskDCP*> ancestors;
-    llvm::DenseSet<TaskDCP*> descendants;
-    // descendants ordered by position in the graph's topological order;
-    // iterating this avoids walking non-descendant tail tasks on hot paths.
-    llvm::SmallVector<TaskDCP*, 32> descendantsTopoOrder;
-  };
-
-  struct ScheduledTaskInfo {
-    size_t nodeIndex;
-    Time aest;
-    Time alst;
-    Weight weight;
-  };
-
-private:
-  using CpuTaskList = std::list<TaskDCP*>;
-
-  struct FindSlot {
-    Time aest;
-    int index;
-  };
-
-  std::vector<TaskDCP> nodes;
-  onnx_mlir::LabeledList<TaskDCP> topologicalOrder;
-  std::vector<uint64_t> taskStructureHashes;
-  std::vector<CpuTaskList> cpuTasks;
-  std::unordered_map<CPU, CrossbarUsage> cpuCrossbarUsage;
-  llvm::DenseMap<CPU, uint64_t> cpuStructureHashes;
-  CPU lastCpu = 0;
-  long long flag = 1;
-  Time dcpl = 0;
-  Time maxCompletion = 0;
-  Time secondMaxCompletion = 0;
-  TaskDCP* maxCompletionTask = nullptr;
-  int maxCpuCount = 1000;
-  mlir::MLIRContext* context = nullptr;
-
-  TaskInsertion insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position);
-  void removeTaskFromCPU(CPU cpu, TaskDCP* task);
-  CpuTaskList& getOrCreateCpuTasks(CPU cpu);
-  const CpuTaskList* findCpuTasks(CPU cpu) const;
-
-  std::vector<TaskDCP*> getRoots();
-
-  long long getUniqueFlag() { return flag++; }
-
-  void initAest();
-  void initAlst();
-  void initTaskStructureHashes();
-
-  Time computeAestOnCpu(TaskDCP* task, CPU cpu);
-  Time computeDcplOnCpu(TaskDCP* task, CPU cpu);
-  Time getDcpl() const { return dcpl; }
-  Time computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl);
-  void updateAestFromTask(TaskDCP* task);
-  void updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants);
-  void updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder);
-  // Propagates AEST like the overload above but returns early (before touching
-  // the remaining descendants) as soon as a task's completion exceeds
-  // `dcplBudget`, signalling that the new DCPL would exceed the budget.
-  // Returns true iff the full propagation completed without exceeding the
-  // budget. Uses the caller's snapshot to restore AEST on the aborted tail.
-  bool tryUpdateAestWithinBudget(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder, Time dcplBudget);
-
-  // Incrementally refreshes ALST after `task` has been scheduled. Nodes
-  // outside the backward cone (`relations.ancestors` plus `task`) retain
-  // their relative distance to the sink boundary and only absorb the signed
-  // DCPL delta (`newDcpl - oldDcpl`). `task` itself and its ancestors are
-  // recomputed in reverse topological order so that new same-CPU transfer
-  // costs (now zero) and scheduling-edge children are reflected.
-  void updateAlstFromScheduledTask(TaskDCP* task, const CandidateRelations& relations, Time oldDcpl);
-
-  void initTopological();
-  void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
-  void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
-
-  llvm::DenseMap<TaskDCP*, Time> computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations);
-  size_t getNodeIndex(const TaskDCP* task) const;
-
-  // Returns a compact dedup key for CPU `c` when evaluating `candidate`:
-  // mixes candidateAest, crossbar usage, and the incremental cpu structure
-  // hash into a single uint64_t. Zero heap allocation.
-  uint64_t computeCpuCandidateKey(Time candidateAest, CPU cpu);
-  CandidateRelations selectProcessor(TaskDCP* candidate, bool push);
-  CPU getLastCpu() const { return lastCpu; }
-  void incrementLastCpu() { lastCpu++; }
-  FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations);
-  FindSlot findSlotWithFixedFinalTime(
-    TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu);
-  void dumpDot();
-
-  friend TaskInsertion;
-  friend class TaskDCP;
-
-  CrossbarUsage getCpuCrossbarUsage(CPU cpu) const;
-  CrossbarUsage getCpuCrossbarCapacity() const;
-  CrossbarUsage getTaskCrossbarFootprint(const TaskDCP* task) const;
-  void reserveTaskCrossbars(CPU cpu, const TaskDCP* task);
-  void releaseTaskCrossbars(CPU cpu, const TaskDCP* task);
-  bool wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const;
-
-public:
-  void runDcp();
-  GraphDCP(llvm::ArrayRef<onnx_mlir::spatial::SpatCompute> spatComputes, llvm::ArrayRef<IndexedEdge> edges)
-  : nodes(), cpuTasks(), cpuCrossbarUsage() {
-    for (auto spatCompute : spatComputes)
-      nodes.emplace_back(spatCompute);
-    for (auto [start, end, weight] : edges)
-      makeEdge(start, end, weight);
-  }
-
-  GraphDCP(llvm::ArrayRef<Weight> nodeWeights,
-           llvm::ArrayRef<IndexedEdge> edges,
-           llvm::ArrayRef<int64_t> nodeOrderKeys = {},
-           llvm::ArrayRef<CrossbarUsage> nodeCrossbarUsage = {})
-  : nodes(), cpuTasks(), cpuCrossbarUsage() {
-    assert((nodeCrossbarUsage.empty() || nodeCrossbarUsage.size() == nodeWeights.size())
-           && "synthetic crossbar usage must match synthetic node weights");
-    assert((nodeOrderKeys.empty() || nodeOrderKeys.size() == nodeWeights.size())
-           && "synthetic node order keys must match synthetic node weights");
-    nodes.reserve(nodeWeights.size());
-    for (auto [index, weight] : llvm::enumerate(nodeWeights))
-      nodes.emplace_back(nodeOrderKeys.empty() ? static_cast<int64_t>(index) : nodeOrderKeys[index],
-                         weight,
-                         nodeCrossbarUsage.empty() ? 0 : nodeCrossbarUsage[index]);
-    for (auto [start, end, weight] : edges)
-      makeEdge(start, end, weight);
-  }
-
-  DCPAnalysisResult getResult();
-  std::vector<ScheduledTaskInfo> getScheduledTasks(CPU cpu) const;
-  CPU cpuCount() const { return lastCpu; }
-
-  void makeEdge(size_t parentIndex, size_t childIndex, Weight weight) {
-    addEdge(&nodes[parentIndex], &nodes[childIndex], weight);
-  }
-
-  size_t taskInCpu(CPU cpu) { return getOrCreateCpuTasks(cpu).size(); }
-
-  void setMaxCpuCount(int value) { maxCpuCount = value; }
-  int getMaxCpuCount() const { return maxCpuCount; }
-
-  // Total crossbar units allocated across all active CPUs.
-  size_t crossbarsUsed() const;
-  // Maximum crossbar units available across all active CPUs (lastCpu * per-CPU capacity).
-  size_t crossbarsAvailable() const;
-
-  // Optional MLIR context used to drive mlir::parallelFor inside runDcp. If
-  // null the scheduler runs single-threaded (tests use this path).
-  void setContext(mlir::MLIRContext* ctx) { context = ctx; }
-};
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
deleted file mode 100644
index dbaaa82..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <fstream>
-#include <string>
-
-#include "GraphDebug.hpp"
-#include "src/Accelerators/PIM/Common/PimCommon.hpp"
-
-namespace dcp_graph {
-
-#ifdef DCP_DEBUG_ENABLED
-
-DcpProgressLogger::DcpProgressLogger(size_t totalTasks)
-: logProgress(totalTasks >= 200),
-  totalTasks(totalTasks),
-  startTime(std::chrono::steady_clock::now()),
-  lastProgressPrint(startTime) {}
-
-std::string DcpProgressLogger::formatDuration(double seconds) {
-  if (seconds < 0)
-    seconds = 0;
-
-  long totalSeconds = static_cast<long>(seconds + 0.5);
-  long hours = totalSeconds / 3600;
-  long minutes = (totalSeconds % 3600) / 60;
-  long secs = totalSeconds % 60;
-  if (hours > 0)
-    return llvm::formatv("{0}:{1:02}:{2:02}", hours, minutes, secs).str();
-  return llvm::formatv("{0}:{1:02}", minutes, secs).str();
-}
-
-void DcpProgressLogger::recordFindDuration(double seconds) { findCandidateSeconds += seconds; }
-void DcpProgressLogger::recordSelectDuration(double seconds) { selectProcessorSeconds += seconds; }
-void DcpProgressLogger::recordUpdateDuration(double seconds) { updateTimingSeconds += seconds; }
-void DcpProgressLogger::advanceCompleted(size_t taskCount) { completedTasks += taskCount; }
-
-void DcpProgressLogger::printStart(size_t readyCount, int maxCpuCount, size_t xbarsCapacity) const {
-  if (!logProgress)
-    return;
-  llvm::errs() << llvm::formatv("[DCP] start   tasks={0}   ready={1}   cpus=0/{2}   crossbars=0/{3}\n",
-                                totalTasks,
-                                readyCount,
-                                maxCpuCount,
-                                xbarsCapacity);
-}
-
-void DcpProgressLogger::maybePrintSlowCandidate(size_t nodeIndex,
-                                                double elapsedSeconds,
-                                                size_t readyCount,
-                                                CPU cpuCount) const {
-  if (!logProgress || elapsedSeconds < 1.0)
-    return;
-
-  llvm::errs() << llvm::formatv("[DCP] slow node={0}   elapsed={1}   ready={2}   cpus={3}\n",
-                                nodeIndex,
-                                formatDuration(elapsedSeconds),
-                                readyCount,
-                                cpuCount);
-}
-
-void DcpProgressLogger::printProgress(
-  size_t readyCount, CPU cpuCount, int maxCpuCount, size_t xbarsUsed, size_t xbarsAvailable, bool force) {
-  if (!logProgress)
-    return;
-
-  auto now = std::chrono::steady_clock::now();
-  if (!force && now - lastProgressPrint < std::chrono::seconds(1) && completedTasks != totalTasks)
-    return;
-
-  double elapsedSeconds = std::chrono::duration<double>(now - startTime).count();
-  double rate = elapsedSeconds > 0.0 ? static_cast<double>(completedTasks) / elapsedSeconds : 0.0;
-  double etaSeconds = rate > 0.0 ? static_cast<double>(totalTasks - completedTasks) / rate : 0.0;
-  double percent = totalTasks == 0 ? 100.0 : (100.0 * static_cast<double>(completedTasks) / totalTasks);
-
-  bool done = completedTasks == totalTasks;
-  llvm::errs() << llvm::formatv("[DCP] {0}/{1} ({2:F0}%)   ready={3}   cpus={4}/{5}   crossbars={6}/{7}   {8}{9}\n",
-                                completedTasks,
-                                totalTasks,
-                                percent,
-                                readyCount,
-                                cpuCount,
-                                maxCpuCount,
-                                xbarsUsed,
-                                xbarsAvailable,
-                                llvm::formatv("elapsed={0}", formatDuration(elapsedSeconds)).str(),
-                                done ? "" : llvm::formatv("   eta={0}", formatDuration(etaSeconds)).str());
-  lastProgressPrint = now;
-}
-
-#else
-
-DcpProgressLogger::DcpProgressLogger(size_t) {}
-void DcpProgressLogger::recordFindDuration(double) {}
-void DcpProgressLogger::recordSelectDuration(double) {}
-void DcpProgressLogger::recordUpdateDuration(double) {}
-void DcpProgressLogger::advanceCompleted(size_t) {}
-void DcpProgressLogger::printStart(size_t, int, size_t) const {}
-void DcpProgressLogger::maybePrintSlowCandidate(size_t, double, size_t, CPU) const {}
-void DcpProgressLogger::printProgress(size_t, CPU, int, size_t, size_t, bool) {}
-
-#endif
-
-void dumpGraphDot(const std::vector<TaskDCP>& nodes, const std::vector<std::list<TaskDCP*>>& cpuTasks, CPU lastCpu) {
-  static int dumpIndex = 0;
-  std::string outputDir = onnx_mlir::getOutputDir();
-  if (outputDir.empty())
-    return;
-
-  std::string graphDir = outputDir + "/dcp_graph";
-  onnx_mlir::createDirectory(graphDir);
-  std::fstream file(graphDir + "/graph_" + std::to_string(dumpIndex++) + ".dot", std::ios::out);
-  file << "digraph G {\n";
-  if (!cpuTasks.empty()) {
-    for (CPU cpu = 0; cpu < lastCpu; cpu++) {
-      file << "subgraph cluster_" << cpu << "{\nstyle=filled;\ncolor=lightgrey;\n";
-      size_t cpuIndex = static_cast<size_t>(cpu);
-      if (cpuIndex >= cpuTasks.size()) {
-        file << " }\n";
-        continue;
-      }
-
-      for (auto node : cpuTasks[cpuIndex]) {
-        file << node->Id() << " [label=\"";
-        file << "n:" << node->Id() << "\n";
-        file << "aest:" << node->getAest() << "\n";
-        file << "alst:" << node->getAlst() << "\n";
-        file << "weight:" << node->getWeight() << "\"]\n";
-      }
-      file << " }\n";
-    }
-  }
-  else {
-    for (const auto& node : nodes) {
-      file << node.Id() << " [label=\"";
-      file << "n:" << node.Id() << "\n";
-      file << "aest:" << node.getAest() << "\n";
-      file << "alst:" << node.getAlst() << "\n";
-      file << "weight:" << node.getWeight() << "\"]\n";
-    }
-  }
-
-  for (const auto& node : nodes)
-    for (const auto& child : node.children) {
-      file << node.Id() << " -> " << child.first->Id();
-      file << " [label=\"" << child.second << "\"]\n";
-    }
-
-  file << "}\n";
-  file.flush();
-  file.close();
-}
-
-} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp
deleted file mode 100644
index 8097a0f..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#pragma once
-
-#include "llvm/ADT/StringRef.h"
-
-#include <chrono>
-#include <list>
-#include <vector>
-
-#include "Task.hpp"
-#include "Utils.hpp"
-
-// Define DCP_DEBUG_ENABLED locally when debugging DCP progress and per-phase
-// profiling. In normal builds the logger methods are no-ops and helpers compile
-// away.
-#define DCP_DEBUG_ENABLED
-
-#ifdef DCP_DEBUG_ENABLED
-#define DCP_DEBUG_IF(...) __VA_ARGS__
-#else
-#define DCP_DEBUG_IF(...)
-#endif
-
-namespace dcp_graph {
-
-class DcpProgressLogger {
-public:
-  explicit DcpProgressLogger(size_t totalTasks);
-
-  void recordFindDuration(double seconds);
-  void recordSelectDuration(double seconds);
-  void recordUpdateDuration(double seconds);
-  void advanceCompleted(size_t taskCount = 1);
-
-  void printStart(size_t readyCount, int maxCpuCount, size_t xbarsCapacity) const;
-  void maybePrintSlowCandidate(size_t nodeIndex, double elapsedSeconds, size_t readyCount, CPU cpuCount) const;
-  void
-  printProgress(size_t readyCount, CPU cpuCount, int maxCpuCount, size_t xbarsUsed, size_t xbarsAvailable, bool force);
-
-#ifdef DCP_DEBUG_ENABLED
-
-private:
-  static std::string formatDuration(double seconds);
-
-  bool logProgress = false;
-  size_t totalTasks = 0;
-  size_t completedTasks = 0;
-  std::chrono::steady_clock::time_point startTime;
-  std::chrono::steady_clock::time_point lastProgressPrint;
-  double findCandidateSeconds = 0.0;
-  double selectProcessorSeconds = 0.0;
-  double updateTimingSeconds = 0.0;
-#endif
-};
-
-void dumpGraphDot(const std::vector<TaskDCP>& nodes, const std::vector<std::list<TaskDCP*>>& cpuTasks, CPU lastCpu);
-
-} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
deleted file mode 100644
index 8d4c1fb..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-#include "llvm/ADT/STLExtras.h"
-
-#include <vector>
-
-#include "GraphSupport.hpp"
-#include "Task.hpp"
-#include "UniqueWorklist.hpp"
-
-namespace dcp_graph {
-
-llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents) {
-  llvm::DenseSet<TaskDCP*> reachable;
-  std::vector<TaskDCP*> worklist;
-  worklist.reserve(32);
-
-  auto enqueueEdges = [&](TaskDCP* task) {
-    const auto& edges = followParents ? task->parents : task->children;
-    for (const auto& edge : edges)
-      if (reachable.insert(edge.first).second)
-        worklist.push_back(edge.first);
-  };
-
-  enqueueEdges(root);
-  while (!worklist.empty()) {
-    TaskDCP* task = worklist.back();
-    worklist.pop_back();
-    enqueueEdges(task);
-  }
-  return reachable;
-}
-
-GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate) {
-  return {collectReachableTasks(candidate, true), collectReachableTasks(candidate, false), {}};
-}
-
-LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
-                                                const llvm::DenseSet<TaskDCP*>& descendants,
-                                                Time dcpl,
-                                                Time maxCompletion,
-                                                Time secondMaxCompletion,
-                                                TaskDCP* maxCompletionTask) {
-  LocalScheduleSnapshot snapshot;
-  snapshot.aestBackup.reserve(descendants.size() + 1);
-  snapshot.aestBackup.emplace_back(task, task->getAest());
-  for (TaskDCP* descendant : descendants)
-    snapshot.aestBackup.emplace_back(descendant, descendant->getAest());
-  snapshot.dcpl = dcpl;
-  snapshot.maxCompletion = maxCompletion;
-  snapshot.secondMaxCompletion = secondMaxCompletion;
-  snapshot.maxCompletionTask = maxCompletionTask;
-  return snapshot;
-}
-
-void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
-                               Time& dcpl,
-                               Time& maxCompletion,
-                               Time& secondMaxCompletion,
-                               TaskDCP*& maxCompletionTask) {
-  for (const auto& [task, aest] : snapshot.aestBackup)
-    task->setAest(aest);
-  dcpl = snapshot.dcpl;
-  maxCompletion = snapshot.maxCompletion;
-  secondMaxCompletion = snapshot.secondMaxCompletion;
-  maxCompletionTask = snapshot.maxCompletionTask;
-}
-
-int countDependencyParents(const TaskDCP* task) {
-  return static_cast<int>(llvm::count_if(task->parents, [](const Edge& edge) { return !edge.isScheduling; }));
-}
-
-void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion) {
-  if (insertion == nullptr)
-    return;
-
-  auto alreadyRecorded =
-    llvm::any_of(insertion->topologicalMoves,
-                 [task](const TaskInsertion::TopologicalMoveRecord& move) { return move.task == task; });
-  if (alreadyRecorded)
-    return;
-
-  insertion->topologicalMoves.push_back({task, onnx_mlir::LabeledList<TaskDCP>::next(task)});
-}
-
-std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount) {
-  UniqueWorkList<std::vector<TaskDCP*>> worklist(roots);
-  worklist.reserve(nodeCount);
-
-  size_t index = 0;
-  while (index != worklist.size()) {
-    bool modified = true;
-    while (modified) {
-      modified = false;
-      for (const auto& child : worklist.at(index)->children)
-        if (worklist.allElementsContained(
-              child.first->parents.begin(), child.first->parents.end(), [](Edge edge) { return edge.first; }))
-          modified |= worklist.pushBack(child.first);
-    }
-    index++;
-  }
-
-  return {worklist.begin(), worklist.end()};
-}
-
-} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp
deleted file mode 100644
index 9e738f5..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-
-#include <utility>
-#include <vector>
-
-#include "Graph.hpp"
-
-namespace dcp_graph {
-
-struct LocalScheduleSnapshot {
-  llvm::SmallVector<std::pair<TaskDCP*, Time>, 64> aestBackup;
-  Time dcpl = 0;
-  Time maxCompletion = 0;
-  Time secondMaxCompletion = 0;
-  TaskDCP* maxCompletionTask = nullptr;
-};
-
-llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents);
-GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate);
-
-LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
-                                                const llvm::DenseSet<TaskDCP*>& descendants,
-                                                Time dcpl,
-                                                Time maxCompletion,
-                                                Time secondMaxCompletion,
-                                                TaskDCP* maxCompletionTask);
-void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
-                               Time& dcpl,
-                               Time& maxCompletion,
-                               Time& secondMaxCompletion,
-                               TaskDCP*& maxCompletionTask);
-
-int countDependencyParents(const TaskDCP* task);
-void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion);
-std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount);
-
-} // namespace dcp_graph
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp
deleted file mode 100644
index 21d93b1..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#include <optional>
-
-#include "Graph.hpp"
-#include "Task.hpp"
-#include "UniqueWorklist.hpp"
-
-std::optional<Edge> TaskDCP::addChild(TaskDCP* child, Weight weight, bool isScheduling) {
-  std::optional<Edge> oldEdge = std::nullopt;
-  auto foundElement = std::find_if(children.begin(), children.end(), [child, isScheduling](Edge element) {
-    return child == element.first && isScheduling == element.isScheduling;
-  });
-  if (foundElement != children.end()) {
-    oldEdge = *foundElement;
-    fastRemove(children, foundElement);
-  }
-  children.emplace_back(Edge {child, weight, isScheduling});
-  return oldEdge;
-}
-
-std::optional<Edge> TaskDCP::addParent(TaskDCP* parent, Weight weight, bool isScheduling) {
-  std::optional<Edge> oldEdge = std::nullopt;
-  auto foundElement = std::find_if(parents.begin(), parents.end(), [parent, isScheduling](Edge element) {
-    return parent == element.first && isScheduling == element.isScheduling;
-  });
-  if (foundElement != parents.end()) {
-    oldEdge = *foundElement;
-    fastRemove(parents, foundElement);
-  }
-  parents.emplace_back(Edge {parent, weight, isScheduling});
-  return oldEdge;
-}
-
-bool TaskDCP::hasDescendant(TaskDCP* child) {
-  UniqueWorkList<std::vector<TaskDCP*>> worklist;
-  worklist.reserve(32);
-  worklist.pushBack(this);
-  while (!worklist.empty()) {
-    TaskDCP* task = worklist.back();
-    worklist.popBack();
-    if (task == child)
-      return true;
-    for (auto edge : task->children)
-      worklist.pushBack(edge.first);
-  }
-  return false;
-}
-
-Weight TaskDCP::computeWeightOnCpu(GraphDCP* graph, CPU cpu) {
-  if (crossbarUsage != 0 && graph->wouldExhaustCrossbarCapacity(cpu, this))
-    return std::numeric_limits<Weight>::max();
-  return baseWeight;
-}
-
-void TaskInsertion::rollBack() {
-  graph->removeTaskFromCPU(cpuModified, taskInserted);
-  if (beforeNode.has_value()) {
-    auto edgePair = *beforeNode;
-    addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
-  }
-  if (afterNode.has_value()) {
-    auto edgePair = *afterNode;
-    addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
-  }
-  // for (auto it = topologicalMoves.rbegin(); it != topologicalMoves.rend(); ++it)
-  //   graph->topologicalOrder.moveBefore(it->task, it->nextTask);
-}
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp
deleted file mode 100644
index 1cd6ec5..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Task.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <optional>
-#include <vector>
-
-#include "Utils.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-
-class TaskDCP : public onnx_mlir::LabeledListNode<TaskDCP> {
-  onnx_mlir::spatial::SpatCompute spatCompute;
-  Time aest;
-  Time alst;
-  std::optional<CPU> scheduledCpu;
-  Weight weight;
-  Weight baseWeight;
-  CrossbarUsage crossbarUsage;
-  long long flag = 0;
-  int64_t syntheticId = -1;
-
-  std::optional<Edge> addChild(TaskDCP* child, Weight weight, bool isScheduling);
-  std::optional<Edge> addChild(TaskDCP& child, Weight weight, bool isScheduling) {
-    return addChild(&child, weight, isScheduling);
-  }
-
-  void removeChild(TaskDCP* toRemove, bool isScheduling) { fastRemove(children, toRemove, isScheduling); }
-  void removeChild(TaskDCP& toRemove, bool isScheduling) { fastRemove(children, &toRemove, isScheduling); }
-
-  std::optional<Edge> addParent(TaskDCP* parent, Weight weight, bool isScheduling);
-  std::optional<Edge> addParent(TaskDCP& parent, Weight weight, bool isScheduling) {
-    return addParent(&parent, weight, isScheduling);
-  }
-
-  void removeParent(TaskDCP* toRemove, bool isScheduling) { fastRemove(parents, toRemove, isScheduling); }
-  void removeParent(TaskDCP& toRemove, bool isScheduling) { fastRemove(parents, &toRemove, isScheduling); }
-
-public:
-  std::vector<Edge> parents;
-  std::vector<Edge> children;
-  TaskDCP() = default;
-  TaskDCP(onnx_mlir::spatial::SpatCompute spatCompute)
-  : onnx_mlir::LabeledListNode<TaskDCP>(),
-    spatCompute(spatCompute),
-    aest(0),
-    alst(0),
-    scheduledCpu(),
-    weight(getSpatComputeWeight(spatCompute)),
-    baseWeight(weight),
-    crossbarUsage(getSpatComputeCrossbarUsage(spatCompute)),
-    syntheticId(-1),
-    parents(),
-    children() {}
-
-  TaskDCP(int64_t id, Weight weight, CrossbarUsage crossbarUsage = 0)
-  : onnx_mlir::LabeledListNode<TaskDCP>(),
-    spatCompute(),
-    aest(0),
-    alst(0),
-    scheduledCpu(),
-    weight(weight),
-    baseWeight(weight),
-    crossbarUsage(crossbarUsage),
-    flag(0),
-    syntheticId(id),
-    parents(),
-    children() {}
-
-  TaskDCP(const TaskDCP& node) = delete;
-  TaskDCP(TaskDCP&& node) = default;
-
-  void setCpu(CPU cpu) { scheduledCpu = cpu; }
-  std::optional<CPU> getCpu() const { return scheduledCpu; }
-  void resetCpu() { scheduledCpu = std::nullopt; }
-  Weight getWeight() const {
-    if (isScheduled())
-      return weight;
-    return baseWeight;
-  }
-  void setWeight(Weight value) { weight = value; }
-  void resetWeight() { weight = baseWeight; }
-  Weight computeWeightOnCpu(GraphDCP* graph, CPU cpu);
-  CrossbarUsage getCrossbarUsage() const { return crossbarUsage; }
-
-  bool hasParents() const { return parents.size() != 0; }
-  bool hasChildren() const { return children.size() != 0; }
-
-  Time getAest() const { return aest; }
-  Time getAlst() const { return alst; }
-  void setAest(Time value) { aest = value; }
-  void setAlst(Time value) { alst = value; }
-  bool hasDescendant(TaskDCP* child);
-  int64_t Id() const {
-    if (spatCompute)
-      return reinterpret_cast<int64_t>(spatCompute.getAsOpaquePointer());
-    return syntheticId;
-  }
-
-  bool isCriticalPath() const { return alst == aest; }
-  bool isScheduled() const { return scheduledCpu.has_value(); }
-  onnx_mlir::spatial::SpatCompute getSpatCompute() const { return spatCompute; }
-
-  void setFlag(long long val) { flag = val; }
-  long long getFlag() const { return flag; }
-
-  onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalIterator() { return getIterator(); }
-
-  friend std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling);
-  friend void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling);
-  friend Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
-};
-
-struct TaskInsertion {
-  struct TopologicalMoveRecord {
-    TaskDCP* task;
-    TaskDCP* nextTask;
-  };
-
-  std::optional<EdgePair> beforeNode;
-  std::optional<EdgePair> afterNode;
-  std::vector<TopologicalMoveRecord> topologicalMoves;
-  CPU cpuModified;
-  TaskDCP* taskInserted;
-  GraphDCP* graph;
-
-  void rollBack();
-};
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp
deleted file mode 100644
index 9003857..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/UniqueWorklist.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#pragma once
-
-#include "llvm/ADT/DenseSet.h"
-
-#include <cassert>
-#include <type_traits>
-
-template <typename T, typename = void>
-struct HasPopFront : std::false_type {};
-
-template <typename T>
-struct HasPopFront<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
-
-template <typename T>
-class UniqueWorkList {
-
-  using ValueType = typename T::value_type;
-  T storage;
-  llvm::DenseSet<ValueType> uniqueElements;
-
-public:
-  UniqueWorkList() = default;
-
-  template <typename RangeT>
-  UniqueWorkList(const RangeT& from)
-  : storage() {
-    for (auto& element : from) {
-      if (!uniqueElements.contains(element)) {
-        storage.push_back(element);
-        uniqueElements.insert(element);
-      }
-    }
-  }
-
-  bool empty() const { return storage.empty(); }
-  void reserve(size_t value) { return storage.reserve(value); }
-  size_t size() const { return storage.size(); }
-  ValueType& at(size_t index) { return storage.at(index); }
-  const ValueType& at(size_t index) const { return storage.at(index); }
-
-  ValueType& front() { return storage.front(); }
-  ValueType& back() { return storage.back(); }
-
-  bool pushBack(const ValueType& value) {
-    if (!uniqueElements.contains(value)) {
-      storage.push_back(value);
-      uniqueElements.insert(value);
-      return true;
-    }
-    return false;
-  }
-
-  void popFront() {
-    if constexpr (HasPopFront<T>::value)
-      storage.pop_front();
-    else
-      assert(false && "Underlying storage type does not support pop_front()");
-  }
-
-  auto cbegin() const { return storage.cbegin(); }
-  auto cend() const { return storage.cend(); }
-
-  void popBack() { storage.pop_back(); }
-
-  template <typename Iterator, typename Mapper>
-  bool allElementsContained(Iterator begin, Iterator end, Mapper map) const {
-    auto it = begin;
-    while (it != end) {
-      if (!uniqueElements.contains(map(*it)))
-        return false;
-      std::advance(it, 1);
-    }
-    return true;
-  }
-
-  auto begin() { return storage.begin(); }
-
-  auto end() { return storage.end(); }
-
-  auto begin() const { return storage.begin(); }
-
-  auto end() const { return storage.end(); }
-};
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp
deleted file mode 100644
index 7a71509..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Utils.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-#pragma once
-
-#include "mlir/IR/BuiltinTypeInterfaces.h"
-
-#include "llvm/Support/Casting.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <list>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "src/Accelerators/PIM/Common/LabeledList.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-
-using CPU = int;
-using Weight = unsigned long long;
-using Time = unsigned long long;
-using CrossbarUsage = unsigned long long;
-class TaskDCP;
-class GraphDCP;
-struct Edge {
-  TaskDCP* first;
-  Weight second;
-  bool isScheduling = false;
-};
-using EdgePair = std::pair<Edge, Edge>;
-using IndexedEdge = std::tuple<int64_t, int64_t, int64_t>;
-
-inline void fastRemove(std::vector<Edge>& vector, TaskDCP* toRemove, bool isScheduling) {
-  auto position = std::find_if(vector.begin(), vector.end(), [toRemove, isScheduling](Edge edge) {
-    return edge.first == toRemove && edge.isScheduling == isScheduling;
-  });
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
-}
-
-inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* toRemove) {
-  auto position =
-    std::find_if(vector.begin(), vector.end(), [toRemove](TaskDCP* element) { return element == toRemove; });
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
-}
-
-template <typename P>
-void fastRemove(std::vector<Edge>& vector, P position) {
-  if (position != vector.end()) {
-    std::swap(*(vector.end() - 1), *position);
-    vector.pop_back();
-  }
-}
-
-template <typename T>
-inline T checkedAdd(T lhs, T rhs) {
-  static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
-  assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
-  return lhs + rhs;
-}
-
-template <typename T>
-inline T checkedMultiply(T lhs, T rhs) {
-  static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
-  if (lhs == 0 || rhs == 0)
-    return 0;
-  assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
-  return lhs * rhs;
-}
-
-template <typename T>
-inline T addOrMax(T lhs, T rhs) {
-  static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
-  if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
-    return std::numeric_limits<T>::max();
-  return checkedAdd(lhs, rhs);
-}
-
-template <typename T>
-inline T subtractOrZero(T lhs, T rhs) {
-  static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
-  if (lhs == std::numeric_limits<T>::max())
-    return lhs;
-  if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
-    return 0;
-  return lhs - rhs;
-}
-
-inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
-
-inline Weight getSpatComputeWeight(onnx_mlir::spatial::SpatCompute spatCompute) {
-  constexpr Weight kOperationWeight = 100;
-  Weight numOperations = 0;
-  for (auto& block : spatCompute.getBody())
-    for ([[maybe_unused]] auto& op : block)
-      numOperations = checkedAdd(numOperations, static_cast<Weight>(1));
-  return checkedMultiply(numOperations, kOperationWeight);
-}
-
-inline CrossbarUsage getSpatComputeCrossbarUsage(onnx_mlir::spatial::SpatCompute spatCompute) {
-  CrossbarUsage crossbarUsage = 0;
-  for (auto& region : spatCompute.getBody())
-    for (auto& inst : region)
-      if (llvm::isa<onnx_mlir::spatial::SpatVMMOp>(inst))
-        crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
-  return crossbarUsage;
-}
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
index 6ca07f3..0a079a7 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
@@ -1,6 +1,7 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/Unit.h"
 #include "mlir/IR/Value.h"
 
 #include "llvm/ADT/STLExtras.h"
@@ -32,15 +33,6 @@ Weight getComputeBodyWeight(Region& body) {
   return checkedMultiply(numOperations, kOperationWeight);
 }
 
-CrossbarUsage getComputeBodyCrossbarUsage(Region& body) {
-  CrossbarUsage crossbarUsage = 0;
-  for (auto& block : body)
-    for (auto& op : block)
-      if (isa<SpatVMMOp>(op))
-        crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
-  return crossbarUsage;
-}
-
 bool isUsedAsWeightOnly(Operation* producerOp) {
   if (producerOp->getNumResults() == 0)
     return false;
@@ -133,16 +125,28 @@ std::vector<ComputeGraphEdge> aggregateEdges(llvm::ArrayRef<ComputeGraphEdge> ed
 
 Weight getComputeInstanceWeight(const ComputeInstance& instance) {
   if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
-    return getSpatComputeWeight(spatCompute);
+    return getComputeBodyWeight(spatCompute.getBody());
   auto batch = cast<SpatComputeBatch>(instance.op);
   return checkedMultiply(getComputeBodyWeight(batch.getBody()), static_cast<Weight>(instance.laneCount));
 }
 
+CrossbarUsage getSpatComputeCrossbarUsage(SpatCompute spatComute){
+  CrossbarUsage ret;
+  ret.insert_range(spatComute.getWeights());
+  return ret;
+}
+
+CrossbarUsage getSpatComputeBatchCrossbarUsage(SpatComputeBatch spatComuteBatch){
+  CrossbarUsage ret;
+  ret.insert_range(spatComuteBatch.getWeights());
+  return ret;
+}
+
 CrossbarUsage getComputeInstanceCrossbarUsage(const ComputeInstance& instance) {
   if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
     return getSpatComputeCrossbarUsage(spatCompute);
   auto batch = cast<SpatComputeBatch>(instance.op);
-  return checkedMultiply(getComputeBodyCrossbarUsage(batch.getBody()), static_cast<CrossbarUsage>(instance.laneCount));
+  return getSpatComputeBatchCrossbarUsage(batch);
 }
 
 ComputeGraph buildComputeGraph(Operation* entryOp) {
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.hpp
index d72e17d..0882a55 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/ComputeGraph.hpp
@@ -4,6 +4,7 @@
 #include "mlir/IR/Value.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 
 #include <cstddef>
@@ -11,17 +12,19 @@
 #include <utility>
 #include <vector>
 
-#include "../DCPGraph/Utils.hpp"
+#include "Utils.hpp"
 #include "ComputeInstance.hpp"
 #include "ComputeInstanceUtils.hpp"
 
+using CrossbarUsage = llvm::SmallPtrSet<mlir::Value, 6>;
+
 namespace onnx_mlir {
 namespace spatial {
 
 struct ComputeGraphNode {
   ComputeInstance instance;
   Weight weight = 0;
-  CrossbarUsage crossbarUsage = 0;
+  llvm::SmallPtrSet<mlir::Value,6> crossbarUsage;
   size_t originalOrder = 0;
 };
 
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.cpp
deleted file mode 100644
index cd7f177..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.cpp
+++ /dev/null
@@ -1,722 +0,0 @@
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <algorithm>
-#include <cstdlib>
-#include <limits>
-#include <numeric>
-#include <optional>
-#include <queue>
-#include <vector>
-
-#include "../DCPGraph/Graph.hpp"
-#include "DcpScheduler.hpp"
-
-namespace onnx_mlir {
-namespace spatial {
-
-using namespace mlir;
-
-namespace {
-
-bool isDcpCoarsenDebugEnabled() { return std::getenv("DCP_COARSEN_DEBUG") != nullptr; }
-
-struct VirtualNode {
-  llvm::SmallVector<size_t, 4> originalNodeIndices;
-  Weight weight = 0;
-  CrossbarUsage crossbarUsage = 0;
-};
-
-struct VirtualGraph {
-  std::vector<VirtualNode> nodes;
-  std::vector<IndexedEdge> edges;
-};
-
-struct TimingInfo {
-  std::vector<Time> aest;
-  std::vector<Time> alst;
-  std::vector<size_t> topologicalOrder;
-  bool valid = false;
-};
-
-struct WindowScheduleResult {
-  std::vector<std::vector<size_t>> mergeGroups;
-  CPU cpuCount = 0;
-  size_t mergedNodeCount = 0;
-  size_t maxMergeGroupSize = 0;
-};
-
-size_t getSchedulingCpuBudget(const DcpScheduleOptions& options) {
-  if (options.processorCount > 0)
-    return options.processorCount;
-  return std::numeric_limits<size_t>::max();
-}
-
-std::vector<IndexedEdge> aggregateEdges(llvm::ArrayRef<IndexedEdge> edges) {
-  llvm::DenseMap<std::pair<size_t, size_t>, Weight> edgeWeights;
-  for (auto [start, end, weight] : edges) {
-    size_t startIndex = static_cast<size_t>(start);
-    size_t endIndex = static_cast<size_t>(end);
-    if (startIndex == endIndex)
-      continue;
-    auto key = std::make_pair(startIndex, endIndex);
-    Weight edgeWeight = static_cast<Weight>(weight);
-    auto inserted = edgeWeights.try_emplace(key, edgeWeight);
-    if (!inserted.second)
-      inserted.first->second = std::max(inserted.first->second, edgeWeight);
-  }
-
-  std::vector<IndexedEdge> aggregatedEdges;
-  aggregatedEdges.reserve(edgeWeights.size());
-  for (auto [key, weight] : edgeWeights)
-    aggregatedEdges.push_back(
-      {static_cast<int64_t>(key.first), static_cast<int64_t>(key.second), static_cast<int64_t>(weight)});
-  llvm::sort(aggregatedEdges, [](const IndexedEdge& lhs, const IndexedEdge& rhs) {
-    if (std::get<0>(lhs) != std::get<0>(rhs))
-      return std::get<0>(lhs) < std::get<0>(rhs);
-    return std::get<1>(lhs) < std::get<1>(rhs);
-  });
-  return aggregatedEdges;
-}
-
-VirtualGraph buildInitialVirtualGraph(const ComputeGraph& graph) {
-  VirtualGraph virtualGraph;
-  virtualGraph.nodes.reserve(graph.nodes.size());
-  for (auto [index, node] : llvm::enumerate(graph.nodes)) {
-    VirtualNode virtualNode;
-    virtualNode.originalNodeIndices.push_back(index);
-    virtualNode.weight = node.weight;
-    virtualNode.crossbarUsage = node.crossbarUsage;
-    virtualGraph.nodes.push_back(std::move(virtualNode));
-  }
-
-  std::vector<IndexedEdge> edges;
-  edges.reserve(graph.edges.size());
-  for (const ComputeGraphEdge& edge : graph.edges)
-    edges.push_back(
-      {static_cast<int64_t>(edge.source), static_cast<int64_t>(edge.target), static_cast<int64_t>(edge.transferCost)});
-  virtualGraph.edges = aggregateEdges(edges);
-  return virtualGraph;
-}
-
-TimingInfo computeTiming(const VirtualGraph& graph) {
-  TimingInfo timing;
-  size_t nodeCount = graph.nodes.size();
-  timing.aest.assign(nodeCount, 0);
-  timing.alst.assign(nodeCount, 0);
-  timing.topologicalOrder.reserve(nodeCount);
-
-  std::vector<std::vector<std::pair<size_t, Weight>>> parents(nodeCount);
-  std::vector<std::vector<std::pair<size_t, Weight>>> children(nodeCount);
-  std::vector<size_t> incomingEdgeCount(nodeCount, 0);
-
-  for (auto [start, end, weight] : graph.edges) {
-    size_t startIndex = static_cast<size_t>(start);
-    size_t endIndex = static_cast<size_t>(end);
-    Weight edgeWeight = static_cast<Weight>(weight);
-    assert(startIndex < nodeCount && endIndex < nodeCount && "virtual edge endpoint out of range");
-    children[startIndex].push_back({endIndex, edgeWeight});
-    parents[endIndex].push_back({startIndex, edgeWeight});
-    incomingEdgeCount[endIndex]++;
-  }
-
-  auto getVirtualNodeOrderKey = [&](size_t nodeIndex) {
-    const VirtualNode& node = graph.nodes[nodeIndex];
-    if (!node.originalNodeIndices.empty())
-      return node.originalNodeIndices.front();
-    return nodeIndex;
-  };
-  auto readyNodeGreater = [&](size_t lhs, size_t rhs) {
-    size_t lhsKey = getVirtualNodeOrderKey(lhs);
-    size_t rhsKey = getVirtualNodeOrderKey(rhs);
-    if (lhsKey != rhsKey)
-      return lhsKey > rhsKey;
-    return lhs > rhs;
-  };
-  std::priority_queue<size_t, std::vector<size_t>, decltype(readyNodeGreater)> readyNodes(readyNodeGreater);
-  for (size_t i = 0; i < nodeCount; ++i)
-    if (incomingEdgeCount[i] == 0)
-      readyNodes.push(i);
-
-  while (!readyNodes.empty()) {
-    size_t current = readyNodes.top();
-    readyNodes.pop();
-    timing.topologicalOrder.push_back(current);
-    for (auto [child, weight] : children[current]) {
-      (void) weight;
-      assert(incomingEdgeCount[child] > 0 && "incoming edge count underflow");
-      incomingEdgeCount[child]--;
-      if (incomingEdgeCount[child] == 0)
-        readyNodes.push(child);
-    }
-  }
-
-  if (timing.topologicalOrder.size() != nodeCount)
-    return timing;
-
-  Time dcpl = 0;
-  for (size_t nodeIndex : timing.topologicalOrder) {
-    Time maxParentAest = 0;
-    for (auto [parent, transferCost] : parents[nodeIndex]) {
-      maxParentAest =
-        std::max(maxParentAest, addOrMax(addOrMax(timing.aest[parent], graph.nodes[parent].weight), transferCost));
-    }
-    timing.aest[nodeIndex] = maxParentAest;
-    dcpl = std::max(dcpl, addOrMax(maxParentAest, graph.nodes[nodeIndex].weight));
-  }
-
-  for (size_t nodeIndex : llvm::reverse(timing.topologicalOrder)) {
-    Time minAlst = std::numeric_limits<Time>::max();
-    if (children[nodeIndex].empty())
-      minAlst = subtractOrZero(dcpl, graph.nodes[nodeIndex].weight);
-    for (auto [child, transferCost] : children[nodeIndex]) {
-      minAlst =
-        std::min(minAlst, subtractOrZero(timing.alst[child], addOrMax(graph.nodes[nodeIndex].weight, transferCost)));
-    }
-    timing.alst[nodeIndex] = minAlst;
-  }
-
-  timing.valid = true;
-  return timing;
-}
-
-std::vector<std::vector<size_t>> buildUndirectedAdjacency(const VirtualGraph& graph) {
-  std::vector<std::vector<size_t>> adjacency(graph.nodes.size());
-  for (auto [start, end, weight] : graph.edges) {
-    (void) weight;
-    size_t startIndex = static_cast<size_t>(start);
-    size_t endIndex = static_cast<size_t>(end);
-    assert(startIndex < graph.nodes.size() && endIndex < graph.nodes.size() && "virtual edge endpoint out of range");
-    adjacency[startIndex].push_back(endIndex);
-    adjacency[endIndex].push_back(startIndex);
-  }
-  for (auto& neighbours : adjacency) {
-    llvm::sort(neighbours);
-    neighbours.erase(std::unique(neighbours.begin(), neighbours.end()), neighbours.end());
-  }
-  return adjacency;
-}
-
-std::vector<size_t> selectCriticalWindow(const VirtualGraph& graph, const TimingInfo& timing, size_t windowSize) {
-  std::vector<size_t> ranked(timing.aest.size());
-  std::iota(ranked.begin(), ranked.end(), 0);
-  auto isHigherPriority = [&](size_t lhs, size_t rhs) {
-    Time lhsSlack = slackOrZero(timing.aest[lhs], timing.alst[lhs]);
-    Time rhsSlack = slackOrZero(timing.aest[rhs], timing.alst[rhs]);
-    if (lhsSlack != rhsSlack)
-      return lhsSlack < rhsSlack;
-    if (timing.aest[lhs] != timing.aest[rhs])
-      return timing.aest[lhs] < timing.aest[rhs];
-    return lhs < rhs;
-  };
-
-  windowSize = std::min(windowSize, ranked.size());
-  if (windowSize == 0)
-    return {};
-  if (windowSize == ranked.size()) {
-    llvm::sort(ranked, isHigherPriority);
-    return ranked;
-  }
-
-  size_t criticalPoolSize = std::min(ranked.size(), std::max(windowSize, windowSize * 2));
-  if (criticalPoolSize < ranked.size())
-    std::nth_element(
-      ranked.begin(), ranked.begin() + static_cast<std::ptrdiff_t>(criticalPoolSize), ranked.end(), isHigherPriority);
-
-  std::vector<char> inCriticalPool(ranked.size(), false);
-  for (size_t i = 0; i < criticalPoolSize; ++i)
-    inCriticalPool[ranked[i]] = true;
-
-  size_t seed = *std::min_element(ranked.begin(), ranked.end(), isHigherPriority);
-  std::vector<std::vector<size_t>> adjacency = buildUndirectedAdjacency(graph);
-  std::vector<size_t> selected;
-  std::vector<char> inWindow(ranked.size(), false);
-  selected.reserve(windowSize);
-
-  struct FrontierEntry {
-    size_t node;
-  };
-  auto frontierCompare = [&](FrontierEntry lhs, FrontierEntry rhs) { return isHigherPriority(rhs.node, lhs.node); };
-  std::priority_queue<FrontierEntry, std::vector<FrontierEntry>, decltype(frontierCompare)> frontier(frontierCompare);
-
-  auto addToWindow = [&](size_t node, const std::vector<char>& eligible) {
-    if (inWindow[node])
-      return;
-    inWindow[node] = true;
-    selected.push_back(node);
-    for (size_t neighbour : adjacency[node])
-      if (!inWindow[neighbour] && eligible[neighbour])
-        frontier.push({neighbour});
-  };
-
-  addToWindow(seed, inCriticalPool);
-  while (!frontier.empty() && selected.size() < windowSize) {
-    size_t node = frontier.top().node;
-    frontier.pop();
-    if (!inWindow[node])
-      addToWindow(node, inCriticalPool);
-  }
-
-  if (selected.size() < windowSize) {
-    std::vector<char> anyNode(ranked.size(), true);
-    for (size_t node : selected)
-      for (size_t neighbour : adjacency[node])
-        if (!inWindow[neighbour])
-          frontier.push({neighbour});
-    while (!frontier.empty() && selected.size() < windowSize) {
-      size_t node = frontier.top().node;
-      frontier.pop();
-      if (!inWindow[node])
-        addToWindow(node, anyNode);
-    }
-  }
-
-  if (selected.size() < windowSize) {
-    llvm::sort(ranked, isHigherPriority);
-    for (size_t node : ranked) {
-      if (selected.size() == windowSize)
-        break;
-      if (!inWindow[node]) {
-        inWindow[node] = true;
-        selected.push_back(node);
-      }
-    }
-  }
-
-  llvm::sort(selected, isHigherPriority);
-  return selected;
-}
-
-std::vector<IndexedEdge> buildWindowEdges(const VirtualGraph& graph, const std::vector<int64_t>& nodeToWindowIndex) {
-  std::vector<IndexedEdge> windowEdges;
-  windowEdges.reserve(graph.edges.size());
-  for (auto [start, end, weight] : graph.edges) {
-    int64_t mappedStart = nodeToWindowIndex[static_cast<size_t>(start)];
-    int64_t mappedEnd = nodeToWindowIndex[static_cast<size_t>(end)];
-    if (mappedStart == -1 || mappedEnd == -1)
-      continue;
-    windowEdges.push_back({mappedStart, mappedEnd, weight});
-  }
-  return aggregateEdges(windowEdges);
-}
-
-WindowScheduleResult scheduleWindow(const VirtualGraph& graph,
-                                    llvm::ArrayRef<size_t> selectedNodes,
-                                    const DcpScheduleOptions& options,
-                                    mlir::MLIRContext* context) {
-  std::vector<Weight> windowWeights;
-  std::vector<CrossbarUsage> windowCrossbarUsage;
-  std::vector<int64_t> windowNodeOrderKeys;
-  std::vector<int64_t> nodeToWindowIndex(graph.nodes.size(), -1);
-  windowWeights.reserve(selectedNodes.size());
-  windowCrossbarUsage.reserve(selectedNodes.size());
-  windowNodeOrderKeys.reserve(selectedNodes.size());
-
-  for (auto [windowIndex, nodeIndex] : llvm::enumerate(selectedNodes)) {
-    nodeToWindowIndex[nodeIndex] = static_cast<int64_t>(windowIndex);
-    windowWeights.push_back(graph.nodes[nodeIndex].weight);
-    windowCrossbarUsage.push_back(graph.nodes[nodeIndex].crossbarUsage);
-    windowNodeOrderKeys.push_back(static_cast<int64_t>(nodeIndex));
-  }
-
-  GraphDCP windowGraph(
-    windowWeights, buildWindowEdges(graph, nodeToWindowIndex), windowNodeOrderKeys, windowCrossbarUsage);
-  if (options.processorCount > 0)
-    windowGraph.setMaxCpuCount(static_cast<int>(options.processorCount));
-  windowGraph.setContext(context);
-  windowGraph.runDcp();
-
-  WindowScheduleResult result;
-  result.cpuCount = windowGraph.cpuCount();
-  for (CPU cpu = 0; cpu < windowGraph.cpuCount(); ++cpu) {
-    auto scheduledTasks = windowGraph.getScheduledTasks(cpu);
-    if (scheduledTasks.size() < 2)
-      continue;
-
-    result.mergedNodeCount += scheduledTasks.size();
-    result.maxMergeGroupSize = std::max(result.maxMergeGroupSize, scheduledTasks.size());
-    std::vector<size_t> mergeGroup;
-    mergeGroup.reserve(scheduledTasks.size());
-    for (const auto& task : scheduledTasks)
-      mergeGroup.push_back(selectedNodes[task.nodeIndex]);
-    result.mergeGroups.push_back(std::move(mergeGroup));
-  }
-  return result;
-}
-
-bool coarsenGraph(const VirtualGraph& graph,
-                  llvm::ArrayRef<std::vector<size_t>> mergeGroups,
-                  VirtualGraph& coarsenedGraph,
-                  std::vector<size_t>& oldToNewNode) {
-  TimingInfo timing = computeTiming(graph);
-  std::vector<size_t> topologicalRank(graph.nodes.size());
-  std::iota(topologicalRank.begin(), topologicalRank.end(), 0);
-  if (timing.valid)
-    for (auto [rank, nodeIndex] : llvm::enumerate(timing.topologicalOrder))
-      topologicalRank[nodeIndex] = rank;
-
-  std::vector<std::vector<size_t>> orderedMergeGroups;
-  orderedMergeGroups.reserve(mergeGroups.size());
-  for (const auto& mergeGroup : mergeGroups) {
-    orderedMergeGroups.emplace_back(mergeGroup.begin(), mergeGroup.end());
-    std::stable_sort(orderedMergeGroups.back().begin(), orderedMergeGroups.back().end(), [&](size_t lhs, size_t rhs) {
-      if (topologicalRank[lhs] != topologicalRank[rhs])
-        return topologicalRank[lhs] < topologicalRank[rhs];
-      return lhs < rhs;
-    });
-  }
-
-  std::vector<int64_t> nodeToMergeGroup(graph.nodes.size(), -1);
-  for (auto [groupIndex, mergeGroup] : llvm::enumerate(orderedMergeGroups)) {
-    if (mergeGroup.size() < 2)
-      continue;
-    for (size_t nodeIndex : mergeGroup) {
-      assert(nodeIndex < graph.nodes.size() && "merge group node out of range");
-      nodeToMergeGroup[nodeIndex] = static_cast<int64_t>(groupIndex);
-    }
-  }
-
-  std::vector<std::optional<size_t>> mergeGroupToNewNode(orderedMergeGroups.size());
-  std::vector<size_t> newNodeRank;
-  oldToNewNode.assign(graph.nodes.size(), 0);
-  bool mergedAny = false;
-  coarsenedGraph.nodes.clear();
-  coarsenedGraph.edges.clear();
-  coarsenedGraph.nodes.reserve(graph.nodes.size());
-  newNodeRank.reserve(graph.nodes.size());
-
-  for (size_t nodeIndex = 0; nodeIndex < graph.nodes.size(); ++nodeIndex) {
-    int64_t mergeGroupIndex = nodeToMergeGroup[nodeIndex];
-    if (mergeGroupIndex == -1) {
-      oldToNewNode[nodeIndex] = coarsenedGraph.nodes.size();
-      coarsenedGraph.nodes.push_back(graph.nodes[nodeIndex]);
-      newNodeRank.push_back(topologicalRank[nodeIndex]);
-      continue;
-    }
-
-    auto& newNodeIndex = mergeGroupToNewNode[static_cast<size_t>(mergeGroupIndex)];
-    if (newNodeIndex.has_value()) {
-      oldToNewNode[nodeIndex] = *newNodeIndex;
-      continue;
-    }
-
-    VirtualNode mergedNode;
-    for (size_t memberIndex : orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)]) {
-      const VirtualNode& memberNode = graph.nodes[memberIndex];
-      mergedNode.originalNodeIndices.append(memberNode.originalNodeIndices.begin(),
-                                            memberNode.originalNodeIndices.end());
-      mergedNode.weight = addOrMax(mergedNode.weight, memberNode.weight);
-      mergedNode.crossbarUsage = addOrMax(mergedNode.crossbarUsage, memberNode.crossbarUsage);
-    }
-    std::sort(mergedNode.originalNodeIndices.begin(), mergedNode.originalNodeIndices.end());
-
-    mergedAny = true;
-    newNodeIndex = coarsenedGraph.nodes.size();
-    for (size_t memberIndex : orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)])
-      oldToNewNode[memberIndex] = *newNodeIndex;
-    newNodeRank.push_back(topologicalRank[orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)].front()]);
-    coarsenedGraph.nodes.push_back(std::move(mergedNode));
-  }
-
-  if (!mergedAny)
-    return false;
-
-  std::vector<IndexedEdge> remappedEdges;
-  remappedEdges.reserve(graph.edges.size());
-  for (auto [start, end, weight] : graph.edges) {
-    size_t newStart = oldToNewNode[static_cast<size_t>(start)];
-    size_t newEnd = oldToNewNode[static_cast<size_t>(end)];
-    if (newStart == newEnd)
-      continue;
-    if (newNodeRank[newStart] >= newNodeRank[newEnd])
-      continue;
-    remappedEdges.push_back({static_cast<int64_t>(newStart), static_cast<int64_t>(newEnd), weight});
-  }
-  coarsenedGraph.edges = aggregateEdges(remappedEdges);
-
-  return true;
-}
-
-size_t getDcpCoarseningWindowSize(size_t nodeCount, const DcpScheduleOptions& options) {
-  size_t windowSize = std::min(options.criticalWindowSize, nodeCount);
-  CPU maxCpuCount = std::max<CPU>(1, static_cast<CPU>(getSchedulingCpuBudget(options)));
-  if (nodeCount > static_cast<size_t>(maxCpuCount))
-    windowSize = std::max(windowSize, std::min(nodeCount, static_cast<size_t>(maxCpuCount) + 1));
-  return windowSize;
-}
-
-void assignFeasibleAest(const ComputeGraph& graph, MergeScheduleResult& result) {
-  llvm::DenseMap<ComputeInstance, size_t> nodeIndexByInstance;
-  nodeIndexByInstance.reserve(graph.nodes.size());
-  for (auto [nodeIndex, node] : llvm::enumerate(graph.nodes))
-    nodeIndexByInstance[node.instance] = nodeIndex;
-
-  struct ScheduledEdge {
-    size_t target = 0;
-    Time delay = 0;
-  };
-
-  std::vector<std::vector<ScheduledEdge>> scheduledChildren(graph.nodes.size());
-  std::vector<size_t> incomingEdgeCount(graph.nodes.size(), 0);
-  for (const ComputeGraphEdge& edge : graph.edges) {
-    const ComputeInstance sourceInstance = graph.nodes[edge.source].instance;
-    const ComputeInstance targetInstance = graph.nodes[edge.target].instance;
-    const size_t sourceCpu = result.computeToCpuMap.lookup(sourceInstance);
-    const size_t targetCpu = result.computeToCpuMap.lookup(targetInstance);
-
-    Time delay = graph.nodes[edge.source].weight;
-    if (sourceCpu != targetCpu)
-      delay = addOrMax(delay, edge.transferCost);
-
-    scheduledChildren[edge.source].push_back({edge.target, delay});
-    incomingEdgeCount[edge.target]++;
-  }
-
-  llvm::DenseMap<size_t, std::vector<std::pair<size_t, size_t>>> tasksByCpu;
-  for (const ComputeGraphNode& node : graph.nodes) {
-    size_t cpu = result.computeToCpuMap.lookup(node.instance);
-    size_t slot = result.computeToCpuSlotMap.lookup(node.instance);
-    tasksByCpu[cpu].push_back({slot, nodeIndexByInstance.lookup(node.instance)});
-  }
-
-  for (auto& entry : tasksByCpu) {
-    auto& scheduledTasks = entry.second;
-    llvm::sort(scheduledTasks, [](const auto& lhs, const auto& rhs) {
-      if (lhs.first != rhs.first)
-        return lhs.first < rhs.first;
-      return lhs.second < rhs.second;
-    });
-
-    for (size_t i = 1; i < scheduledTasks.size(); ++i) {
-      size_t sourceIndex = scheduledTasks[i - 1].second;
-      size_t targetIndex = scheduledTasks[i].second;
-      scheduledChildren[sourceIndex].push_back({targetIndex, graph.nodes[sourceIndex].weight});
-      incomingEdgeCount[targetIndex]++;
-    }
-  }
-
-  auto readyNodeGreater = [&](size_t lhs, size_t rhs) {
-    if (graph.nodes[lhs].originalOrder != graph.nodes[rhs].originalOrder)
-      return graph.nodes[lhs].originalOrder > graph.nodes[rhs].originalOrder;
-    return lhs > rhs;
-  };
-  std::priority_queue<size_t, std::vector<size_t>, decltype(readyNodeGreater)> readyNodes(readyNodeGreater);
-  for (size_t nodeIndex = 0; nodeIndex < graph.nodes.size(); ++nodeIndex)
-    if (incomingEdgeCount[nodeIndex] == 0)
-      readyNodes.push(nodeIndex);
-
-  std::vector<Time> startTimes(graph.nodes.size(), 0);
-  size_t processedNodeCount = 0;
-  while (!readyNodes.empty()) {
-    size_t sourceIndex = readyNodes.top();
-    readyNodes.pop();
-    processedNodeCount++;
-
-    for (const ScheduledEdge& edge : scheduledChildren[sourceIndex]) {
-      startTimes[edge.target] = std::max(startTimes[edge.target], addOrMax(startTimes[sourceIndex], edge.delay));
-      assert(incomingEdgeCount[edge.target] > 0 && "scheduled incoming edge count underflow");
-      incomingEdgeCount[edge.target]--;
-      if (incomingEdgeCount[edge.target] == 0)
-        readyNodes.push(edge.target);
-    }
-  }
-
-  if (processedNodeCount != graph.nodes.size())
-    llvm::report_fatal_error("merge scheduling: coarsened DCP schedule is cyclic");
-
-  for (auto [nodeIndex, node] : llvm::enumerate(graph.nodes))
-    result.computeToAestMap[node.instance] = startTimes[nodeIndex];
-}
-
-MergeScheduleResult buildResultFromVirtualGraph(const VirtualGraph& graph, const ComputeGraph& originalGraph) {
-  MergeScheduleResult result;
-
-  TimingInfo timing = computeTiming(graph);
-  std::vector<size_t> virtualNodeOrder;
-  if (timing.valid)
-    virtualNodeOrder = std::move(timing.topologicalOrder);
-  else {
-    virtualNodeOrder.resize(graph.nodes.size());
-    std::iota(virtualNodeOrder.begin(), virtualNodeOrder.end(), 0);
-  }
-
-  std::vector<size_t> originalNodeToCpu(originalGraph.nodes.size(), 0);
-  for (auto [cpu, virtualNodeIndex] : llvm::enumerate(virtualNodeOrder)) {
-    const VirtualNode& virtualNode = graph.nodes[virtualNodeIndex];
-    for (size_t originalIndex : virtualNode.originalNodeIndices)
-      originalNodeToCpu[originalIndex] = cpu;
-  }
-
-  result.dominanceOrderCompute.reserve(originalGraph.nodes.size());
-  llvm::DenseMap<size_t, size_t> nextCpuSlot;
-  for (auto [originalIndex, node] : llvm::enumerate(originalGraph.nodes)) {
-    size_t cpu = originalNodeToCpu[originalIndex];
-    result.dominanceOrderCompute.push_back(node.instance);
-    result.computeToCpuMap[node.instance] = cpu;
-    result.computeToCpuSlotMap[node.instance] = nextCpuSlot[cpu]++;
-    result.cpuToLastComputeMap[cpu] = node.instance;
-  }
-  for (const auto& [cpu, lastCompute] : result.cpuToLastComputeMap)
-    result.isLastComputeOfCpu.insert(lastCompute);
-  assignFeasibleAest(originalGraph, result);
-
-  return result;
-}
-
-MergeScheduleResult buildResultFromScheduledGraph(GraphDCP& graphDCP, const ComputeGraph& graph) {
-  MergeScheduleResult result;
-  result.dominanceOrderCompute.reserve(graph.nodes.size());
-  for (const ComputeGraphNode& node : graph.nodes)
-    result.dominanceOrderCompute.push_back(node.instance);
-
-  for (CPU cpu = 0; cpu < graphDCP.cpuCount(); ++cpu) {
-    auto scheduledTasks = graphDCP.getScheduledTasks(cpu);
-    if (scheduledTasks.empty())
-      continue;
-
-    for (auto [slot, task] : llvm::enumerate(scheduledTasks)) {
-      const ComputeInstance instance = graph.nodes[task.nodeIndex].instance;
-      result.computeToCpuMap[instance] = cpu;
-      result.computeToCpuSlotMap[instance] = slot;
-      result.computeToAestMap[instance] = static_cast<uint64_t>(task.aest);
-    }
-
-    const ComputeInstance lastInstance = graph.nodes[scheduledTasks.back().nodeIndex].instance;
-    result.cpuToLastComputeMap[cpu] = lastInstance;
-    result.isLastComputeOfCpu.insert(lastInstance);
-  }
-
-  return result;
-}
-
-MergeScheduleResult
-runLegacyDcp(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context) {
-  llvm::SmallVector<Weight> nodeWeights;
-  llvm::SmallVector<CrossbarUsage> nodeCrossbarUsage;
-  llvm::SmallVector<int64_t> nodeOrderKeys;
-  llvm::SmallVector<IndexedEdge> edges;
-  nodeWeights.reserve(graph.nodes.size());
-  nodeCrossbarUsage.reserve(graph.nodes.size());
-  nodeOrderKeys.reserve(graph.nodes.size());
-  edges.reserve(graph.edges.size());
-
-  for (const ComputeGraphNode& node : graph.nodes) {
-    nodeWeights.push_back(node.weight);
-    nodeCrossbarUsage.push_back(node.crossbarUsage);
-    nodeOrderKeys.push_back(static_cast<int64_t>(node.originalOrder));
-  }
-  for (const ComputeGraphEdge& edge : graph.edges) {
-    edges.push_back(
-      {static_cast<int64_t>(edge.source), static_cast<int64_t>(edge.target), static_cast<int64_t>(edge.transferCost)});
-  }
-
-  GraphDCP graphDCP(nodeWeights, edges, nodeOrderKeys, nodeCrossbarUsage);
-  if (options.processorCount > 0)
-    graphDCP.setMaxCpuCount(static_cast<int>(options.processorCount));
-  graphDCP.setContext(context);
-  graphDCP.runDcp();
-  return buildResultFromScheduledGraph(graphDCP, graph);
-}
-
-bool needsExactScheduledBatches(const ComputeGraph& graph, const DcpScheduleOptions& options) {
-  if (options.processorCount == 0 || !options.allowFallbackForAutoCoreCount)
-    return false;
-  size_t schedulingCpuBudget = getSchedulingCpuBudget(options);
-  return llvm::any_of(graph.nodes, [&](const ComputeGraphNode& node) {
-    auto batch = dyn_cast<SpatComputeBatch>(node.instance.op);
-    return batch && static_cast<size_t>(batch.getLaneCount()) > schedulingCpuBudget;
-  });
-}
-
-} // namespace
-
-MergeScheduleResult
-runDcpScheduler(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context) {
-  if (needsExactScheduledBatches(graph, options))
-    return runLegacyDcp(graph, options, context);
-
-  if (options.criticalWindowSize == 0)
-    return runLegacyDcp(graph, options, context);
-
-  VirtualGraph virtualGraph = buildInitialVirtualGraph(graph);
-  size_t iteration = 0;
-  bool debugCoarsening = isDcpCoarsenDebugEnabled();
-  auto tryCoarsenSelectedNodes = [&](llvm::ArrayRef<size_t> selectedNodes) {
-    size_t oldNodeCount = virtualGraph.nodes.size();
-    WindowScheduleResult windowSchedule = scheduleWindow(virtualGraph, selectedNodes, options, context);
-    if (windowSchedule.mergeGroups.empty()) {
-      if (debugCoarsening && oldNodeCount >= 200)
-        llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} "
-                                      "groups=0 mergedNodes=0 maxGroup=0 new={1} changed=0\n",
-                                      iteration,
-                                      oldNodeCount,
-                                      selectedNodes.size(),
-                                      windowSchedule.cpuCount);
-      return false;
-    }
-
-    VirtualGraph coarsenedGraph;
-    std::vector<size_t> oldToNewNode;
-    if (!coarsenGraph(virtualGraph, windowSchedule.mergeGroups, coarsenedGraph, oldToNewNode))
-      return false;
-    if (debugCoarsening && (oldNodeCount >= 200 || coarsenedGraph.nodes.size() >= 200))
-      llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} "
-                                    "groups={4} mergedNodes={5} maxGroup={6} new={7} changed={8}\n",
-                                    iteration,
-                                    oldNodeCount,
-                                    selectedNodes.size(),
-                                    windowSchedule.cpuCount,
-                                    windowSchedule.mergeGroups.size(),
-                                    windowSchedule.mergedNodeCount,
-                                    windowSchedule.maxMergeGroupSize,
-                                    coarsenedGraph.nodes.size(),
-                                    oldNodeCount - coarsenedGraph.nodes.size());
-    virtualGraph = std::move(coarsenedGraph);
-    return true;
-  };
-
-  while (virtualGraph.nodes.size() > 1) {
-    if (virtualGraph.nodes.size() <= getSchedulingCpuBudget(options)) {
-      if (debugCoarsening && virtualGraph.nodes.size() >= 200)
-        llvm::errs() << llvm::formatv(
-          "[DCP-COARSEN] iter={0} old={1} stop=cpu-budget\n", iteration, virtualGraph.nodes.size());
-      break;
-    }
-
-    iteration++;
-    TimingInfo timing = computeTiming(virtualGraph);
-    if (!timing.valid) {
-      if (debugCoarsening && virtualGraph.nodes.size() >= 200)
-        llvm::errs() << llvm::formatv(
-          "[DCP-COARSEN] iter={0} old={1} invalid-timing\n", iteration, virtualGraph.nodes.size());
-      break;
-    }
-
-    llvm::SmallVector<size_t> selectedNodes;
-    auto criticalWindow =
-      selectCriticalWindow(virtualGraph, timing, getDcpCoarseningWindowSize(virtualGraph.nodes.size(), options));
-    selectedNodes.append(criticalWindow.begin(), criticalWindow.end());
-
-    if (selectedNodes.size() < 2) {
-      if (debugCoarsening && virtualGraph.nodes.size() >= 200)
-        llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} stop=small-window\n",
-                                      iteration,
-                                      virtualGraph.nodes.size(),
-                                      selectedNodes.size());
-      break;
-    }
-
-    if (tryCoarsenSelectedNodes(selectedNodes))
-      continue;
-    if (debugCoarsening && virtualGraph.nodes.size() >= 200)
-      llvm::errs() << llvm::formatv(
-        "[DCP-COARSEN] iter={0} old={1} stop=no-merge\n", iteration, virtualGraph.nodes.size());
-    break;
-  }
-
-  return buildResultFromVirtualGraph(virtualGraph, graph);
-}
-
-} // namespace spatial
-} // namespace onnx_mlir
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.hpp
deleted file mode 100644
index e050dad..0000000
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/DcpScheduler.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include "mlir/IR/MLIRContext.h"
-
-#include "ComputeGraph.hpp"
-#include "MergeSchedule.hpp"
-
-namespace onnx_mlir {
-namespace spatial {
-
-struct DcpScheduleOptions {
-  size_t processorCount = 0;
-  size_t criticalWindowSize = 0;
-  bool allowFallbackForAutoCoreCount = true;
-};
-
-MergeScheduleResult
-runDcpScheduler(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context);
-
-} // namespace spatial
-} // namespace onnx_mlir
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
index d1efd67..644367b 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
@@ -6,9 +6,7 @@
 #include <limits>
 #include <vector>
 
-#include "../DCPGraph/DCPAnalysis.hpp"
 #include "ComputeGraph.hpp"
-#include "DcpScheduler.hpp"
 #include "MergeSchedulingAnalysis.hpp"
 #include "PeftScheduler.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
@@ -21,12 +19,11 @@ namespace {
 MergeSchedulerKind getSchedulerKind() {
   switch (pimMergeScheduler.getValue()) {
   case MergeSchedulerPeft: return MergeSchedulerKind::Peft;
-  case MergeSchedulerDcp:  return MergeSchedulerKind::Dcp;
   }
   llvm_unreachable("unknown merge scheduler kind");
 }
 
-void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result, CrossbarUsage crossbarCapacity) {
+void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result, unsigned long crossbarCapacity) {
   llvm::DenseMap<size_t, std::vector<std::pair<size_t, size_t>>> tasksByCpu;
   tasksByCpu.reserve(result.cpuToLastComputeMap.size());
 
@@ -51,11 +48,11 @@ void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result
       return lhs.second < rhs.second;
     });
 
-    CrossbarUsage usedCrossbars = 0;
+    unsigned int usedCrossbars = 0;
     for (size_t slot = 0; slot < scheduledTasks.size(); ++slot) {
       if (scheduledTasks[slot].first != slot)
         llvm::report_fatal_error("merge scheduling: CPU slots are not contiguous");
-      usedCrossbars = addOrMax(usedCrossbars, graph.nodes[scheduledTasks[slot].second].crossbarUsage);
+      usedCrossbars = addOrMax(usedCrossbars, graph.nodes[scheduledTasks[slot].second].crossbarUsage.size());
       if (usedCrossbars > crossbarCapacity)
         llvm::report_fatal_error("merge scheduling: CPU crossbar capacity exceeded");
     }
@@ -115,18 +112,10 @@ MergeScheduleResult MergeSchedulingAnalysis::run() {
   if (options.kind == MergeSchedulerKind::Peft) {
     schedule = runPeftScheduler(graph,
                                 PeftScheduleOptions {options.processorCount,
-                                                     static_cast<CrossbarUsage>(crossbarCountInCore.getValue()),
+                                                     static_cast<unsigned long>(crossbarCountInCore.getValue()),
                                                      entryOp->getContext()});
   }
-  else {
-    schedule = runDcpScheduler(graph,
-                               DcpScheduleOptions {options.processorCount,
-                                                   dcpCriticalWindowSize.getValue(),
-                                                   options.allowDcpFallbackForAutoCoreCount},
-                               entryOp->getContext());
-  }
-
-  verifySchedule(graph, schedule, static_cast<CrossbarUsage>(crossbarCountInCore.getValue()));
+  verifySchedule(graph, schedule, static_cast<unsigned long>(crossbarCountInCore.getValue()));
   return schedule;
 }
 
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.hpp
index 53b7fe5..971544f 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.hpp
@@ -10,14 +10,12 @@ namespace onnx_mlir {
 namespace spatial {
 
 enum class MergeSchedulerKind {
-  Dcp,
   Peft,
 };
 
 struct MergeSchedulingOptions {
   MergeSchedulerKind kind = MergeSchedulerKind::Peft;
   size_t processorCount = 0;
-  bool allowDcpFallbackForAutoCoreCount = true;
 };
 
 class MergeSchedulingAnalysis {
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
index fe0d639..b51896f 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
@@ -1,6 +1,7 @@
 #include "mlir/IR/Threading.h"
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 
@@ -161,7 +162,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
   }
 
   std::vector<char> scheduled(nodeCount, false);
-  std::vector<CrossbarUsage> processorCrossbars(processorCount, 0);
+  std::vector<CrossbarUsage> processorCrossbars(processorCount, llvm::SmallPtrSet<mlir::Value, 6> {});
   std::vector<ScheduledTask> schedules(nodeCount);
   std::vector<std::vector<size_t>> tasksByProcessor(processorCount);
 
@@ -179,8 +180,14 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
     bool crossbarRejected = false;
 
     for (size_t processor = 0; processor < processorCount; ++processor) {
-      if (graph.nodes[task].crossbarUsage != 0
-          && addOrMax(processorCrossbars[processor], graph.nodes[task].crossbarUsage) > options.crossbarCapacity) {
+      if (graph.nodes[task].crossbarUsage.size() != 0
+          && !llvm::all_of(graph.nodes[task].crossbarUsage,
+                           [&processorCrossbars, processor](mlir::Value nodeCrossbar) {
+                             return llvm::is_contained(processorCrossbars[processor], nodeCrossbar);
+                           })
+          && addOrMax(processorCrossbars[processor].size(), graph.nodes[task].crossbarUsage.size())
+               > options.crossbarCapacity) {
+
         crossbarRejected = true;
         continue;
       }
@@ -245,7 +252,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
     schedules[task] = {bestProcessor, bestEst, bestEft};
     scheduled[task] = true;
     ++scheduledCount;
-    processorCrossbars[bestProcessor] = addOrMax(processorCrossbars[bestProcessor], graph.nodes[task].crossbarUsage);
+    processorCrossbars[bestProcessor].insert_range(graph.nodes[task].crossbarUsage);
 
     // 3. CRITICAL FIX: Topological Append
     // Because the readyQueue pops in strict topological order, simply pushing to the
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.hpp
index b9bd170..0f20c83 100644
--- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.hpp
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/PeftScheduler.hpp
@@ -10,7 +10,7 @@ namespace spatial {
 
 struct PeftScheduleOptions {
   size_t processorCount = 0;
-  CrossbarUsage crossbarCapacity = 0;
+  unsigned long crossbarCapacity = 0;
   mlir::MLIRContext* context = nullptr;
 };
 
diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/Utils.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/Utils.hpp
new file mode 100644
index 0000000..e3b4b28
--- /dev/null
+++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/Scheduling/Utils.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+
+#include "llvm/Support/Casting.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <list>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "src/Accelerators/PIM/Common/LabeledList.hpp"
+#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
+
+using CPU = int;
+using Weight = unsigned long long;
+using Time = unsigned long long;
+
+
+template <typename T>
+inline T checkedAdd(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
+  assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
+  return lhs + rhs;
+}
+
+template <typename T>
+inline T checkedMultiply(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
+  if (lhs == 0 || rhs == 0)
+    return 0;
+  assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
+  return lhs * rhs;
+}
+
+template <typename T>
+inline T addOrMax(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
+  if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
+    return std::numeric_limits<T>::max();
+  return checkedAdd(lhs, rhs);
+}
+
+template <typename T>
+inline T subtractOrZero(T lhs, T rhs) {
+  static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
+  if (lhs == std::numeric_limits<T>::max())
+    return lhs;
+  if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
+    return 0;
+  return lhs - rhs;
+}
+
+inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
+
diff --git a/test/PIM/CMakeLists.txt b/test/PIM/CMakeLists.txt
index b7735ec..7e9be67 100644
--- a/test/PIM/CMakeLists.txt
+++ b/test/PIM/CMakeLists.txt
@@ -30,10 +30,3 @@ add_pim_unittest(LabeledListTest
   OMPimCommon
 )
 
-add_pim_unittest(DCPTest
-  DCPTest.cpp
-
-  LINK_LIBS PRIVATE
-  OMPimCommon
-  SpatialOps
-)
diff --git a/test/PIM/DCPTest.cpp b/test/PIM/DCPTest.cpp
deleted file mode 100644
index 1b44123..0000000
--- a/test/PIM/DCPTest.cpp
+++ /dev/null
@@ -1,531 +0,0 @@
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <initializer_list>
-#include <iostream>
-#include <limits>
-#include <optional>
-#include <unordered_map>
-#include <vector>
-
-#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
-#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
-#include "src/Compiler/CompilerOptions.hpp"
-
-namespace {
-
-struct ExpectedScheduledTask {
-  size_t nodeIndex;
-  Time aest;
-  Time alst;
-  Weight weight;
-};
-
-struct ScheduledPlacement {
-  CPU cpu;
-  GraphDCP::ScheduledTaskInfo task;
-};
-
-std::filesystem::path getDcpTestOutputDir() { return std::filesystem::temp_directory_path() / "raptor-test-pim"; }
-
-void configureDcpDotOutput() {
-  auto outputDir = getDcpTestOutputDir();
-  std::error_code errorCode;
-  std::filesystem::remove_all(outputDir, errorCode);
-  std::filesystem::create_directories(outputDir, errorCode);
-  assert(!errorCode);
-  onnx_mlir::outputBaseName = (outputDir / "DCPTest.mlir").string();
-}
-
-std::optional<std::filesystem::path> getLatestDcpDotFile() {
-  auto graphDir = getDcpTestOutputDir() / "dcp_graph";
-  if (!std::filesystem::exists(graphDir))
-    return std::nullopt;
-
-  std::optional<std::filesystem::path> latestDot;
-  for (const auto& entry : std::filesystem::directory_iterator(graphDir)) {
-    if (!entry.is_regular_file() || entry.path().extension() != ".dot")
-      continue;
-    if (!latestDot || entry.path().filename() > latestDot->filename())
-      latestDot = entry.path();
-  }
-  return latestDot;
-}
-
-void dumpDcpFailureArtifacts() {
-  auto latestDot = getLatestDcpDotFile();
-  if (!latestDot) {
-    std::cerr << "No DCP dot file was produced.\n";
-    return;
-  }
-
-  std::cerr << "DCP dot file: " << latestDot->string() << '\n';
-  std::ifstream dotFile(*latestDot);
-  if (!dotFile.is_open()) {
-    std::cerr << "Failed to open DCP dot file.\n";
-    return;
-  }
-
-  std::cerr << dotFile.rdbuf();
-}
-
-void printCpuSchedule(GraphDCP& graph, CPU cpu) {
-  auto actualTasks = graph.getScheduledTasks(cpu);
-  std::cerr << "CPU " << cpu << " actual schedule:\n";
-  for (const auto& task : actualTasks) {
-    std::cerr << "  " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
-              << " weight: " << task.weight << '\n';
-  }
-}
-
-void printGraphSchedule(GraphDCP& graph) {
-  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
-    printCpuSchedule(graph, cpu);
-}
-
-bool checkScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
-  auto actualTasks = graph.getScheduledTasks(cpu);
-  if (actualTasks.size() != expectedTasks.size()) {
-    printCpuSchedule(graph, cpu);
-    return false;
-  }
-
-  auto expectedIt = expectedTasks.begin();
-  for (const auto& actualTask : actualTasks) {
-    if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
-        || actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
-      printCpuSchedule(graph, cpu);
-      return false;
-    }
-    ++expectedIt;
-  }
-  return true;
-}
-
-std::unordered_map<size_t, ScheduledPlacement> collectScheduledPlacements(GraphDCP& graph) {
-  std::unordered_map<size_t, ScheduledPlacement> scheduledPlacements;
-  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
-    for (const auto& task : graph.getScheduledTasks(cpu)) {
-      auto [it, inserted] = scheduledPlacements.emplace(task.nodeIndex, ScheduledPlacement {cpu, task});
-      assert(inserted && "task scheduled multiple times");
-      (void) it;
-    }
-  }
-  return scheduledPlacements;
-}
-
-bool checkAllTasksScheduled(GraphDCP& graph, size_t expectedTaskCount) {
-  auto scheduledPlacements = collectScheduledPlacements(graph);
-  if (scheduledPlacements.size() != expectedTaskCount) {
-    std::cerr << "Expected " << expectedTaskCount << " scheduled tasks, got " << scheduledPlacements.size() << "\n";
-    printGraphSchedule(graph);
-    return false;
-  }
-  return true;
-}
-
-bool checkCpuSchedulesDoNotOverlap(GraphDCP& graph) {
-  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
-    auto scheduledTasks = graph.getScheduledTasks(cpu);
-    Time previousCompletion = 0;
-    bool firstTask = true;
-    for (const auto& task : scheduledTasks) {
-      Time completion = addOrMax(task.aest, task.weight);
-      if (task.aest > task.alst) {
-        std::cerr << "Task " << task.nodeIndex << " on CPU " << cpu << " has aest > alst\n";
-        printCpuSchedule(graph, cpu);
-        return false;
-      }
-      if (!firstTask && task.aest < previousCompletion) {
-        std::cerr << "CPU " << cpu << " has overlapping tasks\n";
-        printCpuSchedule(graph, cpu);
-        return false;
-      }
-      previousCompletion = completion;
-      firstTask = false;
-    }
-  }
-  return true;
-}
-
-bool checkDependencyConstraints(GraphDCP& graph, llvm::ArrayRef<IndexedEdge> edges) {
-  auto scheduledPlacements = collectScheduledPlacements(graph);
-  for (auto [parentIndex, childIndex, transferCost] : edges) {
-    const auto& parent = scheduledPlacements.at(parentIndex);
-    const auto& child = scheduledPlacements.at(childIndex);
-    Time requiredStart = addOrMax(parent.task.aest, parent.task.weight);
-    if (parent.cpu != child.cpu)
-      requiredStart = addOrMax(requiredStart, static_cast<Weight>(transferCost));
-    if (child.task.aest < requiredStart) {
-      std::cerr << "Dependency violation for edge " << parentIndex << " -> " << childIndex << '\n';
-      printGraphSchedule(graph);
-      return false;
-    }
-  }
-  return true;
-}
-
-Time getMaxCompletion(GraphDCP& graph) {
-  Time maxCompletion = 0;
-  for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
-    for (const auto& task : graph.getScheduledTasks(cpu))
-      maxCompletion = std::max(maxCompletion, addOrMax(task.aest, task.weight));
-  return maxCompletion;
-}
-
-int testDCPGraphSingleNode() {
-  std::cout << "testDCPGraphSingleNode:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {15};
-  GraphDCP graph(nodeWeights, {});
-  graph.runDcp();
-
-  if (graph.cpuCount() != 1) {
-    std::cerr << "Expected exactly 1 CPU, got " << graph.cpuCount() << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           0,
-                           {
-                             {0, 0, 0, 15},
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  return 0;
-}
-
-int testDCPGraphLinearChain() {
-  std::cout << "testDCPGraphLinearChain:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {10, 20, 5};
-  const std::vector<IndexedEdge> edges = {
-    {0, 1, 7},
-    {1, 2, 9},
-  };
-
-  GraphDCP graph(nodeWeights, edges);
-  graph.runDcp();
-
-  if (graph.cpuCount() != 1) {
-    std::cerr << "Expected a linear chain to stay on one CPU, got " << graph.cpuCount() << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           0,
-                           {
-                             {0, 0,  0,  10},
-                             {1, 10, 10, 20},
-                             {2, 30, 30, 5 },
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkCpuSchedulesDoNotOverlap(graph) || !checkDependencyConstraints(graph, edges)) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  return 0;
-}
-
-int testDCPGraphFixture() {
-  std::cout << "testDCPGraphFixture:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {
-    80,
-    40,
-    40,
-    40,
-    40,
-    40,
-    60,
-    30,
-    30,
-    30,
-    30,
-    40,
-    20,
-    20,
-    20,
-    20,
-    10,
-    10,
-  };
-  const std::vector<IndexedEdge> edges = {
-    {0,  1,  3  },
-    {0,  1,  120},
-    {0,  2,  120},
-    {0,  3,  120},
-    {0,  4,  120},
-    {0,  5,  120},
-    {0,  6,  120},
-    {2,  6,  80 },
-    {2,  7,  80 },
-    {3,  8,  80 },
-    {4,  9,  80 },
-    {5,  10, 80 },
-    {6,  7,  120},
-    {6,  8,  120},
-    {6,  9,  120},
-    {6,  10, 120},
-    {6,  11, 120},
-    {8,  11, 80 },
-    {8,  12, 80 },
-    {9,  13, 80 },
-    {10, 14, 80 },
-    {11, 12, 120},
-    {11, 13, 120},
-    {11, 14, 120},
-    {11, 15, 120},
-    {13, 15, 80 },
-    {13, 16, 80 },
-    {14, 17, 80 },
-    {15, 16, 120},
-    {15, 17, 120},
-  };
-
-  GraphDCP graph(nodeWeights, {});
-  for (auto [parent, child, weight] : edges)
-    graph.makeEdge(parent, child, weight);
-
-  graph.runDcp();
-  if (graph.cpuCount() != 4) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           3,
-                           {
-                             {1, 200, 400, 40},
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           2,
-                           {
-                             {5,  200, 260, 40},
-                             {10, 300, 300, 30},
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           1,
-                           {
-                             {4, 200, 210, 40},
-                             {7, 300, 410, 30},
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkScheduledTasks(graph,
-                           0,
-                           {
-                             {0,  0,   0,   80},
-                             {2,  80,  80,  40},
-                             {6,  120, 120, 60},
-                             {3,  180, 200, 40},
-                             {8,  220, 240, 30},
-                             {11, 250, 270, 40},
-                             {12, 290, 310, 20},
-                             {9,  320, 330, 30},
-                             {13, 350, 360, 20},
-                             {15, 370, 380, 20},
-                             {16, 390, 400, 10},
-                             {14, 410, 410, 20},
-                             {17, 430, 430, 10},
-  })) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
-      || !checkDependencyConstraints(graph, edges)) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  return 0;
-}
-
-int testDCPGraphMaxCPUs() {
-  std::cout << "testDCPGraphMaxCPUs:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {20, 10, 10, 10, 10, 10, 10};
-  const std::vector<IndexedEdge> edges = {
-    {0, 1, 0},
-    {0, 2, 0},
-    {0, 3, 0},
-    {0, 4, 0},
-    {0, 5, 0},
-    {0, 6, 0},
-  };
-
-  GraphDCP graph(nodeWeights, edges);
-  graph.setMaxCpuCount(2);
-  graph.runDcp();
-
-  if (graph.cpuCount() != 2) {
-    std::cerr << "Expected exactly 2 CPUs with maxCpuCount=2, got " << graph.cpuCount() << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
-      || !checkDependencyConstraints(graph, edges)) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (getMaxCompletion(graph) > 50) {
-    std::cerr << "Expected makespan <= 50 under maxCpuCount=2, got " << getMaxCompletion(graph) << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-
-  return 0;
-}
-
-int testDCPGraphSingleCpuCap() {
-  std::cout << "testDCPGraphSingleCpuCap:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {20, 10, 10, 10};
-  const std::vector<IndexedEdge> edges = {
-    {0, 1, 0},
-    {0, 2, 0},
-    {0, 3, 0},
-  };
-
-  GraphDCP graph(nodeWeights, edges);
-  graph.setMaxCpuCount(1);
-  graph.runDcp();
-
-  if (graph.cpuCount() != 1) {
-    std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
-      || !checkDependencyConstraints(graph, edges)) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-  if (getMaxCompletion(graph) != 50) {
-    std::cerr << "Expected makespan 50 under maxCpuCount=1, got " << getMaxCompletion(graph) << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-
-  return 0;
-}
-
-int testDCPGraphDiamondDependencies() {
-  std::cout << "testDCPGraphDiamondDependencies:" << std::endl;
-  configureDcpDotOutput();
-
-  const std::vector<Weight> nodeWeights = {15, 10, 12, 20};
-  const std::vector<IndexedEdge> edges = {
-    {0, 1, 5},
-    {0, 2, 7},
-    {1, 3, 3},
-    {2, 3, 2},
-  };
-
-  GraphDCP graph(nodeWeights, edges);
-  graph.runDcp();
-
-  if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
-      || !checkDependencyConstraints(graph, edges)) {
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-
-  auto scheduledPlacements = collectScheduledPlacements(graph);
-  const auto& sink = scheduledPlacements.at(3).task;
-  if (sink.aest < 27) {
-    std::cerr << "Expected sink node to start no earlier than the longest parent path, got " << sink.aest << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-
-  return 0;
-}
-
-// crossbarSize=4, crossbarCount=2 => capacity = 4*4*2 = 32.
-// Each task with crossbarUsage=1 needs footprint = 4*4 = 16, so at most 1 task
-// can fit per CPU (16+16 = 32 >= capacity). The scheduler must open a fresh CPU
-// for each task; all three end up on separate CPUs with their base weight.
-int testDCPGraphCrossbarExhaustion() {
-  std::cout << "testDCPGraphCrossbarExhaustion:" << std::endl;
-  configureDcpDotOutput();
-
-  const size_t savedCrossbarSize = onnx_mlir::crossbarSize.getValue();
-  const size_t savedCrossbarCount = onnx_mlir::crossbarCountInCore.getValue();
-  onnx_mlir::crossbarSize = 4;
-  onnx_mlir::crossbarCountInCore = 2;
-
-  auto restoreCrossbarOptions = [&]() {
-    onnx_mlir::crossbarSize = savedCrossbarSize;
-    onnx_mlir::crossbarCountInCore = savedCrossbarCount;
-  };
-
-  const std::vector<Weight> nodeWeights = {10, 10, 10};
-  const std::vector<CrossbarUsage> nodeCrossbarUsage = {1, 1, 1};
-  GraphDCP graph(nodeWeights, {}, {},nodeCrossbarUsage);
-  graph.setMaxCpuCount(3);
-  graph.runDcp();
-
-  if (graph.cpuCount() != 3) {
-    restoreCrossbarOptions();
-    std::cerr << "Expected 3 CPUs (one per task due to crossbar limit), got " << graph.cpuCount() << "\n";
-    dumpDcpFailureArtifacts();
-    return 1;
-  }
-
-  int failures = 0;
-  for (CPU c = 0; c < 3; c++) {
-    auto scheduledTasks = graph.getScheduledTasks(c);
-    if (scheduledTasks.size() != 1) {
-      std::cerr << "Expected exactly 1 task on CPU " << c << ", got " << scheduledTasks.size() << "\n";
-      printCpuSchedule(graph, c);
-      failures++;
-      continue;
-    }
-    if (scheduledTasks[0].weight != 10) {
-      std::cerr << "Expected weight=10 on CPU " << c << ", got " << scheduledTasks[0].weight << "\n";
-      printCpuSchedule(graph, c);
-      failures++;
-    }
-  }
-
-  restoreCrossbarOptions();
-  if (failures) dumpDcpFailureArtifacts();
-  return failures;
-}
-
-} // namespace
-
-int main(int argc, char* argv[]) {
-  (void) argc;
-  (void) argv;
-
-  int failures = 0;
-  failures += testDCPGraphSingleNode();
-  failures += testDCPGraphLinearChain();
-  failures += testDCPGraphFixture();
-  failures += testDCPGraphMaxCPUs();
-  failures += testDCPGraphSingleCpuCap();
-  failures += testDCPGraphDiamondDependencies();
-  failures += testDCPGraphCrossbarExhaustion();
-  if (failures != 0) {
-    std::cerr << failures << " test failures\n";
-    return EXIT_FAILURE;
-  }
-  return EXIT_SUCCESS;
-}
diff --git a/validation/validate.py b/validation/validate.py
index 9bfb371..79abfd3 100644
--- a/validation/validate.py
+++ b/validation/validate.py
@@ -65,7 +65,7 @@ def main():
     ap.add_argument("--crossbar-count", type=int, default=8)
     ap.add_argument("--core-count", type=int, default=None,
                     help="Core count to pass to Raptor. Required for PIM validation.")
-    ap.add_argument("--pim-merge-scheduler", choices=("peft", "dcp"), default="peft",
+    ap.add_argument("--pim-merge-scheduler", choices=("peft"), default="peft",
                     help="Scheduler used by the Spatial merge-compute-nodes pass.")
     ap.add_argument("--command-timeout-seconds", type=float, default=1000000.0,
                     help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")