From 15e8edb9c47e2e701c628de11d00ce9e9c015614 Mon Sep 17 00:00:00 2001 From: NiccoloN Date: Sat, 25 Apr 2026 19:24:09 +0200 Subject: [PATCH] better spat computes merging --- README.md | 154 +++++ src/PIM/Compiler/PimCompilerOptions.cpp | 4 +- .../ONNXToSpatial/ONNXToSpatialPass.cpp | 104 +--- src/PIM/Conversion/SpatialToPim/Common.cpp | 19 +- .../DCPGraph/DCPAnalysis.cpp | 346 ++++++++--- .../MergeComputeNodes/DCPGraph/Graph.cpp | 514 ++++++++++++--- .../MergeComputeNodes/DCPGraph/Graph.hpp | 31 +- .../MergeComputeNodes/DCPGraph/GraphDebug.cpp | 41 +- .../MergeComputeNodes/DCPGraph/GraphDebug.hpp | 5 +- .../MergeComputeNodesPass.cpp | 585 +++++++++++++++++- test/PIM/DCPTest.cpp | 43 +- 11 files changed, 1477 insertions(+), 369 deletions(-) diff --git a/README.md b/README.md index 3d26525..e973656 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,159 @@ # Raptor +Raptor is a domain-specific MLIR compiler for neural networks (ONNX format) +targeting in-memory computing / processing-in-memory (PIM) architectures. +It progressively lowers ONNX-MLIR through a set of MLIR dialects down to +target-specific artifacts (currently JSON code for the `pimsim-nn` simulator). + +## Overview + +PIM architectures perform most of the computation directly in memory. +Raptor's first supported target is `pimsim-nn`, which simulates a chip with: +- a shared host memory, +- a number of cores that do most of the computation directly in their memory + (vector ops, vmm/mvm on ReRAM crossbars), +- no branching instructions (branchless architecture) and no hardware loop + support — any repeated work (e.g. convolutions) must be unrolled into + explicit per-iteration instructions. + +Because of this, the amount of emitted instructions explodes quickly and the +compiler must optimize aggressively at every stage to keep compilation +tractable. + +A second target, `PulPim`, is planned for an accelerator with RISC-V cores +each carrying its own in-memory computing unit and crossbars. It will live in +a dedicated dialect (future work). + +### Targets and simulators + +`pimsim-nn` (under `backend-simulators/pim/pimsim-nn`) is used for +**performance** estimates (latency, energy), but does not functionally execute +the JSON code it consumes. To validate the numerical correctness of the JSON +code produced by Raptor (or, for comparison, by the `pimcomp` compiler), we use +a Rust simulator we maintain in-tree at +`backend-simulators/pim/pim-simulator`. + +## Compilation pipeline + +The PIM-related sources live under `src/PIM` and the tests under `test/PIM`. +When working on this codebase, most changes should stay confined to those +trees (you only need to look outside, e.g. at `onnx-mlir` or `llvm`, for +framework-level details). + +High-level lowering flow: + +``` +ONNX-MLIR ──► Spatial ──► Pim (tensor) ──► Pim (bufferized) ──► PIM JSON +``` + +1. **ONNX → Spatial** (`src/PIM/Conversion/ONNXToSpatial`). + Lowers ONNX ops into the `spat` dialect (`src/PIM/Dialect/Spatial`). + Spatial models a high-level spatial in-memory accelerator: vmm/mvm + operations are accelerated by storing a constant RHS matrix into a + crossbar. Crossbars cannot be re-programmed during execution, have a + limited fixed size, and there is a limited number of them per core. + Conversion patterns are split by op family under + `Conversion/ONNXToSpatial/Patterns/{Math,NN,Tensor}` (Conv, Gemm, MatMul, + Elementwise, ReduceMean, Pool, Relu, Sigmoid, Softmax, Concat, Gather, + Reshape, Resize, Split). + +2. **Spatial → Pim** (`src/PIM/Conversion/SpatialToPim`). + Lowers Spatial to the `pim` dialect (`src/PIM/Dialect/Pim`), which + materializes PIM cores (`pim.core`), inter-core communication + (`pim.send` / `pim.receive`), halts, and crossbar-level operations. + +3. **Merge compute nodes** (`src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes`). + A DCP-inspired heuristic (Dynamic Critical Path — see the original + scheduling paper by Kwok & Ahmad, + [DCP-eScience2007](https://clouds.cis.unimelb.edu.au/papers/DCP-eScience2007.pdf)) + that coarsens the virtual node graph and decides how to group compute + nodes onto cores. Our implementation is only DCP-*inspired*: it is a + heuristic with different assumptions from the paper (different cost + model, constraints from crossbar capacity / core resources, and a + windowed coarsening loop instead of full-graph reprioritization). The + `dcp-critical-window-size` option controls how many lowest-slack virtual + nodes each coarsening iteration considers (0 = legacy full-graph + analysis). Related sources: `DCPGraph/DCPAnalysis.cpp`, `Graph.cpp/.hpp`, + `MergeComputeNodesPass.cpp`. + +4. **Bufferization** (`src/PIM/Dialect/Pim/Transforms/Bufferization`). + Converts tensor-semantics PIM IR into memref-semantics PIM IR using the + standard MLIR `BufferizableOpInterface` machinery + (`OpBufferizationInterfaces.*`, `PimBufferization.td`). + +5. **PIM code generation** (`src/PIM/Pass/PimCodegen`): + - `HostConstantFolding` — folds host-side constants. + - `MaterializeHostConstantsPass` — materializes the remaining host + constants for emission. + - `VerificationPass` — checks invariants before emission. + - `EmitPimJsonPass` — emits the final PIM JSON consumed by `pimsim-nn` + and `pim-simulator`. + +Supporting pieces: +- `src/PIM/Compiler` — PIM-specific compiler options (crossbar size/count, + core count, DCP window, experimental conv impl, concat error handling, …) + and `PimCodeGen` entry points. +- `src/PIM/Common` — shared utilities (`PimCommon`, `LabeledList`). +- `src/PIM/Pass` — auxiliary passes (`MessagePass`, `CountInstructionPass`) + and the `PIMPasses.h` registry used by `PimAccelerator`. +- `src/PIM/PimAccelerator.{cpp,hpp}` — accelerator entry point: registers + dialects, passes, and plugs Raptor into the ONNX-MLIR driver. + +## Key compiler options + +Pass these on the `onnx-mlir` command line when compiling for PIM: + +- `--maccel=PIM` — select the PIM accelerator. +- `--EmitSpatial` / `--EmitPim` / `--EmitPimBufferized` / `--EmitPimCodegen` + — stop the pipeline at the requested stage (default: `EmitPimCodegen`). +- `--pim-only-codegen` — assume the input is already bufferized PIM IR and + run only the codegen tail. +- `--crossbar-size=` / `--crossbar-count=` — crossbar dimensions and + per-core count. +- `--core-count=` — number of cores (`-1` picks the minimum). +- `--dcp-critical-window-size=` — DCP coarsening window (0 = legacy). +- `--use-experimental-conv-impl` — alternative convolution lowering. +- `--ignore-concat-error` — soft-fail corner case in `ConcatOp`. + +## Validation + +Functional validation lives in `validation/` and drives the Rust +`pim-simulator` to compare Raptor's output against a reference. + +Per-operation validation (from `validation/`): + +``` +validate.py \ + --raptor-path ../cmake-build-release/Release/bin/onnx-mlir \ + --onnx-include-dir ../onnx-mlir/include +``` + +End-to-end network validation (example: first 4 layers of YOLOv11n): + +``` +validate.py \ + --raptor-path ../cmake-build-release/Release/bin/onnx-mlir \ + --onnx-include-dir ../onnx-mlir/include \ + --operations-dir ./networks/yolo11n/depth_04 \ + --crossbar-size 2048 +``` + +Available networks under `validation/networks/`: `vgg16`, `yolo11n`. +Available operations under `validation/operations/`: `add`, `conv`, `div`, +`gather`, `gemm`, `gemv`, `mul`, `pool`, `reduce_mean`, `relu`, `resize`, +`sigmoid`, `softmax`, `split`. + +## Rebuilding + +Release build (fast): + +``` +cmake --build /home/nico/raptor/raptor/cmake-build-release --target onnx-mlir -j 30 +``` + +A slower debug build is also available — configure it the same way but with +`-DCMAKE_BUILD_TYPE=Debug` (see installation instructions below). + ## Build ### Protobuf diff --git a/src/PIM/Compiler/PimCompilerOptions.cpp b/src/PIM/Compiler/PimCompilerOptions.cpp index 7f4b18c..1b3288a 100644 --- a/src/PIM/Compiler/PimCompilerOptions.cpp +++ b/src/PIM/Compiler/PimCompilerOptions.cpp @@ -41,7 +41,7 @@ llvm::cl::opt crossbarSize("crossbar-size", llvm::cl::desc("Width and heigth of a single crossbar"), llvm::cl::init(2)); llvm::cl::opt - crossbarCountInCore("crossbar-count", llvm::cl::desc("Number of crossbars in each core"), llvm::cl::init(2)); + crossbarCountInCore("crossbar-count", llvm::cl::desc("Number of crossbars in each core"), llvm::cl::init(256)); llvm::cl::opt coresCount("core-count", llvm::cl::desc("Number of cores in the chip. `-1` to use the minimum amount of cores."), @@ -51,7 +51,7 @@ llvm::cl::opt dcpCriticalWindowSize( "dcp-critical-window-size", llvm::cl::desc("Number of lowest-slack virtual nodes considered by each DCP coarsening iteration. " "Use 0 to run the legacy full-graph DCP analysis."), - llvm::cl::init(1024)); + llvm::cl::init(4000)); llvm::cl::opt ignoreConcatError("ignore-concat-error", diff --git a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp index 28e7078..6fc2240 100644 --- a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp +++ b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp @@ -8,7 +8,6 @@ #include "mlir/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -24,8 +23,6 @@ #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" -#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp" -#include "src/Accelerators/PIM/Pass/PIMPasses.h" #include "src/Compiler/CompilerOptions.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" @@ -52,7 +49,6 @@ struct ONNXToSpatialPass : PassWrapper(inst)) { auto sources = toRemoveOp.getInputs(); rewriter.setInsertionPointAfter(toRemoveOp); - if (llvm::any_of( - sources, [](auto source) { return isa_and_present(source.getDefiningOp()); })) { + if (llvm::any_of(sources, + [](auto source) { return isa_and_present(source.getDefiningOp()); })) { auto newCompute = spatial::SpatCompute::create(rewriter, loc, inst->getResultTypes(), sources); SmallVector sourceTypes; SmallVector sourceLoc; @@ -294,100 +288,6 @@ void ONNXToSpatialPass::encapsulateGlobalInstruction(func::FuncOp funcOp) { } } -void ONNXToSpatialPass::mergeTriviallyConnectedComputes(func::FuncOp funcOp) { - Location loc = funcOp.getLoc(); - IRRewriter rewriter(&getContext()); - SmallVector trivialComputes; - llvm::SmallSet toErase; - - for (auto compute : funcOp.getOps()) - if (compute->hasOneUse()) { - auto& use = *compute->getUses().begin(); - auto user = dyn_cast(use.getOwner()); - - if (user && user.getInputs().size() == 1 && use.getOperandNumber() >= user.getWeights().size()) - trivialComputes.push_back(compute); - } - - while (!trivialComputes.empty()) { - auto compute = trivialComputes.front(); - - if (compute.use_empty()) { - std::swap(trivialComputes.front(), trivialComputes.back()); - trivialComputes.pop_back(); - continue; - } - auto& computeUse = *compute->getUses().begin(); - auto child = cast(computeUse.getOwner()); - auto usedResult = cast(computeUse.get()).getResultNumber(); - auto childArgIndex = computeUse.getOperandNumber() - child.getWeights().size(); - - rewriter.setInsertionPointAfter(compute.getOperation()); - - auto newCompute = - spatial::SpatCompute::create(rewriter, loc, child.getResultTypes(), compute.getOperands()); - newCompute.getProperties().setOperandSegmentSizes( - {static_cast(compute.getWeights().size()), static_cast(compute.getInputs().size())}); - - IRMapping mapper; - auto weightMutableIter = newCompute.getWeightsMutable(); - for (auto weight : child.getWeights()) { - auto founded = llvm::find(newCompute.getWeights(), weight); - if (founded == newCompute.getWeights().end()) { - weightMutableIter.append(weight); - auto last = weightMutableIter.end(); - last = std::prev(last, 1); - mapper.map(weight, last->get()); - } - else { - mapper.map(weight, *founded); - } - } - - compute.getBodyRegion().cloneInto(&newCompute.getBodyRegion(), mapper); - auto newTerminator = newCompute.getBody().front().getTerminator(); - mapper.map(child.getBody().front().getArgument(childArgIndex), newTerminator->getOperand(usedResult)); - newTerminator->erase(); - rewriter.setInsertionPoint(&newCompute.getBody().front(), newCompute.getBody().front().end()); - for (auto& op : child.getBody().front()) { - auto newInst = rewriter.clone(op, mapper); - - if (auto vmOp = llvm::dyn_cast(newInst)) { - auto oldIndex = vmOp.getWeightIndex(); - auto newWeight = mapper.lookup(*std::next(child.getWeights().begin(), oldIndex)); - auto newIndex = std::distance(newCompute.getWeights().begin(), llvm::find(newCompute.getWeights(), newWeight)); - vmOp.setWeightIndex(newIndex); - } - if (auto vmOp = llvm::dyn_cast(newInst)) { - auto oldIndex = vmOp.getWeightIndex(); - auto newWeight = mapper.lookup(*std::next(child.getWeights().begin(), oldIndex)); - auto newIndex = std::distance(newCompute.getWeights().begin(), llvm::find(newCompute.getWeights(), newWeight)); - vmOp.setWeightIndex(newIndex); - } - } - - child.replaceAllUsesWith(newCompute); - toErase.insert(child); - - std::swap(trivialComputes.front(), trivialComputes.back()); - trivialComputes.pop_back(); - toErase.insert(compute); - - if (newCompute->hasOneUse()) { - auto& use = *newCompute->getUses().begin(); - auto user = dyn_cast(use.getOwner()); - if (user && user.getInputs().size() == 1 && use.getOperandNumber() >= user.getWeights().size()) - trivialComputes.push_back(newCompute); - } - } - - for (auto compute : toErase) { - for (Value result : compute->getResults()) - result.dropAllUses(); - compute.erase(); - } -} - void ONNXToSpatialPass::annotateWeightsConstants(func::FuncOp funcOp) const { funcOp.walk([&](arith::ConstantOp constantOp) { if (hasOnlySpatialMvmVmmWeightUses(constantOp.getResult())) diff --git a/src/PIM/Conversion/SpatialToPim/Common.cpp b/src/PIM/Conversion/SpatialToPim/Common.cpp index 5d4af51..0f6859f 100644 --- a/src/PIM/Conversion/SpatialToPim/Common.cpp +++ b/src/PIM/Conversion/SpatialToPim/Common.cpp @@ -96,8 +96,8 @@ bool hasSpatialChannelTargetCoreIdAttr(mlir::Value channel) { return channelNewOp && channelNewOp->hasAttr(kChannelTargetCoreIdAttrName); } -mlir::Value createPimReceiveFromSpatialChannel( - PatternRewriter& rewriter, Location loc, mlir::Value output, mlir::Value channel) { +mlir::Value +createPimReceiveFromSpatialChannel(PatternRewriter& rewriter, Location loc, mlir::Value output, mlir::Value channel) { mlir::Value outputBuffer = getBestOutputTensorFromOperandsOrAllocate(rewriter, output.getDefiningOp()); auto sizeAttr = getTensorSizeInBytesAttr(rewriter, output); auto sourceCoreIdAttr = getSpatialChannelSourceCoreIdAttr(rewriter, channel); @@ -127,6 +127,16 @@ SmallVector getOpOperandsSortedByUses(Operation* operation) { return map_to_vector(operandsAndUses, [](auto operandAndUse) { return operandAndUse.first; }); } +bool hasLaterUserInBlock(mlir::Value value, Operation* operation) { + for (Operation* user : value.getUsers()) { + if (user->getBlock() != operation->getBlock()) + return true; + if (operation->isBeforeInBlock(user)) + return true; + } + return false; +} + mlir::Value getBestOutputTensorFromOperandsOrAllocate(PatternRewriter& rewriter, Operation* operation) { assert("Only support operations with a single result" && operation->getNumResults() == 1); mlir::Value result = operation->getResult(0); @@ -134,8 +144,9 @@ mlir::Value getBestOutputTensorFromOperandsOrAllocate(PatternRewriter& rewriter, assert("Only support result ShapedType as result type" && isa(resultType)); SmallVector operands = getOpOperandsSortedByUses(operation); - auto validOperands = - make_filter_range(operands, [resultType](mlir::Value operand) { return operand.getType() == resultType; }); + auto validOperands = make_filter_range(operands, [operation, resultType](mlir::Value operand) { + return operand.getType() == resultType && !hasLaterUserInBlock(operand, operation); + }); auto bestOperand = validOperands.begin(); if (bestOperand != validOperands.end()) diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp index 781b5bc..a9e5e6c 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp @@ -3,15 +3,17 @@ #include "mlir/IR/Value.h" #include "mlir/IR/ValueRange.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" #include #include -#include #include #include -#include +#include #include #include @@ -47,11 +49,13 @@ struct TimingInfo { struct WindowScheduleResult { std::vector> mergeGroups; - bool usedAllAvailableCpus = false; + CPU cpuCount = 0; + size_t mergedNodeCount = 0; + size_t maxMergeGroupSize = 0; }; std::vector aggregateEdges(ArrayRef edges) { - std::map, Weight> edgeWeights; + llvm::DenseMap, Weight> edgeWeights; for (auto [start, end, weight] : edges) { size_t startIndex = static_cast(start); size_t endIndex = static_cast(end); @@ -59,11 +63,9 @@ std::vector aggregateEdges(ArrayRef edges) { continue; auto key = std::make_pair(startIndex, endIndex); Weight edgeWeight = static_cast(weight); - auto it = edgeWeights.find(key); - if (it == edgeWeights.end()) - edgeWeights.insert({key, edgeWeight}); - else - it->second = std::max(it->second, edgeWeight); + auto inserted = edgeWeights.try_emplace(key, edgeWeight); + if (!inserted.second) + inserted.first->second = std::max(inserted.first->second, edgeWeight); } std::vector aggregatedEdges; @@ -71,6 +73,11 @@ std::vector aggregateEdges(ArrayRef edges) { for (auto [key, weight] : edgeWeights) aggregatedEdges.push_back( {static_cast(key.first), static_cast(key.second), static_cast(weight)}); + llvm::sort(aggregatedEdges, [](const IndexedEdge& lhs, const IndexedEdge& rhs) { + if (std::get<0>(lhs) != std::get<0>(rhs)) + return std::get<0>(lhs) < std::get<0>(rhs); + return std::get<1>(lhs) < std::get<1>(rhs); + }); return aggregatedEdges; } @@ -157,10 +164,27 @@ TimingInfo computeTiming(const VirtualGraph& graph) { return timing; } -std::vector selectCriticalWindow(const TimingInfo& timing, size_t windowSize) { - std::vector selected(timing.aest.size()); - std::iota(selected.begin(), selected.end(), 0); - std::stable_sort(selected.begin(), selected.end(), [&](size_t lhs, size_t rhs) { +std::vector> buildUndirectedAdjacency(const VirtualGraph& graph) { + std::vector> adjacency(graph.nodes.size()); + for (auto [start, end, weight] : graph.edges) { + (void) weight; + size_t startIndex = static_cast(start); + size_t endIndex = static_cast(end); + assert(startIndex < graph.nodes.size() && endIndex < graph.nodes.size() && "virtual edge endpoint out of range"); + adjacency[startIndex].push_back(endIndex); + adjacency[endIndex].push_back(startIndex); + } + for (auto& neighbours : adjacency) { + llvm::sort(neighbours); + neighbours.erase(std::unique(neighbours.begin(), neighbours.end()), neighbours.end()); + } + return adjacency; +} + +std::vector selectCriticalWindow(const VirtualGraph& graph, const TimingInfo& timing, size_t windowSize) { + std::vector ranked(timing.aest.size()); + std::iota(ranked.begin(), ranked.end(), 0); + auto isHigherPriority = [&](size_t lhs, size_t rhs) { Time lhsSlack = slackOrZero(timing.aest[lhs], timing.alst[lhs]); Time rhsSlack = slackOrZero(timing.aest[rhs], timing.alst[rhs]); if (lhsSlack != rhsSlack) @@ -168,19 +192,83 @@ std::vector selectCriticalWindow(const TimingInfo& timing, size_t window if (timing.aest[lhs] != timing.aest[rhs]) return timing.aest[lhs] < timing.aest[rhs]; return lhs < rhs; - }); - selected.resize(std::min(windowSize, selected.size())); - return selected; -} + }; -std::vector getOriginalSignature(const VirtualGraph& graph, ArrayRef selectedNodes) { - std::vector signature; - for (size_t nodeIndex : selectedNodes) { - const VirtualNode& node = graph.nodes[nodeIndex]; - signature.insert(signature.end(), node.originalComputeIndices.begin(), node.originalComputeIndices.end()); + windowSize = std::min(windowSize, ranked.size()); + if (windowSize == 0) + return {}; + if (windowSize == ranked.size()) { + llvm::sort(ranked, isHigherPriority); + return ranked; } - std::sort(signature.begin(), signature.end()); - return signature; + + size_t criticalPoolSize = std::min(ranked.size(), std::max(windowSize, windowSize * 2)); + if (criticalPoolSize < ranked.size()) + std::nth_element( + ranked.begin(), ranked.begin() + static_cast(criticalPoolSize), ranked.end(), isHigherPriority); + + std::vector inCriticalPool(ranked.size(), false); + for (size_t i = 0; i < criticalPoolSize; ++i) + inCriticalPool[ranked[i]] = true; + + size_t seed = *std::min_element(ranked.begin(), ranked.end(), isHigherPriority); + std::vector> adjacency = buildUndirectedAdjacency(graph); + std::vector selected; + std::vector inWindow(ranked.size(), false); + selected.reserve(windowSize); + + struct FrontierEntry { + size_t node; + }; + auto frontierCompare = [&](FrontierEntry lhs, FrontierEntry rhs) { return isHigherPriority(rhs.node, lhs.node); }; + std::priority_queue, decltype(frontierCompare)> frontier(frontierCompare); + + auto addToWindow = [&](size_t node, const std::vector& eligible) { + if (inWindow[node]) + return; + inWindow[node] = true; + selected.push_back(node); + for (size_t neighbour : adjacency[node]) + if (!inWindow[neighbour] && eligible[neighbour]) + frontier.push({neighbour}); + }; + + addToWindow(seed, inCriticalPool); + while (!frontier.empty() && selected.size() < windowSize) { + size_t node = frontier.top().node; + frontier.pop(); + if (!inWindow[node]) + addToWindow(node, inCriticalPool); + } + + if (selected.size() < windowSize) { + std::vector anyNode(ranked.size(), true); + for (size_t node : selected) + for (size_t neighbour : adjacency[node]) + if (!inWindow[neighbour]) + frontier.push({neighbour}); + while (!frontier.empty() && selected.size() < windowSize) { + size_t node = frontier.top().node; + frontier.pop(); + if (!inWindow[node]) + addToWindow(node, anyNode); + } + } + + if (selected.size() < windowSize) { + llvm::sort(ranked, isHigherPriority); + for (size_t node : ranked) { + if (selected.size() == windowSize) + break; + if (!inWindow[node]) { + inWindow[node] = true; + selected.push_back(node); + } + } + } + + llvm::sort(selected, isHigherPriority); + return selected; } std::vector buildWindowEdges(const VirtualGraph& graph, const std::vector& nodeToWindowIndex) { @@ -216,25 +304,47 @@ WindowScheduleResult scheduleWindow(const VirtualGraph& graph, ArrayRef windowGraph.runDcp(); WindowScheduleResult result; - result.usedAllAvailableCpus = windowGraph.cpuCount() >= windowGraph.getMaxCpuCount(); + result.cpuCount = windowGraph.cpuCount(); for (CPU cpu = 0; cpu < windowGraph.cpuCount(); ++cpu) { auto scheduledTasks = windowGraph.getScheduledTasks(cpu); if (scheduledTasks.size() < 2) continue; + result.mergedNodeCount += scheduledTasks.size(); + result.maxMergeGroupSize = std::max(result.maxMergeGroupSize, scheduledTasks.size()); std::vector mergeGroup; mergeGroup.reserve(scheduledTasks.size()); for (const auto& task : scheduledTasks) mergeGroup.push_back(selectedNodes[task.nodeIndex]); - std::sort(mergeGroup.begin(), mergeGroup.end()); result.mergeGroups.push_back(std::move(mergeGroup)); } return result; } -bool coarsenGraph(const VirtualGraph& graph, ArrayRef> mergeGroups, VirtualGraph& coarsenedGraph) { +bool coarsenGraph(const VirtualGraph& graph, + ArrayRef> mergeGroups, + VirtualGraph& coarsenedGraph, + std::vector& oldToNewNode) { + TimingInfo timing = computeTiming(graph); + std::vector topologicalRank(graph.nodes.size()); + std::iota(topologicalRank.begin(), topologicalRank.end(), 0); + if (timing.valid) + for (auto [rank, nodeIndex] : llvm::enumerate(timing.topologicalOrder)) + topologicalRank[nodeIndex] = rank; + + std::vector> orderedMergeGroups; + orderedMergeGroups.reserve(mergeGroups.size()); + for (const auto& mergeGroup : mergeGroups) { + orderedMergeGroups.emplace_back(mergeGroup.begin(), mergeGroup.end()); + std::stable_sort(orderedMergeGroups.back().begin(), orderedMergeGroups.back().end(), [&](size_t lhs, size_t rhs) { + if (topologicalRank[lhs] != topologicalRank[rhs]) + return topologicalRank[lhs] < topologicalRank[rhs]; + return lhs < rhs; + }); + } + std::vector nodeToMergeGroup(graph.nodes.size(), -1); - for (auto [groupIndex, mergeGroup] : llvm::enumerate(mergeGroups)) { + for (auto [groupIndex, mergeGroup] : llvm::enumerate(orderedMergeGroups)) { if (mergeGroup.size() < 2) continue; for (size_t nodeIndex : mergeGroup) { @@ -243,18 +353,21 @@ bool coarsenGraph(const VirtualGraph& graph, ArrayRef> merge } } - std::vector> mergeGroupToNewNode(mergeGroups.size()); - std::vector oldToNewNode(graph.nodes.size(), 0); + std::vector> mergeGroupToNewNode(orderedMergeGroups.size()); + std::vector newNodeRank; + oldToNewNode.assign(graph.nodes.size(), 0); bool mergedAny = false; coarsenedGraph.nodes.clear(); coarsenedGraph.edges.clear(); coarsenedGraph.nodes.reserve(graph.nodes.size()); + newNodeRank.reserve(graph.nodes.size()); for (size_t nodeIndex = 0; nodeIndex < graph.nodes.size(); ++nodeIndex) { int64_t mergeGroupIndex = nodeToMergeGroup[nodeIndex]; if (mergeGroupIndex == -1) { oldToNewNode[nodeIndex] = coarsenedGraph.nodes.size(); coarsenedGraph.nodes.push_back(graph.nodes[nodeIndex]); + newNodeRank.push_back(topologicalRank[nodeIndex]); continue; } @@ -265,7 +378,7 @@ bool coarsenGraph(const VirtualGraph& graph, ArrayRef> merge } VirtualNode mergedNode; - for (size_t memberIndex : mergeGroups[static_cast(mergeGroupIndex)]) { + for (size_t memberIndex : orderedMergeGroups[static_cast(mergeGroupIndex)]) { const VirtualNode& memberNode = graph.nodes[memberIndex]; mergedNode.originalComputeIndices.append(memberNode.originalComputeIndices.begin(), memberNode.originalComputeIndices.end()); @@ -276,8 +389,9 @@ bool coarsenGraph(const VirtualGraph& graph, ArrayRef> merge mergedAny = true; newNodeIndex = coarsenedGraph.nodes.size(); - for (size_t memberIndex : mergeGroups[static_cast(mergeGroupIndex)]) + for (size_t memberIndex : orderedMergeGroups[static_cast(mergeGroupIndex)]) oldToNewNode[memberIndex] = *newNodeIndex; + newNodeRank.push_back(topologicalRank[orderedMergeGroups[static_cast(mergeGroupIndex)].front()]); coarsenedGraph.nodes.push_back(std::move(mergedNode)); } @@ -291,75 +405,61 @@ bool coarsenGraph(const VirtualGraph& graph, ArrayRef> merge size_t newEnd = oldToNewNode[static_cast(end)]; if (newStart == newEnd) continue; + if (newNodeRank[newStart] >= newNodeRank[newEnd]) + continue; remappedEdges.push_back({static_cast(newStart), static_cast(newEnd), weight}); } coarsenedGraph.edges = aggregateEdges(remappedEdges); - return computeTiming(coarsenedGraph).valid; + return true; } -bool coarsenGraphWithFallback(const VirtualGraph& graph, - ArrayRef> mergeGroups, - VirtualGraph& coarsenedGraph) { - if (coarsenGraph(graph, mergeGroups, coarsenedGraph)) - return true; +constexpr CPU kDefaultMaxCpuCount = 1000; - std::vector orderedGroupIndices(mergeGroups.size()); - std::iota(orderedGroupIndices.begin(), orderedGroupIndices.end(), 0); - std::stable_sort(orderedGroupIndices.begin(), orderedGroupIndices.end(), [&](size_t lhs, size_t rhs) { - return mergeGroups[lhs].size() > mergeGroups[rhs].size(); - }); - - std::vector> acceptedMergeGroups; - acceptedMergeGroups.reserve(mergeGroups.size()); - for (size_t groupIndex : orderedGroupIndices) { - std::vector> candidateMergeGroups = acceptedMergeGroups; - candidateMergeGroups.push_back(mergeGroups[groupIndex]); - - VirtualGraph candidateGraph; - if (!coarsenGraph(graph, candidateMergeGroups, candidateGraph)) - continue; - - acceptedMergeGroups = std::move(candidateMergeGroups); - coarsenedGraph = std::move(candidateGraph); - } - return !acceptedMergeGroups.empty(); +CPU getVirtualGraphMaxCpuCount() { + if (coresCount.getValue() > 0) + return static_cast(coresCount.getValue()); + return kDefaultMaxCpuCount; } -std::vector computeOriginalTopologicalOrder(size_t computeCount, ArrayRef edges) { - VirtualGraph graph; - graph.nodes.resize(computeCount); - graph.edges = aggregateEdges(edges); - TimingInfo timing = computeTiming(graph); - if (timing.valid) - return timing.topologicalOrder; - - std::vector fallbackOrder(computeCount); - std::iota(fallbackOrder.begin(), fallbackOrder.end(), 0); - return fallbackOrder; +size_t getDcpCoarseningWindowSize(size_t nodeCount) { + size_t windowSize = std::min(dcpCriticalWindowSize.getValue(), nodeCount); + CPU maxCpuCount = std::max(1, getVirtualGraphMaxCpuCount()); + if (nodeCount > static_cast(maxCpuCount)) + windowSize = std::max(windowSize, std::min(nodeCount, static_cast(maxCpuCount) + 1)); + return windowSize; } -DCPAnalysisResult buildResultFromVirtualGraph(const VirtualGraph& graph, - ArrayRef spatComputes, - ArrayRef originalEdges) { +DCPAnalysisResult buildResultFromVirtualGraph(const VirtualGraph& graph, ArrayRef spatComputes) { DCPAnalysisResult result; - std::vector originalToVirtualNode(spatComputes.size(), 0); - for (auto [virtualNodeIndex, virtualNode] : llvm::enumerate(graph.nodes)) - for (size_t originalIndex : virtualNode.originalComputeIndices) - originalToVirtualNode[originalIndex] = virtualNodeIndex; - auto dominanceOrder = computeOriginalTopologicalOrder(spatComputes.size(), originalEdges); - result.dominanceOrderCompute.reserve(dominanceOrder.size()); - for (size_t originalIndex : dominanceOrder) { - SpatCompute spatCompute = spatComputes[originalIndex]; - size_t cpu = originalToVirtualNode[originalIndex]; + TimingInfo timing = computeTiming(graph); + std::vector virtualNodeOrder; + if (timing.valid) { + virtualNodeOrder = std::move(timing.topologicalOrder); + } + else { + virtualNodeOrder.resize(graph.nodes.size()); + std::iota(virtualNodeOrder.begin(), virtualNodeOrder.end(), 0); + } + + std::vector originalComputeToCpu(spatComputes.size(), 0); + for (auto [cpu, virtualNodeIndex] : llvm::enumerate(virtualNodeOrder)) { + const VirtualNode& virtualNode = graph.nodes[virtualNodeIndex]; + for (size_t originalIndex : virtualNode.originalComputeIndices) + originalComputeToCpu[originalIndex] = cpu; + } + + result.dominanceOrderCompute.reserve(spatComputes.size()); + for (auto [originalIndex, spatCompute] : llvm::enumerate(spatComputes)) { + size_t cpu = originalComputeToCpu[originalIndex]; result.dominanceOrderCompute.push_back(spatCompute); result.computeToCpuMap[spatCompute] = cpu; result.cpuToLastComputeMap[cpu] = spatCompute; } - - for (auto [cpu, lastCompute] : result.cpuToLastComputeMap) + for (const auto& [cpu, lastCompute] : result.cpuToLastComputeMap) result.isLastComputeOfCpu.insert(lastCompute); + return result; } @@ -409,32 +509,74 @@ DCPAnalysisResult DCPAnalysis::run() { return runLegacyDcp(spatComputes, edges, entryOp->getContext()); VirtualGraph virtualGraph = buildInitialVirtualGraph(spatComputes, edges); - std::set> seenCriticalWindows; - while (virtualGraph.nodes.size() > 1) { - TimingInfo timing = computeTiming(virtualGraph); - if (!timing.valid) - break; - - auto selectedNodes = selectCriticalWindow(timing, dcpCriticalWindowSize.getValue()); - if (selectedNodes.size() < 2) - break; - - if (!seenCriticalWindows.insert(getOriginalSignature(virtualGraph, selectedNodes)).second) - break; - + size_t iteration = 0; + auto tryCoarsenSelectedNodes = [&](ArrayRef selectedNodes) { + size_t oldNodeCount = virtualGraph.nodes.size(); WindowScheduleResult windowSchedule = scheduleWindow(virtualGraph, selectedNodes, entryOp->getContext()); - if (windowSchedule.mergeGroups.empty()) - break; + if (windowSchedule.mergeGroups.empty()) { + if (oldNodeCount >= 200) + llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} " + "groups=0 mergedNodes=0 maxGroup=0 new={1} changed=0\n", + iteration, + oldNodeCount, + selectedNodes.size(), + windowSchedule.cpuCount); + return false; + } VirtualGraph coarsenedGraph; - if (!coarsenGraphWithFallback(virtualGraph, windowSchedule.mergeGroups, coarsenedGraph)) - break; + std::vector oldToNewNode; + if (!coarsenGraph(virtualGraph, windowSchedule.mergeGroups, coarsenedGraph, oldToNewNode)) + return false; + if (oldNodeCount >= 200 || coarsenedGraph.nodes.size() >= 200) + llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} " + "groups={4} mergedNodes={5} maxGroup={6} new={7} changed={8}\n", + iteration, + oldNodeCount, + selectedNodes.size(), + windowSchedule.cpuCount, + windowSchedule.mergeGroups.size(), + windowSchedule.mergedNodeCount, + windowSchedule.maxMergeGroupSize, + coarsenedGraph.nodes.size(), + oldNodeCount - coarsenedGraph.nodes.size()); virtualGraph = std::move(coarsenedGraph); - if (windowSchedule.usedAllAvailableCpus) + return true; + }; + + while (virtualGraph.nodes.size() > 1) { + iteration++; + TimingInfo timing = computeTiming(virtualGraph); + if (!timing.valid) { + if (virtualGraph.nodes.size() >= 200) + llvm::errs() << llvm::formatv( + "[DCP-COARSEN] iter={0} old={1} invalid-timing\n", iteration, virtualGraph.nodes.size()); break; + } + + SmallVector selectedNodes; + auto criticalWindow = + selectCriticalWindow(virtualGraph, timing, getDcpCoarseningWindowSize(virtualGraph.nodes.size())); + selectedNodes.append(criticalWindow.begin(), criticalWindow.end()); + + if (selectedNodes.size() < 2) { + if (virtualGraph.nodes.size() >= 200) + llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} stop=small-window\n", + iteration, + virtualGraph.nodes.size(), + selectedNodes.size()); + break; + } + + if (tryCoarsenSelectedNodes(selectedNodes)) + continue; + if (virtualGraph.nodes.size() >= 200) + llvm::errs() << llvm::formatv( + "[DCP-COARSEN] iter={0} old={1} stop=no-merge\n", iteration, virtualGraph.nodes.size()); + break; } - return buildResultFromVirtualGraph(virtualGraph, spatComputes, edges); + return buildResultFromVirtualGraph(virtualGraph, spatComputes); } } // namespace spatial diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp index b24d3b1..6c48b1d 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp @@ -38,11 +38,14 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/ErrorHandling.h" #include #include #include +#include #include +#include #include #include "DCPAnalysis.hpp" @@ -60,6 +63,7 @@ namespace { // Coarse-grained phase timers printed when DCP_SELECT_PROFILE is set. struct SelectTimers { double findSlot = 0.0; + double dedup = 0.0; double precheck = 0.0; double snapshotInsertUpdate = 0.0; double childSlot = 0.0; @@ -70,9 +74,19 @@ struct SelectTimers { long tasksProcessed = 0; void dump(const char* label) const { std::fprintf(stderr, - "[selectProfile:%s] tasks=%ld findSlot=%.2fs precheck=%.2fs snapUpd=%.2fs childSlot=%.2fs rollback=%.2fs iter=%ld precheckPass=%ld dcplPass=%ld\n", - label, tasksProcessed, findSlot, precheck, snapshotInsertUpdate, childSlot, - rollbackRestore, iterations, passedPrecheck, passedDcpl); + "[selectProfile:%s] tasks=%ld dedup=%.2fs findSlot=%.2fs precheck=%.2fs snapUpd=%.2fs " + "childSlot=%.2fs rollback=%.2fs iter=%ld precheckPass=%ld dcplPass=%ld\n", + label, + tasksProcessed, + dedup, + findSlot, + precheck, + snapshotInsertUpdate, + childSlot, + rollbackRestore, + iterations, + passedPrecheck, + passedDcpl); } ~SelectTimers() { if (std::getenv("DCP_SELECT_PROFILE")) @@ -83,6 +97,101 @@ static SelectTimers gSelectTimers; } // namespace #endif +namespace { + +uint64_t mixHash(uint64_t seed, uint64_t value) { + seed ^= value + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2); + return seed; +} + +uint64_t finishHash(uint64_t seed) { + seed ^= seed >> 33; + seed *= 0xff51afd7ed558ccdULL; + seed ^= seed >> 33; + seed *= 0xc4ceb9fe1a85ec53ULL; + seed ^= seed >> 33; + return seed; +} + +uint64_t hashEdgeSignature(uint64_t neighborHash, Weight weight, uint64_t direction) { + uint64_t hash = mixHash(0x84222325cbf29ce4ULL, direction); + hash = mixHash(hash, neighborHash); + hash = mixHash(hash, static_cast(weight)); + return finishHash(hash); +} + +struct CpuAestCache { + Time defaultAest = 0; + llvm::SmallDenseMap colocatedParentAests; + + Time get(CPU cpu) const { + auto it = colocatedParentAests.find(cpu); + if (it == colocatedParentAests.end()) + return defaultAest; + return it->second; + } +}; + +struct CpuTimeMax { + CPU cpu = -1; + Time time = 0; +}; + +void updateCpuTimeMax(CpuTimeMax& first, CpuTimeMax& second, CPU cpu, Time time) { + if (first.cpu == cpu) { + first.time = std::max(first.time, time); + return; + } + if (second.cpu == cpu) { + second.time = std::max(second.time, time); + if (second.time > first.time) + std::swap(first, second); + return; + } + if (time >= first.time) { + second = first; + first = {cpu, time}; + return; + } + if (time > second.time) + second = {cpu, time}; +} + +CpuAestCache computeCpuAestCache(TaskDCP* task) { + CpuAestCache cache; + llvm::SmallDenseMap transferAestByCpu; + llvm::SmallDenseMap localAestByCpu; + Time unscheduledTransferAest = 0; + + for (const Edge& parentEdge : task->parents) { + Time parentFinish = addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()); + Time transferAest = addOrMax(parentFinish, getTransferCost(parentEdge.first, task)); + if (std::optional parentCpu = parentEdge.first->getCpu()) { + Time& cpuTransferAest = transferAestByCpu[*parentCpu]; + cpuTransferAest = std::max(cpuTransferAest, transferAest); + Time& cpuLocalAest = localAestByCpu[*parentCpu]; + cpuLocalAest = std::max(cpuLocalAest, parentFinish); + continue; + } + unscheduledTransferAest = std::max(unscheduledTransferAest, transferAest); + } + + CpuTimeMax firstOther {-1, unscheduledTransferAest}; + CpuTimeMax secondOther {-1, 0}; + for (const auto& entry : transferAestByCpu) + updateCpuTimeMax(firstOther, secondOther, entry.first, entry.second); + + cache.defaultAest = firstOther.time; + for (const auto& entry : localAestByCpu) { + CPU cpu = entry.first; + Time bestNonLocalParentAest = firstOther.cpu == cpu ? secondOther.time : firstOther.time; + cache.colocatedParentAests[cpu] = std::max(bestNonLocalParentAest, entry.second); + } + return cache; +} + +} // namespace + //===----------------------------------------------------------------------===// // Edge manipulation //===----------------------------------------------------------------------===// @@ -156,6 +265,49 @@ std::vector GraphDCP::getRoots() { return tmp; } +void GraphDCP::initTaskStructureHashes() { + taskStructureHashes.resize(nodes.size()); + for (auto [index, task] : llvm::enumerate(nodes)) { + uint64_t hash = mixHash(0x7442b1129fd01363ULL, static_cast(task.getWeight())); + hash = mixHash(hash, static_cast(task.getCrossbarUsage())); + taskStructureHashes[index] = finishHash(hash); + } + + std::vector nextHashes(nodes.size()); + std::vector edgeHashes; + for (int iteration = 0; iteration < 4; ++iteration) { + for (auto [index, task] : llvm::enumerate(nodes)) { + uint64_t hash = mixHash(0x464dcab27ac82291ULL, taskStructureHashes[index]); + edgeHashes.clear(); + edgeHashes.reserve(task.parents.size() + task.children.size()); + for (const Edge& parent : task.parents) + if (!parent.isScheduling) + edgeHashes.push_back( + hashEdgeSignature(taskStructureHashes[getNodeIndex(parent.first)], parent.second, /*direction=*/0)); + for (const Edge& child : task.children) + if (!child.isScheduling) + edgeHashes.push_back( + hashEdgeSignature(taskStructureHashes[getNodeIndex(child.first)], child.second, /*direction=*/1)); + llvm::sort(edgeHashes); + hash = mixHash(hash, static_cast(edgeHashes.size())); + for (uint64_t edgeHash : edgeHashes) + hash = mixHash(hash, edgeHash); + nextHashes[index] = finishHash(hash); + } + taskStructureHashes.swap(nextHashes); + } +} + +// Compact dedup key for CPU `c` vs `candidate`: mixes candidateAest, crossbar +// usage, and the incremental cpu structure hash. No heap allocation. +uint64_t GraphDCP::computeCpuCandidateKey(Time candidateAest, CPU cpu) { + uint64_t hash = mixHash(0xd6e8feb86659fd93ULL, static_cast(candidateAest)); + hash = mixHash(hash, static_cast(getCpuCrossbarUsage(cpu))); + auto it = cpuStructureHashes.find(cpu); + hash = mixHash(hash, it != cpuStructureHashes.end() ? it->second : 0ULL); + return finishHash(hash); +} + // Inserts `task` at `position` on `cpu`, wiring up scheduling edges with the // neighbouring tasks and keeping the global topological order consistent. TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) { @@ -164,6 +316,7 @@ TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) task->setCpu(cpu); task->setWeight(scheduledWeight); reserveTaskCrossbars(cpu, task); + cpuStructureHashes[cpu] ^= taskStructureHashes[getNodeIndex(task)]; auto& tasksInCpu = getOrCreateCpuTasks(cpu); unsigned int numCpuTasks = tasksInCpu.size(); assert(position <= numCpuTasks && "Inserting in a not valid position"); @@ -201,6 +354,7 @@ TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) { releaseTaskCrossbars(cpu, task); + cpuStructureHashes[cpu] ^= taskStructureHashes[getNodeIndex(task)]; task->resetCpu(); task->resetWeight(); auto& scheduledTasks = getOrCreateCpuTasks(cpu); @@ -271,6 +425,21 @@ bool GraphDCP::wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const return nextUsage >= getCpuCrossbarCapacity(); } +size_t GraphDCP::crossbarsUsed() const { + CrossbarUsage crossbarEdge = static_cast(onnx_mlir::crossbarSize.getValue()); + CrossbarUsage crossbarArea = crossbarEdge * crossbarEdge; + if (crossbarArea == 0) + return 0; + CrossbarUsage totalArea = 0; + for (const auto& [cpu, usage] : cpuCrossbarUsage) + totalArea = checkedAdd(totalArea, usage); + return static_cast(totalArea / crossbarArea); +} + +size_t GraphDCP::crossbarsAvailable() const { + return static_cast(lastCpu) * onnx_mlir::crossbarCountInCore.getValue(); +} + //===----------------------------------------------------------------------===// // AEST / ALST computation //===----------------------------------------------------------------------===// @@ -456,9 +625,9 @@ void GraphDCP::updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef::max()) { + initAlst(); + return; + } + + if (newDcpl != oldDcpl) { + const bool increased = newDcpl > oldDcpl; + const Time delta = increased ? (newDcpl - oldDcpl) : (oldDcpl - newDcpl); + for (TaskDCP& node : topologicalOrder) { + if (&node == task || relations.ancestors.contains(&node)) + continue; + Time alst = node.getAlst(); + node.setAlst(increased ? addOrMax(alst, delta) : subtractOrZero(alst, delta)); + } + } + + auto recomputeAlst = [&](TaskDCP* node) { + Time minAlst = std::numeric_limits