From 85e2750d6c197b307cc8c4b43261dcb874f3e21e Mon Sep 17 00:00:00 2001 From: NiccoloN Date: Tue, 21 Apr 2026 12:33:44 +0200 Subject: [PATCH] faster (and refactored) DCP analysis --- README.md | 6 +- .../SpatialToPim/SpatialToPimPass.cpp | 6 +- src/PIM/Dialect/Spatial/CMakeLists.txt | 2 + .../DCPGraph/DCPAnalysis.cpp | 28 +- .../DCPGraph/DCPAnalysis.hpp | 9 +- .../MergeComputeNodes/DCPGraph/Graph.cpp | 1489 ++++++++++++----- .../MergeComputeNodes/DCPGraph/Graph.hpp | 123 +- .../MergeComputeNodes/DCPGraph/GraphDebug.cpp | 152 ++ .../MergeComputeNodes/DCPGraph/GraphDebug.hpp | 57 + .../DCPGraph/GraphSupport.cpp | 105 ++ .../DCPGraph/GraphSupport.hpp | 41 + .../MergeComputeNodes/DCPGraph/Task.cpp | 62 +- .../MergeComputeNodes/DCPGraph/Task.hpp | 111 +- .../DCPGraph/UniqueWorklist.hpp | 57 +- .../MergeComputeNodes/DCPGraph/Utils.hpp | 120 +- .../MergeComputeNodesPass.cpp | 112 +- test/PIM/CMakeLists.txt | 11 +- test/PIM/DCPTest.cpp | 528 ++++++ test/PIM/LabeledListTest.cpp | 162 ++ test/PIM/TestPIM.cpp | 202 --- 20 files changed, 2525 insertions(+), 858 deletions(-) create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphDebug.hpp create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp create mode 100644 src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/GraphSupport.hpp create mode 100644 test/PIM/DCPTest.cpp create mode 100644 test/PIM/LabeledListTest.cpp delete mode 100644 test/PIM/TestPIM.cpp diff --git a/README.md b/README.md index 3f9ac1b..3d26525 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,7 @@ Moreover, if compiling with build type debug, it is also suggested to use mold as linker (you will need to install it if you don't have it already) to reduce memory usage during linking. You can use it by setting the options: ``` --DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \ --DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" +-DLLVM_USE_LINKER=mold ``` ### Raptor @@ -45,7 +44,8 @@ Also in this case, it is suggested to use mold as linker to reduce link time and setting the options: ``` -DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \ --DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" +-DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" \ +-DCMAKE_MODULE_LINKER_FLAGS="-fuse-ld=mold" ``` ``` diff --git a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp index 867b207..3621336 100644 --- a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp +++ b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp @@ -661,9 +661,8 @@ void SpatialToPimPass::annotateChannelCoreIds(func::FuncOp funcOp) { broadcastSendOp = op; continue; } - if (auto op = dyn_cast(user)) { + if (auto op = dyn_cast(user)) continue; - } llvm_unreachable("Unexpected user of spat.channel_new during Spatial-to-PIM lowering"); } @@ -719,7 +718,8 @@ void SpatialToPimPass::lowerBroadcastChannelOps(func::FuncOp funcOp, IRRewriter& auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult()); auto sourceCoreIdAttr = getSpatialChannelSourceCoreIdAttr(rewriter, receiveOp.getChannel()); Value receivedValue = - PimReceiveOp::create(rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr) + PimReceiveOp::create( + rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr) .getOutput(); rewriter.replaceOp(receiveOp, receivedValue); } diff --git a/src/PIM/Dialect/Spatial/CMakeLists.txt b/src/PIM/Dialect/Spatial/CMakeLists.txt index 286abd7..641f011 100644 --- a/src/PIM/Dialect/Spatial/CMakeLists.txt +++ b/src/PIM/Dialect/Spatial/CMakeLists.txt @@ -5,6 +5,8 @@ add_pim_library(SpatialOps SpatialOps.cpp Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp Transforms/MergeComputeNodes/DCPGraph/Graph.cpp + Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp + Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp Transforms/MergeComputeNodes/DCPGraph/Task.cpp Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp index d5006d7..0c0c5a7 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp @@ -17,7 +17,7 @@ namespace spatial { using namespace mlir; -SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) { +SpatWeightedCompute getOriginalSpatWeightedCompute(Operation* op) { if (!op) return {}; while (auto extract = llvm::dyn_cast(op)) { @@ -30,32 +30,32 @@ SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) { return {}; } -DCPAnalysisResult DCPAnalysis::runAnalysis() { - using EdgesIndex = std::tuple; +DCPAnalysisResult DCPAnalysis::run() { llvm::SmallVector spatWeightedComputes; - llvm::SmallVector edges; - for (auto& regions : entryOp->getRegions()) - for (SpatWeightedCompute spatWeightedCompute : regions.getOps()) + llvm::SmallVector edges; + for (auto& region : entryOp->getRegions()) + for (SpatWeightedCompute spatWeightedCompute : region.getOps()) spatWeightedComputes.push_back(spatWeightedCompute); for (auto [indexEndEdge, spatWeightedCompute] : llvm::enumerate(spatWeightedComputes)) { for (Value input : spatWeightedCompute.getInputs()) { - if (auto spatWeightedComputeArgOp = getOriginalSpatWeightCompute(input.getDefiningOp())) { - auto elemIter = llvm::find(spatWeightedComputes, spatWeightedComputeArgOp); - assert(elemIter != spatWeightedComputes.end()); - auto indexStartEdge = std::distance(spatWeightedComputes.begin(), elemIter); - ResultRange outputs = spatWeightedComputeArgOp.getResults(); + if (auto producerCompute = getOriginalSpatWeightedCompute(input.getDefiningOp())) { + auto producerIt = llvm::find(spatWeightedComputes, producerCompute); + assert(producerIt != spatWeightedComputes.end()); + auto indexStartEdge = std::distance(spatWeightedComputes.begin(), producerIt); + ResultRange outputs = producerCompute.getResults(); int64_t totalSize = 0; for (auto output : outputs) { - ShapedType result = cast(output.getType()); - totalSize += getSizeInBytes(result); + ShapedType resultType = cast(output.getType()); + totalSize += getSizeInBytes(resultType); } edges.push_back({indexStartEdge, indexEndEdge, totalSize}); } } } GraphDCP graphDCP(spatWeightedComputes, edges); - graphDCP.DCP(); + graphDCP.setContext(entryOp->getContext()); + graphDCP.runDcp(); return graphDCP.getResult(); } diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp index f7426cc..472e51f 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.hpp @@ -3,6 +3,7 @@ #include "mlir/IR/Operation.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include @@ -10,8 +11,8 @@ struct DCPAnalysisResult { std::vector dominanceOrderCompute; - llvm::DenseMap computeToCPUMap; - llvm::DenseSet isLastComputeOfACpu; + llvm::DenseMap computeToCpuMap; + llvm::DenseSet isLastComputeOfCpu; llvm::DenseMap cpuToLastComputeMap; }; @@ -21,12 +22,12 @@ struct DCPAnalysis { private: DCPAnalysisResult result; mlir::Operation* entryOp; - DCPAnalysisResult runAnalysis(); + DCPAnalysisResult run(); public: DCPAnalysis(mlir::Operation* op) : entryOp(op) { - result = runAnalysis(); + result = run(); } DCPAnalysisResult& getResult() { return result; } }; diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp index 8a45713..0056071 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.cpp @@ -1,105 +1,152 @@ +//===----------------------------------------------------------------------===// +// DCP-inspired task scheduler. +// +// Input: a DAG of compute tasks. Each task has an execution weight; each edge +// carries an inter-task transfer cost that only applies when producer and +// consumer land on different CPUs. +// +// Output: an assignment of every task to a CPU and an order within that CPU, +// aiming to minimise the overall critical-path length (DCPL). +// +// Every task keeps two timing estimates: +// AEST - earliest start time, driven by parent completions + transfers. +// ALST - latest start time that still keeps the task on the critical path. +// A task is "critical" when its slack (ALST - AEST) is zero. +// +// Main loop (runDcp): +// 1. Build a topological order and seed AEST/ALST from the unscheduled DAG. +// 2. While there are ready tasks (all dependency parents scheduled): +// a. Pick the candidate with tightest slack (earliest AEST breaks ties). +// b. selectProcessor() tries every candidate CPU and picks the one that +// minimises a composite cost (own slot + smallest unscheduled child). +// c. Commit the placement and refresh AEST/ALST. +// d. Release any child whose dependency parents are now all scheduled. +// +// Heuristic notes: classic DCP assumes identical task costs on every CPU and +// a single-issue processor model. We diverge - crossbar capacity can make a +// task infeasible on a CPU, and placement happens incrementally. That makes +// this a heuristic rather than a faithful DCP implementation. +// +// Parallelism: selectProcessor's per-CPU findSlot sweep is read-only, so we +// run it concurrently across CPUs via mlir::parallelFor. The subsequent +// sequential evaluation benefits from ordering CPUs by ascending slot.aest, +// which tightens the bestComposite early-prune bound. +//===----------------------------------------------------------------------===// + +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Threading.h" + #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include -#include #include -#include -#include -#include -#include -#include +#include +#include +#include #include #include "DCPAnalysis.hpp" #include "Graph.hpp" +#include "GraphDebug.hpp" +#include "GraphSupport.hpp" #include "Task.hpp" #include "UniqueWorklist.hpp" #include "Utils.hpp" #include "src/Accelerators/PIM/Common/PimCommon.hpp" +#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" -std::optional addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight) { - auto old_child = parent->addChild(child, weight); - auto old_parent = child->addParent(parent, weight); - assert(old_child.has_value() == old_parent.has_value() && "The edge must be present in both element"); - if (old_child.has_value()) { +#ifdef DCP_DEBUG_ENABLED +namespace { +// Coarse-grained phase timers printed when DCP_SELECT_PROFILE is set. +struct SelectTimers { + double findSlot = 0.0; + double precheck = 0.0; + double snapshotInsertUpdate = 0.0; + double childSlot = 0.0; + double rollbackRestore = 0.0; + long iterations = 0; + long passedPrecheck = 0; + long passedDcpl = 0; + long tasksProcessed = 0; + void dump(const char* label) const { + std::fprintf(stderr, + "[selectProfile:%s] tasks=%ld findSlot=%.2fs precheck=%.2fs snapUpd=%.2fs childSlot=%.2fs rollback=%.2fs iter=%ld precheckPass=%ld dcplPass=%ld\n", + label, tasksProcessed, findSlot, precheck, snapshotInsertUpdate, childSlot, + rollbackRestore, iterations, passedPrecheck, passedDcpl); + } + ~SelectTimers() { + if (std::getenv("DCP_SELECT_PROFILE")) + dump("exit"); + } +}; +static SelectTimers gSelectTimers; +} // namespace +#endif +//===----------------------------------------------------------------------===// +// Edge manipulation +//===----------------------------------------------------------------------===// + +std::optional addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling) { + auto oldChild = parent->addChild(child, weight, isScheduling); + auto oldParent = child->addParent(parent, weight, isScheduling); + assert(oldChild.has_value() == oldParent.has_value() && "The edge must be present in both element"); + if (oldChild.has_value()) { return { - {*old_parent, *old_child} + {*oldParent, *oldChild} }; } return {}; } -void removeEdge(TaskDCP* parent, TaskDCP* child) { - parent->removeChild(child); - child->removeParent(parent); +void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling) { + parent->removeChild(child, isScheduling); + child->removeParent(parent, isScheduling); } -int getTranferCost(TaskDCP* parent, TaskDCP* child) { - if (parent->scheduledCPU.has_value() && child->scheduledCPU.has_value() - && *parent->scheduledCPU == *child->scheduledCPU) { +// A dependency edge may appear multiple times (e.g. from separate data inputs); +// the transfer cost is the maximum across those parallel edges. Cost is zero +// when both endpoints share a CPU. +Weight getTransferCost(TaskDCP* parent, TaskDCP* child) { + if (parent->scheduledCpu.has_value() && child->scheduledCpu.has_value() + && *parent->scheduledCpu == *child->scheduledCpu) return 0; - } - auto child_position = - std::find_if(parent->childs.begin(), parent->childs.end(), [&child](Edge_t elem) { return elem.first == child; }); - assert(child_position != parent->childs.end()); - return child_position->second; + Weight maxTransferCost = 0; + bool foundTransferCost = false; + for (const auto& edge : parent->children) + if (edge.first == child && !edge.isScheduling) { + maxTransferCost = std::max(maxTransferCost, edge.second); + foundTransferCost = true; + } + assert(foundTransferCost && "missing transfer cost for dependency edge"); + return maxTransferCost; } +//===----------------------------------------------------------------------===// +// Indexing and CPU task lists +//===----------------------------------------------------------------------===// + size_t GraphDCP::getNodeIndex(const TaskDCP* task) const { assert(task >= nodes.data() && task < nodes.data() + nodes.size() && "task must belong to graph"); return static_cast(task - nodes.data()); } -TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) { - TaskInsertion ret; - task->setCPU(cpu); - task->setWeight(task->computeWeight(this, cpu)); - auto& tasksInCPU = mapCPUTasks[cpu]; - unsigned int numCPUTask = tasksInCPU.size(); - assert(position <= numCPUTask && "Inserting in a not valid position"); - auto insertedPoint = tasksInCPU.insert(std::next(tasksInCPU.begin(), position), task); - ret.oldTopologicalPosition = std::next(task->getTopologicalPosition()); - ret.cpuModified = cpu; - ret.taskInserted = task; - ret.graph = this; - - if (insertedPoint != tasksInCPU.begin()) { - auto precedentPoint = std::prev(insertedPoint, 1); - auto oldEdge = addEdge(*precedentPoint, *insertedPoint, 0); - ret.beforeNode = oldEdge; - - if (*task < **insertedPoint) - topologicalMoveAfter(task, *precedentPoint); - } - - if (std::next(insertedPoint) != tasksInCPU.end()) { - auto nextPoint = std::next(insertedPoint, 1); - auto oldEdge = addEdge(*insertedPoint, *nextPoint, 0); - ret.afterNode = oldEdge; - if (**insertedPoint < *task) - topologicalMoveBefore(task, *nextPoint); - } - return ret; +GraphDCP::CpuTaskList& GraphDCP::getOrCreateCpuTasks(CPU cpu) { + assert(cpu >= 0 && "cpu id must be non-negative"); + size_t cpuIndex = static_cast(cpu); + if (cpuTasks.size() <= cpuIndex) + cpuTasks.resize(cpuIndex + 1); + return cpuTasks[cpuIndex]; } -void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) { - task->resetCPU(); - task->resetWeight(); - auto& list = mapCPUTasks[cpu]; - auto task_position = std::find(list.begin(), list.end(), task); - assert(task_position != list.end() && "Removing a not present task"); - if (task_position != list.begin()) { - auto precedent_point = std::prev(task_position, 1); - removeEdge(*precedent_point, *task_position); - } - - if (std::next(task_position) != list.end()) { - auto next_point = std::next(task_position, 1); - removeEdge(*task_position, *next_point); - } - list.erase(task_position); +const GraphDCP::CpuTaskList* GraphDCP::findCpuTasks(CPU cpu) const { + if (cpu < 0) + return nullptr; + size_t cpuIndex = static_cast(cpu); + if (cpuIndex >= cpuTasks.size()) + return nullptr; + return &cpuTasks[cpuIndex]; } std::vector GraphDCP::getRoots() { @@ -110,153 +157,495 @@ std::vector GraphDCP::getRoots() { return tmp; } -void GraphDCP::initAEST() { - UniqueWorkList> worklists(getRoots()); +// Inserts `task` at `position` on `cpu`, wiring up scheduling edges with the +// neighbouring tasks and keeping the global topological order consistent. +TaskInsertion GraphDCP::insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position) { + TaskInsertion ret; + Weight scheduledWeight = task->computeWeightOnCpu(this, cpu); + task->setCpu(cpu); + task->setWeight(scheduledWeight); + reserveTaskCrossbars(cpu, task); + auto& tasksInCpu = getOrCreateCpuTasks(cpu); + unsigned int numCpuTasks = tasksInCpu.size(); + assert(position <= numCpuTasks && "Inserting in a not valid position"); + auto insertedPoint = tasksInCpu.insert(std::next(tasksInCpu.begin(), position), task); + ret.cpuModified = cpu; + ret.taskInserted = task; + ret.graph = this; + // If we split an existing neighbour-neighbour scheduling edge, drop it; the + // two new edges below recreate the ordering with `task` in between. + if (insertedPoint != tasksInCpu.begin() && std::next(insertedPoint) != tasksInCpu.end()) { + auto precedentPoint = std::prev(insertedPoint, 1); + auto nextPoint = std::next(insertedPoint, 1); + removeEdge(*precedentPoint, *nextPoint, true); + } + + if (insertedPoint != tasksInCpu.begin()) { + auto precedentPoint = std::prev(insertedPoint, 1); + auto oldEdge = addEdge(*precedentPoint, *insertedPoint, 0, true); + ret.beforeNode = oldEdge; + + if (*task < **precedentPoint) + topologicalMoveAfter(task, *precedentPoint, &ret); + } + + if (std::next(insertedPoint) != tasksInCpu.end()) { + auto nextPoint = std::next(insertedPoint, 1); + auto oldEdge = addEdge(*insertedPoint, *nextPoint, 0, true); + ret.afterNode = oldEdge; + if (**nextPoint < *task) + topologicalMoveBefore(task, *nextPoint, &ret); + } + return ret; +} + +void GraphDCP::removeTaskFromCPU(CPU cpu, TaskDCP* task) { + releaseTaskCrossbars(cpu, task); + task->resetCpu(); + task->resetWeight(); + auto& scheduledTasks = getOrCreateCpuTasks(cpu); + auto taskPosition = std::find(scheduledTasks.begin(), scheduledTasks.end(), task); + assert(taskPosition != scheduledTasks.end() && "Removing a not present task"); + TaskDCP* previousTask = nullptr; + TaskDCP* nextTask = nullptr; + if (taskPosition != scheduledTasks.begin()) { + auto previousPoint = std::prev(taskPosition, 1); + previousTask = *previousPoint; + removeEdge(*previousPoint, *taskPosition, true); + } + + if (std::next(taskPosition) != scheduledTasks.end()) { + auto nextPoint = std::next(taskPosition, 1); + nextTask = *nextPoint; + removeEdge(*taskPosition, *nextPoint, true); + } + if (previousTask != nullptr && nextTask != nullptr) + addEdge(previousTask, nextTask, 0, true); + scheduledTasks.erase(taskPosition); +} + +//===----------------------------------------------------------------------===// +// Crossbar capacity bookkeeping +//===----------------------------------------------------------------------===// + +CrossbarUsage GraphDCP::getCpuCrossbarUsage(CPU cpu) const { + auto it = cpuCrossbarUsage.find(cpu); + if (it == cpuCrossbarUsage.end()) + return 0; + return it->second; +} + +CrossbarUsage GraphDCP::getCpuCrossbarCapacity() const { + assert(onnx_mlir::crossbarSize.getValue() > 0 && "crossbar-size must be strictly positive"); + assert(onnx_mlir::crossbarCountInCore.getValue() > 0 && "crossbar-count must be strictly positive"); + CrossbarUsage crossbarEdge = static_cast(onnx_mlir::crossbarSize.getValue()); + CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge); + return checkedMultiply(static_cast(onnx_mlir::crossbarCountInCore.getValue()), crossbarArea); +} + +CrossbarUsage GraphDCP::getTaskCrossbarFootprint(const TaskDCP* task) const { + CrossbarUsage crossbarCount = task->getCrossbarUsage(); + if (crossbarCount == 0) + return 0; + CrossbarUsage crossbarEdge = static_cast(onnx_mlir::crossbarSize.getValue()); + CrossbarUsage crossbarArea = checkedMultiply(crossbarEdge, crossbarEdge); + return checkedMultiply(crossbarCount, crossbarArea); +} + +void GraphDCP::reserveTaskCrossbars(CPU cpu, const TaskDCP* task) { + cpuCrossbarUsage[cpu] = checkedAdd(getCpuCrossbarUsage(cpu), getTaskCrossbarFootprint(task)); +} + +void GraphDCP::releaseTaskCrossbars(CPU cpu, const TaskDCP* task) { + CrossbarUsage footprint = getTaskCrossbarFootprint(task); + CrossbarUsage currentUsage = getCpuCrossbarUsage(cpu); + assert(currentUsage >= footprint && "crossbar usage underflow"); + cpuCrossbarUsage[cpu] = currentUsage - footprint; +} + +bool GraphDCP::wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const { + CrossbarUsage footprint = getTaskCrossbarFootprint(task); + if (footprint == 0) + return false; + CrossbarUsage nextUsage = checkedAdd(getCpuCrossbarUsage(cpu), footprint); + return nextUsage >= getCpuCrossbarCapacity(); +} + +//===----------------------------------------------------------------------===// +// AEST / ALST computation +//===----------------------------------------------------------------------===// + +// Walks the topological order once and fills AEST from parent completions, +// while tracking the top two completion times so DCPL updates can avoid a +// second pass. secondMaxCompletion lets us invalidate `maxCompletionTask` +// locally when its AEST moves. +void GraphDCP::initAest() { auto& worklist = topologicalOrder; - int max_dcpl = 0; + Time maxDcpl = 0; + Time secondMaxCompletionCandidate = 0; + TaskDCP* maxCompletionTaskCandidate = nullptr; for (auto& task : worklist) { - int max_parent_aest = 0; - for (Edge_t parentEdge : task.parents) { - max_parent_aest = - std::max(parentEdge.first->getAEST() + parentEdge.first->getWeight() + getTranferCost(parentEdge.first, &task), - max_parent_aest); + Time maxParentAest = 0; + for (Edge parentEdge : task.parents) { + maxParentAest = std::max(addOrMax(addOrMax(parentEdge.first->getAest(), parentEdge.first->getWeight()), + getTransferCost(parentEdge.first, &task)), + maxParentAest); } - task.setAEST(max_parent_aest); - max_dcpl = std::max(max_parent_aest + task.getWeight(), max_dcpl); - } - DCPL = max_dcpl; -} - -int GraphDCP::computeAEST(TaskDCP* task, CPU cpu) { - int max_parent_aest = 0; - for (Edge_t parentEdge : task->parents) { - int transfer_cost = 0; - if (!(parentEdge.first->isScheduled() && cpu == *parentEdge.first->getCPU())) - transfer_cost = getTranferCost(parentEdge.first, task); - max_parent_aest = - std::max(parentEdge.first->getAEST() + parentEdge.first->getWeight() + transfer_cost, max_parent_aest); - } - return max_parent_aest; -} - -int GraphDCP::computeDCPL(TaskDCP* task, CPU cpu) { - int max_aest = 0; - for (auto& node : nodes) - if (&node != task) - max_aest = std::max(node.getAEST() + node.getWeight(), max_aest); - else - max_aest = std::max(computeAEST(task, cpu) + node.computeWeight(this, cpu), max_aest); - return max_aest; -} - -void GraphDCP::initALST() { - int dcpl = getDCPL(); - auto& worklists = topologicalOrder; - - for (TaskDCP& node : llvm::reverse(worklists)) { - int min_alst = INT_MAX; - if (!node.hasChilds()) - min_alst = dcpl - node.getWeight(); - for (Edge_t childEdge : node.childs) - min_alst = - std::min(min_alst, childEdge.first->getALST() - node.getWeight() - getTranferCost(&node, childEdge.first)); - node.setALST(min_alst); - } -} - -llvm::DenseMap GraphDCP::computeALST(TaskDCP* task, CPU cpu) { - int dcpl = computeDCPL(task, cpu); - llvm::DenseMap temp_ALST; - - auto& worklists = topologicalOrder; - for (TaskDCP& node : llvm::reverse(worklists)) { - int min_alst = INT_MAX; - if (!node.hasChilds()) { - if (&node != task) - min_alst = dcpl - node.getWeight(); - else - min_alst = dcpl - node.computeWeight(this, cpu); + task.setAest(maxParentAest); + Time completion = addOrMax(maxParentAest, task.getWeight()); + if (completion >= maxDcpl) { + secondMaxCompletionCandidate = maxDcpl; + maxDcpl = completion; + maxCompletionTaskCandidate = &task; } - - for (Edge_t childEdge : node.childs) { - int transfer_cost = getTranferCost(&node, childEdge.first); - if (&node == task && childEdge.first->isScheduled() && cpu == *childEdge.first->getCPU()) - transfer_cost = 0; - min_alst = std::min(min_alst, temp_ALST[childEdge.first] - node.getWeight() - transfer_cost); + else if (completion > secondMaxCompletionCandidate) { + secondMaxCompletionCandidate = completion; } - temp_ALST[&node] = min_alst; } - return temp_ALST; + dcpl = maxDcpl; + maxCompletion = maxDcpl; + secondMaxCompletion = secondMaxCompletionCandidate; + maxCompletionTask = maxCompletionTaskCandidate; } -TaskDCP* GraphDCP::findCandidate(std::vector nodes) { - auto hasNoCPParentUnsecheduled = [](TaskDCP* node) { - return std::all_of( - node->parents.begin(), node->parents.end(), [](Edge_t element) { return element.first->isScheduled() == true; }); +// Same backward pass as initAest but over the reverse topological order, +// seeding ALST from scheduleDcpl on leaves. +void GraphDCP::initAlst() { + Time scheduleDcpl = getDcpl(); + auto& worklist = topologicalOrder; + + for (TaskDCP& node : llvm::reverse(worklist)) { + Time minAlst = std::numeric_limits