Perft topological fix
This commit is contained in:
@@ -22,7 +22,7 @@ struct ScheduledTask {
|
||||
size_t slot = 0;
|
||||
};
|
||||
|
||||
std::vector<std::vector<size_t>> buildReverseLevels(const ComputeGraph &graph) {
|
||||
std::vector<std::vector<size_t>> buildReverseLevels(const ComputeGraph& graph) {
|
||||
std::vector<size_t> remainingSuccessors(graph.nodes.size(), 0);
|
||||
std::queue<size_t> readySinks;
|
||||
std::vector<std::vector<size_t>> reverseLevels;
|
||||
@@ -43,8 +43,7 @@ std::vector<std::vector<size_t>> buildReverseLevels(const ComputeGraph &graph) {
|
||||
readySinks.pop();
|
||||
levelNodes.push_back(node);
|
||||
++levelizedCount;
|
||||
for (const auto &[pred, weight] : graph.predecessors[node]) {
|
||||
(void) weight;
|
||||
for (const auto& [pred, weight] : graph.predecessors[node]) {
|
||||
assert(remainingSuccessors[pred] > 0 && "remaining successor count underflow");
|
||||
if (--remainingSuccessors[pred] == 0)
|
||||
readySinks.push(pred);
|
||||
@@ -79,7 +78,7 @@ void verifyOctTableSize(size_t nodeCount, size_t processorCount) {
|
||||
|
||||
} // namespace
|
||||
|
||||
MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftScheduleOptions &options) {
|
||||
MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftScheduleOptions& options) {
|
||||
const size_t nodeCount = graph.nodes.size();
|
||||
const size_t processorCount = options.processorCount;
|
||||
if (processorCount == 0)
|
||||
@@ -88,18 +87,23 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
verifyOctTableSize(nodeCount, processorCount);
|
||||
std::vector<std::vector<size_t>> reverseLevels = buildReverseLevels(graph);
|
||||
|
||||
// MOCK: Replace this with your actual heterogeneous cost lookup.
|
||||
// If graph.nodes[task] is modified to hold a vector of weights per processor, access it here.
|
||||
auto getComputeCost = [&](size_t task, size_t processor) -> Time { return graph.nodes[task].weight; };
|
||||
|
||||
std::vector<Time> oct(nodeCount * processorCount, 0);
|
||||
std::vector<Time> minOctPlusComp(nodeCount, 0);
|
||||
|
||||
for (const std::vector<size_t> &levelNodes : reverseLevels) {
|
||||
// 1. O(P(E+V)) Heterogeneous OCT Calculation
|
||||
for (const std::vector<size_t>& levelNodes : reverseLevels) {
|
||||
auto computeNodeOct = [&](size_t levelIndex) {
|
||||
size_t task = levelNodes[levelIndex];
|
||||
std::vector<Time> maxVals(processorCount, 0);
|
||||
|
||||
for (const auto &[succ, comm] : graph.successors[task]) {
|
||||
for (const auto& [succ, comm] : graph.successors[task]) {
|
||||
Time valDifferentCpu = addOrMax(minOctPlusComp[succ], comm);
|
||||
for (size_t processor = 0; processor < processorCount; ++processor) {
|
||||
Time valSameCpu = addOrMax(oct[succ * processorCount + processor], graph.nodes[succ].weight);
|
||||
Time valSameCpu = addOrMax(oct[succ * processorCount + processor], getComputeCost(succ, processor));
|
||||
Time bestSucc = std::min(valSameCpu, valDifferentCpu);
|
||||
maxVals[processor] = std::max(maxVals[processor], bestSucc);
|
||||
}
|
||||
@@ -108,7 +112,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
Time minForPreds = std::numeric_limits<Time>::max();
|
||||
for (size_t processor = 0; processor < processorCount; ++processor) {
|
||||
oct[task * processorCount + processor] = maxVals[processor];
|
||||
minForPreds = std::min(minForPreds, addOrMax(maxVals[processor], graph.nodes[task].weight));
|
||||
minForPreds = std::min(minForPreds, addOrMax(maxVals[processor], getComputeCost(task, processor)));
|
||||
}
|
||||
minOctPlusComp[task] = minForPreds == std::numeric_limits<Time>::max() ? 0 : minForPreds;
|
||||
};
|
||||
@@ -132,6 +136,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
rank += static_cast<long double>(oct[node * processorCount + processor]);
|
||||
ranks[node] = {rank, node, graph.nodes[node].originalOrder};
|
||||
};
|
||||
|
||||
if (options.context != nullptr)
|
||||
mlir::parallelFor(options.context, 0, nodeCount, computeRank);
|
||||
else
|
||||
@@ -139,8 +144,8 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
computeRank(node);
|
||||
|
||||
auto readyCompare = [&](size_t lhs, size_t rhs) {
|
||||
const RankEntry &lhsRank = ranks[lhs];
|
||||
const RankEntry &rhsRank = ranks[rhs];
|
||||
const RankEntry& lhsRank = ranks[lhs];
|
||||
const RankEntry& rhsRank = ranks[rhs];
|
||||
if (lhsRank.rank != rhsRank.rank)
|
||||
return lhsRank.rank < rhsRank.rank;
|
||||
if (lhsRank.originalOrder != rhsRank.originalOrder)
|
||||
@@ -157,7 +162,6 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
}
|
||||
|
||||
std::vector<char> scheduled(nodeCount, false);
|
||||
std::vector<Time> processorAvailable(processorCount, 0);
|
||||
std::vector<CrossbarUsage> processorCrossbars(processorCount, 0);
|
||||
std::vector<ScheduledTask> schedules(nodeCount);
|
||||
std::vector<std::vector<size_t>> tasksByProcessor(processorCount);
|
||||
@@ -176,26 +180,46 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
bool crossbarRejected = false;
|
||||
|
||||
for (size_t processor = 0; processor < processorCount; ++processor) {
|
||||
if (graph.nodes[task].crossbarUsage != 0 &&
|
||||
addOrMax(processorCrossbars[processor], graph.nodes[task].crossbarUsage) > options.crossbarCapacity) {
|
||||
if (graph.nodes[task].crossbarUsage != 0
|
||||
&& addOrMax(processorCrossbars[processor], graph.nodes[task].crossbarUsage) > options.crossbarCapacity) {
|
||||
crossbarRejected = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
Time dataReady = 0;
|
||||
for (const auto &[pred, comm] : graph.predecessors[task]) {
|
||||
const ScheduledTask &predSchedule = schedules[pred];
|
||||
for (const auto& [pred, comm] : graph.predecessors[task]) {
|
||||
const ScheduledTask& predSchedule = schedules[pred];
|
||||
Time commPenalty = predSchedule.processor == processor ? 0 : comm;
|
||||
dataReady = std::max(dataReady, addOrMax(predSchedule.endTime, commPenalty));
|
||||
}
|
||||
|
||||
Time est = std::max(processorAvailable[processor], dataReady);
|
||||
Time eft = addOrMax(est, graph.nodes[task].weight);
|
||||
// 2. PEFT Gap-Filling EST Calculation (Maintains optimal scheduling math)
|
||||
Time compWeight = getComputeCost(task, processor);
|
||||
Time est = dataReady;
|
||||
Time currentEnd = 0;
|
||||
bool foundGap = false;
|
||||
|
||||
for (size_t schedTaskIndex : tasksByProcessor[processor]) {
|
||||
const ScheduledTask& schedTask = schedules[schedTaskIndex];
|
||||
Time gapStart = std::max(currentEnd, dataReady);
|
||||
|
||||
if (addOrMax(gapStart, compWeight) <= schedTask.startTime) {
|
||||
est = gapStart;
|
||||
foundGap = true;
|
||||
break;
|
||||
}
|
||||
currentEnd = schedTask.endTime;
|
||||
}
|
||||
|
||||
if (!foundGap)
|
||||
est = std::max(currentEnd, dataReady);
|
||||
|
||||
Time eft = addOrMax(est, compWeight);
|
||||
Time oeft = addOrMax(eft, oct[task * processorCount + processor]);
|
||||
|
||||
if (oeft < bestOeft || (oeft == bestOeft && eft < bestEft) ||
|
||||
(oeft == bestOeft && eft == bestEft && est < bestEst) ||
|
||||
(oeft == bestOeft && eft == bestEft && est == bestEst && processor < bestProcessor)) {
|
||||
if (oeft < bestOeft || (oeft == bestOeft && eft < bestEft)
|
||||
|| (oeft == bestOeft && eft == bestEft && est < bestEst)
|
||||
|| (oeft == bestOeft && eft == bestEft && est == bestEst && processor < bestProcessor)) {
|
||||
bestProcessor = processor;
|
||||
bestEst = est;
|
||||
bestEft = eft;
|
||||
@@ -219,15 +243,18 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
llvm::report_fatal_error(llvm::StringRef(message));
|
||||
}
|
||||
|
||||
schedules[task] = {bestProcessor, bestEst, bestEft, tasksByProcessor[bestProcessor].size()};
|
||||
schedules[task] = {bestProcessor, bestEst, bestEft, 0};
|
||||
scheduled[task] = true;
|
||||
++scheduledCount;
|
||||
processorAvailable[bestProcessor] = bestEft;
|
||||
processorCrossbars[bestProcessor] =
|
||||
addOrMax(processorCrossbars[bestProcessor], graph.nodes[task].crossbarUsage);
|
||||
processorCrossbars[bestProcessor] = addOrMax(processorCrossbars[bestProcessor], graph.nodes[task].crossbarUsage);
|
||||
|
||||
// 3. CRITICAL FIX: Topological Append
|
||||
// Because the readyQueue pops in strict topological order, simply pushing to the
|
||||
// back guarantees the Monoliths will be physically generated cycle-free.
|
||||
// The hardware will still benefit from the processor assignment chosen by PEFT.
|
||||
tasksByProcessor[bestProcessor].push_back(task);
|
||||
|
||||
for (const auto &[child, weight] : graph.successors[task]) {
|
||||
for (const auto& [child, weight] : graph.successors[task]) {
|
||||
(void) weight;
|
||||
assert(remainingParents[child] > 0 && "remaining parent count underflow");
|
||||
if (--remainingParents[child] == 0)
|
||||
@@ -238,16 +265,28 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
if (scheduledCount != nodeCount)
|
||||
llvm::report_fatal_error("PEFT scheduler: failed to schedule every compute node");
|
||||
|
||||
// 4. Build Strict Topological Dominance Order
|
||||
std::vector<size_t> scheduledOrder(nodeCount);
|
||||
for (size_t i = 0; i < nodeCount; ++i)
|
||||
scheduledOrder[i] = i;
|
||||
|
||||
std::sort(scheduledOrder.begin(), scheduledOrder.end(), [&](size_t a, size_t b) {
|
||||
return graph.nodes[a].originalOrder < graph.nodes[b].originalOrder;
|
||||
});
|
||||
|
||||
// 5. Populate Final Result
|
||||
MergeScheduleResult result;
|
||||
result.dominanceOrderCompute.reserve(nodeCount);
|
||||
for (const ComputeGraphNode &node : graph.nodes)
|
||||
result.dominanceOrderCompute.push_back(node.instance);
|
||||
|
||||
for (size_t task : scheduledOrder)
|
||||
result.dominanceOrderCompute.push_back(graph.nodes[task].instance);
|
||||
|
||||
for (size_t processor = 0; processor < processorCount; ++processor) {
|
||||
size_t currentSlot = 0;
|
||||
for (size_t task : tasksByProcessor[processor]) {
|
||||
const ComputeInstance instance = graph.nodes[task].instance;
|
||||
result.computeToCpuMap[instance] = processor;
|
||||
result.computeToCpuSlotMap[instance] = schedules[task].slot;
|
||||
result.computeToCpuSlotMap[instance] = currentSlot++;
|
||||
result.computeToAestMap[instance] = schedules[task].startTime;
|
||||
}
|
||||
if (!tasksByProcessor[processor].empty()) {
|
||||
@@ -259,6 +298,6 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph &graph, const PeftSchedu
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace spatial
|
||||
} // namespace onnx_mlir
|
||||
|
||||
|
||||
Reference in New Issue
Block a user