faster (and refactored) DCP analysis
All checks were successful
Validate Operations / validate-operations (push) Successful in 2h16m17s
All checks were successful
Validate Operations / validate-operations (push) Successful in 2h16m17s
This commit is contained in:
@@ -31,8 +31,7 @@ Moreover, if compiling with build type debug, it is also suggested to use
|
|||||||
mold as linker (you will need to install it if you don't have it already)
|
mold as linker (you will need to install it if you don't have it already)
|
||||||
to reduce memory usage during linking. You can use it by setting the options:
|
to reduce memory usage during linking. You can use it by setting the options:
|
||||||
```
|
```
|
||||||
-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \
|
-DLLVM_USE_LINKER=mold
|
||||||
-DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Raptor
|
### Raptor
|
||||||
@@ -45,7 +44,8 @@ Also in this case, it is suggested to use mold as linker to reduce link time and
|
|||||||
setting the options:
|
setting the options:
|
||||||
```
|
```
|
||||||
-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \
|
-DCMAKE_EXE_LINKER_FLAGS="-fuse-ld=mold" \
|
||||||
-DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold"
|
-DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=mold" \
|
||||||
|
-DCMAKE_MODULE_LINKER_FLAGS="-fuse-ld=mold"
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -661,9 +661,8 @@ void SpatialToPimPass::annotateChannelCoreIds(func::FuncOp funcOp) {
|
|||||||
broadcastSendOp = op;
|
broadcastSendOp = op;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (auto op = dyn_cast<spatial::SpatChannelBroadcastReceiveOp>(user)) {
|
if (auto op = dyn_cast<spatial::SpatChannelBroadcastReceiveOp>(user))
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
llvm_unreachable("Unexpected user of spat.channel_new during Spatial-to-PIM lowering");
|
llvm_unreachable("Unexpected user of spat.channel_new during Spatial-to-PIM lowering");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -719,7 +718,8 @@ void SpatialToPimPass::lowerBroadcastChannelOps(func::FuncOp funcOp, IRRewriter&
|
|||||||
auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult());
|
auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult());
|
||||||
auto sourceCoreIdAttr = getSpatialChannelSourceCoreIdAttr(rewriter, receiveOp.getChannel());
|
auto sourceCoreIdAttr = getSpatialChannelSourceCoreIdAttr(rewriter, receiveOp.getChannel());
|
||||||
Value receivedValue =
|
Value receivedValue =
|
||||||
PimReceiveOp::create(rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr)
|
PimReceiveOp::create(
|
||||||
|
rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr)
|
||||||
.getOutput();
|
.getOutput();
|
||||||
rewriter.replaceOp(receiveOp, receivedValue);
|
rewriter.replaceOp(receiveOp, receivedValue);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ add_pim_library(SpatialOps
|
|||||||
SpatialOps.cpp
|
SpatialOps.cpp
|
||||||
Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
|
Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp
|
||||||
Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
|
Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
|
||||||
|
Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
|
||||||
|
Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
|
||||||
Transforms/MergeComputeNodes/DCPGraph/Task.cpp
|
Transforms/MergeComputeNodes/DCPGraph/Task.cpp
|
||||||
Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
|
Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ namespace spatial {
|
|||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|
||||||
SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) {
|
SpatWeightedCompute getOriginalSpatWeightedCompute(Operation* op) {
|
||||||
if (!op)
|
if (!op)
|
||||||
return {};
|
return {};
|
||||||
while (auto extract = llvm::dyn_cast<tensor::ExtractSliceOp>(op)) {
|
while (auto extract = llvm::dyn_cast<tensor::ExtractSliceOp>(op)) {
|
||||||
@@ -30,32 +30,32 @@ SpatWeightedCompute getOriginalSpatWeightCompute(Operation* op) {
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
DCPAnalysisResult DCPAnalysis::runAnalysis() {
|
DCPAnalysisResult DCPAnalysis::run() {
|
||||||
using EdgesIndex = std::tuple<int64_t, int64_t, int64_t>;
|
|
||||||
llvm::SmallVector<SpatWeightedCompute, 10> spatWeightedComputes;
|
llvm::SmallVector<SpatWeightedCompute, 10> spatWeightedComputes;
|
||||||
llvm::SmallVector<EdgesIndex, 10> edges;
|
llvm::SmallVector<IndexedEdge, 10> edges;
|
||||||
for (auto& regions : entryOp->getRegions())
|
for (auto& region : entryOp->getRegions())
|
||||||
for (SpatWeightedCompute spatWeightedCompute : regions.getOps<SpatWeightedCompute>())
|
for (SpatWeightedCompute spatWeightedCompute : region.getOps<SpatWeightedCompute>())
|
||||||
spatWeightedComputes.push_back(spatWeightedCompute);
|
spatWeightedComputes.push_back(spatWeightedCompute);
|
||||||
|
|
||||||
for (auto [indexEndEdge, spatWeightedCompute] : llvm::enumerate(spatWeightedComputes)) {
|
for (auto [indexEndEdge, spatWeightedCompute] : llvm::enumerate(spatWeightedComputes)) {
|
||||||
for (Value input : spatWeightedCompute.getInputs()) {
|
for (Value input : spatWeightedCompute.getInputs()) {
|
||||||
if (auto spatWeightedComputeArgOp = getOriginalSpatWeightCompute(input.getDefiningOp())) {
|
if (auto producerCompute = getOriginalSpatWeightedCompute(input.getDefiningOp())) {
|
||||||
auto elemIter = llvm::find(spatWeightedComputes, spatWeightedComputeArgOp);
|
auto producerIt = llvm::find(spatWeightedComputes, producerCompute);
|
||||||
assert(elemIter != spatWeightedComputes.end());
|
assert(producerIt != spatWeightedComputes.end());
|
||||||
auto indexStartEdge = std::distance(spatWeightedComputes.begin(), elemIter);
|
auto indexStartEdge = std::distance(spatWeightedComputes.begin(), producerIt);
|
||||||
ResultRange outputs = spatWeightedComputeArgOp.getResults();
|
ResultRange outputs = producerCompute.getResults();
|
||||||
int64_t totalSize = 0;
|
int64_t totalSize = 0;
|
||||||
for (auto output : outputs) {
|
for (auto output : outputs) {
|
||||||
ShapedType result = cast<ShapedType>(output.getType());
|
ShapedType resultType = cast<ShapedType>(output.getType());
|
||||||
totalSize += getSizeInBytes(result);
|
totalSize += getSizeInBytes(resultType);
|
||||||
}
|
}
|
||||||
edges.push_back({indexStartEdge, indexEndEdge, totalSize});
|
edges.push_back({indexStartEdge, indexEndEdge, totalSize});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GraphDCP graphDCP(spatWeightedComputes, edges);
|
GraphDCP graphDCP(spatWeightedComputes, edges);
|
||||||
graphDCP.DCP();
|
graphDCP.setContext(entryOp->getContext());
|
||||||
|
graphDCP.runDcp();
|
||||||
return graphDCP.getResult();
|
return graphDCP.getResult();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
#include "mlir/IR/Operation.h"
|
#include "mlir/IR/Operation.h"
|
||||||
|
|
||||||
#include "llvm/ADT/DenseMap.h"
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/DenseSet.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@@ -10,8 +11,8 @@
|
|||||||
|
|
||||||
struct DCPAnalysisResult {
|
struct DCPAnalysisResult {
|
||||||
std::vector<onnx_mlir::spatial::SpatWeightedCompute> dominanceOrderCompute;
|
std::vector<onnx_mlir::spatial::SpatWeightedCompute> dominanceOrderCompute;
|
||||||
llvm::DenseMap<onnx_mlir::spatial::SpatWeightedCompute, size_t> computeToCPUMap;
|
llvm::DenseMap<onnx_mlir::spatial::SpatWeightedCompute, size_t> computeToCpuMap;
|
||||||
llvm::DenseSet<onnx_mlir::spatial::SpatWeightedCompute> isLastComputeOfACpu;
|
llvm::DenseSet<onnx_mlir::spatial::SpatWeightedCompute> isLastComputeOfCpu;
|
||||||
llvm::DenseMap<size_t, onnx_mlir::spatial::SpatWeightedCompute> cpuToLastComputeMap;
|
llvm::DenseMap<size_t, onnx_mlir::spatial::SpatWeightedCompute> cpuToLastComputeMap;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -21,12 +22,12 @@ struct DCPAnalysis {
|
|||||||
private:
|
private:
|
||||||
DCPAnalysisResult result;
|
DCPAnalysisResult result;
|
||||||
mlir::Operation* entryOp;
|
mlir::Operation* entryOp;
|
||||||
DCPAnalysisResult runAnalysis();
|
DCPAnalysisResult run();
|
||||||
|
|
||||||
public:
|
public:
|
||||||
DCPAnalysis(mlir::Operation* op)
|
DCPAnalysis(mlir::Operation* op)
|
||||||
: entryOp(op) {
|
: entryOp(op) {
|
||||||
result = runAnalysis();
|
result = run();
|
||||||
}
|
}
|
||||||
DCPAnalysisResult& getResult() { return result; }
|
DCPAnalysisResult& getResult() { return result; }
|
||||||
};
|
};
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
#include "llvm/ADT/ArrayRef.h"
|
||||||
#include "llvm/ADT/DenseMap.h"
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/DenseSet.h"
|
||||||
|
|
||||||
#include <list>
|
#include <list>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
@@ -12,90 +13,144 @@
|
|||||||
#include "Task.hpp"
|
#include "Task.hpp"
|
||||||
#include "Utils.hpp"
|
#include "Utils.hpp"
|
||||||
|
|
||||||
std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
|
namespace mlir {
|
||||||
void removeEdge(TaskDCP* parent, TaskDCP* child);
|
class MLIRContext;
|
||||||
int getTranferCost(TaskDCP* parent, TaskDCP* child);
|
} // namespace mlir
|
||||||
|
|
||||||
|
std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling = false);
|
||||||
|
void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling = false);
|
||||||
|
Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
|
||||||
|
|
||||||
class GraphDCP {
|
class GraphDCP {
|
||||||
public:
|
public:
|
||||||
|
struct CandidateRelations {
|
||||||
|
llvm::DenseSet<TaskDCP*> ancestors;
|
||||||
|
llvm::DenseSet<TaskDCP*> descendants;
|
||||||
|
// descendants ordered by position in the graph's topological order;
|
||||||
|
// iterating this avoids walking non-descendant tail tasks on hot paths.
|
||||||
|
llvm::SmallVector<TaskDCP*, 32> descendantsTopoOrder;
|
||||||
|
};
|
||||||
|
|
||||||
struct ScheduledTaskInfo {
|
struct ScheduledTaskInfo {
|
||||||
size_t nodeIndex;
|
size_t nodeIndex;
|
||||||
int aest;
|
Time aest;
|
||||||
int alst;
|
Time alst;
|
||||||
int weight;
|
Weight weight;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
using CpuTaskList = std::list<TaskDCP*>;
|
||||||
|
|
||||||
struct FindSlot {
|
struct FindSlot {
|
||||||
int aest;
|
Time aest;
|
||||||
int index;
|
int index;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<TaskDCP> nodes;
|
std::vector<TaskDCP> nodes;
|
||||||
onnx_mlir::LabeledList<TaskDCP> topologicalOrder;
|
onnx_mlir::LabeledList<TaskDCP> topologicalOrder;
|
||||||
std::unordered_map<CPU, std::list<TaskDCP*>> mapCPUTasks;
|
std::vector<CpuTaskList> cpuTasks;
|
||||||
CPU last_cpu = 0;
|
std::unordered_map<CPU, CrossbarUsage> cpuCrossbarUsage;
|
||||||
|
CPU lastCpu = 0;
|
||||||
long long flag = 1;
|
long long flag = 1;
|
||||||
int DCPL;
|
Time dcpl = 0;
|
||||||
|
Time maxCompletion = 0;
|
||||||
|
Time secondMaxCompletion = 0;
|
||||||
|
TaskDCP* maxCompletionTask = nullptr;
|
||||||
|
int maxCpuCount = 1000;
|
||||||
|
mlir::MLIRContext* context = nullptr;
|
||||||
|
|
||||||
TaskInsertion insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position);
|
TaskInsertion insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position);
|
||||||
void removeTaskFromCPU(CPU cpu, TaskDCP* task);
|
void removeTaskFromCPU(CPU cpu, TaskDCP* task);
|
||||||
|
CpuTaskList& getOrCreateCpuTasks(CPU cpu);
|
||||||
|
const CpuTaskList* findCpuTasks(CPU cpu) const;
|
||||||
|
|
||||||
std::vector<TaskDCP*> getRoots();
|
std::vector<TaskDCP*> getRoots();
|
||||||
|
|
||||||
long long getUniqueFlag() { return flag++; }
|
long long getUniqueFlag() { return flag++; }
|
||||||
|
|
||||||
void initAEST();
|
void initAest();
|
||||||
int initDCPL();
|
void initAlst();
|
||||||
void initALST();
|
|
||||||
|
|
||||||
int computeAEST(TaskDCP* task, CPU cpu);
|
Time computeAestOnCpu(TaskDCP* task, CPU cpu);
|
||||||
int computeDCPL(TaskDCP* task, CPU cpu);
|
Time computeDcplOnCpu(TaskDCP* task, CPU cpu);
|
||||||
int getDCPL() { return DCPL; }
|
Time getDcpl() const { return dcpl; }
|
||||||
|
Time computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl);
|
||||||
|
void updateAestFromTask(TaskDCP* task);
|
||||||
|
void updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants);
|
||||||
|
void updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder);
|
||||||
|
// Propagates AEST like the overload above but returns early (before touching
|
||||||
|
// the remaining descendants) as soon as a task's completion exceeds
|
||||||
|
// `dcplBudget`, signalling that the new DCPL would exceed the budget.
|
||||||
|
// Returns true iff the full propagation completed without exceeding the
|
||||||
|
// budget. Uses the caller's snapshot to restore AEST on the aborted tail.
|
||||||
|
bool tryUpdateAestWithinBudget(TaskDCP* task,
|
||||||
|
llvm::ArrayRef<TaskDCP*> descendantsTopoOrder,
|
||||||
|
Time dcplBudget);
|
||||||
|
|
||||||
void initTopological();
|
void initTopological();
|
||||||
void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint);
|
void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
|
||||||
void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint);
|
void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
|
||||||
|
|
||||||
llvm::DenseMap<TaskDCP*, int> computeALST(TaskDCP* task, CPU cpu);
|
llvm::DenseMap<TaskDCP*, Time> computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations);
|
||||||
size_t getNodeIndex(const TaskDCP* task) const;
|
size_t getNodeIndex(const TaskDCP* task) const;
|
||||||
|
|
||||||
TaskDCP* findCandidate(std::vector<TaskDCP*> nodes);
|
TaskDCP* findCandidate(const std::vector<TaskDCP*>& readyNodes);
|
||||||
void selectProcessor(TaskDCP* candidate, bool push);
|
void selectProcessor(TaskDCP* candidate, bool push);
|
||||||
CPU lastCPU() const { return last_cpu; }
|
CPU getLastCpu() const { return lastCpu; }
|
||||||
void incLastCPU() { last_cpu++; }
|
void incrementLastCpu() { lastCpu++; }
|
||||||
FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push);
|
FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations);
|
||||||
void to_dot();
|
FindSlot findSlotWithFixedFinalTime(
|
||||||
|
TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu);
|
||||||
|
void dumpDot();
|
||||||
|
|
||||||
friend TaskInsertion;
|
friend TaskInsertion;
|
||||||
|
friend class TaskDCP;
|
||||||
|
|
||||||
|
CrossbarUsage getCpuCrossbarUsage(CPU cpu) const;
|
||||||
|
CrossbarUsage getCpuCrossbarCapacity() const;
|
||||||
|
CrossbarUsage getTaskCrossbarFootprint(const TaskDCP* task) const;
|
||||||
|
void reserveTaskCrossbars(CPU cpu, const TaskDCP* task);
|
||||||
|
void releaseTaskCrossbars(CPU cpu, const TaskDCP* task);
|
||||||
|
bool wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void DCP();
|
void runDcp();
|
||||||
GraphDCP(llvm::ArrayRef<onnx_mlir::spatial::SpatWeightedCompute> spatWeightedComputes,
|
GraphDCP(llvm::ArrayRef<onnx_mlir::spatial::SpatWeightedCompute> spatWeightedComputes,
|
||||||
llvm::ArrayRef<EdgesIndex> edges)
|
llvm::ArrayRef<IndexedEdge> edges)
|
||||||
: nodes(), mapCPUTasks() {
|
: nodes(), cpuTasks(), cpuCrossbarUsage() {
|
||||||
for (auto spatWeightedCompute : spatWeightedComputes)
|
for (auto spatWeightedCompute : spatWeightedComputes)
|
||||||
nodes.emplace_back(spatWeightedCompute);
|
nodes.emplace_back(spatWeightedCompute);
|
||||||
for (auto [start, end, weight] : edges)
|
for (auto [start, end, weight] : edges)
|
||||||
makeEdge(start, end, weight);
|
makeEdge(start, end, weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
GraphDCP(llvm::ArrayRef<Weight_t> nodeWeights, llvm::ArrayRef<EdgesIndex> edges)
|
GraphDCP(llvm::ArrayRef<Weight> nodeWeights,
|
||||||
: nodes(), mapCPUTasks() {
|
llvm::ArrayRef<IndexedEdge> edges,
|
||||||
|
llvm::ArrayRef<CrossbarUsage> nodeCrossbarUsage = {})
|
||||||
|
: nodes(), cpuTasks(), cpuCrossbarUsage() {
|
||||||
|
assert((nodeCrossbarUsage.empty() || nodeCrossbarUsage.size() == nodeWeights.size())
|
||||||
|
&& "synthetic crossbar usage must match synthetic node weights");
|
||||||
nodes.reserve(nodeWeights.size());
|
nodes.reserve(nodeWeights.size());
|
||||||
for (auto [index, weight] : llvm::enumerate(nodeWeights))
|
for (auto [index, weight] : llvm::enumerate(nodeWeights))
|
||||||
nodes.emplace_back(index, weight);
|
nodes.emplace_back(index, weight, nodeCrossbarUsage.empty() ? 0 : nodeCrossbarUsage[index]);
|
||||||
for (auto [start, end, weight] : edges)
|
for (auto [start, end, weight] : edges)
|
||||||
makeEdge(start, end, weight);
|
makeEdge(start, end, weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
DCPAnalysisResult getResult();
|
DCPAnalysisResult getResult();
|
||||||
std::vector<ScheduledTaskInfo> getScheduledTasks(CPU cpu) const;
|
std::vector<ScheduledTaskInfo> getScheduledTasks(CPU cpu) const;
|
||||||
CPU cpuCount() const { return last_cpu; }
|
CPU cpuCount() const { return lastCpu; }
|
||||||
|
|
||||||
void makeEdge(size_t parent_index, size_t child_index, Weight_t weight) {
|
void makeEdge(size_t parentIndex, size_t childIndex, Weight weight) {
|
||||||
addEdge(&nodes[parent_index], &nodes[child_index], weight);
|
addEdge(&nodes[parentIndex], &nodes[childIndex], weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t taskInCPU(CPU cpu) { return mapCPUTasks[cpu].size(); }
|
size_t taskInCpu(CPU cpu) { return getOrCreateCpuTasks(cpu).size(); }
|
||||||
|
|
||||||
|
void setMaxCpuCount(int value) { maxCpuCount = value; }
|
||||||
|
int getMaxCpuCount() const { return maxCpuCount; }
|
||||||
|
|
||||||
|
// Optional MLIR context used to drive mlir::parallelFor inside runDcp. If
|
||||||
|
// null the scheduler runs single-threaded (tests use this path).
|
||||||
|
void setContext(mlir::MLIRContext* ctx) { context = ctx; }
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -0,0 +1,152 @@
|
|||||||
|
#include "llvm/Support/FormatVariadic.h"
|
||||||
|
#include "llvm/Support/raw_ostream.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "GraphDebug.hpp"
|
||||||
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
|
|
||||||
|
namespace dcp_graph {
|
||||||
|
|
||||||
|
#ifdef DCP_DEBUG_ENABLED
|
||||||
|
|
||||||
|
DcpProgressLogger::DcpProgressLogger(size_t totalTasks)
|
||||||
|
: logProgress(totalTasks >= 200),
|
||||||
|
totalTasks(totalTasks),
|
||||||
|
startTime(std::chrono::steady_clock::now()),
|
||||||
|
lastProgressPrint(startTime) {}
|
||||||
|
|
||||||
|
std::string DcpProgressLogger::formatDuration(double seconds) {
|
||||||
|
if (seconds < 0)
|
||||||
|
seconds = 0;
|
||||||
|
|
||||||
|
long totalSeconds = static_cast<long>(seconds + 0.5);
|
||||||
|
long hours = totalSeconds / 3600;
|
||||||
|
long minutes = (totalSeconds % 3600) / 60;
|
||||||
|
long secs = totalSeconds % 60;
|
||||||
|
if (hours > 0)
|
||||||
|
return llvm::formatv("{0}:{1:02}:{2:02}", hours, minutes, secs).str();
|
||||||
|
return llvm::formatv("{0}:{1:02}", minutes, secs).str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void DcpProgressLogger::recordFindDuration(double seconds) { findCandidateSeconds += seconds; }
|
||||||
|
void DcpProgressLogger::recordSelectDuration(double seconds) { selectProcessorSeconds += seconds; }
|
||||||
|
void DcpProgressLogger::recordUpdateDuration(double seconds) { updateTimingSeconds += seconds; }
|
||||||
|
void DcpProgressLogger::advanceCompleted(size_t taskCount) { completedTasks += taskCount; }
|
||||||
|
|
||||||
|
void DcpProgressLogger::printStart(size_t readyCount) const {
|
||||||
|
if (!logProgress)
|
||||||
|
return;
|
||||||
|
llvm::errs() << llvm::formatv("[DCP] start: tasks={0} ready={1}\n", totalTasks, readyCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DcpProgressLogger::maybePrintSlowCandidate(size_t nodeIndex,
|
||||||
|
double elapsedSeconds,
|
||||||
|
size_t readyCount,
|
||||||
|
CPU cpuCount) const {
|
||||||
|
if (!logProgress || elapsedSeconds < 1.0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
llvm::errs() << llvm::formatv("[DCP] slow candidate node={0} elapsed={1} ready={2} cpus={3}\n",
|
||||||
|
nodeIndex,
|
||||||
|
formatDuration(elapsedSeconds),
|
||||||
|
readyCount,
|
||||||
|
cpuCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DcpProgressLogger::printProgress(size_t readyCount, CPU cpuCount, llvm::StringRef stage, bool force) {
|
||||||
|
if (!logProgress)
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto now = std::chrono::steady_clock::now();
|
||||||
|
if (!force && now - lastProgressPrint < std::chrono::seconds(1) && completedTasks != totalTasks)
|
||||||
|
return;
|
||||||
|
|
||||||
|
double elapsedSeconds = std::chrono::duration<double>(now - startTime).count();
|
||||||
|
double rate = elapsedSeconds > 0.0 ? static_cast<double>(completedTasks) / elapsedSeconds : 0.0;
|
||||||
|
double etaSeconds = rate > 0.0 ? static_cast<double>(totalTasks - completedTasks) / rate : 0.0;
|
||||||
|
double percent = totalTasks == 0 ? 100.0 : (100.0 * static_cast<double>(completedTasks) / totalTasks);
|
||||||
|
|
||||||
|
llvm::errs() << llvm::formatv("[DCP] {0}/{1} ({2:F1}%) ready={3} cpus={4} stage={5} elapsed={6} eta={7}\n",
|
||||||
|
completedTasks,
|
||||||
|
totalTasks,
|
||||||
|
percent,
|
||||||
|
readyCount,
|
||||||
|
cpuCount,
|
||||||
|
stage,
|
||||||
|
formatDuration(elapsedSeconds),
|
||||||
|
completedTasks == totalTasks ? "0:00" : formatDuration(etaSeconds));
|
||||||
|
llvm::errs() << llvm::formatv(" time(find={0}, select={1}, update={2})\n",
|
||||||
|
formatDuration(findCandidateSeconds),
|
||||||
|
formatDuration(selectProcessorSeconds),
|
||||||
|
formatDuration(updateTimingSeconds));
|
||||||
|
lastProgressPrint = now;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
DcpProgressLogger::DcpProgressLogger(size_t) {}
|
||||||
|
void DcpProgressLogger::recordFindDuration(double) {}
|
||||||
|
void DcpProgressLogger::recordSelectDuration(double) {}
|
||||||
|
void DcpProgressLogger::recordUpdateDuration(double) {}
|
||||||
|
void DcpProgressLogger::advanceCompleted(size_t) {}
|
||||||
|
void DcpProgressLogger::printStart(size_t) const {}
|
||||||
|
void DcpProgressLogger::maybePrintSlowCandidate(size_t, double, size_t, CPU) const {}
|
||||||
|
void DcpProgressLogger::printProgress(size_t, CPU, llvm::StringRef, bool) {}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void dumpGraphDot(const std::vector<TaskDCP>& nodes,
|
||||||
|
const std::vector<std::list<TaskDCP*>>& cpuTasks,
|
||||||
|
CPU lastCpu) {
|
||||||
|
static int dumpIndex = 0;
|
||||||
|
std::string outputDir = onnx_mlir::getOutputDir();
|
||||||
|
if (outputDir.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
std::string graphDir = outputDir + "/dcp_graph";
|
||||||
|
onnx_mlir::createDirectory(graphDir);
|
||||||
|
std::fstream file(graphDir + "/graph_" + std::to_string(dumpIndex++) + ".dot", std::ios::out);
|
||||||
|
file << "digraph G {\n";
|
||||||
|
if (!cpuTasks.empty()) {
|
||||||
|
for (CPU cpu = 0; cpu < lastCpu; cpu++) {
|
||||||
|
file << "subgraph cluster_" << cpu << "{\nstyle=filled;\ncolor=lightgrey;\n";
|
||||||
|
size_t cpuIndex = static_cast<size_t>(cpu);
|
||||||
|
if (cpuIndex >= cpuTasks.size()) {
|
||||||
|
file << " }\n";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto node : cpuTasks[cpuIndex]) {
|
||||||
|
file << node->Id() << " [label=\"";
|
||||||
|
file << "n:" << node->Id() << "\n";
|
||||||
|
file << "aest:" << node->getAest() << "\n";
|
||||||
|
file << "alst:" << node->getAlst() << "\n";
|
||||||
|
file << "weight:" << node->getWeight() << "\"]\n";
|
||||||
|
}
|
||||||
|
file << " }\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (const auto& node : nodes) {
|
||||||
|
file << node.Id() << " [label=\"";
|
||||||
|
file << "n:" << node.Id() << "\n";
|
||||||
|
file << "aest:" << node.getAest() << "\n";
|
||||||
|
file << "alst:" << node.getAlst() << "\n";
|
||||||
|
file << "weight:" << node.getWeight() << "\"]\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto& node : nodes)
|
||||||
|
for (const auto& child : node.children) {
|
||||||
|
file << node.Id() << " -> " << child.first->Id();
|
||||||
|
file << " [label=\"" << child.second << "\"]\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
file << "}\n";
|
||||||
|
file.flush();
|
||||||
|
file.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace dcp_graph
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llvm/ADT/StringRef.h"
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <list>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "Task.hpp"
|
||||||
|
#include "Utils.hpp"
|
||||||
|
|
||||||
|
// Uncomment to enable DCP progress logging and per-phase profiling during
|
||||||
|
// development. When disabled the logger methods are no-ops and the helpers
|
||||||
|
// compile away.
|
||||||
|
#define DCP_DEBUG_ENABLED
|
||||||
|
|
||||||
|
#ifdef DCP_DEBUG_ENABLED
|
||||||
|
#define DCP_DEBUG_IF(...) __VA_ARGS__
|
||||||
|
#else
|
||||||
|
#define DCP_DEBUG_IF(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace dcp_graph {
|
||||||
|
|
||||||
|
class DcpProgressLogger {
|
||||||
|
public:
|
||||||
|
explicit DcpProgressLogger(size_t totalTasks);
|
||||||
|
|
||||||
|
void recordFindDuration(double seconds);
|
||||||
|
void recordSelectDuration(double seconds);
|
||||||
|
void recordUpdateDuration(double seconds);
|
||||||
|
void advanceCompleted(size_t taskCount = 1);
|
||||||
|
|
||||||
|
void printStart(size_t readyCount) const;
|
||||||
|
void maybePrintSlowCandidate(size_t nodeIndex, double elapsedSeconds, size_t readyCount, CPU cpuCount) const;
|
||||||
|
void printProgress(size_t readyCount, CPU cpuCount, llvm::StringRef stage, bool force);
|
||||||
|
|
||||||
|
#ifdef DCP_DEBUG_ENABLED
|
||||||
|
private:
|
||||||
|
static std::string formatDuration(double seconds);
|
||||||
|
|
||||||
|
bool logProgress = false;
|
||||||
|
size_t totalTasks = 0;
|
||||||
|
size_t completedTasks = 0;
|
||||||
|
std::chrono::steady_clock::time_point startTime;
|
||||||
|
std::chrono::steady_clock::time_point lastProgressPrint;
|
||||||
|
double findCandidateSeconds = 0.0;
|
||||||
|
double selectProcessorSeconds = 0.0;
|
||||||
|
double updateTimingSeconds = 0.0;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
void dumpGraphDot(const std::vector<TaskDCP>& nodes,
|
||||||
|
const std::vector<std::list<TaskDCP*>>& cpuTasks,
|
||||||
|
CPU lastCpu);
|
||||||
|
|
||||||
|
} // namespace dcp_graph
|
||||||
@@ -0,0 +1,105 @@
|
|||||||
|
#include "llvm/ADT/STLExtras.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "GraphSupport.hpp"
|
||||||
|
#include "Task.hpp"
|
||||||
|
#include "UniqueWorklist.hpp"
|
||||||
|
|
||||||
|
namespace dcp_graph {
|
||||||
|
|
||||||
|
llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents) {
|
||||||
|
llvm::DenseSet<TaskDCP*> reachable;
|
||||||
|
std::vector<TaskDCP*> worklist;
|
||||||
|
worklist.reserve(32);
|
||||||
|
|
||||||
|
auto enqueueEdges = [&](TaskDCP* task) {
|
||||||
|
const auto& edges = followParents ? task->parents : task->children;
|
||||||
|
for (const auto& edge : edges)
|
||||||
|
if (reachable.insert(edge.first).second)
|
||||||
|
worklist.push_back(edge.first);
|
||||||
|
};
|
||||||
|
|
||||||
|
enqueueEdges(root);
|
||||||
|
while (!worklist.empty()) {
|
||||||
|
TaskDCP* task = worklist.back();
|
||||||
|
worklist.pop_back();
|
||||||
|
enqueueEdges(task);
|
||||||
|
}
|
||||||
|
return reachable;
|
||||||
|
}
|
||||||
|
|
||||||
|
GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate) {
|
||||||
|
return {collectReachableTasks(candidate, true), collectReachableTasks(candidate, false)};
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
|
||||||
|
const llvm::DenseSet<TaskDCP*>& descendants,
|
||||||
|
Time dcpl,
|
||||||
|
Time maxCompletion,
|
||||||
|
Time secondMaxCompletion,
|
||||||
|
TaskDCP* maxCompletionTask) {
|
||||||
|
LocalScheduleSnapshot snapshot;
|
||||||
|
snapshot.aestBackup.reserve(descendants.size() + 1);
|
||||||
|
snapshot.aestBackup.emplace_back(task, task->getAest());
|
||||||
|
for (TaskDCP* descendant : descendants)
|
||||||
|
snapshot.aestBackup.emplace_back(descendant, descendant->getAest());
|
||||||
|
snapshot.dcpl = dcpl;
|
||||||
|
snapshot.maxCompletion = maxCompletion;
|
||||||
|
snapshot.secondMaxCompletion = secondMaxCompletion;
|
||||||
|
snapshot.maxCompletionTask = maxCompletionTask;
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
|
||||||
|
void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
|
||||||
|
Time& dcpl,
|
||||||
|
Time& maxCompletion,
|
||||||
|
Time& secondMaxCompletion,
|
||||||
|
TaskDCP*& maxCompletionTask) {
|
||||||
|
for (const auto& [task, aest] : snapshot.aestBackup)
|
||||||
|
task->setAest(aest);
|
||||||
|
dcpl = snapshot.dcpl;
|
||||||
|
maxCompletion = snapshot.maxCompletion;
|
||||||
|
secondMaxCompletion = snapshot.secondMaxCompletion;
|
||||||
|
maxCompletionTask = snapshot.maxCompletionTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
int countDependencyParents(const TaskDCP* task) {
|
||||||
|
return static_cast<int>(llvm::count_if(task->parents, [](const Edge& edge) { return !edge.isScheduling; }));
|
||||||
|
}
|
||||||
|
|
||||||
|
void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion) {
|
||||||
|
if (insertion == nullptr)
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto alreadyRecorded =
|
||||||
|
llvm::any_of(insertion->topologicalMoves,
|
||||||
|
[task](const TaskInsertion::TopologicalMoveRecord& move) { return move.task == task; });
|
||||||
|
if (alreadyRecorded)
|
||||||
|
return;
|
||||||
|
|
||||||
|
insertion->topologicalMoves.push_back({task, onnx_mlir::LabeledList<TaskDCP>::next(task)});
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount) {
|
||||||
|
UniqueWorkList<std::vector<TaskDCP*>> worklist(roots);
|
||||||
|
worklist.reserve(nodeCount);
|
||||||
|
|
||||||
|
size_t index = 0;
|
||||||
|
while (index != worklist.size()) {
|
||||||
|
bool modified = true;
|
||||||
|
while (modified) {
|
||||||
|
modified = false;
|
||||||
|
for (const auto& child : worklist.at(index)->children)
|
||||||
|
if (worklist.allElementsContained(
|
||||||
|
child.first->parents.begin(), child.first->parents.end(), [](Edge edge) { return edge.first; }))
|
||||||
|
modified |= worklist.pushBack(child.first);
|
||||||
|
}
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {worklist.begin(), worklist.end()};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace dcp_graph
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llvm/ADT/ArrayRef.h"
|
||||||
|
#include "llvm/ADT/DenseSet.h"
|
||||||
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "Graph.hpp"
|
||||||
|
|
||||||
|
namespace dcp_graph {
|
||||||
|
|
||||||
|
struct LocalScheduleSnapshot {
|
||||||
|
llvm::SmallVector<std::pair<TaskDCP*, Time>, 64> aestBackup;
|
||||||
|
Time dcpl = 0;
|
||||||
|
Time maxCompletion = 0;
|
||||||
|
Time secondMaxCompletion = 0;
|
||||||
|
TaskDCP* maxCompletionTask = nullptr;
|
||||||
|
};
|
||||||
|
|
||||||
|
llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents);
|
||||||
|
GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate);
|
||||||
|
|
||||||
|
LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
|
||||||
|
const llvm::DenseSet<TaskDCP*>& descendants,
|
||||||
|
Time dcpl,
|
||||||
|
Time maxCompletion,
|
||||||
|
Time secondMaxCompletion,
|
||||||
|
TaskDCP* maxCompletionTask);
|
||||||
|
void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
|
||||||
|
Time& dcpl,
|
||||||
|
Time& maxCompletion,
|
||||||
|
Time& secondMaxCompletion,
|
||||||
|
TaskDCP*& maxCompletionTask);
|
||||||
|
|
||||||
|
int countDependencyParents(const TaskDCP* task);
|
||||||
|
void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion);
|
||||||
|
std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount);
|
||||||
|
|
||||||
|
} // namespace dcp_graph
|
||||||
@@ -4,57 +4,63 @@
|
|||||||
#include "Task.hpp"
|
#include "Task.hpp"
|
||||||
#include "UniqueWorklist.hpp"
|
#include "UniqueWorklist.hpp"
|
||||||
|
|
||||||
std::optional<Edge_t> TaskDCP::addChild(TaskDCP* child, Weight_t weight) {
|
std::optional<Edge> TaskDCP::addChild(TaskDCP* child, Weight weight, bool isScheduling) {
|
||||||
std::optional<Edge_t> oldEdge = std::nullopt;
|
std::optional<Edge> oldEdge = std::nullopt;
|
||||||
auto founded_element =
|
auto foundElement = std::find_if(children.begin(), children.end(), [child, isScheduling](Edge element) {
|
||||||
std::find_if(childs.begin(), childs.end(), [child](Edge_t element) { return child == element.first; });
|
return child == element.first && isScheduling == element.isScheduling;
|
||||||
if (founded_element != childs.end()) {
|
});
|
||||||
oldEdge = *founded_element;
|
if (foundElement != children.end()) {
|
||||||
fastRemove(childs, founded_element);
|
oldEdge = *foundElement;
|
||||||
|
fastRemove(children, foundElement);
|
||||||
}
|
}
|
||||||
childs.emplace_back(child, weight);
|
children.emplace_back(Edge {child, weight, isScheduling});
|
||||||
return oldEdge;
|
return oldEdge;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::optional<Edge_t> TaskDCP::addParent(TaskDCP* parent, Weight_t weight) {
|
std::optional<Edge> TaskDCP::addParent(TaskDCP* parent, Weight weight, bool isScheduling) {
|
||||||
std::optional<Edge_t> oldEdge = std::nullopt;
|
std::optional<Edge> oldEdge = std::nullopt;
|
||||||
auto founded_element =
|
auto foundElement = std::find_if(parents.begin(), parents.end(), [parent, isScheduling](Edge element) {
|
||||||
std::find_if(parents.begin(), parents.end(), [parent](Edge_t element) { return parent == element.first; });
|
return parent == element.first && isScheduling == element.isScheduling;
|
||||||
if (founded_element != parents.end()) {
|
});
|
||||||
oldEdge = *founded_element;
|
if (foundElement != parents.end()) {
|
||||||
fastRemove(parents, founded_element);
|
oldEdge = *foundElement;
|
||||||
|
fastRemove(parents, foundElement);
|
||||||
}
|
}
|
||||||
parents.emplace_back(parent, weight);
|
parents.emplace_back(Edge {parent, weight, isScheduling});
|
||||||
return oldEdge;
|
return oldEdge;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TaskDCP::hasDescendent(TaskDCP* child) {
|
bool TaskDCP::hasDescendant(TaskDCP* child) {
|
||||||
UniqueWorkList<std::vector<TaskDCP*>> worklist;
|
UniqueWorkList<std::vector<TaskDCP*>> worklist;
|
||||||
worklist.reserve(32);
|
worklist.reserve(32);
|
||||||
worklist.push_back(this);
|
worklist.pushBack(this);
|
||||||
while (!worklist.empty()) {
|
while (!worklist.empty()) {
|
||||||
TaskDCP* task = worklist.back();
|
TaskDCP* task = worklist.back();
|
||||||
worklist.pop_back();
|
worklist.popBack();
|
||||||
if (task == child)
|
if (task == child)
|
||||||
return true;
|
return true;
|
||||||
for (auto c : task->childs)
|
for (auto edge : task->children)
|
||||||
worklist.push_back(c.first);
|
worklist.pushBack(edge.first);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO fare qualcosa di sensato
|
Weight TaskDCP::computeWeightOnCpu(GraphDCP* graph, CPU cpu) {
|
||||||
int TaskDCP::computeWeight(GraphDCP* graph, CPU cpu) { return origWeight; }
|
if (crossbarUsage != 0 && graph->wouldExhaustCrossbarCapacity(cpu, this))
|
||||||
|
return std::numeric_limits<Weight>::max();
|
||||||
|
return baseWeight;
|
||||||
|
}
|
||||||
|
|
||||||
void TaskInsertion::rollBack() {
|
void TaskInsertion::rollBack() {
|
||||||
graph->removeTaskFromCPU(cpuModified, taskInserted);
|
graph->removeTaskFromCPU(cpuModified, taskInserted);
|
||||||
if (beforeNode.has_value()) {
|
if (beforeNode.has_value()) {
|
||||||
auto double_edge = *beforeNode;
|
auto edgePair = *beforeNode;
|
||||||
addEdge(double_edge.first.first, double_edge.second.first, double_edge.first.second);
|
addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
|
||||||
}
|
}
|
||||||
if (afterNode.has_value()) {
|
if (afterNode.has_value()) {
|
||||||
auto double_edge = *afterNode;
|
auto edgePair = *afterNode;
|
||||||
addEdge(double_edge.first.first, double_edge.second.first, double_edge.first.second);
|
addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
|
||||||
}
|
}
|
||||||
graph->topologicalOrder.moveBefore( taskInserted,&*oldTopologicalPosition );
|
// for (auto it = topologicalMoves.rbegin(); it != topologicalMoves.rend(); ++it)
|
||||||
|
// graph->topologicalOrder.moveBefore(it->task, it->nextTask);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,110 +7,117 @@
|
|||||||
#include "Utils.hpp"
|
#include "Utils.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
|
|
||||||
std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
|
|
||||||
void removeEdge(TaskDCP* parent, TaskDCP* child);
|
|
||||||
|
|
||||||
class TaskDCP : public onnx_mlir::LabeledListNode<TaskDCP> {
|
class TaskDCP : public onnx_mlir::LabeledListNode<TaskDCP> {
|
||||||
onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute;
|
onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute;
|
||||||
int aest;
|
Time aest;
|
||||||
int alst;
|
Time alst;
|
||||||
std::optional<CPU> scheduledCPU;
|
std::optional<CPU> scheduledCpu;
|
||||||
int weight;
|
Weight weight;
|
||||||
int origWeight;
|
Weight baseWeight;
|
||||||
|
CrossbarUsage crossbarUsage;
|
||||||
long long flag = 0;
|
long long flag = 0;
|
||||||
int64_t syntheticId = -1;
|
int64_t syntheticId = -1;
|
||||||
|
|
||||||
std::optional<Edge_t> addChild(TaskDCP* child, Weight_t weight);
|
std::optional<Edge> addChild(TaskDCP* child, Weight weight, bool isScheduling);
|
||||||
std::optional<Edge_t> addChild(TaskDCP& child, Weight_t weight) { return addChild(&child, weight); }
|
std::optional<Edge> addChild(TaskDCP& child, Weight weight, bool isScheduling) {
|
||||||
|
return addChild(&child, weight, isScheduling);
|
||||||
|
}
|
||||||
|
|
||||||
void removeChild(TaskDCP* to_remove) { fastRemove(childs, to_remove); }
|
void removeChild(TaskDCP* toRemove, bool isScheduling) { fastRemove(children, toRemove, isScheduling); }
|
||||||
void removeChild(TaskDCP& to_remove) { fastRemove(childs, &to_remove); }
|
void removeChild(TaskDCP& toRemove, bool isScheduling) { fastRemove(children, &toRemove, isScheduling); }
|
||||||
|
|
||||||
std::optional<Edge_t> addParent(TaskDCP* parent, Weight_t weight);
|
std::optional<Edge> addParent(TaskDCP* parent, Weight weight, bool isScheduling);
|
||||||
std::optional<Edge_t> addParent(TaskDCP& parent, Weight_t weight) { return addParent(&parent, weight); }
|
std::optional<Edge> addParent(TaskDCP& parent, Weight weight, bool isScheduling) {
|
||||||
|
return addParent(&parent, weight, isScheduling);
|
||||||
|
}
|
||||||
|
|
||||||
void removeParent(TaskDCP* to_remove) { fastRemove(parents, to_remove); }
|
void removeParent(TaskDCP* toRemove, bool isScheduling) { fastRemove(parents, toRemove, isScheduling); }
|
||||||
void removeParent(TaskDCP& to_remove) { fastRemove(parents, &to_remove); }
|
void removeParent(TaskDCP& toRemove, bool isScheduling) { fastRemove(parents, &toRemove, isScheduling); }
|
||||||
|
|
||||||
public:
|
public:
|
||||||
std::vector<Edge_t> parents;
|
std::vector<Edge> parents;
|
||||||
std::vector<Edge_t> childs;
|
std::vector<Edge> children;
|
||||||
TaskDCP() = default;
|
TaskDCP() = default;
|
||||||
TaskDCP(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute)
|
TaskDCP(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute)
|
||||||
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
||||||
spatWeightedCompute(spatWeightedCompute),
|
spatWeightedCompute(spatWeightedCompute),
|
||||||
aest(0),
|
aest(0),
|
||||||
alst(0),
|
alst(0),
|
||||||
scheduledCPU(),
|
scheduledCpu(),
|
||||||
weight(getSpatWeightCompute(spatWeightedCompute)),
|
weight(getSpatComputeWeight(spatWeightedCompute)),
|
||||||
origWeight(weight),
|
baseWeight(weight),
|
||||||
|
crossbarUsage(getSpatComputeCrossbarUsage(spatWeightedCompute)),
|
||||||
syntheticId(-1),
|
syntheticId(-1),
|
||||||
parents(),
|
parents(),
|
||||||
childs() {}
|
children() {}
|
||||||
|
|
||||||
TaskDCP(int64_t id, int weight)
|
TaskDCP(int64_t id, Weight weight, CrossbarUsage crossbarUsage = 0)
|
||||||
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
||||||
spatWeightedCompute(),
|
spatWeightedCompute(),
|
||||||
aest(0),
|
aest(0),
|
||||||
alst(0),
|
alst(0),
|
||||||
scheduledCPU(),
|
scheduledCpu(),
|
||||||
weight(weight),
|
weight(weight),
|
||||||
origWeight(weight),
|
baseWeight(weight),
|
||||||
|
crossbarUsage(crossbarUsage),
|
||||||
flag(0),
|
flag(0),
|
||||||
syntheticId(id),
|
syntheticId(id),
|
||||||
parents(),
|
parents(),
|
||||||
childs() {}
|
children() {}
|
||||||
|
|
||||||
TaskDCP(const TaskDCP& node) = delete;
|
TaskDCP(const TaskDCP& node) = delete;
|
||||||
TaskDCP(TaskDCP&& node) = default;
|
TaskDCP(TaskDCP&& node) = default;
|
||||||
|
|
||||||
void setCPU(CPU cpu) { scheduledCPU = cpu; }
|
void setCpu(CPU cpu) { scheduledCpu = cpu; }
|
||||||
std::optional<CPU> getCPU() const { return scheduledCPU; }
|
std::optional<CPU> getCpu() const { return scheduledCpu; }
|
||||||
void resetCPU() { scheduledCPU = std::nullopt; }
|
void resetCpu() { scheduledCpu = std::nullopt; }
|
||||||
int getWeight() const {
|
Weight getWeight() const {
|
||||||
if (isScheduled())
|
if (isScheduled())
|
||||||
return weight;
|
return weight;
|
||||||
return origWeight;
|
return baseWeight;
|
||||||
}
|
}
|
||||||
void setWeight(int val) { weight = val; }
|
void setWeight(Weight value) { weight = value; }
|
||||||
void resetWeight() { weight = origWeight; }
|
void resetWeight() { weight = baseWeight; }
|
||||||
int computeWeight(GraphDCP* graph, CPU cpu);
|
Weight computeWeightOnCpu(GraphDCP* graph, CPU cpu);
|
||||||
|
CrossbarUsage getCrossbarUsage() const { return crossbarUsage; }
|
||||||
|
|
||||||
bool hasParents() const { return parents.size() != 0; }
|
bool hasParents() const { return parents.size() != 0; }
|
||||||
bool hasChilds() const { return childs.size() != 0; }
|
bool hasChildren() const { return children.size() != 0; }
|
||||||
|
|
||||||
int getAEST() const { return aest; }
|
Time getAest() const { return aest; }
|
||||||
int getALST() const { return alst; }
|
Time getAlst() const { return alst; }
|
||||||
void setAEST(int val) {
|
void setAest(Time value) { aest = value; }
|
||||||
assert(val >= 0);
|
void setAlst(Time value) { alst = value; }
|
||||||
aest = val;
|
bool hasDescendant(TaskDCP* child);
|
||||||
}
|
|
||||||
void setALST(int val) { alst = val; }
|
|
||||||
bool hasDescendent(TaskDCP* child);
|
|
||||||
int64_t Id() const {
|
int64_t Id() const {
|
||||||
if (spatWeightedCompute)
|
if (spatWeightedCompute)
|
||||||
return reinterpret_cast<int64_t>(spatWeightedCompute.getAsOpaquePointer());
|
return reinterpret_cast<int64_t>(spatWeightedCompute.getAsOpaquePointer());
|
||||||
return syntheticId;
|
return syntheticId;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isCP() const { return alst == aest; }
|
bool isCriticalPath() const { return alst == aest; }
|
||||||
bool isScheduled() const { return scheduledCPU.has_value(); }
|
bool isScheduled() const { return scheduledCpu.has_value(); }
|
||||||
onnx_mlir::spatial::SpatWeightedCompute getSpatWeightedCompute() const { return spatWeightedCompute; }
|
onnx_mlir::spatial::SpatWeightedCompute getSpatWeightedCompute() const { return spatWeightedCompute; }
|
||||||
|
|
||||||
void setFlag(long long val) { flag = val; }
|
void setFlag(long long val) { flag = val; }
|
||||||
long long getFlag() const { return flag; }
|
long long getFlag() const { return flag; }
|
||||||
|
|
||||||
onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalPosition() { return getIterator(); }
|
onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalIterator() { return getIterator(); }
|
||||||
|
|
||||||
friend std::optional<DoubleEdge> addEdge(TaskDCP* parent, TaskDCP* child, Weight_t weight);
|
friend std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling);
|
||||||
friend void removeEdge(TaskDCP* parent, TaskDCP* child);
|
friend void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling);
|
||||||
friend int getTranferCost(TaskDCP* parent, TaskDCP* child);
|
friend Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TaskInsertion {
|
struct TaskInsertion {
|
||||||
std::optional<DoubleEdge> beforeNode;
|
struct TopologicalMoveRecord {
|
||||||
std::optional<DoubleEdge> afterNode;
|
TaskDCP* task;
|
||||||
onnx_mlir::LabeledList<TaskDCP>::Iterator oldTopologicalPosition;
|
TaskDCP* nextTask;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::optional<EdgePair> beforeNode;
|
||||||
|
std::optional<EdgePair> afterNode;
|
||||||
|
std::vector<TopologicalMoveRecord> topologicalMoves;
|
||||||
CPU cpuModified;
|
CPU cpuModified;
|
||||||
TaskDCP* taskInserted;
|
TaskDCP* taskInserted;
|
||||||
GraphDCP* graph;
|
GraphDCP* graph;
|
||||||
|
|||||||
@@ -1,58 +1,57 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "llvm/ADT/DenseSet.h"
|
#include "llvm/ADT/DenseSet.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
#include <iostream>
|
|
||||||
#include <unordered_set>
|
|
||||||
|
|
||||||
template <typename T, typename = void>
|
template <typename T, typename = void>
|
||||||
struct has_pop_front : std::false_type {};
|
struct HasPopFront : std::false_type {};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct has_pop_front<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
|
struct HasPopFront<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
class UniqueWorkList {
|
class UniqueWorkList {
|
||||||
|
|
||||||
using V = typename T::value_type;
|
using ValueType = typename T::value_type;
|
||||||
T storage;
|
T storage;
|
||||||
llvm::DenseSet<V> set;
|
llvm::DenseSet<ValueType> uniqueElements;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
UniqueWorkList() = default;
|
UniqueWorkList() = default;
|
||||||
|
|
||||||
template <typename arg_ty>
|
template <typename RangeT>
|
||||||
UniqueWorkList(const arg_ty& from)
|
UniqueWorkList(const RangeT& from)
|
||||||
: storage() {
|
: storage() {
|
||||||
for (auto& element : from) {
|
for (auto& element : from) {
|
||||||
if (!set.contains(element)) {
|
if (!uniqueElements.contains(element)) {
|
||||||
storage.push_back(element);
|
storage.push_back(element);
|
||||||
set.insert(element);
|
uniqueElements.insert(element);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool empty() const { return storage.empty(); }
|
bool empty() const { return storage.empty(); }
|
||||||
void reserve(size_t val) { return storage.reserve(val); }
|
void reserve(size_t value) { return storage.reserve(value); }
|
||||||
size_t size() const { return storage.size(); }
|
size_t size() const { return storage.size(); }
|
||||||
V& at(size_t i) { return storage.at(i); }
|
ValueType& at(size_t index) { return storage.at(index); }
|
||||||
const V& at(size_t i) const { return storage.at(i); }
|
const ValueType& at(size_t index) const { return storage.at(index); }
|
||||||
|
|
||||||
V& front() { return storage.front(); }
|
ValueType& front() { return storage.front(); }
|
||||||
V& back() { return storage.back(); }
|
ValueType& back() { return storage.back(); }
|
||||||
|
|
||||||
bool push_back(const V& val) {
|
bool pushBack(const ValueType& value) {
|
||||||
if (!set.contains(val)) {
|
if (!uniqueElements.contains(value)) {
|
||||||
storage.push_back(val);
|
storage.push_back(value);
|
||||||
set.insert(val);
|
uniqueElements.insert(value);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void pop_front() {
|
void popFront() {
|
||||||
if constexpr (has_pop_front<T>::value)
|
if constexpr (HasPopFront<T>::value)
|
||||||
storage.pop_front();
|
storage.pop_front();
|
||||||
else
|
else
|
||||||
assert(false && "Underlying storage type does not support pop_front()");
|
assert(false && "Underlying storage type does not support pop_front()");
|
||||||
@@ -61,15 +60,15 @@ public:
|
|||||||
auto cbegin() const { return storage.cbegin(); }
|
auto cbegin() const { return storage.cbegin(); }
|
||||||
auto cend() const { return storage.cend(); }
|
auto cend() const { return storage.cend(); }
|
||||||
|
|
||||||
void pop_back() { storage.pop_back(); }
|
void popBack() { storage.pop_back(); }
|
||||||
|
|
||||||
|
|
||||||
template <typename Iterator, typename Mapper>
|
template <typename Iterator, typename Mapper>
|
||||||
bool allElementContained(Iterator start, Iterator end, Mapper map) {
|
bool allElementsContained(Iterator begin, Iterator end, Mapper map) const {
|
||||||
while (start != end) {
|
auto it = begin;
|
||||||
if (!set.contains(map(*start)))
|
while (it != end) {
|
||||||
|
if (!uniqueElements.contains(map(*it)))
|
||||||
return false;
|
return false;
|
||||||
std::advance(start, 1);
|
std::advance(it, 1);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -77,4 +76,8 @@ public:
|
|||||||
auto begin() { return storage.begin(); }
|
auto begin() { return storage.begin(); }
|
||||||
|
|
||||||
auto end() { return storage.end(); }
|
auto end() { return storage.end(); }
|
||||||
|
|
||||||
|
auto begin() const { return storage.begin(); }
|
||||||
|
|
||||||
|
auto end() const { return storage.end(); }
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -6,60 +6,106 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
#include <type_traits>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Support/TypeUtilities.hpp"
|
|
||||||
|
|
||||||
|
|
||||||
using CPU = int;
|
using CPU = int;
|
||||||
using Weight_t = int;
|
using Weight = unsigned long long;
|
||||||
|
using Time = unsigned long long;
|
||||||
|
using CrossbarUsage = unsigned long long;
|
||||||
class TaskDCP;
|
class TaskDCP;
|
||||||
class GraphDCP;
|
class GraphDCP;
|
||||||
using Edge_t = std::pair<TaskDCP*, Weight_t>;
|
struct Edge {
|
||||||
using DoubleEdge = std::pair<Edge_t, Edge_t>;
|
TaskDCP* first;
|
||||||
using EdgesIndex = std::tuple<int64_t, int64_t, int64_t>;
|
Weight second;
|
||||||
|
bool isScheduling = false;
|
||||||
|
};
|
||||||
|
using EdgePair = std::pair<Edge, Edge>;
|
||||||
|
using IndexedEdge = std::tuple<int64_t, int64_t, int64_t>;
|
||||||
|
|
||||||
|
inline void fastRemove(std::vector<Edge>& vector, TaskDCP* toRemove, bool isScheduling) {
|
||||||
|
auto position = std::find_if(vector.begin(), vector.end(), [toRemove, isScheduling](Edge edge) {
|
||||||
|
return edge.first == toRemove && edge.isScheduling == isScheduling;
|
||||||
|
});
|
||||||
|
if (position != vector.end()) {
|
||||||
|
std::swap(*(vector.end() - 1), *position);
|
||||||
|
vector.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* toRemove) {
|
||||||
|
auto position =
|
||||||
|
std::find_if(vector.begin(), vector.end(), [toRemove](TaskDCP* element) { return element == toRemove; });
|
||||||
|
if (position != vector.end()) {
|
||||||
|
std::swap(*(vector.end() - 1), *position);
|
||||||
|
vector.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename P>
|
||||||
|
void fastRemove(std::vector<Edge>& vector, P position) {
|
||||||
|
if (position != vector.end()) {
|
||||||
|
std::swap(*(vector.end() - 1), *position);
|
||||||
|
vector.pop_back();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void fastRemove(std::vector<std::pair<T*, Weight_t>>& vector, T* to_remove) {
|
inline T checkedAdd(T lhs, T rhs) {
|
||||||
auto position =
|
static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
|
||||||
std::find_if(vector.begin(), vector.end(), [to_remove](Edge_t edge) { return edge.first == to_remove; });
|
assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
|
||||||
if (position != vector.end()) {
|
return lhs + rhs;
|
||||||
std::swap(*(vector.end() - 1), *position);
|
|
||||||
vector.pop_back();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* to_remove) {
|
template <typename T>
|
||||||
auto position =
|
inline T checkedMultiply(T lhs, T rhs) {
|
||||||
std::find_if(vector.begin(), vector.end(), [to_remove](TaskDCP* element) { return element == to_remove; });
|
static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
|
||||||
if (position != vector.end()) {
|
if (lhs == 0 || rhs == 0)
|
||||||
std::swap(*(vector.end() - 1), *position);
|
return 0;
|
||||||
vector.pop_back();
|
assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
|
||||||
}
|
return lhs * rhs;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, typename P>
|
template <typename T>
|
||||||
void fastRemove(std::vector<std::pair<T*, Weight_t>>& vector, P position) {
|
inline T addOrMax(T lhs, T rhs) {
|
||||||
if (position != vector.end()) {
|
static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
|
||||||
std::swap(*(vector.end() - 1), *position);
|
if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
|
||||||
vector.pop_back();
|
return std::numeric_limits<T>::max();
|
||||||
}
|
return checkedAdd(lhs, rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO Fare qualcosa di sensato
|
template <typename T>
|
||||||
inline int64_t getSpatWeightCompute(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
|
inline T subtractOrZero(T lhs, T rhs) {
|
||||||
int64_t tot = 0;
|
static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
|
||||||
for (auto& region : spatWeightedCompute.getBody()) {
|
if (lhs == std::numeric_limits<T>::max())
|
||||||
for (auto& inst : region) {
|
return lhs;
|
||||||
for (auto result : inst.getResults())
|
if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
|
||||||
if (auto element = llvm::dyn_cast<mlir::ShapedType>(result.getType()))
|
return 0;
|
||||||
tot += onnx_mlir::getSizeInBytes(element);
|
return lhs - rhs;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return tot;
|
inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
|
||||||
|
|
||||||
|
inline Weight getSpatComputeWeight(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
|
||||||
|
constexpr Weight kOperationWeight = 100;
|
||||||
|
Weight numOperations = 0;
|
||||||
|
for (auto& block : spatWeightedCompute.getBody())
|
||||||
|
for ([[maybe_unused]] auto& op : block)
|
||||||
|
numOperations = checkedAdd(numOperations, static_cast<Weight>(1));
|
||||||
|
return checkedMultiply(numOperations, kOperationWeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline CrossbarUsage getSpatComputeCrossbarUsage(onnx_mlir::spatial::SpatWeightedCompute spatWeightedCompute) {
|
||||||
|
CrossbarUsage crossbarUsage = 0;
|
||||||
|
for (auto& region : spatWeightedCompute.getBody())
|
||||||
|
for (auto& inst : region)
|
||||||
|
if (llvm::isa<onnx_mlir::spatial::SpatWeightedVMMOp>(inst))
|
||||||
|
crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
|
||||||
|
return crossbarUsage;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,6 @@
|
|||||||
#include "mlir/IR/Region.h"
|
#include "mlir/IR/Region.h"
|
||||||
#include "mlir/IR/Value.h"
|
#include "mlir/IR/Value.h"
|
||||||
#include "mlir/IR/ValueRange.h"
|
#include "mlir/IR/ValueRange.h"
|
||||||
#include "mlir/IR/Verifier.h"
|
|
||||||
#include "mlir/Pass/Pass.h"
|
#include "mlir/Pass/Pass.h"
|
||||||
#include "mlir/Support/LLVM.h"
|
#include "mlir/Support/LLVM.h"
|
||||||
|
|
||||||
@@ -14,13 +13,12 @@
|
|||||||
#include "llvm/Support/Debug.h"
|
#include "llvm/Support/Debug.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
|
||||||
#include "DCPGraph/DCPAnalysis.hpp"
|
#include "DCPGraph/DCPAnalysis.hpp"
|
||||||
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|
||||||
@@ -36,10 +34,10 @@ struct ComputeValueResults {
|
|||||||
class LazyInsertComputeResult {
|
class LazyInsertComputeResult {
|
||||||
using InsertPoint = mlir::IRRewriter::InsertPoint;
|
using InsertPoint = mlir::IRRewriter::InsertPoint;
|
||||||
ComputeValueResults computeResults;
|
ComputeValueResults computeResults;
|
||||||
Value channelNewOpVal;
|
Value channelValue;
|
||||||
bool onlyChannel;
|
bool onlyChannel;
|
||||||
std::function<void(InsertPoint insertPoint)> channelSendInserter;
|
std::function<void(InsertPoint insertPoint)> channelSendInserter;
|
||||||
InsertPoint insertPointSend;
|
InsertPoint sendInsertPoint;
|
||||||
std::function<std::pair<Value, std::function<void(InsertPoint)>>()> channelNewInserter;
|
std::function<std::pair<Value, std::function<void(InsertPoint)>>()> channelNewInserter;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@@ -49,7 +47,7 @@ public:
|
|||||||
: computeResults(computeValueResults),
|
: computeResults(computeValueResults),
|
||||||
onlyChannel(isOnlyChannel),
|
onlyChannel(isOnlyChannel),
|
||||||
channelSendInserter(nullptr),
|
channelSendInserter(nullptr),
|
||||||
insertPointSend({}),
|
sendInsertPoint({}),
|
||||||
channelNewInserter(channelNewInserter) {}
|
channelNewInserter(channelNewInserter) {}
|
||||||
|
|
||||||
struct ChannelOrLocalOp {
|
struct ChannelOrLocalOp {
|
||||||
@@ -59,23 +57,23 @@ public:
|
|||||||
|
|
||||||
bool onlyChanneled() const { return onlyChannel; }
|
bool onlyChanneled() const { return onlyChannel; }
|
||||||
|
|
||||||
ChannelOrLocalOp getAsChannelValueAndInsertSender(SpatWeightedCompute spatWeightedCompute) {
|
ChannelOrLocalOp getAsChannelValueAndInsertSender(SpatWeightedCompute currentCompute) {
|
||||||
|
|
||||||
auto [first, second] = channelNewInserter();
|
auto [newChannelValue, senderInserter] = channelNewInserter();
|
||||||
channelNewOpVal = first;
|
channelValue = newChannelValue;
|
||||||
channelSendInserter = second;
|
channelSendInserter = senderInserter;
|
||||||
auto BB = computeResults.innerValue.getParentBlock();
|
auto* block = computeResults.innerValue.getParentBlock();
|
||||||
if (!BB->empty() && isa<spatial::SpatYieldOp>(BB->back()))
|
if (!block->empty() && isa<spatial::SpatYieldOp>(block->back()))
|
||||||
insertPointSend = InsertPoint(BB, --BB->end());
|
sendInsertPoint = InsertPoint(block, --block->end());
|
||||||
else
|
else
|
||||||
insertPointSend = InsertPoint(BB, BB->end());
|
sendInsertPoint = InsertPoint(block, block->end());
|
||||||
if (spatWeightedCompute) {
|
if (currentCompute) {
|
||||||
for (auto& BB : spatWeightedCompute.getBody())
|
for (auto& block : currentCompute.getBody())
|
||||||
if (&BB == insertPointSend.getBlock())
|
if (&block == sendInsertPoint.getBlock())
|
||||||
return {computeResults.innerValue, false};
|
return {computeResults.innerValue, false};
|
||||||
}
|
}
|
||||||
channelSendInserter(insertPointSend);
|
channelSendInserter(sendInsertPoint);
|
||||||
return {channelNewOpVal, true};
|
return {channelValue, true};
|
||||||
}
|
}
|
||||||
|
|
||||||
ChannelOrLocalOp getAsChannelValueAndInsertSender() { return getAsChannelValueAndInsertSender({}); }
|
ChannelOrLocalOp getAsChannelValueAndInsertSender() { return getAsChannelValueAndInsertSender({}); }
|
||||||
@@ -86,7 +84,7 @@ struct MergeComputeNodesPass : PassWrapper<MergeComputeNodesPass, OperationPass<
|
|||||||
private:
|
private:
|
||||||
DenseMap<SpatWeightedCompute, LazyInsertComputeResult> newComputeNodeResults;
|
DenseMap<SpatWeightedCompute, LazyInsertComputeResult> newComputeNodeResults;
|
||||||
DenseMap<SpatWeightedCompute, SpatWeightedCompute> oldToNewComputeMap;
|
DenseMap<SpatWeightedCompute, SpatWeightedCompute> oldToNewComputeMap;
|
||||||
DenseMap<int64_t, SpatWeightedCompute> cputToNewComputeMap;
|
DenseMap<int64_t, SpatWeightedCompute> cpuToNewComputeMap;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeComputeNodesPass)
|
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeComputeNodesPass)
|
||||||
@@ -101,17 +99,16 @@ public:
|
|||||||
|
|
||||||
void runOnOperation() override {
|
void runOnOperation() override {
|
||||||
DCPAnalysisResult& analysisResult = getAnalysis<spatial::DCPAnalysis>().getResult();
|
DCPAnalysisResult& analysisResult = getAnalysis<spatial::DCPAnalysis>().getResult();
|
||||||
auto& lastComputeOfCpu = analysisResult.isLastComputeOfACpu;
|
auto& lastComputeOfCpu = analysisResult.isLastComputeOfCpu;
|
||||||
auto& cpuToLastComputeMap = analysisResult.cpuToLastComputeMap;
|
auto& cpuToLastComputeMap = analysisResult.cpuToLastComputeMap;
|
||||||
IRRewriter rewriter(&getContext());
|
|
||||||
|
|
||||||
for (auto currentComputeNode : analysisResult.dominanceOrderCompute) {
|
for (auto currentComputeNode : analysisResult.dominanceOrderCompute) {
|
||||||
size_t cpu = analysisResult.computeToCPUMap.at(currentComputeNode);
|
size_t cpu = analysisResult.computeToCpuMap.at(currentComputeNode);
|
||||||
if (!cputToNewComputeMap.contains(cpu)) {
|
if (!cpuToNewComputeMap.contains(cpu)) {
|
||||||
ValueTypeRange<ResultRange> newWeightedComputeType = cpuToLastComputeMap.at(cpu).getResultTypes();
|
ValueTypeRange<ResultRange> newWeightedComputeType = cpuToLastComputeMap.at(cpu).getResultTypes();
|
||||||
auto [newWeightedCompute, computeValueResult] = createNewComputeNode(
|
auto [newWeightedCompute, computeValueResult] = createNewComputeNode(
|
||||||
currentComputeNode, newWeightedComputeType, lastComputeOfCpu.contains(currentComputeNode));
|
currentComputeNode, newWeightedComputeType, lastComputeOfCpu.contains(currentComputeNode));
|
||||||
cputToNewComputeMap[cpu] = newWeightedCompute;
|
cpuToNewComputeMap[cpu] = newWeightedCompute;
|
||||||
newComputeNodeResults.insert(
|
newComputeNodeResults.insert(
|
||||||
std::make_pair(currentComputeNode,
|
std::make_pair(currentComputeNode,
|
||||||
createLazyComputeResult(
|
createLazyComputeResult(
|
||||||
@@ -119,7 +116,7 @@ public:
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto [newWeightedCompute, computeValueResult] = mergeIntoComputeNode(
|
auto [newWeightedCompute, computeValueResult] = mergeIntoComputeNode(
|
||||||
cputToNewComputeMap[cpu], currentComputeNode, lastComputeOfCpu.contains(currentComputeNode));
|
cpuToNewComputeMap[cpu], currentComputeNode, lastComputeOfCpu.contains(currentComputeNode));
|
||||||
newComputeNodeResults.insert(
|
newComputeNodeResults.insert(
|
||||||
std::make_pair(currentComputeNode,
|
std::make_pair(currentComputeNode,
|
||||||
createLazyComputeResult(
|
createLazyComputeResult(
|
||||||
@@ -127,10 +124,10 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto computeNodetoRemove : llvm::make_early_inc_range(llvm::reverse(analysisResult.dominanceOrderCompute))) {
|
for (auto computeNodeToRemove : llvm::make_early_inc_range(llvm::reverse(analysisResult.dominanceOrderCompute))) {
|
||||||
for (auto users : computeNodetoRemove->getUsers())
|
for (auto users : computeNodeToRemove->getUsers())
|
||||||
users->dump();
|
users->dump();
|
||||||
computeNodetoRemove.erase();
|
computeNodeToRemove.erase();
|
||||||
}
|
}
|
||||||
func::FuncOp func = getOperation();
|
func::FuncOp func = getOperation();
|
||||||
dumpModule(cast<ModuleOp>(func->getParentOp()), "spatial1_dcp_merged");
|
dumpModule(cast<ModuleOp>(func->getParentOp()), "spatial1_dcp_merged");
|
||||||
@@ -186,9 +183,9 @@ private:
|
|||||||
LazyInsertComputeResult& lazyArgWeight = newComputeNodeResults.at(argWeightCompute);
|
LazyInsertComputeResult& lazyArgWeight = newComputeNodeResults.at(argWeightCompute);
|
||||||
auto [channelVal, isChannel] = lazyArgWeight.getAsChannelValueAndInsertSender();
|
auto [channelVal, isChannel] = lazyArgWeight.getAsChannelValueAndInsertSender();
|
||||||
assert(isChannel == true);
|
assert(isChannel == true);
|
||||||
spatial::SpatChannelReceiveOp reciveOp =
|
spatial::SpatChannelReceiveOp receiveOp =
|
||||||
spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelVal);
|
spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelVal);
|
||||||
mapper.map(oldBB.getArgument(indexOld - indexOldStart), reciveOp);
|
mapper.map(oldBB.getArgument(indexOld - indexOldStart), receiveOp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -238,8 +235,8 @@ private:
|
|||||||
|
|
||||||
auto& toBB = toCompute.getBody().front();
|
auto& toBB = toCompute.getBody().front();
|
||||||
auto& fromBB = fromCompute.getBody().front();
|
auto& fromBB = fromCompute.getBody().front();
|
||||||
auto inputeArgMutable = toCompute.getInputsMutable();
|
auto inputArgMutable = toCompute.getInputsMutable();
|
||||||
// Insert reciveOp
|
// Insert receiveOp
|
||||||
rewriter.setInsertionPointToEnd(&toBB);
|
rewriter.setInsertionPointToEnd(&toBB);
|
||||||
for (auto [bbIndex, arg] : llvm::enumerate(fromCompute.getInputs())) {
|
for (auto [bbIndex, arg] : llvm::enumerate(fromCompute.getInputs())) {
|
||||||
if (auto argWeightCompute = llvm::dyn_cast_if_present<SpatWeightedCompute>(arg.getDefiningOp())) {
|
if (auto argWeightCompute = llvm::dyn_cast_if_present<SpatWeightedCompute>(arg.getDefiningOp())) {
|
||||||
@@ -248,9 +245,9 @@ private:
|
|||||||
LazyInsertComputeResult::ChannelOrLocalOp channelOrLocal =
|
LazyInsertComputeResult::ChannelOrLocalOp channelOrLocal =
|
||||||
lazyArgWeight.getAsChannelValueAndInsertSender(toCompute);
|
lazyArgWeight.getAsChannelValueAndInsertSender(toCompute);
|
||||||
if (channelOrLocal.isChannel) {
|
if (channelOrLocal.isChannel) {
|
||||||
spatial::SpatChannelReceiveOp reciveOp =
|
spatial::SpatChannelReceiveOp receiveOp =
|
||||||
spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelOrLocal.data);
|
spatial::SpatChannelReceiveOp::create(rewriter, loc, argWeightCompute.getType(0), channelOrLocal.data);
|
||||||
mapper.map(fromBB.getArgument(bbIndex), reciveOp.getResult());
|
mapper.map(fromBB.getArgument(bbIndex), receiveOp.getResult());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
mapper.map(fromBB.getArgument(bbIndex), channelOrLocal.data);
|
mapper.map(fromBB.getArgument(bbIndex), channelOrLocal.data);
|
||||||
@@ -262,7 +259,7 @@ private:
|
|||||||
if (founded == toCompute.getInputs().end()) {
|
if (founded == toCompute.getInputs().end()) {
|
||||||
size_t sizeW = toCompute.getWeights().size();
|
size_t sizeW = toCompute.getWeights().size();
|
||||||
size_t sizeI = toCompute.getInputs().size();
|
size_t sizeI = toCompute.getInputs().size();
|
||||||
inputeArgMutable.append(arg);
|
inputArgMutable.append(arg);
|
||||||
assert(sizeW == toCompute.getWeights().size());
|
assert(sizeW == toCompute.getWeights().size());
|
||||||
assert(sizeI + 1 == toCompute.getInputs().size());
|
assert(sizeI + 1 == toCompute.getInputs().size());
|
||||||
assert(sizeW + sizeI + 1 == toCompute.getOperands().size());
|
assert(sizeW + sizeI + 1 == toCompute.getOperands().size());
|
||||||
@@ -281,6 +278,12 @@ private:
|
|||||||
assert(mapper.contains(oldBBarg));
|
assert(mapper.contains(oldBBarg));
|
||||||
|
|
||||||
ComputeValueResults computeValueResults;
|
ComputeValueResults computeValueResults;
|
||||||
|
auto remapWeightIndex = [&](auto weightedOp) {
|
||||||
|
auto oldIndex = weightedOp.getWeightIndex();
|
||||||
|
auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
|
||||||
|
auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
|
||||||
|
weightedOp.setWeightIndex(newIndex);
|
||||||
|
};
|
||||||
for (auto& op : fromCompute.getOps()) {
|
for (auto& op : fromCompute.getOps()) {
|
||||||
if (auto yield = dyn_cast<spatial::SpatYieldOp>(&op)) {
|
if (auto yield = dyn_cast<spatial::SpatYieldOp>(&op)) {
|
||||||
computeValueResults.innerValue = mapper.lookup(yield.getOperand(0));
|
computeValueResults.innerValue = mapper.lookup(yield.getOperand(0));
|
||||||
@@ -289,20 +292,10 @@ private:
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
auto newInst = rewriter.clone(op, mapper);
|
auto newInst = rewriter.clone(op, mapper);
|
||||||
// TODO Refactor in a lambda? same code just different cast, but templated lambda are C++20 and a free function
|
if (auto weightedMvmOp = llvm::dyn_cast<spatial::SpatWeightedMVMOp>(newInst))
|
||||||
// is a bit too much
|
remapWeightIndex(weightedMvmOp);
|
||||||
if (auto vmOp = llvm::dyn_cast<spatial::SpatWeightedMVMOp>(newInst)) {
|
if (auto weightedVmmOp = llvm::dyn_cast<spatial::SpatWeightedVMMOp>(newInst))
|
||||||
auto oldIndex = vmOp.getWeightIndex();
|
remapWeightIndex(weightedVmmOp);
|
||||||
auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
|
|
||||||
auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
|
|
||||||
vmOp.setWeightIndex(newIndex);
|
|
||||||
}
|
|
||||||
if (auto vmOp = llvm::dyn_cast<spatial::SpatWeightedVMMOp>(newInst)) {
|
|
||||||
auto oldIndex = vmOp.getWeightIndex();
|
|
||||||
auto newWeight = mapper.lookup(*std::next(fromCompute.getWeights().begin(), oldIndex));
|
|
||||||
auto newIndex = std::distance(toCompute.getWeights().begin(), llvm::find(toCompute.getWeights(), newWeight));
|
|
||||||
vmOp.setWeightIndex(newIndex);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -323,19 +316,18 @@ private:
|
|||||||
IRRewriter rewriter(context);
|
IRRewriter rewriter(context);
|
||||||
|
|
||||||
rewriter.setInsertionPointToStart(&funcOp.front());
|
rewriter.setInsertionPointToStart(&funcOp.front());
|
||||||
auto saveInsertionPointChnNew = rewriter.saveInsertionPoint();
|
auto savedChannelInsertPoint = rewriter.saveInsertionPoint();
|
||||||
auto insertNew = [saveInsertionPointChnNew, context, loc, computeValueResults]() {
|
auto insertNew = [savedChannelInsertPoint, context, loc, computeValueResults]() {
|
||||||
IRRewriter rewriter(context);
|
IRRewriter rewriter(context);
|
||||||
rewriter.restoreInsertionPoint(saveInsertionPointChnNew);
|
rewriter.restoreInsertionPoint(savedChannelInsertPoint);
|
||||||
auto channelOp = spatial::SpatChannelNewOp::create(rewriter, loc, spatial::SpatChannelType::get(context));
|
auto channelOp = spatial::SpatChannelNewOp::create(rewriter, loc, spatial::SpatChannelType::get(context));
|
||||||
auto channelVal = channelOp.getResult();
|
auto channelVal = channelOp.getResult();
|
||||||
auto insertVal =
|
auto insertVal = [&context, loc, computeValueResults, channelVal](mlir::IRRewriter::InsertPoint sendInsertPoint) {
|
||||||
[&context, loc, computeValueResults, channelVal](mlir::IRRewriter::InsertPoint insertPointChnSend) {
|
IRRewriter rewriter(context);
|
||||||
IRRewriter rewriter(context);
|
rewriter.restoreInsertionPoint(sendInsertPoint);
|
||||||
rewriter.restoreInsertionPoint(insertPointChnSend);
|
auto spatSend = spatial::SpatChannelSendOp::create(rewriter, loc, channelVal, computeValueResults.innerValue);
|
||||||
auto spatSend = spatial::SpatChannelSendOp::create(rewriter, loc, channelVal, computeValueResults.innerValue);
|
return spatSend;
|
||||||
return spatSend;
|
};
|
||||||
};
|
|
||||||
std::pair<Value, std::function<void(mlir::IRRewriter::InsertPoint)>> ret {channelVal, insertVal};
|
std::pair<Value, std::function<void(mlir::IRRewriter::InsertPoint)>> ret {channelVal, insertVal};
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -25,8 +25,15 @@ function(add_pim_unittest test_name)
|
|||||||
set_tests_properties(${test_name} PROPERTIES LABELS pim-unittest)
|
set_tests_properties(${test_name} PROPERTIES LABELS pim-unittest)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
add_pim_unittest(TestPIM
|
add_pim_unittest(LabeledListTest
|
||||||
TestPIM.cpp
|
LabeledListTest.cpp
|
||||||
|
|
||||||
|
LINK_LIBS PRIVATE
|
||||||
|
OMPimCommon
|
||||||
|
)
|
||||||
|
|
||||||
|
add_pim_unittest(DCPTest
|
||||||
|
DCPTest.cpp
|
||||||
|
|
||||||
LINK_LIBS PRIVATE
|
LINK_LIBS PRIVATE
|
||||||
OMPimCommon
|
OMPimCommon
|
||||||
|
|||||||
528
test/PIM/DCPTest.cpp
Normal file
528
test/PIM/DCPTest.cpp
Normal file
@@ -0,0 +1,528 @@
|
|||||||
|
#include <algorithm>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <initializer_list>
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
#include <optional>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
|
#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
|
||||||
|
#include "src/Compiler/CompilerOptions.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
struct ExpectedScheduledTask {
|
||||||
|
size_t nodeIndex;
|
||||||
|
Time aest;
|
||||||
|
Time alst;
|
||||||
|
Weight weight;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ScheduledPlacement {
|
||||||
|
CPU cpu;
|
||||||
|
GraphDCP::ScheduledTaskInfo task;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::filesystem::path getDcpTestOutputDir() { return std::filesystem::temp_directory_path() / "raptor-test-pim"; }
|
||||||
|
|
||||||
|
void configureDcpDotOutput() {
|
||||||
|
auto outputDir = getDcpTestOutputDir();
|
||||||
|
std::error_code errorCode;
|
||||||
|
std::filesystem::remove_all(outputDir, errorCode);
|
||||||
|
std::filesystem::create_directories(outputDir, errorCode);
|
||||||
|
assert(!errorCode);
|
||||||
|
onnx_mlir::outputBaseName = (outputDir / "DCPTest.mlir").string();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<std::filesystem::path> getLatestDcpDotFile() {
|
||||||
|
auto graphDir = getDcpTestOutputDir() / "dcp_graph";
|
||||||
|
if (!std::filesystem::exists(graphDir))
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
std::optional<std::filesystem::path> latestDot;
|
||||||
|
for (const auto& entry : std::filesystem::directory_iterator(graphDir)) {
|
||||||
|
if (!entry.is_regular_file() || entry.path().extension() != ".dot")
|
||||||
|
continue;
|
||||||
|
if (!latestDot || entry.path().filename() > latestDot->filename())
|
||||||
|
latestDot = entry.path();
|
||||||
|
}
|
||||||
|
return latestDot;
|
||||||
|
}
|
||||||
|
|
||||||
|
void dumpDcpFailureArtifacts() {
|
||||||
|
auto latestDot = getLatestDcpDotFile();
|
||||||
|
if (!latestDot) {
|
||||||
|
std::cerr << "No DCP dot file was produced.\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << "DCP dot file: " << latestDot->string() << '\n';
|
||||||
|
std::ifstream dotFile(*latestDot);
|
||||||
|
if (!dotFile.is_open()) {
|
||||||
|
std::cerr << "Failed to open DCP dot file.\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::cerr << dotFile.rdbuf();
|
||||||
|
}
|
||||||
|
|
||||||
|
void printCpuSchedule(GraphDCP& graph, CPU cpu) {
|
||||||
|
auto actualTasks = graph.getScheduledTasks(cpu);
|
||||||
|
std::cerr << "CPU " << cpu << " actual schedule:\n";
|
||||||
|
for (const auto& task : actualTasks) {
|
||||||
|
std::cerr << " " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
|
||||||
|
<< " weight: " << task.weight << '\n';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void printGraphSchedule(GraphDCP& graph) {
|
||||||
|
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
|
||||||
|
printCpuSchedule(graph, cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool checkScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
|
||||||
|
auto actualTasks = graph.getScheduledTasks(cpu);
|
||||||
|
if (actualTasks.size() != expectedTasks.size()) {
|
||||||
|
printCpuSchedule(graph, cpu);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto expectedIt = expectedTasks.begin();
|
||||||
|
for (const auto& actualTask : actualTasks) {
|
||||||
|
if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
|
||||||
|
|| actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
|
||||||
|
printCpuSchedule(graph, cpu);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
++expectedIt;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unordered_map<size_t, ScheduledPlacement> collectScheduledPlacements(GraphDCP& graph) {
|
||||||
|
std::unordered_map<size_t, ScheduledPlacement> scheduledPlacements;
|
||||||
|
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
|
||||||
|
for (const auto& task : graph.getScheduledTasks(cpu)) {
|
||||||
|
auto [it, inserted] = scheduledPlacements.emplace(task.nodeIndex, ScheduledPlacement {cpu, task});
|
||||||
|
assert(inserted && "task scheduled multiple times");
|
||||||
|
(void) it;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return scheduledPlacements;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool checkAllTasksScheduled(GraphDCP& graph, size_t expectedTaskCount) {
|
||||||
|
auto scheduledPlacements = collectScheduledPlacements(graph);
|
||||||
|
if (scheduledPlacements.size() != expectedTaskCount) {
|
||||||
|
std::cerr << "Expected " << expectedTaskCount << " scheduled tasks, got " << scheduledPlacements.size() << "\n";
|
||||||
|
printGraphSchedule(graph);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool checkCpuSchedulesDoNotOverlap(GraphDCP& graph) {
|
||||||
|
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
|
||||||
|
auto scheduledTasks = graph.getScheduledTasks(cpu);
|
||||||
|
Time previousCompletion = 0;
|
||||||
|
bool firstTask = true;
|
||||||
|
for (const auto& task : scheduledTasks) {
|
||||||
|
Time completion = addOrMax(task.aest, task.weight);
|
||||||
|
if (task.aest > task.alst) {
|
||||||
|
std::cerr << "Task " << task.nodeIndex << " on CPU " << cpu << " has aest > alst\n";
|
||||||
|
printCpuSchedule(graph, cpu);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!firstTask && task.aest < previousCompletion) {
|
||||||
|
std::cerr << "CPU " << cpu << " has overlapping tasks\n";
|
||||||
|
printCpuSchedule(graph, cpu);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
previousCompletion = completion;
|
||||||
|
firstTask = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool checkDependencyConstraints(GraphDCP& graph, llvm::ArrayRef<IndexedEdge> edges) {
|
||||||
|
auto scheduledPlacements = collectScheduledPlacements(graph);
|
||||||
|
for (auto [parentIndex, childIndex, transferCost] : edges) {
|
||||||
|
const auto& parent = scheduledPlacements.at(parentIndex);
|
||||||
|
const auto& child = scheduledPlacements.at(childIndex);
|
||||||
|
Time requiredStart = addOrMax(parent.task.aest, parent.task.weight);
|
||||||
|
if (parent.cpu != child.cpu)
|
||||||
|
requiredStart = addOrMax(requiredStart, static_cast<Weight>(transferCost));
|
||||||
|
if (child.task.aest < requiredStart) {
|
||||||
|
std::cerr << "Dependency violation for edge " << parentIndex << " -> " << childIndex << '\n';
|
||||||
|
printGraphSchedule(graph);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Time getMaxCompletion(GraphDCP& graph) {
|
||||||
|
Time maxCompletion = 0;
|
||||||
|
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
|
||||||
|
for (const auto& task : graph.getScheduledTasks(cpu))
|
||||||
|
maxCompletion = std::max(maxCompletion, addOrMax(task.aest, task.weight));
|
||||||
|
return maxCompletion;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphSingleNode() {
|
||||||
|
std::cout << "testDCPGraphSingleNode:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {15};
|
||||||
|
GraphDCP graph(nodeWeights, {});
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (graph.cpuCount() != 1) {
|
||||||
|
std::cerr << "Expected exactly 1 CPU, got " << graph.cpuCount() << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
0,
|
||||||
|
{
|
||||||
|
{0, 0, 0, 15},
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphLinearChain() {
|
||||||
|
std::cout << "testDCPGraphLinearChain:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {10, 20, 5};
|
||||||
|
const std::vector<IndexedEdge> edges = {
|
||||||
|
{0, 1, 7},
|
||||||
|
{1, 2, 9},
|
||||||
|
};
|
||||||
|
|
||||||
|
GraphDCP graph(nodeWeights, edges);
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (graph.cpuCount() != 1) {
|
||||||
|
std::cerr << "Expected a linear chain to stay on one CPU, got " << graph.cpuCount() << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
0,
|
||||||
|
{
|
||||||
|
{0, 0, 0, 10},
|
||||||
|
{1, 10, 10, 20},
|
||||||
|
{2, 30, 30, 5 },
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkCpuSchedulesDoNotOverlap(graph) || !checkDependencyConstraints(graph, edges)) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphFixture() {
|
||||||
|
std::cout << "testDCPGraphFixture:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {
|
||||||
|
80,
|
||||||
|
40,
|
||||||
|
40,
|
||||||
|
40,
|
||||||
|
40,
|
||||||
|
40,
|
||||||
|
60,
|
||||||
|
30,
|
||||||
|
30,
|
||||||
|
30,
|
||||||
|
30,
|
||||||
|
40,
|
||||||
|
20,
|
||||||
|
20,
|
||||||
|
20,
|
||||||
|
20,
|
||||||
|
10,
|
||||||
|
10,
|
||||||
|
};
|
||||||
|
const std::vector<IndexedEdge> edges = {
|
||||||
|
{0, 1, 3 },
|
||||||
|
{0, 1, 120},
|
||||||
|
{0, 2, 120},
|
||||||
|
{0, 3, 120},
|
||||||
|
{0, 4, 120},
|
||||||
|
{0, 5, 120},
|
||||||
|
{0, 6, 120},
|
||||||
|
{2, 6, 80 },
|
||||||
|
{2, 7, 80 },
|
||||||
|
{3, 8, 80 },
|
||||||
|
{4, 9, 80 },
|
||||||
|
{5, 10, 80 },
|
||||||
|
{6, 7, 120},
|
||||||
|
{6, 8, 120},
|
||||||
|
{6, 9, 120},
|
||||||
|
{6, 10, 120},
|
||||||
|
{6, 11, 120},
|
||||||
|
{8, 11, 80 },
|
||||||
|
{8, 12, 80 },
|
||||||
|
{9, 13, 80 },
|
||||||
|
{10, 14, 80 },
|
||||||
|
{11, 12, 120},
|
||||||
|
{11, 13, 120},
|
||||||
|
{11, 14, 120},
|
||||||
|
{11, 15, 120},
|
||||||
|
{13, 15, 80 },
|
||||||
|
{13, 16, 80 },
|
||||||
|
{14, 17, 80 },
|
||||||
|
{15, 16, 120},
|
||||||
|
{15, 17, 120},
|
||||||
|
};
|
||||||
|
|
||||||
|
GraphDCP graph(nodeWeights, {});
|
||||||
|
for (auto [parent, child, weight] : edges)
|
||||||
|
graph.makeEdge(parent, child, weight);
|
||||||
|
|
||||||
|
graph.runDcp();
|
||||||
|
if (graph.cpuCount() != 4) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
3,
|
||||||
|
{
|
||||||
|
{1, 200, 400, 40},
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
2,
|
||||||
|
{
|
||||||
|
{5, 200, 260, 40},
|
||||||
|
{10, 300, 300, 30},
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
1,
|
||||||
|
{
|
||||||
|
{4, 200, 210, 40},
|
||||||
|
{7, 300, 410, 30},
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkScheduledTasks(graph,
|
||||||
|
0,
|
||||||
|
{
|
||||||
|
{0, 0, 0, 80},
|
||||||
|
{2, 80, 80, 40},
|
||||||
|
{6, 120, 120, 60},
|
||||||
|
{3, 180, 200, 40},
|
||||||
|
{8, 220, 240, 30},
|
||||||
|
{11, 250, 270, 40},
|
||||||
|
{12, 290, 310, 20},
|
||||||
|
{9, 320, 330, 30},
|
||||||
|
{13, 350, 360, 20},
|
||||||
|
{15, 370, 380, 20},
|
||||||
|
{16, 390, 400, 10},
|
||||||
|
{14, 410, 410, 20},
|
||||||
|
{17, 430, 430, 10},
|
||||||
|
})) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
||||||
|
|| !checkDependencyConstraints(graph, edges)) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphMaxCPUs() {
|
||||||
|
std::cout << "testDCPGraphMaxCPUs:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {20, 10, 10, 10, 10, 10, 10};
|
||||||
|
const std::vector<IndexedEdge> edges = {
|
||||||
|
{0, 1, 0},
|
||||||
|
{0, 2, 0},
|
||||||
|
{0, 3, 0},
|
||||||
|
{0, 4, 0},
|
||||||
|
{0, 5, 0},
|
||||||
|
{0, 6, 0},
|
||||||
|
};
|
||||||
|
|
||||||
|
GraphDCP graph(nodeWeights, edges);
|
||||||
|
graph.setMaxCpuCount(2);
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (graph.cpuCount() != 2) {
|
||||||
|
std::cerr << "Expected exactly 2 CPUs with maxCpuCount=2, got " << graph.cpuCount() << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
||||||
|
|| !checkDependencyConstraints(graph, edges)) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (getMaxCompletion(graph) > 50) {
|
||||||
|
std::cerr << "Expected makespan <= 50 under maxCpuCount=2, got " << getMaxCompletion(graph) << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphSingleCpuCap() {
|
||||||
|
std::cout << "testDCPGraphSingleCpuCap:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {20, 10, 10, 10};
|
||||||
|
const std::vector<IndexedEdge> edges = {
|
||||||
|
{0, 1, 0},
|
||||||
|
{0, 2, 0},
|
||||||
|
{0, 3, 0},
|
||||||
|
};
|
||||||
|
|
||||||
|
GraphDCP graph(nodeWeights, edges);
|
||||||
|
graph.setMaxCpuCount(1);
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (graph.cpuCount() != 1) {
|
||||||
|
std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
||||||
|
|| !checkDependencyConstraints(graph, edges)) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (getMaxCompletion(graph) != 50) {
|
||||||
|
std::cerr << "Expected makespan 50 under maxCpuCount=1, got " << getMaxCompletion(graph) << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphDiamondDependencies() {
|
||||||
|
std::cout << "testDCPGraphDiamondDependencies:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {15, 10, 12, 20};
|
||||||
|
const std::vector<IndexedEdge> edges = {
|
||||||
|
{0, 1, 5},
|
||||||
|
{0, 2, 7},
|
||||||
|
{1, 3, 3},
|
||||||
|
{2, 3, 2},
|
||||||
|
};
|
||||||
|
|
||||||
|
GraphDCP graph(nodeWeights, edges);
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
||||||
|
|| !checkDependencyConstraints(graph, edges)) {
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto scheduledPlacements = collectScheduledPlacements(graph);
|
||||||
|
const auto& sink = scheduledPlacements.at(3).task;
|
||||||
|
if (sink.aest < 27) {
|
||||||
|
std::cerr << "Expected sink node to start no earlier than the longest parent path, got " << sink.aest << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testDCPGraphCrossbarExhaustion() {
|
||||||
|
std::cout << "testDCPGraphCrossbarExhaustion:" << std::endl;
|
||||||
|
configureDcpDotOutput();
|
||||||
|
|
||||||
|
const size_t savedCrossbarSize = onnx_mlir::crossbarSize.getValue();
|
||||||
|
const size_t savedCrossbarCount = onnx_mlir::crossbarCountInCore.getValue();
|
||||||
|
onnx_mlir::crossbarSize = 4;
|
||||||
|
onnx_mlir::crossbarCountInCore = 2;
|
||||||
|
|
||||||
|
auto restoreCrossbarOptions = [&]() {
|
||||||
|
onnx_mlir::crossbarSize = savedCrossbarSize;
|
||||||
|
onnx_mlir::crossbarCountInCore = savedCrossbarCount;
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<Weight> nodeWeights = {10, 10, 10};
|
||||||
|
const std::vector<CrossbarUsage> nodeCrossbarUsage = {1, 1, 1};
|
||||||
|
GraphDCP graph(nodeWeights, {}, nodeCrossbarUsage);
|
||||||
|
graph.setMaxCpuCount(1);
|
||||||
|
graph.runDcp();
|
||||||
|
|
||||||
|
if (graph.cpuCount() != 1) {
|
||||||
|
restoreCrossbarOptions();
|
||||||
|
std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto scheduledTasks = graph.getScheduledTasks(0);
|
||||||
|
if (scheduledTasks.size() != 3) {
|
||||||
|
restoreCrossbarOptions();
|
||||||
|
std::cerr << "Expected all three tasks to be scheduled on CPU 0\n";
|
||||||
|
printCpuSchedule(graph, 0);
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scheduledTasks[0].weight != 10 || scheduledTasks[1].weight != std::numeric_limits<Weight>::max()
|
||||||
|
|| scheduledTasks[2].weight != std::numeric_limits<Weight>::max()) {
|
||||||
|
restoreCrossbarOptions();
|
||||||
|
std::cerr << "Unexpected effective weights under crossbar exhaustion\n";
|
||||||
|
printCpuSchedule(graph, 0);
|
||||||
|
dumpDcpFailureArtifacts();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
restoreCrossbarOptions();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
(void) argc;
|
||||||
|
(void) argv;
|
||||||
|
|
||||||
|
int failures = 0;
|
||||||
|
failures += testDCPGraphSingleNode();
|
||||||
|
failures += testDCPGraphLinearChain();
|
||||||
|
failures += testDCPGraphFixture();
|
||||||
|
failures += testDCPGraphMaxCPUs();
|
||||||
|
failures += testDCPGraphSingleCpuCap();
|
||||||
|
failures += testDCPGraphDiamondDependencies();
|
||||||
|
failures += testDCPGraphCrossbarExhaustion();
|
||||||
|
if (failures != 0) {
|
||||||
|
std::cerr << failures << " test failures\n";
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
162
test/PIM/LabeledListTest.cpp
Normal file
162
test/PIM/LabeledListTest.cpp
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
#include <cassert>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <initializer_list>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
||||||
|
|
||||||
|
using onnx_mlir::LabeledList;
|
||||||
|
using onnx_mlir::LabeledListNode;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
struct TestNode : public LabeledListNode<TestNode> {
|
||||||
|
explicit TestNode(int id)
|
||||||
|
: id(id) {}
|
||||||
|
|
||||||
|
int id;
|
||||||
|
};
|
||||||
|
|
||||||
|
void assertOrder(LabeledList<TestNode>& list, std::initializer_list<int> expectedOrder) {
|
||||||
|
auto expectedIt = expectedOrder.begin();
|
||||||
|
for (auto& node : list) {
|
||||||
|
assert(expectedIt != expectedOrder.end());
|
||||||
|
assert(node.id == *expectedIt);
|
||||||
|
++expectedIt;
|
||||||
|
}
|
||||||
|
assert(expectedIt == expectedOrder.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
void assertStrictlyIncreasingLabels(LabeledList<TestNode>& list) {
|
||||||
|
auto it = list.begin();
|
||||||
|
if (it == list.end())
|
||||||
|
return;
|
||||||
|
|
||||||
|
auto previousLabel = it->getOrderLabel();
|
||||||
|
++it;
|
||||||
|
for (; it != list.end(); ++it) {
|
||||||
|
assert(previousLabel < it->getOrderLabel());
|
||||||
|
previousLabel = it->getOrderLabel();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int testLabeledListBasicMutation() {
|
||||||
|
std::cout << "testLabeledListBasicMutation:" << std::endl;
|
||||||
|
|
||||||
|
LabeledList<TestNode> list;
|
||||||
|
TestNode n1(1);
|
||||||
|
TestNode n2(2);
|
||||||
|
TestNode n3(3);
|
||||||
|
TestNode n4(4);
|
||||||
|
TestNode n5(5);
|
||||||
|
|
||||||
|
assert(list.empty());
|
||||||
|
assert(list.front() == nullptr);
|
||||||
|
assert(list.back() == nullptr);
|
||||||
|
assert(!list.contains(&n1));
|
||||||
|
assert(LabeledList<TestNode>::previous(&n1) == nullptr);
|
||||||
|
assert(LabeledList<TestNode>::next(&n1) == nullptr);
|
||||||
|
|
||||||
|
list.pushBack(&n1);
|
||||||
|
list.pushBack(&n3);
|
||||||
|
list.insertAfter(&n1, &n2);
|
||||||
|
list.pushFront(&n4);
|
||||||
|
list.insertBefore(nullptr, &n5);
|
||||||
|
|
||||||
|
assert(list.size() == 5);
|
||||||
|
assert(list.front() == &n4);
|
||||||
|
assert(list.back() == &n5);
|
||||||
|
assert(list.contains(&n2));
|
||||||
|
assertOrder(list, {4, 1, 2, 3, 5});
|
||||||
|
assert(LabeledList<TestNode>::next(&n4) == &n1);
|
||||||
|
assert(LabeledList<TestNode>::previous(&n1) == &n4);
|
||||||
|
assert(LabeledList<TestNode>::next(&n5) == nullptr);
|
||||||
|
assert(list.comesBefore(&n1, &n3));
|
||||||
|
assert(list.getOrderLabel(&n1) < list.getOrderLabel(&n3));
|
||||||
|
|
||||||
|
list.moveBefore(&n5, &n2);
|
||||||
|
assertOrder(list, {4, 1, 5, 2, 3});
|
||||||
|
|
||||||
|
list.moveAfter(&n4, &n3);
|
||||||
|
assertOrder(list, {1, 5, 2, 3, 4});
|
||||||
|
|
||||||
|
list.remove(&n2);
|
||||||
|
assert(!n2.isLinked());
|
||||||
|
assert(!list.contains(&n2));
|
||||||
|
assertOrder(list, {1, 5, 3, 4});
|
||||||
|
|
||||||
|
list.clear();
|
||||||
|
assert(list.empty());
|
||||||
|
assert(list.size() == 0);
|
||||||
|
assert(list.front() == nullptr);
|
||||||
|
assert(list.back() == nullptr);
|
||||||
|
assert(!n1.isLinked());
|
||||||
|
assert(!n3.isLinked());
|
||||||
|
assert(!n4.isLinked());
|
||||||
|
assert(!n5.isLinked());
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int testLabeledListRelabelingAndNoopMoves() {
|
||||||
|
std::cout << "testLabeledListRelabelingAndNoopMoves:" << std::endl;
|
||||||
|
|
||||||
|
constexpr int insertedNodeCount = 80;
|
||||||
|
LabeledList<TestNode> list;
|
||||||
|
TestNode head(0);
|
||||||
|
TestNode tail(999);
|
||||||
|
std::vector<TestNode> insertedNodes;
|
||||||
|
insertedNodes.reserve(insertedNodeCount);
|
||||||
|
for (int i = 0; i < insertedNodeCount; ++i)
|
||||||
|
insertedNodes.emplace_back(i + 1);
|
||||||
|
|
||||||
|
list.pushBack(&head);
|
||||||
|
list.pushBack(&tail);
|
||||||
|
for (auto& node : insertedNodes)
|
||||||
|
list.insertAfter(&head, &node);
|
||||||
|
|
||||||
|
assert(list.size() == insertedNodeCount + 2);
|
||||||
|
assert(list.front() == &head);
|
||||||
|
assert(list.back() == &tail);
|
||||||
|
assert(LabeledList<TestNode>::previous(&head) == nullptr);
|
||||||
|
assert(LabeledList<TestNode>::next(&tail) == nullptr);
|
||||||
|
assertStrictlyIncreasingLabels(list);
|
||||||
|
|
||||||
|
auto* firstInserted = LabeledList<TestNode>::next(&head);
|
||||||
|
auto* secondInserted = LabeledList<TestNode>::next(firstInserted);
|
||||||
|
list.moveBefore(firstInserted, secondInserted);
|
||||||
|
list.moveAfter(&head, nullptr);
|
||||||
|
list.moveAfter(&tail, LabeledList<TestNode>::previous(&tail));
|
||||||
|
|
||||||
|
assert(list.front() == &head);
|
||||||
|
assert(list.back() == &tail);
|
||||||
|
assert(firstInserted == &insertedNodes.back());
|
||||||
|
assert(secondInserted == &insertedNodes[insertedNodeCount - 2]);
|
||||||
|
assertStrictlyIncreasingLabels(list);
|
||||||
|
|
||||||
|
int expectedId = insertedNodeCount;
|
||||||
|
auto it = std::next(list.begin());
|
||||||
|
for (; it != list.end() && &*it != &tail; ++it, --expectedId)
|
||||||
|
assert(it->id == expectedId);
|
||||||
|
assert(expectedId == 0);
|
||||||
|
list.clear();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
(void) argc;
|
||||||
|
(void) argv;
|
||||||
|
|
||||||
|
int failures = 0;
|
||||||
|
failures += testLabeledListBasicMutation();
|
||||||
|
failures += testLabeledListRelabelingAndNoopMoves();
|
||||||
|
if (failures != 0) {
|
||||||
|
std::cerr << failures << " test failures\n";
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
@@ -1,202 +0,0 @@
|
|||||||
/*
|
|
||||||
* SPDX-License-Identifier: Apache-2.0
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <initializer_list>
|
|
||||||
#include <iostream>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using onnx_mlir::LabeledList;
|
|
||||||
using onnx_mlir::LabeledListNode;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
struct TestNode : public LabeledListNode<TestNode> {
|
|
||||||
explicit TestNode(int id)
|
|
||||||
: id(id) {}
|
|
||||||
|
|
||||||
int id;
|
|
||||||
};
|
|
||||||
|
|
||||||
void assertOrder(LabeledList<TestNode>& list, std::initializer_list<int> expectedOrder) {
|
|
||||||
auto expectedIt = expectedOrder.begin();
|
|
||||||
for (auto& node : list) {
|
|
||||||
assert(expectedIt != expectedOrder.end());
|
|
||||||
assert(node.id == *expectedIt);
|
|
||||||
++expectedIt;
|
|
||||||
}
|
|
||||||
assert(expectedIt == expectedOrder.end());
|
|
||||||
}
|
|
||||||
|
|
||||||
int testLabeledList() {
|
|
||||||
std::cout << "testLabeledList:" << std::endl;
|
|
||||||
|
|
||||||
LabeledList<TestNode> list;
|
|
||||||
TestNode n1(1);
|
|
||||||
TestNode n2(2);
|
|
||||||
TestNode n3(3);
|
|
||||||
TestNode n4(4);
|
|
||||||
TestNode n5(5);
|
|
||||||
|
|
||||||
list.pushBack(&n1);
|
|
||||||
list.pushBack(&n3);
|
|
||||||
list.insertAfter(&n1, &n2);
|
|
||||||
list.pushFront(&n4);
|
|
||||||
list.insertBefore(nullptr, &n5);
|
|
||||||
|
|
||||||
assertOrder(list, {4, 1, 2, 3, 5});
|
|
||||||
assert(LabeledList<TestNode>::next(&n4) == &n1);
|
|
||||||
assert(LabeledList<TestNode>::previous(&n1) == &n4);
|
|
||||||
assert(LabeledList<TestNode>::next(&n5) == nullptr);
|
|
||||||
assert(list.comesBefore(&n1, &n3));
|
|
||||||
assert(list.getOrderLabel(&n1) < list.getOrderLabel(&n3));
|
|
||||||
|
|
||||||
list.moveBefore(&n5, &n2);
|
|
||||||
assertOrder(list, {4, 1, 5, 2, 3});
|
|
||||||
|
|
||||||
list.moveAfter(&n4, &n3);
|
|
||||||
assertOrder(list, {1, 5, 2, 3, 4});
|
|
||||||
|
|
||||||
list.remove(&n2);
|
|
||||||
assert(!n2.isLinked());
|
|
||||||
assertOrder(list, {1, 5, 3, 4});
|
|
||||||
|
|
||||||
list.clear();
|
|
||||||
assert(list.empty());
|
|
||||||
assert(!n1.isLinked());
|
|
||||||
assert(!n3.isLinked());
|
|
||||||
assert(!n4.isLinked());
|
|
||||||
assert(!n5.isLinked());
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ExpectedScheduledTask {
|
|
||||||
size_t nodeIndex;
|
|
||||||
int aest;
|
|
||||||
int alst;
|
|
||||||
int weight;
|
|
||||||
};
|
|
||||||
|
|
||||||
void assertScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
|
|
||||||
auto actualTasks = graph.getScheduledTasks(cpu);
|
|
||||||
assert(actualTasks.size() == expectedTasks.size());
|
|
||||||
|
|
||||||
auto expectedIt = expectedTasks.begin();
|
|
||||||
for (const auto& actualTask : actualTasks) {
|
|
||||||
assert(expectedIt != expectedTasks.end());
|
|
||||||
if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
|
|
||||||
|| actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
|
|
||||||
std::cerr << "CPU " << cpu << " actual schedule:\n";
|
|
||||||
for (const auto& task : actualTasks) {
|
|
||||||
std::cerr << " " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
|
|
||||||
<< " weight: " << task.weight << '\n';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(actualTask.nodeIndex == expectedIt->nodeIndex);
|
|
||||||
assert(actualTask.aest == expectedIt->aest);
|
|
||||||
assert(actualTask.alst == expectedIt->alst);
|
|
||||||
assert(actualTask.weight == expectedIt->weight);
|
|
||||||
++expectedIt;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphFixture() {
|
|
||||||
std::cout << "testDCPGraphFixture:" << std::endl;
|
|
||||||
|
|
||||||
const std::vector<Weight_t> nodeWeights = {
|
|
||||||
80, 40, 40, 40, 40, 40, 60, 30, 30, 30,
|
|
||||||
30, 40, 20, 20, 20, 20, 10, 10,
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, {});
|
|
||||||
graph.makeEdge(0, 1, 3);
|
|
||||||
graph.makeEdge(0, 1, 120);
|
|
||||||
graph.makeEdge(0, 2, 120);
|
|
||||||
graph.makeEdge(0, 3, 120);
|
|
||||||
graph.makeEdge(0, 4, 120);
|
|
||||||
graph.makeEdge(0, 5, 120);
|
|
||||||
graph.makeEdge(0, 6, 120);
|
|
||||||
graph.makeEdge(2, 6, 80);
|
|
||||||
graph.makeEdge(2, 7, 80);
|
|
||||||
graph.makeEdge(3, 8, 80);
|
|
||||||
graph.makeEdge(4, 9, 80);
|
|
||||||
graph.makeEdge(5, 10, 80);
|
|
||||||
graph.makeEdge(6, 7, 120);
|
|
||||||
graph.makeEdge(6, 8, 120);
|
|
||||||
graph.makeEdge(6, 9, 120);
|
|
||||||
graph.makeEdge(6, 10, 120);
|
|
||||||
graph.makeEdge(6, 11, 120);
|
|
||||||
graph.makeEdge(8, 11, 80);
|
|
||||||
graph.makeEdge(8, 12, 80);
|
|
||||||
graph.makeEdge(9, 13, 80);
|
|
||||||
graph.makeEdge(10, 14, 80);
|
|
||||||
graph.makeEdge(11, 12, 120);
|
|
||||||
graph.makeEdge(11, 13, 120);
|
|
||||||
graph.makeEdge(11, 14, 120);
|
|
||||||
graph.makeEdge(11, 15, 120);
|
|
||||||
graph.makeEdge(13, 15, 80);
|
|
||||||
graph.makeEdge(13, 16, 80);
|
|
||||||
graph.makeEdge(14, 17, 80);
|
|
||||||
graph.makeEdge(15, 16, 120);
|
|
||||||
graph.makeEdge(15, 17, 120);
|
|
||||||
|
|
||||||
graph.DCP();
|
|
||||||
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
|
|
||||||
auto scheduledTasks = graph.getScheduledTasks(cpu);
|
|
||||||
std::cerr << "CPU " << cpu << " computed schedule:\n";
|
|
||||||
for (const auto& task : scheduledTasks) {
|
|
||||||
std::cerr << " " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
|
|
||||||
<< " weight: " << task.weight << '\n';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert(graph.cpuCount() == 4);
|
|
||||||
assertScheduledTasks(graph, 3, {
|
|
||||||
{1, 200, 370, 40},
|
|
||||||
});
|
|
||||||
assertScheduledTasks(graph, 2, {
|
|
||||||
{5, 200, 260, 40},
|
|
||||||
{10, 300, 300, 30},
|
|
||||||
});
|
|
||||||
assertScheduledTasks(graph, 1, {
|
|
||||||
{4, 200, 210, 40},
|
|
||||||
{7, 300, 380, 30},
|
|
||||||
});
|
|
||||||
assertScheduledTasks(graph, 0, {
|
|
||||||
{0, 0, 0, 80},
|
|
||||||
{2, 80, 80, 40},
|
|
||||||
{6, 120, 120, 60},
|
|
||||||
{3, 180, 200, 40},
|
|
||||||
{8, 220, 240, 30},
|
|
||||||
{11, 250, 270, 40},
|
|
||||||
{12, 290, 310, 20},
|
|
||||||
{9, 320, 330, 30},
|
|
||||||
{13, 350, 360, 20},
|
|
||||||
{15, 370, 380, 20},
|
|
||||||
{16, 390, 400, 10},
|
|
||||||
{14, 410, 410, 20},
|
|
||||||
{17, 430, 430, 10},
|
|
||||||
});
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
|
||||||
(void) argc;
|
|
||||||
(void) argv;
|
|
||||||
|
|
||||||
int failures = 0;
|
|
||||||
failures += testLabeledList();
|
|
||||||
failures += testDCPGraphFixture();
|
|
||||||
if (failures != 0) {
|
|
||||||
std::cerr << failures << " test failures\n";
|
|
||||||
return EXIT_FAILURE;
|
|
||||||
}
|
|
||||||
return EXIT_SUCCESS;
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user