Bye Bye DCP
This commit is contained in:
@@ -19,7 +19,6 @@ llvm::cl::opt<PimMergeSchedulerType>
|
|||||||
pimMergeScheduler("pim-merge-scheduler",
|
pimMergeScheduler("pim-merge-scheduler",
|
||||||
llvm::cl::desc("Scheduler used by the Spatial merge-compute-nodes pass"),
|
llvm::cl::desc("Scheduler used by the Spatial merge-compute-nodes pass"),
|
||||||
llvm::cl::values(clEnumValN(MergeSchedulerPeft, "peft", "Use PEFT scheduling")),
|
llvm::cl::values(clEnumValN(MergeSchedulerPeft, "peft", "Use PEFT scheduling")),
|
||||||
llvm::cl::values(clEnumValN(MergeSchedulerDcp, "dcp", "Use the legacy DCP-inspired scheduler")),
|
|
||||||
llvm::cl::init(MergeSchedulerPeft),
|
llvm::cl::init(MergeSchedulerPeft),
|
||||||
llvm::cl::cat(OnnxMlirOptions));
|
llvm::cl::cat(OnnxMlirOptions));
|
||||||
|
|
||||||
@@ -49,12 +48,6 @@ llvm::cl::opt<long> coresCount("core-count",
|
|||||||
llvm::cl::desc("Number of cores in the chip. Required for PIM compilation."),
|
llvm::cl::desc("Number of cores in the chip. Required for PIM compilation."),
|
||||||
llvm::cl::init(-1));
|
llvm::cl::init(-1));
|
||||||
|
|
||||||
llvm::cl::opt<size_t> dcpCriticalWindowSize(
|
|
||||||
"dcp-critical-window-size",
|
|
||||||
llvm::cl::desc("Number of lowest-slack virtual nodes considered by each DCP coarsening iteration. "
|
|
||||||
"Use 0 to run the legacy full-graph DCP analysis. Only used by the DCP scheduler."),
|
|
||||||
llvm::cl::init(4000));
|
|
||||||
|
|
||||||
llvm::cl::opt<bool>
|
llvm::cl::opt<bool>
|
||||||
ignoreConcatError("ignore-concat-error",
|
ignoreConcatError("ignore-concat-error",
|
||||||
llvm::cl::desc("Ignore ConcatOp corner case: do not assert and do a simplification"),
|
llvm::cl::desc("Ignore ConcatOp corner case: do not assert and do a simplification"),
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ typedef enum {
|
|||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
MergeSchedulerPeft = 0,
|
MergeSchedulerPeft = 0,
|
||||||
MergeSchedulerDcp = 1,
|
|
||||||
} PimMergeSchedulerType;
|
} PimMergeSchedulerType;
|
||||||
|
|
||||||
extern llvm::cl::OptionCategory OnnxMlirOptions;
|
extern llvm::cl::OptionCategory OnnxMlirOptions;
|
||||||
@@ -36,7 +35,6 @@ extern llvm::cl::opt<bool> pimEmitJson;
|
|||||||
extern llvm::cl::opt<size_t> crossbarSize;
|
extern llvm::cl::opt<size_t> crossbarSize;
|
||||||
extern llvm::cl::opt<size_t> crossbarCountInCore;
|
extern llvm::cl::opt<size_t> crossbarCountInCore;
|
||||||
extern llvm::cl::opt<long> coresCount;
|
extern llvm::cl::opt<long> coresCount;
|
||||||
extern llvm::cl::opt<size_t> dcpCriticalWindowSize;
|
|
||||||
|
|
||||||
bool hasExplicitPimCoreCount();
|
bool hasExplicitPimCoreCount();
|
||||||
void verifyExplicitPimCoreCount();
|
void verifyExplicitPimCoreCount();
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ add_public_tablegen_target(ONNXToSpatialIncGen)
|
|||||||
|
|
||||||
add_pim_library(OMONNXToSpatial
|
add_pim_library(OMONNXToSpatial
|
||||||
ConversionPatterns.cpp
|
ConversionPatterns.cpp
|
||||||
HostFoldability.cpp
|
CompileTime.cpp
|
||||||
HostLegality.cpp
|
HostLegality.cpp
|
||||||
PrePatterns.cpp
|
PrePatterns.cpp
|
||||||
PostPatterns.cpp
|
PostPatterns.cpp
|
||||||
|
|||||||
@@ -6,7 +6,7 @@
|
|||||||
#include "ShapeTilingUtils.hpp"
|
#include "ShapeTilingUtils.hpp"
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
|
|
||||||
using namespace mlir;
|
using namespace mlir;
|
||||||
|
|
||||||
@@ -44,7 +44,7 @@ SmallVector<Value> sliceTensor(
|
|||||||
RankedTensorType::get(sliceShape, cast<RankedTensorType>(tensorToSlice.getType()).getElementType());
|
RankedTensorType::get(sliceShape, cast<RankedTensorType>(tensorToSlice.getType()).getElementType());
|
||||||
|
|
||||||
Value slice;
|
Value slice;
|
||||||
if (isHostFoldableValue(tensorToSlice)) {
|
if (isCompileTimeComputable(tensorToSlice)) {
|
||||||
slice = tensor::ExtractSliceOp::create(rewriter, loc, tensorToSlice, offsets, sizes, strides);
|
slice = tensor::ExtractSliceOp::create(rewriter, loc, tensorToSlice, offsets, sizes, strides);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -113,7 +113,7 @@ Value broadcastToVector(Value scalarToBroadcast, int64_t length, ConversionPatte
|
|||||||
return tensor::SplatOp::create(rewriter, loc, type, elementValue);
|
return tensor::SplatOp::create(rewriter, loc, type, elementValue);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isHostFoldableValue(scalarToBroadcast))
|
if (isCompileTimeComputable(scalarToBroadcast))
|
||||||
return buildBroadcast(scalarToBroadcast);
|
return buildBroadcast(scalarToBroadcast);
|
||||||
|
|
||||||
auto broadcastCompute =
|
auto broadcastCompute =
|
||||||
|
|||||||
+32
-23
@@ -8,7 +8,7 @@
|
|||||||
#include "llvm/ADT/SmallPtrSet.h"
|
#include "llvm/ADT/SmallPtrSet.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -145,7 +145,7 @@ static DenseElementsAttr getDirectDenseConstantAttr(Value value) {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm::SmallPtrSetImpl<Operation*>& visited) {
|
static DenseElementsAttr getHostConstantDenseElementsAttrImpl(Value value, llvm::SmallPtrSetImpl<Operation*>& visited) {
|
||||||
auto* definingOp = value.getDefiningOp();
|
auto* definingOp = value.getDefiningOp();
|
||||||
if (!definingOp || !visited.insert(definingOp).second)
|
if (!definingOp || !visited.insert(definingOp).second)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
@@ -156,7 +156,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
|
|||||||
return denseAttr;
|
return denseAttr;
|
||||||
|
|
||||||
if (auto transposeOp = dyn_cast<ONNXTransposeOp>(definingOp)) {
|
if (auto transposeOp = dyn_cast<ONNXTransposeOp>(definingOp)) {
|
||||||
auto inputAttr = getHostFoldableDenseElementsAttrImpl(transposeOp.getData(), visited);
|
auto inputAttr = getHostConstantDenseElementsAttrImpl(transposeOp.getData(), visited);
|
||||||
if (!inputAttr)
|
if (!inputAttr)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
@@ -169,7 +169,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(definingOp)) {
|
if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(definingOp)) {
|
||||||
auto inputAttr = getHostFoldableDenseElementsAttrImpl(collapseShapeOp.getSrc(), visited);
|
auto inputAttr = getHostConstantDenseElementsAttrImpl(collapseShapeOp.getSrc(), visited);
|
||||||
if (!inputAttr)
|
if (!inputAttr)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(collapseShapeOp.getType()));
|
auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(collapseShapeOp.getType()));
|
||||||
@@ -177,7 +177,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(definingOp)) {
|
if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(definingOp)) {
|
||||||
auto inputAttr = getHostFoldableDenseElementsAttrImpl(expandShapeOp.getSrc(), visited);
|
auto inputAttr = getHostConstantDenseElementsAttrImpl(expandShapeOp.getSrc(), visited);
|
||||||
if (!inputAttr)
|
if (!inputAttr)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(expandShapeOp.getType()));
|
auto reshapedAttr = reshapeDenseElements(inputAttr, cast<RankedTensorType>(expandShapeOp.getType()));
|
||||||
@@ -185,7 +185,7 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(definingOp)) {
|
if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(definingOp)) {
|
||||||
auto inputAttr = getHostFoldableDenseElementsAttrImpl(extractSliceOp.getSource(), visited);
|
auto inputAttr = getHostConstantDenseElementsAttrImpl(extractSliceOp.getSource(), visited);
|
||||||
if (!inputAttr)
|
if (!inputAttr)
|
||||||
return nullptr;
|
return nullptr;
|
||||||
auto slicedAttr = extractSliceDenseElements(inputAttr, extractSliceOp);
|
auto slicedAttr = extractSliceDenseElements(inputAttr, extractSliceOp);
|
||||||
@@ -195,62 +195,71 @@ static DenseElementsAttr getHostFoldableDenseElementsAttrImpl(Value value, llvm:
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool isHostFoldableOpImpl(Operation* op, llvm::SmallPtrSetImpl<Operation*>& visited) {
|
static bool isCompileTimeOpImpl(Operation* op, llvm::SmallPtrSetImpl<Operation*>& visited) {
|
||||||
if (!op || !visited.insert(op).second)
|
if (!op)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
if (!visited.insert(op).second)
|
||||||
|
return true;
|
||||||
|
|
||||||
if (isa<arith::ConstantOp, ONNXConstantOp, ONNXNoneOp>(op))
|
if (isa<arith::ConstantOp, ONNXConstantOp, ONNXNoneOp>(op))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (auto extractOp = dyn_cast<tensor::ExtractOp>(op))
|
if (auto extractOp = dyn_cast<tensor::ExtractOp>(op))
|
||||||
return hasConstantIndices(extractOp) && isHostFoldableValue(extractOp.getTensor());
|
return hasConstantIndices(extractOp) && isCompileTimeOpImpl(extractOp, visited);
|
||||||
|
|
||||||
if (!isStaticTensorResult(op))
|
if (!isStaticTensorResult(op))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (auto transposeOp = dyn_cast<ONNXTransposeOp>(op))
|
if (auto transposeOp = dyn_cast<ONNXTransposeOp>(op))
|
||||||
return isHostFoldableValue(transposeOp.getData());
|
return isCompileTimeOpImpl(transposeOp, visited);
|
||||||
|
|
||||||
if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(op))
|
if (auto collapseShapeOp = dyn_cast<tensor::CollapseShapeOp>(op))
|
||||||
return isHostFoldableValue(collapseShapeOp.getSrc());
|
return isCompileTimeOpImpl(collapseShapeOp,visited);
|
||||||
|
|
||||||
if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(op))
|
if (auto expandShapeOp = dyn_cast<tensor::ExpandShapeOp>(op))
|
||||||
return isHostFoldableValue(expandShapeOp.getSrc());
|
return isCompileTimeOpImpl(expandShapeOp, visited);
|
||||||
|
|
||||||
if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
|
if (auto extractSliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
|
||||||
return hasStaticUnitStrides(extractSliceOp) && isHostFoldableValue(extractSliceOp.getSource());
|
return hasStaticUnitStrides(extractSliceOp) && isCompileTimeOpImpl(extractSliceOp, visited);
|
||||||
|
|
||||||
if (auto splatOp = dyn_cast<tensor::SplatOp>(op))
|
if (auto splatOp = dyn_cast<tensor::SplatOp>(op))
|
||||||
return isHostFoldableValue(splatOp.getInput());
|
return isCompileTimeOpImpl(splatOp, visited);
|
||||||
|
|
||||||
if (auto extractRowsOp = dyn_cast<spatial::SpatExtractRowsOp>(op))
|
if (auto extractRowsOp = dyn_cast<spatial::SpatExtractRowsOp>(op))
|
||||||
return isHostFoldableValue(extractRowsOp.getInput());
|
return isCompileTimeOpImpl(extractRowsOp, visited);
|
||||||
|
|
||||||
if (auto concatOp = dyn_cast<spatial::SpatConcatOp>(op))
|
if (auto concatOp = dyn_cast<spatial::SpatConcatOp>(op)){
|
||||||
return llvm::all_of(concatOp.getInputs(), isHostFoldableValue);
|
bool res = true;
|
||||||
|
for(auto operandValue : concatOp.getOperands()){
|
||||||
|
res &= isCompileTimeOpImpl(operandValue.getDefiningOp(), visited);
|
||||||
|
if(!res) break;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
bool isHostFoldableValue(Value value) {
|
bool isCompileTimeComputable(Value value) {
|
||||||
auto* definingOp = value.getDefiningOp();
|
auto* definingOp = value.getDefiningOp();
|
||||||
if (!definingOp)
|
if (!definingOp)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
llvm::SmallPtrSet<Operation*, 8> visited;
|
llvm::SmallPtrSet<Operation*, 8> visited;
|
||||||
return isHostFoldableOpImpl(definingOp, visited);
|
return isCompileTimeOpImpl(definingOp, visited);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isHostFoldableOp(Operation* op) {
|
bool isCompileTimeOp(Operation* op) {
|
||||||
llvm::SmallPtrSet<Operation*, 8> visited;
|
llvm::SmallPtrSet<Operation*, 8> visited;
|
||||||
return isHostFoldableOpImpl(op, visited);
|
return isCompileTimeOpImpl(op, visited);
|
||||||
}
|
}
|
||||||
|
|
||||||
DenseElementsAttr getHostFoldableDenseElementsAttr(Value value) {
|
DenseElementsAttr getHostConstDenseElementsAttr(Value value) {
|
||||||
llvm::SmallPtrSet<Operation*, 8> visited;
|
llvm::SmallPtrSet<Operation*, 8> visited;
|
||||||
return getHostFoldableDenseElementsAttrImpl(value, visited);
|
return getHostConstantDenseElementsAttrImpl(value, visited);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
} // namespace onnx_mlir
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "mlir/IR/BuiltinAttributes.h"
|
||||||
|
#include "mlir/IR/Operation.h"
|
||||||
|
#include "mlir/IR/Value.h"
|
||||||
|
|
||||||
|
namespace onnx_mlir {
|
||||||
|
|
||||||
|
bool isCompileTimeComputable(mlir::Value value);
|
||||||
|
|
||||||
|
bool isCompileTimeOp(mlir::Operation* op);
|
||||||
|
|
||||||
|
mlir::DenseElementsAttr getHostConstDenseElementsAttr(mlir::Value value);
|
||||||
|
|
||||||
|
} // namespace onnx_mlir
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/IR/BuiltinAttributes.h"
|
|
||||||
#include "mlir/IR/Operation.h"
|
|
||||||
#include "mlir/IR/Value.h"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
|
|
||||||
bool isHostFoldableValue(mlir::Value value);
|
|
||||||
|
|
||||||
bool isHostFoldableOp(mlir::Operation* op);
|
|
||||||
|
|
||||||
mlir::DenseElementsAttr getHostFoldableDenseElementsAttr(mlir::Value value);
|
|
||||||
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
|
|
||||||
@@ -17,7 +17,7 @@ LogicalResult verifyONNXToSpatialHostLegality(func::FuncOp funcOp) {
|
|||||||
for (Operation& op : funcOp.getFunctionBody().front()) {
|
for (Operation& op : funcOp.getFunctionBody().front()) {
|
||||||
if (isa<func::ReturnOp, spatial::SpatCompute, spatial::SpatComputeBatch>(&op))
|
if (isa<func::ReturnOp, spatial::SpatCompute, spatial::SpatComputeBatch>(&op))
|
||||||
continue;
|
continue;
|
||||||
if (isHostFoldableOp(&op))
|
if (isCompileTimeOp(&op))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
diagnostics.report(&op, [](Operation* illegalOp) {
|
diagnostics.report(&op, [](Operation* illegalOp) {
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
#include "Common/Common.hpp"
|
#include "Common/Common.hpp"
|
||||||
#include "Common/PimCommon.hpp"
|
#include "Common/PimCommon.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostLegality.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PostPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PostPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PrePatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/PrePatterns.hpp"
|
||||||
@@ -92,7 +92,7 @@ static void wrapTopLevelRuntimeTransposes(func::FuncOp funcOp) {
|
|||||||
|
|
||||||
for (Operation& op : llvm::make_early_inc_range(entryBlock)) {
|
for (Operation& op : llvm::make_early_inc_range(entryBlock)) {
|
||||||
auto transposeOp = dyn_cast<ONNXTransposeOp>(&op);
|
auto transposeOp = dyn_cast<ONNXTransposeOp>(&op);
|
||||||
if (!transposeOp || isHostFoldableOp(transposeOp))
|
if (!transposeOp || isCompileTimeOp(transposeOp))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
// Transpose stays globally legal because constant/view-only cases are
|
// Transpose stays globally legal because constant/view-only cases are
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -391,7 +391,7 @@ static Value lowerSingleConvGroup(Value x,
|
|||||||
const int64_t xbarSize = static_cast<int64_t>(crossbarSize.getValue());
|
const int64_t xbarSize = static_cast<int64_t>(crossbarSize.getValue());
|
||||||
const int64_t wMaxDim = std::max(patchSize, numChannelsOut);
|
const int64_t wMaxDim = std::max(patchSize, numChannelsOut);
|
||||||
const int64_t maxParallelPixels = std::max<int64_t>(1, xbarSize / wMaxDim);
|
const int64_t maxParallelPixels = std::max<int64_t>(1, xbarSize / wMaxDim);
|
||||||
auto wDenseAttr = getHostFoldableDenseElementsAttr(w);
|
auto wDenseAttr = getHostConstDenseElementsAttr(w);
|
||||||
|
|
||||||
// Prepare weight matrix W for crossbar storage:
|
// Prepare weight matrix W for crossbar storage:
|
||||||
// W: [numChannelsOut, numChannelsIn, wHeight, wWidth] -> [numChannelsOut, patchSize] -> [patchSize, numChannelsOut]
|
// W: [numChannelsOut, numChannelsIn, wHeight, wWidth] -> [numChannelsOut, patchSize] -> [patchSize, numChannelsOut]
|
||||||
@@ -412,7 +412,7 @@ static Value lowerSingleConvGroup(Value x,
|
|||||||
DenseElementsAttr biasDenseAttr;
|
DenseElementsAttr biasDenseAttr;
|
||||||
if (hasB) {
|
if (hasB) {
|
||||||
gemmBias = b;
|
gemmBias = b;
|
||||||
biasDenseAttr = getHostFoldableDenseElementsAttr(b);
|
biasDenseAttr = getHostConstDenseElementsAttr(b);
|
||||||
biasMatrix = expandBiasIfNeeded(b, rewriter, loc);
|
biasMatrix = expandBiasIfNeeded(b, rewriter, loc);
|
||||||
}
|
}
|
||||||
const bool canPackWeightsAsConstants = static_cast<bool>(wDenseAttr);
|
const bool canPackWeightsAsConstants = static_cast<bool>(wDenseAttr);
|
||||||
@@ -717,7 +717,7 @@ LogicalResult ConvToGemm::matchAndRewrite(ONNXConvOp convOp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
Value result;
|
Value result;
|
||||||
if (llvm::all_of(groupResults, isHostFoldableValue)) {
|
if (llvm::all_of(groupResults, isCompileTimeComputable)) {
|
||||||
result = createSpatConcat(rewriter, loc, /*axis=*/1, groupResults);
|
result = createSpatConcat(rewriter, loc, /*axis=*/1, groupResults);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -55,7 +55,7 @@ static Value transposeForSpatial(Value value,
|
|||||||
ArrayRef<int64_t> permutation,
|
ArrayRef<int64_t> permutation,
|
||||||
ConversionPatternRewriter& rewriter,
|
ConversionPatternRewriter& rewriter,
|
||||||
Location loc) {
|
Location loc) {
|
||||||
if (isHostFoldableValue(value))
|
if (isCompileTimeComputable(value))
|
||||||
return ONNXTransposeOp::create(rewriter, loc, resultType, value, rewriter.getI64ArrayAttr(permutation));
|
return ONNXTransposeOp::create(rewriter, loc, resultType, value, rewriter.getI64ArrayAttr(permutation));
|
||||||
|
|
||||||
auto computeOp = createSpatCompute<1>(rewriter, loc, TypeRange {resultType}, {}, value, [&](Value input) {
|
auto computeOp = createSpatCompute<1>(rewriter, loc, TypeRange {resultType}, {}, value, [&](Value input) {
|
||||||
@@ -67,7 +67,7 @@ static Value transposeForSpatial(Value value,
|
|||||||
|
|
||||||
static Value
|
static Value
|
||||||
expandRankOneBias(Value value, RankedTensorType resultType, ConversionPatternRewriter& rewriter, Location loc) {
|
expandRankOneBias(Value value, RankedTensorType resultType, ConversionPatternRewriter& rewriter, Location loc) {
|
||||||
if (isHostFoldableValue(value))
|
if (isCompileTimeComputable(value))
|
||||||
return tensor::ExpandShapeOp::create(rewriter,
|
return tensor::ExpandShapeOp::create(rewriter,
|
||||||
loc,
|
loc,
|
||||||
resultType,
|
resultType,
|
||||||
@@ -121,7 +121,7 @@ static SmallVector<Value> materializeBatchRowSlices(Value matrix,
|
|||||||
auto rowType = RankedTensorType::get({1, matrixType.getDimSize(1)}, matrixType.getElementType());
|
auto rowType = RankedTensorType::get({1, matrixType.getDimSize(1)}, matrixType.getElementType());
|
||||||
SmallVector<Type> resultTypes(static_cast<size_t>(numRows), rowType);
|
SmallVector<Type> resultTypes(static_cast<size_t>(numRows), rowType);
|
||||||
|
|
||||||
if (isHostFoldableValue(matrix)) {
|
if (isCompileTimeComputable(matrix)) {
|
||||||
auto extractRowsOp = spatial::SpatExtractRowsOp::create(rewriter, loc, TypeRange(resultTypes), matrix);
|
auto extractRowsOp = spatial::SpatExtractRowsOp::create(rewriter, loc, TypeRange(resultTypes), matrix);
|
||||||
return SmallVector<Value>(extractRowsOp->result_begin(), extractRowsOp->result_end());
|
return SmallVector<Value>(extractRowsOp->result_begin(), extractRowsOp->result_end());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -55,7 +55,7 @@ collapseBatchDims(Value value, int64_t batchSize, int64_t rows, int64_t cols, Pa
|
|||||||
return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input, reassociation);
|
return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input, reassociation);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isHostFoldableValue(value))
|
if (isCompileTimeComputable(value))
|
||||||
return buildCollapsed(value);
|
return buildCollapsed(value);
|
||||||
|
|
||||||
auto collapseCompute =
|
auto collapseCompute =
|
||||||
@@ -114,7 +114,7 @@ static Value extractBatchMatrix(Value value,
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isHostFoldableValue(value))
|
if (isCompileTimeComputable(value))
|
||||||
return buildMatrix(value);
|
return buildMatrix(value);
|
||||||
|
|
||||||
auto batchMatrixCompute =
|
auto batchMatrixCompute =
|
||||||
@@ -142,7 +142,7 @@ static Value transposeLastTwoDims(Value value, PatternRewriter& rewriter, Locati
|
|||||||
return ONNXTransposeOp::create(rewriter, loc, transposedType, input, rewriter.getI64ArrayAttr(perm));
|
return ONNXTransposeOp::create(rewriter, loc, transposedType, input, rewriter.getI64ArrayAttr(perm));
|
||||||
};
|
};
|
||||||
|
|
||||||
if (isHostFoldableValue(value))
|
if (isCompileTimeComputable(value))
|
||||||
return buildTranspose(value);
|
return buildTranspose(value);
|
||||||
|
|
||||||
auto transposeCompute =
|
auto transposeCompute =
|
||||||
@@ -182,7 +182,7 @@ static Value concatValues(ValueRange inputs, int64_t axis, PatternRewriter& rewr
|
|||||||
outputShape[axis] = concatDimSize;
|
outputShape[axis] = concatDimSize;
|
||||||
auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
|
auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
|
||||||
|
|
||||||
if (llvm::all_of(inputs, isHostFoldableValue))
|
if (llvm::all_of(inputs, isCompileTimeComputable))
|
||||||
return createSpatConcat(rewriter, loc, axis, inputs);
|
return createSpatConcat(rewriter, loc, axis, inputs);
|
||||||
|
|
||||||
auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
|
auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
|
||||||
@@ -235,7 +235,7 @@ struct MatMulToGemm : OpRewritePattern<ONNXMatMulOp> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Location loc = matmulOp.getLoc();
|
Location loc = matmulOp.getLoc();
|
||||||
bool useTransposedForm = isHostFoldableValue(matmulOp.getA()) && !isHostFoldableValue(matmulOp.getB());
|
bool useTransposedForm = isCompileTimeComputable(matmulOp.getA()) && !isCompileTimeComputable(matmulOp.getB());
|
||||||
|
|
||||||
Value lhs = collapseBatchDims(matmulOp.getA(), lhsBatch, m, k, rewriter, loc);
|
Value lhs = collapseBatchDims(matmulOp.getA(), lhsBatch, m, k, rewriter, loc);
|
||||||
Value rhs = collapseBatchDims(matmulOp.getB(), rhsBatch, k, n, rewriter, loc);
|
Value rhs = collapseBatchDims(matmulOp.getB(), rhsBatch, k, n, rewriter, loc);
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -91,7 +91,7 @@ static Value concatValues(ValueRange inputs, int64_t axis, ConversionPatternRewr
|
|||||||
outputShape[axis] = concatDimSize;
|
outputShape[axis] = concatDimSize;
|
||||||
auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
|
auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());
|
||||||
|
|
||||||
if (llvm::all_of(inputs, isHostFoldableValue))
|
if (llvm::all_of(inputs, isCompileTimeComputable))
|
||||||
return createSpatConcat(rewriter, loc, axis, inputs);
|
return createSpatConcat(rewriter, loc, axis, inputs);
|
||||||
|
|
||||||
auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
|
auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
|
||||||
@@ -135,7 +135,7 @@ static Value squeezeReducedAxes(Value keepdimsValue,
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto reassociation = buildCollapseReassociation(reducedAxes);
|
auto reassociation = buildCollapseReassociation(reducedAxes);
|
||||||
if (isHostFoldableValue(keepdimsValue))
|
if (isCompileTimeComputable(keepdimsValue))
|
||||||
return tensor::CollapseShapeOp::create(rewriter, loc, resultType, keepdimsValue, reassociation).getResult();
|
return tensor::CollapseShapeOp::create(rewriter, loc, resultType, keepdimsValue, reassociation).getResult();
|
||||||
|
|
||||||
auto squeezeCompute =
|
auto squeezeCompute =
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/ComputeRegionBuilder.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/ComputeRegionBuilder.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -20,7 +20,7 @@ struct Concat : public OpConversionPattern<ONNXConcatOp> {
|
|||||||
auto inputs = adaptor.getInputs();
|
auto inputs = adaptor.getInputs();
|
||||||
int64_t axis = adaptor.getAxis();
|
int64_t axis = adaptor.getAxis();
|
||||||
|
|
||||||
if (llvm::all_of(inputs, isHostFoldableValue)) {
|
if (llvm::all_of(inputs, isCompileTimeComputable)) {
|
||||||
rewriter.replaceOp(maxpoolOp, createSpatConcat(rewriter, maxpoolOp.getLoc(), axis, inputs));
|
rewriter.replaceOp(maxpoolOp, createSpatConcat(rewriter, maxpoolOp.getLoc(), axis, inputs));
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -115,7 +115,7 @@ struct Reshape : OpConversionPattern<ONNXReshapeOp> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto replaceWithReshape = [&](auto buildReshape) -> LogicalResult {
|
auto replaceWithReshape = [&](auto buildReshape) -> LogicalResult {
|
||||||
if (isHostFoldableValue(adaptor.getData())) {
|
if (isCompileTimeComputable(adaptor.getData())) {
|
||||||
rewriter.replaceOp(reshapeOp, buildReshape(adaptor.getData()));
|
rewriter.replaceOp(reshapeOp, buildReshape(adaptor.getData()));
|
||||||
return success();
|
return success();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||||
|
|
||||||
@@ -61,7 +61,7 @@ struct Split : OpConversionPattern<ONNXSplitOp> {
|
|||||||
sliceSizes.push_back(resultType.getShape()[axis]);
|
sliceSizes.push_back(resultType.getShape()[axis]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isHostFoldableValue(adaptor.getInput())) {
|
if (isCompileTimeComputable(adaptor.getInput())) {
|
||||||
for (int64_t sliceSize : sliceSizes) {
|
for (int64_t sliceSize : sliceSizes) {
|
||||||
outputs.push_back(extractSliceAt(adaptor.getInput(), axis, offset, sliceSize, rewriter, splitOp.getLoc()));
|
outputs.push_back(extractSliceAt(adaptor.getInput(), axis, offset, sliceSize, rewriter, splitOp.getLoc()));
|
||||||
offset += sliceSize;
|
offset += sliceSize;
|
||||||
|
|||||||
@@ -11,14 +11,8 @@ add_pim_library(SpatialOps
|
|||||||
Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp
|
Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp
|
||||||
Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
|
Transforms/MergeComputeNodes/Scheduling/ComputeGraph.cpp
|
||||||
Transforms/MergeComputeNodes/Scheduling/ComputeInstanceUtils.cpp
|
Transforms/MergeComputeNodes/Scheduling/ComputeInstanceUtils.cpp
|
||||||
Transforms/MergeComputeNodes/Scheduling/DcpScheduler.cpp
|
|
||||||
Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
|
Transforms/MergeComputeNodes/Scheduling/MergeSchedulingAnalysis.cpp
|
||||||
Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
|
Transforms/MergeComputeNodes/Scheduling/PeftScheduler.cpp
|
||||||
Transforms/MergeComputeNodes/DCPGraph/Graph.cpp
|
|
||||||
Transforms/MergeComputeNodes/DCPGraph/GraphDebug.cpp
|
|
||||||
Transforms/MergeComputeNodes/DCPGraph/GraphSupport.cpp
|
|
||||||
Transforms/MergeComputeNodes/DCPGraph/Task.cpp
|
|
||||||
Transforms/MergeComputeNodes/DCPGraph/DCPAnalysis.cpp
|
|
||||||
|
|
||||||
EXCLUDE_FROM_OM_LIBS
|
EXCLUDE_FROM_OM_LIBS
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
|
|
||||||
@@ -120,7 +120,7 @@ static bool isBatchOutputArgument(SpatComputeBatch batchOp, Value value) {
|
|||||||
template <typename ComputeOpTy>
|
template <typename ComputeOpTy>
|
||||||
static LogicalResult verifyStaticWeights(ComputeOpTy computeOp, StringRef kind) {
|
static LogicalResult verifyStaticWeights(ComputeOpTy computeOp, StringRef kind) {
|
||||||
for (Value weight : computeOp.getWeights()) {
|
for (Value weight : computeOp.getWeights()) {
|
||||||
if (!isHostFoldableValue(weight))
|
if (!isCompileTimeComputable(weight))
|
||||||
return computeOp.emitOpError() << kind << " weights must be statically computed from constants";
|
return computeOp.emitOpError() << kind << " weights must be statically computed from constants";
|
||||||
}
|
}
|
||||||
return success();
|
return success();
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
#include "../Scheduling/ComputeGraph.hpp"
|
|
||||||
#include "../Scheduling/DcpScheduler.hpp"
|
|
||||||
#include "DCPAnalysis.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
namespace spatial {
|
|
||||||
|
|
||||||
DCPAnalysisResult DCPAnalysis::run() {
|
|
||||||
ComputeGraph graph = buildComputeGraph(entryOp);
|
|
||||||
DcpScheduleOptions options;
|
|
||||||
if (coresCount.getValue() > 0)
|
|
||||||
options.processorCount = static_cast<size_t>(coresCount.getValue());
|
|
||||||
options.criticalWindowSize = dcpCriticalWindowSize.getValue();
|
|
||||||
options.allowFallbackForAutoCoreCount = true;
|
|
||||||
return runDcpScheduler(graph, options, entryOp->getContext());
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace spatial
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,28 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/IR/Operation.h"
|
|
||||||
|
|
||||||
#include "../Scheduling/MergeSchedule.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
namespace spatial {
|
|
||||||
using DCPAnalysisResult = MergeScheduleResult;
|
|
||||||
|
|
||||||
struct DCPAnalysis {
|
|
||||||
private:
|
|
||||||
DCPAnalysisResult result;
|
|
||||||
mlir::Operation* entryOp;
|
|
||||||
DCPAnalysisResult run();
|
|
||||||
|
|
||||||
public:
|
|
||||||
DCPAnalysis(mlir::Operation* op)
|
|
||||||
: entryOp(op) {
|
|
||||||
result = run();
|
|
||||||
}
|
|
||||||
DCPAnalysisResult& getResult() { return result; }
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace spatial
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
|
|
||||||
using DCPAnalysisResult = onnx_mlir::spatial::DCPAnalysisResult;
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,178 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
|
||||||
#include "llvm/ADT/DenseMap.h"
|
|
||||||
#include "llvm/ADT/DenseSet.h"
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <list>
|
|
||||||
#include <optional>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "DCPAnalysis.hpp"
|
|
||||||
#include "Task.hpp"
|
|
||||||
#include "Utils.hpp"
|
|
||||||
|
|
||||||
namespace mlir {
|
|
||||||
class MLIRContext;
|
|
||||||
} // namespace mlir
|
|
||||||
|
|
||||||
std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling = false);
|
|
||||||
void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling = false);
|
|
||||||
Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
|
|
||||||
|
|
||||||
class GraphDCP {
|
|
||||||
public:
|
|
||||||
struct CandidateRelations {
|
|
||||||
llvm::DenseSet<TaskDCP*> ancestors;
|
|
||||||
llvm::DenseSet<TaskDCP*> descendants;
|
|
||||||
// descendants ordered by position in the graph's topological order;
|
|
||||||
// iterating this avoids walking non-descendant tail tasks on hot paths.
|
|
||||||
llvm::SmallVector<TaskDCP*, 32> descendantsTopoOrder;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ScheduledTaskInfo {
|
|
||||||
size_t nodeIndex;
|
|
||||||
Time aest;
|
|
||||||
Time alst;
|
|
||||||
Weight weight;
|
|
||||||
};
|
|
||||||
|
|
||||||
private:
|
|
||||||
using CpuTaskList = std::list<TaskDCP*>;
|
|
||||||
|
|
||||||
struct FindSlot {
|
|
||||||
Time aest;
|
|
||||||
int index;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<TaskDCP> nodes;
|
|
||||||
onnx_mlir::LabeledList<TaskDCP> topologicalOrder;
|
|
||||||
std::vector<uint64_t> taskStructureHashes;
|
|
||||||
std::vector<CpuTaskList> cpuTasks;
|
|
||||||
std::unordered_map<CPU, CrossbarUsage> cpuCrossbarUsage;
|
|
||||||
llvm::DenseMap<CPU, uint64_t> cpuStructureHashes;
|
|
||||||
CPU lastCpu = 0;
|
|
||||||
long long flag = 1;
|
|
||||||
Time dcpl = 0;
|
|
||||||
Time maxCompletion = 0;
|
|
||||||
Time secondMaxCompletion = 0;
|
|
||||||
TaskDCP* maxCompletionTask = nullptr;
|
|
||||||
int maxCpuCount = 1000;
|
|
||||||
mlir::MLIRContext* context = nullptr;
|
|
||||||
|
|
||||||
TaskInsertion insertTaskInCPU(CPU cpu, TaskDCP* task, size_t position);
|
|
||||||
void removeTaskFromCPU(CPU cpu, TaskDCP* task);
|
|
||||||
CpuTaskList& getOrCreateCpuTasks(CPU cpu);
|
|
||||||
const CpuTaskList* findCpuTasks(CPU cpu) const;
|
|
||||||
|
|
||||||
std::vector<TaskDCP*> getRoots();
|
|
||||||
|
|
||||||
long long getUniqueFlag() { return flag++; }
|
|
||||||
|
|
||||||
void initAest();
|
|
||||||
void initAlst();
|
|
||||||
void initTaskStructureHashes();
|
|
||||||
|
|
||||||
Time computeAestOnCpu(TaskDCP* task, CPU cpu);
|
|
||||||
Time computeDcplOnCpu(TaskDCP* task, CPU cpu);
|
|
||||||
Time getDcpl() const { return dcpl; }
|
|
||||||
Time computeTaskAlstOnCpu(TaskDCP* task, CPU cpu, Time scheduleDcpl);
|
|
||||||
void updateAestFromTask(TaskDCP* task);
|
|
||||||
void updateAestFromTaskWithDescendants(TaskDCP* task, const llvm::DenseSet<TaskDCP*>& descendants);
|
|
||||||
void updateAestFromTaskWithDescendants(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder);
|
|
||||||
// Propagates AEST like the overload above but returns early (before touching
|
|
||||||
// the remaining descendants) as soon as a task's completion exceeds
|
|
||||||
// `dcplBudget`, signalling that the new DCPL would exceed the budget.
|
|
||||||
// Returns true iff the full propagation completed without exceeding the
|
|
||||||
// budget. Uses the caller's snapshot to restore AEST on the aborted tail.
|
|
||||||
bool tryUpdateAestWithinBudget(TaskDCP* task, llvm::ArrayRef<TaskDCP*> descendantsTopoOrder, Time dcplBudget);
|
|
||||||
|
|
||||||
// Incrementally refreshes ALST after `task` has been scheduled. Nodes
|
|
||||||
// outside the backward cone (`relations.ancestors` plus `task`) retain
|
|
||||||
// their relative distance to the sink boundary and only absorb the signed
|
|
||||||
// DCPL delta (`newDcpl - oldDcpl`). `task` itself and its ancestors are
|
|
||||||
// recomputed in reverse topological order so that new same-CPU transfer
|
|
||||||
// costs (now zero) and scheduling-edge children are reflected.
|
|
||||||
void updateAlstFromScheduledTask(TaskDCP* task, const CandidateRelations& relations, Time oldDcpl);
|
|
||||||
|
|
||||||
void initTopological();
|
|
||||||
void topologicalMoveAfter(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
|
|
||||||
void topologicalMoveBefore(TaskDCP* task, TaskDCP* pivotPoint, TaskInsertion* insertion = nullptr);
|
|
||||||
|
|
||||||
llvm::DenseMap<TaskDCP*, Time> computeAlst(TaskDCP* task, CPU cpu, const CandidateRelations& relations);
|
|
||||||
size_t getNodeIndex(const TaskDCP* task) const;
|
|
||||||
|
|
||||||
// Returns a compact dedup key for CPU `c` when evaluating `candidate`:
|
|
||||||
// mixes candidateAest, crossbar usage, and the incremental cpu structure
|
|
||||||
// hash into a single uint64_t. Zero heap allocation.
|
|
||||||
uint64_t computeCpuCandidateKey(Time candidateAest, CPU cpu);
|
|
||||||
CandidateRelations selectProcessor(TaskDCP* candidate, bool push);
|
|
||||||
CPU getLastCpu() const { return lastCpu; }
|
|
||||||
void incrementLastCpu() { lastCpu++; }
|
|
||||||
FindSlot findSlot(TaskDCP* candidate, CPU cpu, bool push, const CandidateRelations& relations);
|
|
||||||
FindSlot findSlotWithFixedFinalTime(
|
|
||||||
TaskDCP* candidate, CPU cpu, const CandidateRelations& relations, Time finalTime, Time aestOnCpu);
|
|
||||||
void dumpDot();
|
|
||||||
|
|
||||||
friend TaskInsertion;
|
|
||||||
friend class TaskDCP;
|
|
||||||
|
|
||||||
CrossbarUsage getCpuCrossbarUsage(CPU cpu) const;
|
|
||||||
CrossbarUsage getCpuCrossbarCapacity() const;
|
|
||||||
CrossbarUsage getTaskCrossbarFootprint(const TaskDCP* task) const;
|
|
||||||
void reserveTaskCrossbars(CPU cpu, const TaskDCP* task);
|
|
||||||
void releaseTaskCrossbars(CPU cpu, const TaskDCP* task);
|
|
||||||
bool wouldExhaustCrossbarCapacity(CPU cpu, const TaskDCP* task) const;
|
|
||||||
|
|
||||||
public:
|
|
||||||
void runDcp();
|
|
||||||
GraphDCP(llvm::ArrayRef<onnx_mlir::spatial::SpatCompute> spatComputes, llvm::ArrayRef<IndexedEdge> edges)
|
|
||||||
: nodes(), cpuTasks(), cpuCrossbarUsage() {
|
|
||||||
for (auto spatCompute : spatComputes)
|
|
||||||
nodes.emplace_back(spatCompute);
|
|
||||||
for (auto [start, end, weight] : edges)
|
|
||||||
makeEdge(start, end, weight);
|
|
||||||
}
|
|
||||||
|
|
||||||
GraphDCP(llvm::ArrayRef<Weight> nodeWeights,
|
|
||||||
llvm::ArrayRef<IndexedEdge> edges,
|
|
||||||
llvm::ArrayRef<int64_t> nodeOrderKeys = {},
|
|
||||||
llvm::ArrayRef<CrossbarUsage> nodeCrossbarUsage = {})
|
|
||||||
: nodes(), cpuTasks(), cpuCrossbarUsage() {
|
|
||||||
assert((nodeCrossbarUsage.empty() || nodeCrossbarUsage.size() == nodeWeights.size())
|
|
||||||
&& "synthetic crossbar usage must match synthetic node weights");
|
|
||||||
assert((nodeOrderKeys.empty() || nodeOrderKeys.size() == nodeWeights.size())
|
|
||||||
&& "synthetic node order keys must match synthetic node weights");
|
|
||||||
nodes.reserve(nodeWeights.size());
|
|
||||||
for (auto [index, weight] : llvm::enumerate(nodeWeights))
|
|
||||||
nodes.emplace_back(nodeOrderKeys.empty() ? static_cast<int64_t>(index) : nodeOrderKeys[index],
|
|
||||||
weight,
|
|
||||||
nodeCrossbarUsage.empty() ? 0 : nodeCrossbarUsage[index]);
|
|
||||||
for (auto [start, end, weight] : edges)
|
|
||||||
makeEdge(start, end, weight);
|
|
||||||
}
|
|
||||||
|
|
||||||
DCPAnalysisResult getResult();
|
|
||||||
std::vector<ScheduledTaskInfo> getScheduledTasks(CPU cpu) const;
|
|
||||||
CPU cpuCount() const { return lastCpu; }
|
|
||||||
|
|
||||||
void makeEdge(size_t parentIndex, size_t childIndex, Weight weight) {
|
|
||||||
addEdge(&nodes[parentIndex], &nodes[childIndex], weight);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t taskInCpu(CPU cpu) { return getOrCreateCpuTasks(cpu).size(); }
|
|
||||||
|
|
||||||
void setMaxCpuCount(int value) { maxCpuCount = value; }
|
|
||||||
int getMaxCpuCount() const { return maxCpuCount; }
|
|
||||||
|
|
||||||
// Total crossbar units allocated across all active CPUs.
|
|
||||||
size_t crossbarsUsed() const;
|
|
||||||
// Maximum crossbar units available across all active CPUs (lastCpu * per-CPU capacity).
|
|
||||||
size_t crossbarsAvailable() const;
|
|
||||||
|
|
||||||
// Optional MLIR context used to drive mlir::parallelFor inside runDcp. If
|
|
||||||
// null the scheduler runs single-threaded (tests use this path).
|
|
||||||
void setContext(mlir::MLIRContext* ctx) { context = ctx; }
|
|
||||||
};
|
|
||||||
@@ -1,154 +0,0 @@
|
|||||||
#include "llvm/Support/FormatVariadic.h"
|
|
||||||
#include "llvm/Support/raw_ostream.h"
|
|
||||||
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "GraphDebug.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
|
||||||
|
|
||||||
namespace dcp_graph {
|
|
||||||
|
|
||||||
#ifdef DCP_DEBUG_ENABLED
|
|
||||||
|
|
||||||
DcpProgressLogger::DcpProgressLogger(size_t totalTasks)
|
|
||||||
: logProgress(totalTasks >= 200),
|
|
||||||
totalTasks(totalTasks),
|
|
||||||
startTime(std::chrono::steady_clock::now()),
|
|
||||||
lastProgressPrint(startTime) {}
|
|
||||||
|
|
||||||
std::string DcpProgressLogger::formatDuration(double seconds) {
|
|
||||||
if (seconds < 0)
|
|
||||||
seconds = 0;
|
|
||||||
|
|
||||||
long totalSeconds = static_cast<long>(seconds + 0.5);
|
|
||||||
long hours = totalSeconds / 3600;
|
|
||||||
long minutes = (totalSeconds % 3600) / 60;
|
|
||||||
long secs = totalSeconds % 60;
|
|
||||||
if (hours > 0)
|
|
||||||
return llvm::formatv("{0}:{1:02}:{2:02}", hours, minutes, secs).str();
|
|
||||||
return llvm::formatv("{0}:{1:02}", minutes, secs).str();
|
|
||||||
}
|
|
||||||
|
|
||||||
void DcpProgressLogger::recordFindDuration(double seconds) { findCandidateSeconds += seconds; }
|
|
||||||
void DcpProgressLogger::recordSelectDuration(double seconds) { selectProcessorSeconds += seconds; }
|
|
||||||
void DcpProgressLogger::recordUpdateDuration(double seconds) { updateTimingSeconds += seconds; }
|
|
||||||
void DcpProgressLogger::advanceCompleted(size_t taskCount) { completedTasks += taskCount; }
|
|
||||||
|
|
||||||
void DcpProgressLogger::printStart(size_t readyCount, int maxCpuCount, size_t xbarsCapacity) const {
|
|
||||||
if (!logProgress)
|
|
||||||
return;
|
|
||||||
llvm::errs() << llvm::formatv("[DCP] start tasks={0} ready={1} cpus=0/{2} crossbars=0/{3}\n",
|
|
||||||
totalTasks,
|
|
||||||
readyCount,
|
|
||||||
maxCpuCount,
|
|
||||||
xbarsCapacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DcpProgressLogger::maybePrintSlowCandidate(size_t nodeIndex,
|
|
||||||
double elapsedSeconds,
|
|
||||||
size_t readyCount,
|
|
||||||
CPU cpuCount) const {
|
|
||||||
if (!logProgress || elapsedSeconds < 1.0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
llvm::errs() << llvm::formatv("[DCP] slow node={0} elapsed={1} ready={2} cpus={3}\n",
|
|
||||||
nodeIndex,
|
|
||||||
formatDuration(elapsedSeconds),
|
|
||||||
readyCount,
|
|
||||||
cpuCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DcpProgressLogger::printProgress(
|
|
||||||
size_t readyCount, CPU cpuCount, int maxCpuCount, size_t xbarsUsed, size_t xbarsAvailable, bool force) {
|
|
||||||
if (!logProgress)
|
|
||||||
return;
|
|
||||||
|
|
||||||
auto now = std::chrono::steady_clock::now();
|
|
||||||
if (!force && now - lastProgressPrint < std::chrono::seconds(1) && completedTasks != totalTasks)
|
|
||||||
return;
|
|
||||||
|
|
||||||
double elapsedSeconds = std::chrono::duration<double>(now - startTime).count();
|
|
||||||
double rate = elapsedSeconds > 0.0 ? static_cast<double>(completedTasks) / elapsedSeconds : 0.0;
|
|
||||||
double etaSeconds = rate > 0.0 ? static_cast<double>(totalTasks - completedTasks) / rate : 0.0;
|
|
||||||
double percent = totalTasks == 0 ? 100.0 : (100.0 * static_cast<double>(completedTasks) / totalTasks);
|
|
||||||
|
|
||||||
bool done = completedTasks == totalTasks;
|
|
||||||
llvm::errs() << llvm::formatv("[DCP] {0}/{1} ({2:F0}%) ready={3} cpus={4}/{5} crossbars={6}/{7} {8}{9}\n",
|
|
||||||
completedTasks,
|
|
||||||
totalTasks,
|
|
||||||
percent,
|
|
||||||
readyCount,
|
|
||||||
cpuCount,
|
|
||||||
maxCpuCount,
|
|
||||||
xbarsUsed,
|
|
||||||
xbarsAvailable,
|
|
||||||
llvm::formatv("elapsed={0}", formatDuration(elapsedSeconds)).str(),
|
|
||||||
done ? "" : llvm::formatv(" eta={0}", formatDuration(etaSeconds)).str());
|
|
||||||
lastProgressPrint = now;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
DcpProgressLogger::DcpProgressLogger(size_t) {}
|
|
||||||
void DcpProgressLogger::recordFindDuration(double) {}
|
|
||||||
void DcpProgressLogger::recordSelectDuration(double) {}
|
|
||||||
void DcpProgressLogger::recordUpdateDuration(double) {}
|
|
||||||
void DcpProgressLogger::advanceCompleted(size_t) {}
|
|
||||||
void DcpProgressLogger::printStart(size_t, int, size_t) const {}
|
|
||||||
void DcpProgressLogger::maybePrintSlowCandidate(size_t, double, size_t, CPU) const {}
|
|
||||||
void DcpProgressLogger::printProgress(size_t, CPU, int, size_t, size_t, bool) {}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void dumpGraphDot(const std::vector<TaskDCP>& nodes, const std::vector<std::list<TaskDCP*>>& cpuTasks, CPU lastCpu) {
|
|
||||||
static int dumpIndex = 0;
|
|
||||||
std::string outputDir = onnx_mlir::getOutputDir();
|
|
||||||
if (outputDir.empty())
|
|
||||||
return;
|
|
||||||
|
|
||||||
std::string graphDir = outputDir + "/dcp_graph";
|
|
||||||
onnx_mlir::createDirectory(graphDir);
|
|
||||||
std::fstream file(graphDir + "/graph_" + std::to_string(dumpIndex++) + ".dot", std::ios::out);
|
|
||||||
file << "digraph G {\n";
|
|
||||||
if (!cpuTasks.empty()) {
|
|
||||||
for (CPU cpu = 0; cpu < lastCpu; cpu++) {
|
|
||||||
file << "subgraph cluster_" << cpu << "{\nstyle=filled;\ncolor=lightgrey;\n";
|
|
||||||
size_t cpuIndex = static_cast<size_t>(cpu);
|
|
||||||
if (cpuIndex >= cpuTasks.size()) {
|
|
||||||
file << " }\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto node : cpuTasks[cpuIndex]) {
|
|
||||||
file << node->Id() << " [label=\"";
|
|
||||||
file << "n:" << node->Id() << "\n";
|
|
||||||
file << "aest:" << node->getAest() << "\n";
|
|
||||||
file << "alst:" << node->getAlst() << "\n";
|
|
||||||
file << "weight:" << node->getWeight() << "\"]\n";
|
|
||||||
}
|
|
||||||
file << " }\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
for (const auto& node : nodes) {
|
|
||||||
file << node.Id() << " [label=\"";
|
|
||||||
file << "n:" << node.Id() << "\n";
|
|
||||||
file << "aest:" << node.getAest() << "\n";
|
|
||||||
file << "alst:" << node.getAlst() << "\n";
|
|
||||||
file << "weight:" << node.getWeight() << "\"]\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto& node : nodes)
|
|
||||||
for (const auto& child : node.children) {
|
|
||||||
file << node.Id() << " -> " << child.first->Id();
|
|
||||||
file << " [label=\"" << child.second << "\"]\n";
|
|
||||||
}
|
|
||||||
|
|
||||||
file << "}\n";
|
|
||||||
file.flush();
|
|
||||||
file.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace dcp_graph
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llvm/ADT/StringRef.h"
|
|
||||||
|
|
||||||
#include <chrono>
|
|
||||||
#include <list>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "Task.hpp"
|
|
||||||
#include "Utils.hpp"
|
|
||||||
|
|
||||||
// Define DCP_DEBUG_ENABLED locally when debugging DCP progress and per-phase
|
|
||||||
// profiling. In normal builds the logger methods are no-ops and helpers compile
|
|
||||||
// away.
|
|
||||||
#define DCP_DEBUG_ENABLED
|
|
||||||
|
|
||||||
#ifdef DCP_DEBUG_ENABLED
|
|
||||||
#define DCP_DEBUG_IF(...) __VA_ARGS__
|
|
||||||
#else
|
|
||||||
#define DCP_DEBUG_IF(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace dcp_graph {
|
|
||||||
|
|
||||||
class DcpProgressLogger {
|
|
||||||
public:
|
|
||||||
explicit DcpProgressLogger(size_t totalTasks);
|
|
||||||
|
|
||||||
void recordFindDuration(double seconds);
|
|
||||||
void recordSelectDuration(double seconds);
|
|
||||||
void recordUpdateDuration(double seconds);
|
|
||||||
void advanceCompleted(size_t taskCount = 1);
|
|
||||||
|
|
||||||
void printStart(size_t readyCount, int maxCpuCount, size_t xbarsCapacity) const;
|
|
||||||
void maybePrintSlowCandidate(size_t nodeIndex, double elapsedSeconds, size_t readyCount, CPU cpuCount) const;
|
|
||||||
void
|
|
||||||
printProgress(size_t readyCount, CPU cpuCount, int maxCpuCount, size_t xbarsUsed, size_t xbarsAvailable, bool force);
|
|
||||||
|
|
||||||
#ifdef DCP_DEBUG_ENABLED
|
|
||||||
|
|
||||||
private:
|
|
||||||
static std::string formatDuration(double seconds);
|
|
||||||
|
|
||||||
bool logProgress = false;
|
|
||||||
size_t totalTasks = 0;
|
|
||||||
size_t completedTasks = 0;
|
|
||||||
std::chrono::steady_clock::time_point startTime;
|
|
||||||
std::chrono::steady_clock::time_point lastProgressPrint;
|
|
||||||
double findCandidateSeconds = 0.0;
|
|
||||||
double selectProcessorSeconds = 0.0;
|
|
||||||
double updateTimingSeconds = 0.0;
|
|
||||||
#endif
|
|
||||||
};
|
|
||||||
|
|
||||||
void dumpGraphDot(const std::vector<TaskDCP>& nodes, const std::vector<std::list<TaskDCP*>>& cpuTasks, CPU lastCpu);
|
|
||||||
|
|
||||||
} // namespace dcp_graph
|
|
||||||
@@ -1,104 +0,0 @@
|
|||||||
#include "llvm/ADT/STLExtras.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "GraphSupport.hpp"
|
|
||||||
#include "Task.hpp"
|
|
||||||
#include "UniqueWorklist.hpp"
|
|
||||||
|
|
||||||
namespace dcp_graph {
|
|
||||||
|
|
||||||
llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents) {
|
|
||||||
llvm::DenseSet<TaskDCP*> reachable;
|
|
||||||
std::vector<TaskDCP*> worklist;
|
|
||||||
worklist.reserve(32);
|
|
||||||
|
|
||||||
auto enqueueEdges = [&](TaskDCP* task) {
|
|
||||||
const auto& edges = followParents ? task->parents : task->children;
|
|
||||||
for (const auto& edge : edges)
|
|
||||||
if (reachable.insert(edge.first).second)
|
|
||||||
worklist.push_back(edge.first);
|
|
||||||
};
|
|
||||||
|
|
||||||
enqueueEdges(root);
|
|
||||||
while (!worklist.empty()) {
|
|
||||||
TaskDCP* task = worklist.back();
|
|
||||||
worklist.pop_back();
|
|
||||||
enqueueEdges(task);
|
|
||||||
}
|
|
||||||
return reachable;
|
|
||||||
}
|
|
||||||
|
|
||||||
GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate) {
|
|
||||||
return {collectReachableTasks(candidate, true), collectReachableTasks(candidate, false), {}};
|
|
||||||
}
|
|
||||||
|
|
||||||
LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
|
|
||||||
const llvm::DenseSet<TaskDCP*>& descendants,
|
|
||||||
Time dcpl,
|
|
||||||
Time maxCompletion,
|
|
||||||
Time secondMaxCompletion,
|
|
||||||
TaskDCP* maxCompletionTask) {
|
|
||||||
LocalScheduleSnapshot snapshot;
|
|
||||||
snapshot.aestBackup.reserve(descendants.size() + 1);
|
|
||||||
snapshot.aestBackup.emplace_back(task, task->getAest());
|
|
||||||
for (TaskDCP* descendant : descendants)
|
|
||||||
snapshot.aestBackup.emplace_back(descendant, descendant->getAest());
|
|
||||||
snapshot.dcpl = dcpl;
|
|
||||||
snapshot.maxCompletion = maxCompletion;
|
|
||||||
snapshot.secondMaxCompletion = secondMaxCompletion;
|
|
||||||
snapshot.maxCompletionTask = maxCompletionTask;
|
|
||||||
return snapshot;
|
|
||||||
}
|
|
||||||
|
|
||||||
void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
|
|
||||||
Time& dcpl,
|
|
||||||
Time& maxCompletion,
|
|
||||||
Time& secondMaxCompletion,
|
|
||||||
TaskDCP*& maxCompletionTask) {
|
|
||||||
for (const auto& [task, aest] : snapshot.aestBackup)
|
|
||||||
task->setAest(aest);
|
|
||||||
dcpl = snapshot.dcpl;
|
|
||||||
maxCompletion = snapshot.maxCompletion;
|
|
||||||
secondMaxCompletion = snapshot.secondMaxCompletion;
|
|
||||||
maxCompletionTask = snapshot.maxCompletionTask;
|
|
||||||
}
|
|
||||||
|
|
||||||
int countDependencyParents(const TaskDCP* task) {
|
|
||||||
return static_cast<int>(llvm::count_if(task->parents, [](const Edge& edge) { return !edge.isScheduling; }));
|
|
||||||
}
|
|
||||||
|
|
||||||
void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion) {
|
|
||||||
if (insertion == nullptr)
|
|
||||||
return;
|
|
||||||
|
|
||||||
auto alreadyRecorded =
|
|
||||||
llvm::any_of(insertion->topologicalMoves,
|
|
||||||
[task](const TaskInsertion::TopologicalMoveRecord& move) { return move.task == task; });
|
|
||||||
if (alreadyRecorded)
|
|
||||||
return;
|
|
||||||
|
|
||||||
insertion->topologicalMoves.push_back({task, onnx_mlir::LabeledList<TaskDCP>::next(task)});
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount) {
|
|
||||||
UniqueWorkList<std::vector<TaskDCP*>> worklist(roots);
|
|
||||||
worklist.reserve(nodeCount);
|
|
||||||
|
|
||||||
size_t index = 0;
|
|
||||||
while (index != worklist.size()) {
|
|
||||||
bool modified = true;
|
|
||||||
while (modified) {
|
|
||||||
modified = false;
|
|
||||||
for (const auto& child : worklist.at(index)->children)
|
|
||||||
if (worklist.allElementsContained(
|
|
||||||
child.first->parents.begin(), child.first->parents.end(), [](Edge edge) { return edge.first; }))
|
|
||||||
modified |= worklist.pushBack(child.first);
|
|
||||||
}
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {worklist.begin(), worklist.end()};
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace dcp_graph
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
|
||||||
#include "llvm/ADT/DenseSet.h"
|
|
||||||
#include "llvm/ADT/SmallVector.h"
|
|
||||||
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "Graph.hpp"
|
|
||||||
|
|
||||||
namespace dcp_graph {
|
|
||||||
|
|
||||||
struct LocalScheduleSnapshot {
|
|
||||||
llvm::SmallVector<std::pair<TaskDCP*, Time>, 64> aestBackup;
|
|
||||||
Time dcpl = 0;
|
|
||||||
Time maxCompletion = 0;
|
|
||||||
Time secondMaxCompletion = 0;
|
|
||||||
TaskDCP* maxCompletionTask = nullptr;
|
|
||||||
};
|
|
||||||
|
|
||||||
llvm::DenseSet<TaskDCP*> collectReachableTasks(TaskDCP* root, bool followParents);
|
|
||||||
GraphDCP::CandidateRelations computeCandidateRelations(TaskDCP* candidate);
|
|
||||||
|
|
||||||
LocalScheduleSnapshot captureLocalScheduleState(TaskDCP* task,
|
|
||||||
const llvm::DenseSet<TaskDCP*>& descendants,
|
|
||||||
Time dcpl,
|
|
||||||
Time maxCompletion,
|
|
||||||
Time secondMaxCompletion,
|
|
||||||
TaskDCP* maxCompletionTask);
|
|
||||||
void restoreLocalScheduleState(const LocalScheduleSnapshot& snapshot,
|
|
||||||
Time& dcpl,
|
|
||||||
Time& maxCompletion,
|
|
||||||
Time& secondMaxCompletion,
|
|
||||||
TaskDCP*& maxCompletionTask);
|
|
||||||
|
|
||||||
int countDependencyParents(const TaskDCP* task);
|
|
||||||
void recordTopologicalMove(TaskDCP* task, TaskInsertion* insertion);
|
|
||||||
std::vector<TaskDCP*> collectDominanceOrder(llvm::ArrayRef<TaskDCP*> roots, size_t nodeCount);
|
|
||||||
|
|
||||||
} // namespace dcp_graph
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
#include <optional>
|
|
||||||
|
|
||||||
#include "Graph.hpp"
|
|
||||||
#include "Task.hpp"
|
|
||||||
#include "UniqueWorklist.hpp"
|
|
||||||
|
|
||||||
std::optional<Edge> TaskDCP::addChild(TaskDCP* child, Weight weight, bool isScheduling) {
|
|
||||||
std::optional<Edge> oldEdge = std::nullopt;
|
|
||||||
auto foundElement = std::find_if(children.begin(), children.end(), [child, isScheduling](Edge element) {
|
|
||||||
return child == element.first && isScheduling == element.isScheduling;
|
|
||||||
});
|
|
||||||
if (foundElement != children.end()) {
|
|
||||||
oldEdge = *foundElement;
|
|
||||||
fastRemove(children, foundElement);
|
|
||||||
}
|
|
||||||
children.emplace_back(Edge {child, weight, isScheduling});
|
|
||||||
return oldEdge;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::optional<Edge> TaskDCP::addParent(TaskDCP* parent, Weight weight, bool isScheduling) {
|
|
||||||
std::optional<Edge> oldEdge = std::nullopt;
|
|
||||||
auto foundElement = std::find_if(parents.begin(), parents.end(), [parent, isScheduling](Edge element) {
|
|
||||||
return parent == element.first && isScheduling == element.isScheduling;
|
|
||||||
});
|
|
||||||
if (foundElement != parents.end()) {
|
|
||||||
oldEdge = *foundElement;
|
|
||||||
fastRemove(parents, foundElement);
|
|
||||||
}
|
|
||||||
parents.emplace_back(Edge {parent, weight, isScheduling});
|
|
||||||
return oldEdge;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TaskDCP::hasDescendant(TaskDCP* child) {
|
|
||||||
UniqueWorkList<std::vector<TaskDCP*>> worklist;
|
|
||||||
worklist.reserve(32);
|
|
||||||
worklist.pushBack(this);
|
|
||||||
while (!worklist.empty()) {
|
|
||||||
TaskDCP* task = worklist.back();
|
|
||||||
worklist.popBack();
|
|
||||||
if (task == child)
|
|
||||||
return true;
|
|
||||||
for (auto edge : task->children)
|
|
||||||
worklist.pushBack(edge.first);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
Weight TaskDCP::computeWeightOnCpu(GraphDCP* graph, CPU cpu) {
|
|
||||||
if (crossbarUsage != 0 && graph->wouldExhaustCrossbarCapacity(cpu, this))
|
|
||||||
return std::numeric_limits<Weight>::max();
|
|
||||||
return baseWeight;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TaskInsertion::rollBack() {
|
|
||||||
graph->removeTaskFromCPU(cpuModified, taskInserted);
|
|
||||||
if (beforeNode.has_value()) {
|
|
||||||
auto edgePair = *beforeNode;
|
|
||||||
addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
|
|
||||||
}
|
|
||||||
if (afterNode.has_value()) {
|
|
||||||
auto edgePair = *afterNode;
|
|
||||||
addEdge(edgePair.first.first, edgePair.second.first, edgePair.first.second, edgePair.first.isScheduling);
|
|
||||||
}
|
|
||||||
// for (auto it = topologicalMoves.rbegin(); it != topologicalMoves.rend(); ++it)
|
|
||||||
// graph->topologicalOrder.moveBefore(it->task, it->nextTask);
|
|
||||||
}
|
|
||||||
@@ -1,126 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <optional>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "Utils.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
||||||
|
|
||||||
class TaskDCP : public onnx_mlir::LabeledListNode<TaskDCP> {
|
|
||||||
onnx_mlir::spatial::SpatCompute spatCompute;
|
|
||||||
Time aest;
|
|
||||||
Time alst;
|
|
||||||
std::optional<CPU> scheduledCpu;
|
|
||||||
Weight weight;
|
|
||||||
Weight baseWeight;
|
|
||||||
CrossbarUsage crossbarUsage;
|
|
||||||
long long flag = 0;
|
|
||||||
int64_t syntheticId = -1;
|
|
||||||
|
|
||||||
std::optional<Edge> addChild(TaskDCP* child, Weight weight, bool isScheduling);
|
|
||||||
std::optional<Edge> addChild(TaskDCP& child, Weight weight, bool isScheduling) {
|
|
||||||
return addChild(&child, weight, isScheduling);
|
|
||||||
}
|
|
||||||
|
|
||||||
void removeChild(TaskDCP* toRemove, bool isScheduling) { fastRemove(children, toRemove, isScheduling); }
|
|
||||||
void removeChild(TaskDCP& toRemove, bool isScheduling) { fastRemove(children, &toRemove, isScheduling); }
|
|
||||||
|
|
||||||
std::optional<Edge> addParent(TaskDCP* parent, Weight weight, bool isScheduling);
|
|
||||||
std::optional<Edge> addParent(TaskDCP& parent, Weight weight, bool isScheduling) {
|
|
||||||
return addParent(&parent, weight, isScheduling);
|
|
||||||
}
|
|
||||||
|
|
||||||
void removeParent(TaskDCP* toRemove, bool isScheduling) { fastRemove(parents, toRemove, isScheduling); }
|
|
||||||
void removeParent(TaskDCP& toRemove, bool isScheduling) { fastRemove(parents, &toRemove, isScheduling); }
|
|
||||||
|
|
||||||
public:
|
|
||||||
std::vector<Edge> parents;
|
|
||||||
std::vector<Edge> children;
|
|
||||||
TaskDCP() = default;
|
|
||||||
TaskDCP(onnx_mlir::spatial::SpatCompute spatCompute)
|
|
||||||
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
|
||||||
spatCompute(spatCompute),
|
|
||||||
aest(0),
|
|
||||||
alst(0),
|
|
||||||
scheduledCpu(),
|
|
||||||
weight(getSpatComputeWeight(spatCompute)),
|
|
||||||
baseWeight(weight),
|
|
||||||
crossbarUsage(getSpatComputeCrossbarUsage(spatCompute)),
|
|
||||||
syntheticId(-1),
|
|
||||||
parents(),
|
|
||||||
children() {}
|
|
||||||
|
|
||||||
TaskDCP(int64_t id, Weight weight, CrossbarUsage crossbarUsage = 0)
|
|
||||||
: onnx_mlir::LabeledListNode<TaskDCP>(),
|
|
||||||
spatCompute(),
|
|
||||||
aest(0),
|
|
||||||
alst(0),
|
|
||||||
scheduledCpu(),
|
|
||||||
weight(weight),
|
|
||||||
baseWeight(weight),
|
|
||||||
crossbarUsage(crossbarUsage),
|
|
||||||
flag(0),
|
|
||||||
syntheticId(id),
|
|
||||||
parents(),
|
|
||||||
children() {}
|
|
||||||
|
|
||||||
TaskDCP(const TaskDCP& node) = delete;
|
|
||||||
TaskDCP(TaskDCP&& node) = default;
|
|
||||||
|
|
||||||
void setCpu(CPU cpu) { scheduledCpu = cpu; }
|
|
||||||
std::optional<CPU> getCpu() const { return scheduledCpu; }
|
|
||||||
void resetCpu() { scheduledCpu = std::nullopt; }
|
|
||||||
Weight getWeight() const {
|
|
||||||
if (isScheduled())
|
|
||||||
return weight;
|
|
||||||
return baseWeight;
|
|
||||||
}
|
|
||||||
void setWeight(Weight value) { weight = value; }
|
|
||||||
void resetWeight() { weight = baseWeight; }
|
|
||||||
Weight computeWeightOnCpu(GraphDCP* graph, CPU cpu);
|
|
||||||
CrossbarUsage getCrossbarUsage() const { return crossbarUsage; }
|
|
||||||
|
|
||||||
bool hasParents() const { return parents.size() != 0; }
|
|
||||||
bool hasChildren() const { return children.size() != 0; }
|
|
||||||
|
|
||||||
Time getAest() const { return aest; }
|
|
||||||
Time getAlst() const { return alst; }
|
|
||||||
void setAest(Time value) { aest = value; }
|
|
||||||
void setAlst(Time value) { alst = value; }
|
|
||||||
bool hasDescendant(TaskDCP* child);
|
|
||||||
int64_t Id() const {
|
|
||||||
if (spatCompute)
|
|
||||||
return reinterpret_cast<int64_t>(spatCompute.getAsOpaquePointer());
|
|
||||||
return syntheticId;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isCriticalPath() const { return alst == aest; }
|
|
||||||
bool isScheduled() const { return scheduledCpu.has_value(); }
|
|
||||||
onnx_mlir::spatial::SpatCompute getSpatCompute() const { return spatCompute; }
|
|
||||||
|
|
||||||
void setFlag(long long val) { flag = val; }
|
|
||||||
long long getFlag() const { return flag; }
|
|
||||||
|
|
||||||
onnx_mlir::LabeledList<TaskDCP>::Iterator getTopologicalIterator() { return getIterator(); }
|
|
||||||
|
|
||||||
friend std::optional<EdgePair> addEdge(TaskDCP* parent, TaskDCP* child, Weight weight, bool isScheduling);
|
|
||||||
friend void removeEdge(TaskDCP* parent, TaskDCP* child, bool isScheduling);
|
|
||||||
friend Weight getTransferCost(TaskDCP* parent, TaskDCP* child);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct TaskInsertion {
|
|
||||||
struct TopologicalMoveRecord {
|
|
||||||
TaskDCP* task;
|
|
||||||
TaskDCP* nextTask;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::optional<EdgePair> beforeNode;
|
|
||||||
std::optional<EdgePair> afterNode;
|
|
||||||
std::vector<TopologicalMoveRecord> topologicalMoves;
|
|
||||||
CPU cpuModified;
|
|
||||||
TaskDCP* taskInserted;
|
|
||||||
GraphDCP* graph;
|
|
||||||
|
|
||||||
void rollBack();
|
|
||||||
};
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "llvm/ADT/DenseSet.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <type_traits>
|
|
||||||
|
|
||||||
template <typename T, typename = void>
|
|
||||||
struct HasPopFront : std::false_type {};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
struct HasPopFront<T, std::void_t<decltype(std::declval<T>().pop_front())>> : std::true_type {};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
class UniqueWorkList {
|
|
||||||
|
|
||||||
using ValueType = typename T::value_type;
|
|
||||||
T storage;
|
|
||||||
llvm::DenseSet<ValueType> uniqueElements;
|
|
||||||
|
|
||||||
public:
|
|
||||||
UniqueWorkList() = default;
|
|
||||||
|
|
||||||
template <typename RangeT>
|
|
||||||
UniqueWorkList(const RangeT& from)
|
|
||||||
: storage() {
|
|
||||||
for (auto& element : from) {
|
|
||||||
if (!uniqueElements.contains(element)) {
|
|
||||||
storage.push_back(element);
|
|
||||||
uniqueElements.insert(element);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool empty() const { return storage.empty(); }
|
|
||||||
void reserve(size_t value) { return storage.reserve(value); }
|
|
||||||
size_t size() const { return storage.size(); }
|
|
||||||
ValueType& at(size_t index) { return storage.at(index); }
|
|
||||||
const ValueType& at(size_t index) const { return storage.at(index); }
|
|
||||||
|
|
||||||
ValueType& front() { return storage.front(); }
|
|
||||||
ValueType& back() { return storage.back(); }
|
|
||||||
|
|
||||||
bool pushBack(const ValueType& value) {
|
|
||||||
if (!uniqueElements.contains(value)) {
|
|
||||||
storage.push_back(value);
|
|
||||||
uniqueElements.insert(value);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void popFront() {
|
|
||||||
if constexpr (HasPopFront<T>::value)
|
|
||||||
storage.pop_front();
|
|
||||||
else
|
|
||||||
assert(false && "Underlying storage type does not support pop_front()");
|
|
||||||
}
|
|
||||||
|
|
||||||
auto cbegin() const { return storage.cbegin(); }
|
|
||||||
auto cend() const { return storage.cend(); }
|
|
||||||
|
|
||||||
void popBack() { storage.pop_back(); }
|
|
||||||
|
|
||||||
template <typename Iterator, typename Mapper>
|
|
||||||
bool allElementsContained(Iterator begin, Iterator end, Mapper map) const {
|
|
||||||
auto it = begin;
|
|
||||||
while (it != end) {
|
|
||||||
if (!uniqueElements.contains(map(*it)))
|
|
||||||
return false;
|
|
||||||
std::advance(it, 1);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto begin() { return storage.begin(); }
|
|
||||||
|
|
||||||
auto end() { return storage.end(); }
|
|
||||||
|
|
||||||
auto begin() const { return storage.begin(); }
|
|
||||||
|
|
||||||
auto end() const { return storage.end(); }
|
|
||||||
};
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
|
||||||
|
|
||||||
#include "llvm/Support/Casting.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <limits>
|
|
||||||
#include <list>
|
|
||||||
#include <type_traits>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
||||||
|
|
||||||
using CPU = int;
|
|
||||||
using Weight = unsigned long long;
|
|
||||||
using Time = unsigned long long;
|
|
||||||
using CrossbarUsage = unsigned long long;
|
|
||||||
class TaskDCP;
|
|
||||||
class GraphDCP;
|
|
||||||
struct Edge {
|
|
||||||
TaskDCP* first;
|
|
||||||
Weight second;
|
|
||||||
bool isScheduling = false;
|
|
||||||
};
|
|
||||||
using EdgePair = std::pair<Edge, Edge>;
|
|
||||||
using IndexedEdge = std::tuple<int64_t, int64_t, int64_t>;
|
|
||||||
|
|
||||||
inline void fastRemove(std::vector<Edge>& vector, TaskDCP* toRemove, bool isScheduling) {
|
|
||||||
auto position = std::find_if(vector.begin(), vector.end(), [toRemove, isScheduling](Edge edge) {
|
|
||||||
return edge.first == toRemove && edge.isScheduling == isScheduling;
|
|
||||||
});
|
|
||||||
if (position != vector.end()) {
|
|
||||||
std::swap(*(vector.end() - 1), *position);
|
|
||||||
vector.pop_back();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void fastRemove(std::vector<TaskDCP*>& vector, TaskDCP* toRemove) {
|
|
||||||
auto position =
|
|
||||||
std::find_if(vector.begin(), vector.end(), [toRemove](TaskDCP* element) { return element == toRemove; });
|
|
||||||
if (position != vector.end()) {
|
|
||||||
std::swap(*(vector.end() - 1), *position);
|
|
||||||
vector.pop_back();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename P>
|
|
||||||
void fastRemove(std::vector<Edge>& vector, P position) {
|
|
||||||
if (position != vector.end()) {
|
|
||||||
std::swap(*(vector.end() - 1), *position);
|
|
||||||
vector.pop_back();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline T checkedAdd(T lhs, T rhs) {
|
|
||||||
static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
|
|
||||||
assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
|
|
||||||
return lhs + rhs;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline T checkedMultiply(T lhs, T rhs) {
|
|
||||||
static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
|
|
||||||
if (lhs == 0 || rhs == 0)
|
|
||||||
return 0;
|
|
||||||
assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
|
|
||||||
return lhs * rhs;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline T addOrMax(T lhs, T rhs) {
|
|
||||||
static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
|
|
||||||
if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
|
|
||||||
return std::numeric_limits<T>::max();
|
|
||||||
return checkedAdd(lhs, rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
inline T subtractOrZero(T lhs, T rhs) {
|
|
||||||
static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
|
|
||||||
if (lhs == std::numeric_limits<T>::max())
|
|
||||||
return lhs;
|
|
||||||
if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
|
|
||||||
return 0;
|
|
||||||
return lhs - rhs;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
|
|
||||||
|
|
||||||
inline Weight getSpatComputeWeight(onnx_mlir::spatial::SpatCompute spatCompute) {
|
|
||||||
constexpr Weight kOperationWeight = 100;
|
|
||||||
Weight numOperations = 0;
|
|
||||||
for (auto& block : spatCompute.getBody())
|
|
||||||
for ([[maybe_unused]] auto& op : block)
|
|
||||||
numOperations = checkedAdd(numOperations, static_cast<Weight>(1));
|
|
||||||
return checkedMultiply(numOperations, kOperationWeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline CrossbarUsage getSpatComputeCrossbarUsage(onnx_mlir::spatial::SpatCompute spatCompute) {
|
|
||||||
CrossbarUsage crossbarUsage = 0;
|
|
||||||
for (auto& region : spatCompute.getBody())
|
|
||||||
for (auto& inst : region)
|
|
||||||
if (llvm::isa<onnx_mlir::spatial::SpatVMMOp>(inst))
|
|
||||||
crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
|
|
||||||
return crossbarUsage;
|
|
||||||
}
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||||
#include "mlir/IR/BuiltinTypes.h"
|
#include "mlir/IR/BuiltinTypes.h"
|
||||||
#include "mlir/IR/Operation.h"
|
#include "mlir/IR/Operation.h"
|
||||||
|
#include "mlir/IR/Unit.h"
|
||||||
#include "mlir/IR/Value.h"
|
#include "mlir/IR/Value.h"
|
||||||
|
|
||||||
#include "llvm/ADT/STLExtras.h"
|
#include "llvm/ADT/STLExtras.h"
|
||||||
@@ -32,15 +33,6 @@ Weight getComputeBodyWeight(Region& body) {
|
|||||||
return checkedMultiply(numOperations, kOperationWeight);
|
return checkedMultiply(numOperations, kOperationWeight);
|
||||||
}
|
}
|
||||||
|
|
||||||
CrossbarUsage getComputeBodyCrossbarUsage(Region& body) {
|
|
||||||
CrossbarUsage crossbarUsage = 0;
|
|
||||||
for (auto& block : body)
|
|
||||||
for (auto& op : block)
|
|
||||||
if (isa<SpatVMMOp>(op))
|
|
||||||
crossbarUsage = checkedAdd(crossbarUsage, static_cast<CrossbarUsage>(1));
|
|
||||||
return crossbarUsage;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isUsedAsWeightOnly(Operation* producerOp) {
|
bool isUsedAsWeightOnly(Operation* producerOp) {
|
||||||
if (producerOp->getNumResults() == 0)
|
if (producerOp->getNumResults() == 0)
|
||||||
return false;
|
return false;
|
||||||
@@ -133,16 +125,28 @@ std::vector<ComputeGraphEdge> aggregateEdges(llvm::ArrayRef<ComputeGraphEdge> ed
|
|||||||
|
|
||||||
Weight getComputeInstanceWeight(const ComputeInstance& instance) {
|
Weight getComputeInstanceWeight(const ComputeInstance& instance) {
|
||||||
if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
|
if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
|
||||||
return getSpatComputeWeight(spatCompute);
|
return getComputeBodyWeight(spatCompute.getBody());
|
||||||
auto batch = cast<SpatComputeBatch>(instance.op);
|
auto batch = cast<SpatComputeBatch>(instance.op);
|
||||||
return checkedMultiply(getComputeBodyWeight(batch.getBody()), static_cast<Weight>(instance.laneCount));
|
return checkedMultiply(getComputeBodyWeight(batch.getBody()), static_cast<Weight>(instance.laneCount));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CrossbarUsage getSpatComputeCrossbarUsage(SpatCompute spatComute){
|
||||||
|
CrossbarUsage ret;
|
||||||
|
ret.insert_range(spatComute.getWeights());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
CrossbarUsage getSpatComputeBatchCrossbarUsage(SpatComputeBatch spatComuteBatch){
|
||||||
|
CrossbarUsage ret;
|
||||||
|
ret.insert_range(spatComuteBatch.getWeights());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
CrossbarUsage getComputeInstanceCrossbarUsage(const ComputeInstance& instance) {
|
CrossbarUsage getComputeInstanceCrossbarUsage(const ComputeInstance& instance) {
|
||||||
if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
|
if (auto spatCompute = dyn_cast<SpatCompute>(instance.op))
|
||||||
return getSpatComputeCrossbarUsage(spatCompute);
|
return getSpatComputeCrossbarUsage(spatCompute);
|
||||||
auto batch = cast<SpatComputeBatch>(instance.op);
|
auto batch = cast<SpatComputeBatch>(instance.op);
|
||||||
return checkedMultiply(getComputeBodyCrossbarUsage(batch.getBody()), static_cast<CrossbarUsage>(instance.laneCount));
|
return getSpatComputeBatchCrossbarUsage(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
ComputeGraph buildComputeGraph(Operation* entryOp) {
|
ComputeGraph buildComputeGraph(Operation* entryOp) {
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
#include "mlir/IR/Value.h"
|
#include "mlir/IR/Value.h"
|
||||||
|
|
||||||
#include "llvm/ADT/DenseMap.h"
|
#include "llvm/ADT/DenseMap.h"
|
||||||
|
#include "llvm/ADT/SmallSet.h"
|
||||||
#include "llvm/ADT/SmallVector.h"
|
#include "llvm/ADT/SmallVector.h"
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
@@ -11,17 +12,19 @@
|
|||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "../DCPGraph/Utils.hpp"
|
#include "Utils.hpp"
|
||||||
#include "ComputeInstance.hpp"
|
#include "ComputeInstance.hpp"
|
||||||
#include "ComputeInstanceUtils.hpp"
|
#include "ComputeInstanceUtils.hpp"
|
||||||
|
|
||||||
|
using CrossbarUsage = llvm::SmallPtrSet<mlir::Value, 6>;
|
||||||
|
|
||||||
namespace onnx_mlir {
|
namespace onnx_mlir {
|
||||||
namespace spatial {
|
namespace spatial {
|
||||||
|
|
||||||
struct ComputeGraphNode {
|
struct ComputeGraphNode {
|
||||||
ComputeInstance instance;
|
ComputeInstance instance;
|
||||||
Weight weight = 0;
|
Weight weight = 0;
|
||||||
CrossbarUsage crossbarUsage = 0;
|
llvm::SmallPtrSet<mlir::Value,6> crossbarUsage;
|
||||||
size_t originalOrder = 0;
|
size_t originalOrder = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,722 +0,0 @@
|
|||||||
#include "llvm/ADT/SmallVector.h"
|
|
||||||
#include "llvm/Support/FormatVariadic.h"
|
|
||||||
#include "llvm/Support/raw_ostream.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <limits>
|
|
||||||
#include <numeric>
|
|
||||||
#include <optional>
|
|
||||||
#include <queue>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "../DCPGraph/Graph.hpp"
|
|
||||||
#include "DcpScheduler.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
namespace spatial {
|
|
||||||
|
|
||||||
using namespace mlir;
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
bool isDcpCoarsenDebugEnabled() { return std::getenv("DCP_COARSEN_DEBUG") != nullptr; }
|
|
||||||
|
|
||||||
struct VirtualNode {
|
|
||||||
llvm::SmallVector<size_t, 4> originalNodeIndices;
|
|
||||||
Weight weight = 0;
|
|
||||||
CrossbarUsage crossbarUsage = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct VirtualGraph {
|
|
||||||
std::vector<VirtualNode> nodes;
|
|
||||||
std::vector<IndexedEdge> edges;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct TimingInfo {
|
|
||||||
std::vector<Time> aest;
|
|
||||||
std::vector<Time> alst;
|
|
||||||
std::vector<size_t> topologicalOrder;
|
|
||||||
bool valid = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct WindowScheduleResult {
|
|
||||||
std::vector<std::vector<size_t>> mergeGroups;
|
|
||||||
CPU cpuCount = 0;
|
|
||||||
size_t mergedNodeCount = 0;
|
|
||||||
size_t maxMergeGroupSize = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
size_t getSchedulingCpuBudget(const DcpScheduleOptions& options) {
|
|
||||||
if (options.processorCount > 0)
|
|
||||||
return options.processorCount;
|
|
||||||
return std::numeric_limits<size_t>::max();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<IndexedEdge> aggregateEdges(llvm::ArrayRef<IndexedEdge> edges) {
|
|
||||||
llvm::DenseMap<std::pair<size_t, size_t>, Weight> edgeWeights;
|
|
||||||
for (auto [start, end, weight] : edges) {
|
|
||||||
size_t startIndex = static_cast<size_t>(start);
|
|
||||||
size_t endIndex = static_cast<size_t>(end);
|
|
||||||
if (startIndex == endIndex)
|
|
||||||
continue;
|
|
||||||
auto key = std::make_pair(startIndex, endIndex);
|
|
||||||
Weight edgeWeight = static_cast<Weight>(weight);
|
|
||||||
auto inserted = edgeWeights.try_emplace(key, edgeWeight);
|
|
||||||
if (!inserted.second)
|
|
||||||
inserted.first->second = std::max(inserted.first->second, edgeWeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<IndexedEdge> aggregatedEdges;
|
|
||||||
aggregatedEdges.reserve(edgeWeights.size());
|
|
||||||
for (auto [key, weight] : edgeWeights)
|
|
||||||
aggregatedEdges.push_back(
|
|
||||||
{static_cast<int64_t>(key.first), static_cast<int64_t>(key.second), static_cast<int64_t>(weight)});
|
|
||||||
llvm::sort(aggregatedEdges, [](const IndexedEdge& lhs, const IndexedEdge& rhs) {
|
|
||||||
if (std::get<0>(lhs) != std::get<0>(rhs))
|
|
||||||
return std::get<0>(lhs) < std::get<0>(rhs);
|
|
||||||
return std::get<1>(lhs) < std::get<1>(rhs);
|
|
||||||
});
|
|
||||||
return aggregatedEdges;
|
|
||||||
}
|
|
||||||
|
|
||||||
VirtualGraph buildInitialVirtualGraph(const ComputeGraph& graph) {
|
|
||||||
VirtualGraph virtualGraph;
|
|
||||||
virtualGraph.nodes.reserve(graph.nodes.size());
|
|
||||||
for (auto [index, node] : llvm::enumerate(graph.nodes)) {
|
|
||||||
VirtualNode virtualNode;
|
|
||||||
virtualNode.originalNodeIndices.push_back(index);
|
|
||||||
virtualNode.weight = node.weight;
|
|
||||||
virtualNode.crossbarUsage = node.crossbarUsage;
|
|
||||||
virtualGraph.nodes.push_back(std::move(virtualNode));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<IndexedEdge> edges;
|
|
||||||
edges.reserve(graph.edges.size());
|
|
||||||
for (const ComputeGraphEdge& edge : graph.edges)
|
|
||||||
edges.push_back(
|
|
||||||
{static_cast<int64_t>(edge.source), static_cast<int64_t>(edge.target), static_cast<int64_t>(edge.transferCost)});
|
|
||||||
virtualGraph.edges = aggregateEdges(edges);
|
|
||||||
return virtualGraph;
|
|
||||||
}
|
|
||||||
|
|
||||||
TimingInfo computeTiming(const VirtualGraph& graph) {
|
|
||||||
TimingInfo timing;
|
|
||||||
size_t nodeCount = graph.nodes.size();
|
|
||||||
timing.aest.assign(nodeCount, 0);
|
|
||||||
timing.alst.assign(nodeCount, 0);
|
|
||||||
timing.topologicalOrder.reserve(nodeCount);
|
|
||||||
|
|
||||||
std::vector<std::vector<std::pair<size_t, Weight>>> parents(nodeCount);
|
|
||||||
std::vector<std::vector<std::pair<size_t, Weight>>> children(nodeCount);
|
|
||||||
std::vector<size_t> incomingEdgeCount(nodeCount, 0);
|
|
||||||
|
|
||||||
for (auto [start, end, weight] : graph.edges) {
|
|
||||||
size_t startIndex = static_cast<size_t>(start);
|
|
||||||
size_t endIndex = static_cast<size_t>(end);
|
|
||||||
Weight edgeWeight = static_cast<Weight>(weight);
|
|
||||||
assert(startIndex < nodeCount && endIndex < nodeCount && "virtual edge endpoint out of range");
|
|
||||||
children[startIndex].push_back({endIndex, edgeWeight});
|
|
||||||
parents[endIndex].push_back({startIndex, edgeWeight});
|
|
||||||
incomingEdgeCount[endIndex]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto getVirtualNodeOrderKey = [&](size_t nodeIndex) {
|
|
||||||
const VirtualNode& node = graph.nodes[nodeIndex];
|
|
||||||
if (!node.originalNodeIndices.empty())
|
|
||||||
return node.originalNodeIndices.front();
|
|
||||||
return nodeIndex;
|
|
||||||
};
|
|
||||||
auto readyNodeGreater = [&](size_t lhs, size_t rhs) {
|
|
||||||
size_t lhsKey = getVirtualNodeOrderKey(lhs);
|
|
||||||
size_t rhsKey = getVirtualNodeOrderKey(rhs);
|
|
||||||
if (lhsKey != rhsKey)
|
|
||||||
return lhsKey > rhsKey;
|
|
||||||
return lhs > rhs;
|
|
||||||
};
|
|
||||||
std::priority_queue<size_t, std::vector<size_t>, decltype(readyNodeGreater)> readyNodes(readyNodeGreater);
|
|
||||||
for (size_t i = 0; i < nodeCount; ++i)
|
|
||||||
if (incomingEdgeCount[i] == 0)
|
|
||||||
readyNodes.push(i);
|
|
||||||
|
|
||||||
while (!readyNodes.empty()) {
|
|
||||||
size_t current = readyNodes.top();
|
|
||||||
readyNodes.pop();
|
|
||||||
timing.topologicalOrder.push_back(current);
|
|
||||||
for (auto [child, weight] : children[current]) {
|
|
||||||
(void) weight;
|
|
||||||
assert(incomingEdgeCount[child] > 0 && "incoming edge count underflow");
|
|
||||||
incomingEdgeCount[child]--;
|
|
||||||
if (incomingEdgeCount[child] == 0)
|
|
||||||
readyNodes.push(child);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (timing.topologicalOrder.size() != nodeCount)
|
|
||||||
return timing;
|
|
||||||
|
|
||||||
Time dcpl = 0;
|
|
||||||
for (size_t nodeIndex : timing.topologicalOrder) {
|
|
||||||
Time maxParentAest = 0;
|
|
||||||
for (auto [parent, transferCost] : parents[nodeIndex]) {
|
|
||||||
maxParentAest =
|
|
||||||
std::max(maxParentAest, addOrMax(addOrMax(timing.aest[parent], graph.nodes[parent].weight), transferCost));
|
|
||||||
}
|
|
||||||
timing.aest[nodeIndex] = maxParentAest;
|
|
||||||
dcpl = std::max(dcpl, addOrMax(maxParentAest, graph.nodes[nodeIndex].weight));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t nodeIndex : llvm::reverse(timing.topologicalOrder)) {
|
|
||||||
Time minAlst = std::numeric_limits<Time>::max();
|
|
||||||
if (children[nodeIndex].empty())
|
|
||||||
minAlst = subtractOrZero(dcpl, graph.nodes[nodeIndex].weight);
|
|
||||||
for (auto [child, transferCost] : children[nodeIndex]) {
|
|
||||||
minAlst =
|
|
||||||
std::min(minAlst, subtractOrZero(timing.alst[child], addOrMax(graph.nodes[nodeIndex].weight, transferCost)));
|
|
||||||
}
|
|
||||||
timing.alst[nodeIndex] = minAlst;
|
|
||||||
}
|
|
||||||
|
|
||||||
timing.valid = true;
|
|
||||||
return timing;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::vector<size_t>> buildUndirectedAdjacency(const VirtualGraph& graph) {
|
|
||||||
std::vector<std::vector<size_t>> adjacency(graph.nodes.size());
|
|
||||||
for (auto [start, end, weight] : graph.edges) {
|
|
||||||
(void) weight;
|
|
||||||
size_t startIndex = static_cast<size_t>(start);
|
|
||||||
size_t endIndex = static_cast<size_t>(end);
|
|
||||||
assert(startIndex < graph.nodes.size() && endIndex < graph.nodes.size() && "virtual edge endpoint out of range");
|
|
||||||
adjacency[startIndex].push_back(endIndex);
|
|
||||||
adjacency[endIndex].push_back(startIndex);
|
|
||||||
}
|
|
||||||
for (auto& neighbours : adjacency) {
|
|
||||||
llvm::sort(neighbours);
|
|
||||||
neighbours.erase(std::unique(neighbours.begin(), neighbours.end()), neighbours.end());
|
|
||||||
}
|
|
||||||
return adjacency;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<size_t> selectCriticalWindow(const VirtualGraph& graph, const TimingInfo& timing, size_t windowSize) {
|
|
||||||
std::vector<size_t> ranked(timing.aest.size());
|
|
||||||
std::iota(ranked.begin(), ranked.end(), 0);
|
|
||||||
auto isHigherPriority = [&](size_t lhs, size_t rhs) {
|
|
||||||
Time lhsSlack = slackOrZero(timing.aest[lhs], timing.alst[lhs]);
|
|
||||||
Time rhsSlack = slackOrZero(timing.aest[rhs], timing.alst[rhs]);
|
|
||||||
if (lhsSlack != rhsSlack)
|
|
||||||
return lhsSlack < rhsSlack;
|
|
||||||
if (timing.aest[lhs] != timing.aest[rhs])
|
|
||||||
return timing.aest[lhs] < timing.aest[rhs];
|
|
||||||
return lhs < rhs;
|
|
||||||
};
|
|
||||||
|
|
||||||
windowSize = std::min(windowSize, ranked.size());
|
|
||||||
if (windowSize == 0)
|
|
||||||
return {};
|
|
||||||
if (windowSize == ranked.size()) {
|
|
||||||
llvm::sort(ranked, isHigherPriority);
|
|
||||||
return ranked;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t criticalPoolSize = std::min(ranked.size(), std::max(windowSize, windowSize * 2));
|
|
||||||
if (criticalPoolSize < ranked.size())
|
|
||||||
std::nth_element(
|
|
||||||
ranked.begin(), ranked.begin() + static_cast<std::ptrdiff_t>(criticalPoolSize), ranked.end(), isHigherPriority);
|
|
||||||
|
|
||||||
std::vector<char> inCriticalPool(ranked.size(), false);
|
|
||||||
for (size_t i = 0; i < criticalPoolSize; ++i)
|
|
||||||
inCriticalPool[ranked[i]] = true;
|
|
||||||
|
|
||||||
size_t seed = *std::min_element(ranked.begin(), ranked.end(), isHigherPriority);
|
|
||||||
std::vector<std::vector<size_t>> adjacency = buildUndirectedAdjacency(graph);
|
|
||||||
std::vector<size_t> selected;
|
|
||||||
std::vector<char> inWindow(ranked.size(), false);
|
|
||||||
selected.reserve(windowSize);
|
|
||||||
|
|
||||||
struct FrontierEntry {
|
|
||||||
size_t node;
|
|
||||||
};
|
|
||||||
auto frontierCompare = [&](FrontierEntry lhs, FrontierEntry rhs) { return isHigherPriority(rhs.node, lhs.node); };
|
|
||||||
std::priority_queue<FrontierEntry, std::vector<FrontierEntry>, decltype(frontierCompare)> frontier(frontierCompare);
|
|
||||||
|
|
||||||
auto addToWindow = [&](size_t node, const std::vector<char>& eligible) {
|
|
||||||
if (inWindow[node])
|
|
||||||
return;
|
|
||||||
inWindow[node] = true;
|
|
||||||
selected.push_back(node);
|
|
||||||
for (size_t neighbour : adjacency[node])
|
|
||||||
if (!inWindow[neighbour] && eligible[neighbour])
|
|
||||||
frontier.push({neighbour});
|
|
||||||
};
|
|
||||||
|
|
||||||
addToWindow(seed, inCriticalPool);
|
|
||||||
while (!frontier.empty() && selected.size() < windowSize) {
|
|
||||||
size_t node = frontier.top().node;
|
|
||||||
frontier.pop();
|
|
||||||
if (!inWindow[node])
|
|
||||||
addToWindow(node, inCriticalPool);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (selected.size() < windowSize) {
|
|
||||||
std::vector<char> anyNode(ranked.size(), true);
|
|
||||||
for (size_t node : selected)
|
|
||||||
for (size_t neighbour : adjacency[node])
|
|
||||||
if (!inWindow[neighbour])
|
|
||||||
frontier.push({neighbour});
|
|
||||||
while (!frontier.empty() && selected.size() < windowSize) {
|
|
||||||
size_t node = frontier.top().node;
|
|
||||||
frontier.pop();
|
|
||||||
if (!inWindow[node])
|
|
||||||
addToWindow(node, anyNode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (selected.size() < windowSize) {
|
|
||||||
llvm::sort(ranked, isHigherPriority);
|
|
||||||
for (size_t node : ranked) {
|
|
||||||
if (selected.size() == windowSize)
|
|
||||||
break;
|
|
||||||
if (!inWindow[node]) {
|
|
||||||
inWindow[node] = true;
|
|
||||||
selected.push_back(node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::sort(selected, isHigherPriority);
|
|
||||||
return selected;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<IndexedEdge> buildWindowEdges(const VirtualGraph& graph, const std::vector<int64_t>& nodeToWindowIndex) {
|
|
||||||
std::vector<IndexedEdge> windowEdges;
|
|
||||||
windowEdges.reserve(graph.edges.size());
|
|
||||||
for (auto [start, end, weight] : graph.edges) {
|
|
||||||
int64_t mappedStart = nodeToWindowIndex[static_cast<size_t>(start)];
|
|
||||||
int64_t mappedEnd = nodeToWindowIndex[static_cast<size_t>(end)];
|
|
||||||
if (mappedStart == -1 || mappedEnd == -1)
|
|
||||||
continue;
|
|
||||||
windowEdges.push_back({mappedStart, mappedEnd, weight});
|
|
||||||
}
|
|
||||||
return aggregateEdges(windowEdges);
|
|
||||||
}
|
|
||||||
|
|
||||||
WindowScheduleResult scheduleWindow(const VirtualGraph& graph,
|
|
||||||
llvm::ArrayRef<size_t> selectedNodes,
|
|
||||||
const DcpScheduleOptions& options,
|
|
||||||
mlir::MLIRContext* context) {
|
|
||||||
std::vector<Weight> windowWeights;
|
|
||||||
std::vector<CrossbarUsage> windowCrossbarUsage;
|
|
||||||
std::vector<int64_t> windowNodeOrderKeys;
|
|
||||||
std::vector<int64_t> nodeToWindowIndex(graph.nodes.size(), -1);
|
|
||||||
windowWeights.reserve(selectedNodes.size());
|
|
||||||
windowCrossbarUsage.reserve(selectedNodes.size());
|
|
||||||
windowNodeOrderKeys.reserve(selectedNodes.size());
|
|
||||||
|
|
||||||
for (auto [windowIndex, nodeIndex] : llvm::enumerate(selectedNodes)) {
|
|
||||||
nodeToWindowIndex[nodeIndex] = static_cast<int64_t>(windowIndex);
|
|
||||||
windowWeights.push_back(graph.nodes[nodeIndex].weight);
|
|
||||||
windowCrossbarUsage.push_back(graph.nodes[nodeIndex].crossbarUsage);
|
|
||||||
windowNodeOrderKeys.push_back(static_cast<int64_t>(nodeIndex));
|
|
||||||
}
|
|
||||||
|
|
||||||
GraphDCP windowGraph(
|
|
||||||
windowWeights, buildWindowEdges(graph, nodeToWindowIndex), windowNodeOrderKeys, windowCrossbarUsage);
|
|
||||||
if (options.processorCount > 0)
|
|
||||||
windowGraph.setMaxCpuCount(static_cast<int>(options.processorCount));
|
|
||||||
windowGraph.setContext(context);
|
|
||||||
windowGraph.runDcp();
|
|
||||||
|
|
||||||
WindowScheduleResult result;
|
|
||||||
result.cpuCount = windowGraph.cpuCount();
|
|
||||||
for (CPU cpu = 0; cpu < windowGraph.cpuCount(); ++cpu) {
|
|
||||||
auto scheduledTasks = windowGraph.getScheduledTasks(cpu);
|
|
||||||
if (scheduledTasks.size() < 2)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
result.mergedNodeCount += scheduledTasks.size();
|
|
||||||
result.maxMergeGroupSize = std::max(result.maxMergeGroupSize, scheduledTasks.size());
|
|
||||||
std::vector<size_t> mergeGroup;
|
|
||||||
mergeGroup.reserve(scheduledTasks.size());
|
|
||||||
for (const auto& task : scheduledTasks)
|
|
||||||
mergeGroup.push_back(selectedNodes[task.nodeIndex]);
|
|
||||||
result.mergeGroups.push_back(std::move(mergeGroup));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool coarsenGraph(const VirtualGraph& graph,
|
|
||||||
llvm::ArrayRef<std::vector<size_t>> mergeGroups,
|
|
||||||
VirtualGraph& coarsenedGraph,
|
|
||||||
std::vector<size_t>& oldToNewNode) {
|
|
||||||
TimingInfo timing = computeTiming(graph);
|
|
||||||
std::vector<size_t> topologicalRank(graph.nodes.size());
|
|
||||||
std::iota(topologicalRank.begin(), topologicalRank.end(), 0);
|
|
||||||
if (timing.valid)
|
|
||||||
for (auto [rank, nodeIndex] : llvm::enumerate(timing.topologicalOrder))
|
|
||||||
topologicalRank[nodeIndex] = rank;
|
|
||||||
|
|
||||||
std::vector<std::vector<size_t>> orderedMergeGroups;
|
|
||||||
orderedMergeGroups.reserve(mergeGroups.size());
|
|
||||||
for (const auto& mergeGroup : mergeGroups) {
|
|
||||||
orderedMergeGroups.emplace_back(mergeGroup.begin(), mergeGroup.end());
|
|
||||||
std::stable_sort(orderedMergeGroups.back().begin(), orderedMergeGroups.back().end(), [&](size_t lhs, size_t rhs) {
|
|
||||||
if (topologicalRank[lhs] != topologicalRank[rhs])
|
|
||||||
return topologicalRank[lhs] < topologicalRank[rhs];
|
|
||||||
return lhs < rhs;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int64_t> nodeToMergeGroup(graph.nodes.size(), -1);
|
|
||||||
for (auto [groupIndex, mergeGroup] : llvm::enumerate(orderedMergeGroups)) {
|
|
||||||
if (mergeGroup.size() < 2)
|
|
||||||
continue;
|
|
||||||
for (size_t nodeIndex : mergeGroup) {
|
|
||||||
assert(nodeIndex < graph.nodes.size() && "merge group node out of range");
|
|
||||||
nodeToMergeGroup[nodeIndex] = static_cast<int64_t>(groupIndex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::optional<size_t>> mergeGroupToNewNode(orderedMergeGroups.size());
|
|
||||||
std::vector<size_t> newNodeRank;
|
|
||||||
oldToNewNode.assign(graph.nodes.size(), 0);
|
|
||||||
bool mergedAny = false;
|
|
||||||
coarsenedGraph.nodes.clear();
|
|
||||||
coarsenedGraph.edges.clear();
|
|
||||||
coarsenedGraph.nodes.reserve(graph.nodes.size());
|
|
||||||
newNodeRank.reserve(graph.nodes.size());
|
|
||||||
|
|
||||||
for (size_t nodeIndex = 0; nodeIndex < graph.nodes.size(); ++nodeIndex) {
|
|
||||||
int64_t mergeGroupIndex = nodeToMergeGroup[nodeIndex];
|
|
||||||
if (mergeGroupIndex == -1) {
|
|
||||||
oldToNewNode[nodeIndex] = coarsenedGraph.nodes.size();
|
|
||||||
coarsenedGraph.nodes.push_back(graph.nodes[nodeIndex]);
|
|
||||||
newNodeRank.push_back(topologicalRank[nodeIndex]);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto& newNodeIndex = mergeGroupToNewNode[static_cast<size_t>(mergeGroupIndex)];
|
|
||||||
if (newNodeIndex.has_value()) {
|
|
||||||
oldToNewNode[nodeIndex] = *newNodeIndex;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
VirtualNode mergedNode;
|
|
||||||
for (size_t memberIndex : orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)]) {
|
|
||||||
const VirtualNode& memberNode = graph.nodes[memberIndex];
|
|
||||||
mergedNode.originalNodeIndices.append(memberNode.originalNodeIndices.begin(),
|
|
||||||
memberNode.originalNodeIndices.end());
|
|
||||||
mergedNode.weight = addOrMax(mergedNode.weight, memberNode.weight);
|
|
||||||
mergedNode.crossbarUsage = addOrMax(mergedNode.crossbarUsage, memberNode.crossbarUsage);
|
|
||||||
}
|
|
||||||
std::sort(mergedNode.originalNodeIndices.begin(), mergedNode.originalNodeIndices.end());
|
|
||||||
|
|
||||||
mergedAny = true;
|
|
||||||
newNodeIndex = coarsenedGraph.nodes.size();
|
|
||||||
for (size_t memberIndex : orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)])
|
|
||||||
oldToNewNode[memberIndex] = *newNodeIndex;
|
|
||||||
newNodeRank.push_back(topologicalRank[orderedMergeGroups[static_cast<size_t>(mergeGroupIndex)].front()]);
|
|
||||||
coarsenedGraph.nodes.push_back(std::move(mergedNode));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!mergedAny)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
std::vector<IndexedEdge> remappedEdges;
|
|
||||||
remappedEdges.reserve(graph.edges.size());
|
|
||||||
for (auto [start, end, weight] : graph.edges) {
|
|
||||||
size_t newStart = oldToNewNode[static_cast<size_t>(start)];
|
|
||||||
size_t newEnd = oldToNewNode[static_cast<size_t>(end)];
|
|
||||||
if (newStart == newEnd)
|
|
||||||
continue;
|
|
||||||
if (newNodeRank[newStart] >= newNodeRank[newEnd])
|
|
||||||
continue;
|
|
||||||
remappedEdges.push_back({static_cast<int64_t>(newStart), static_cast<int64_t>(newEnd), weight});
|
|
||||||
}
|
|
||||||
coarsenedGraph.edges = aggregateEdges(remappedEdges);
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t getDcpCoarseningWindowSize(size_t nodeCount, const DcpScheduleOptions& options) {
|
|
||||||
size_t windowSize = std::min(options.criticalWindowSize, nodeCount);
|
|
||||||
CPU maxCpuCount = std::max<CPU>(1, static_cast<CPU>(getSchedulingCpuBudget(options)));
|
|
||||||
if (nodeCount > static_cast<size_t>(maxCpuCount))
|
|
||||||
windowSize = std::max(windowSize, std::min(nodeCount, static_cast<size_t>(maxCpuCount) + 1));
|
|
||||||
return windowSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
void assignFeasibleAest(const ComputeGraph& graph, MergeScheduleResult& result) {
|
|
||||||
llvm::DenseMap<ComputeInstance, size_t> nodeIndexByInstance;
|
|
||||||
nodeIndexByInstance.reserve(graph.nodes.size());
|
|
||||||
for (auto [nodeIndex, node] : llvm::enumerate(graph.nodes))
|
|
||||||
nodeIndexByInstance[node.instance] = nodeIndex;
|
|
||||||
|
|
||||||
struct ScheduledEdge {
|
|
||||||
size_t target = 0;
|
|
||||||
Time delay = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<std::vector<ScheduledEdge>> scheduledChildren(graph.nodes.size());
|
|
||||||
std::vector<size_t> incomingEdgeCount(graph.nodes.size(), 0);
|
|
||||||
for (const ComputeGraphEdge& edge : graph.edges) {
|
|
||||||
const ComputeInstance sourceInstance = graph.nodes[edge.source].instance;
|
|
||||||
const ComputeInstance targetInstance = graph.nodes[edge.target].instance;
|
|
||||||
const size_t sourceCpu = result.computeToCpuMap.lookup(sourceInstance);
|
|
||||||
const size_t targetCpu = result.computeToCpuMap.lookup(targetInstance);
|
|
||||||
|
|
||||||
Time delay = graph.nodes[edge.source].weight;
|
|
||||||
if (sourceCpu != targetCpu)
|
|
||||||
delay = addOrMax(delay, edge.transferCost);
|
|
||||||
|
|
||||||
scheduledChildren[edge.source].push_back({edge.target, delay});
|
|
||||||
incomingEdgeCount[edge.target]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::DenseMap<size_t, std::vector<std::pair<size_t, size_t>>> tasksByCpu;
|
|
||||||
for (const ComputeGraphNode& node : graph.nodes) {
|
|
||||||
size_t cpu = result.computeToCpuMap.lookup(node.instance);
|
|
||||||
size_t slot = result.computeToCpuSlotMap.lookup(node.instance);
|
|
||||||
tasksByCpu[cpu].push_back({slot, nodeIndexByInstance.lookup(node.instance)});
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& entry : tasksByCpu) {
|
|
||||||
auto& scheduledTasks = entry.second;
|
|
||||||
llvm::sort(scheduledTasks, [](const auto& lhs, const auto& rhs) {
|
|
||||||
if (lhs.first != rhs.first)
|
|
||||||
return lhs.first < rhs.first;
|
|
||||||
return lhs.second < rhs.second;
|
|
||||||
});
|
|
||||||
|
|
||||||
for (size_t i = 1; i < scheduledTasks.size(); ++i) {
|
|
||||||
size_t sourceIndex = scheduledTasks[i - 1].second;
|
|
||||||
size_t targetIndex = scheduledTasks[i].second;
|
|
||||||
scheduledChildren[sourceIndex].push_back({targetIndex, graph.nodes[sourceIndex].weight});
|
|
||||||
incomingEdgeCount[targetIndex]++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto readyNodeGreater = [&](size_t lhs, size_t rhs) {
|
|
||||||
if (graph.nodes[lhs].originalOrder != graph.nodes[rhs].originalOrder)
|
|
||||||
return graph.nodes[lhs].originalOrder > graph.nodes[rhs].originalOrder;
|
|
||||||
return lhs > rhs;
|
|
||||||
};
|
|
||||||
std::priority_queue<size_t, std::vector<size_t>, decltype(readyNodeGreater)> readyNodes(readyNodeGreater);
|
|
||||||
for (size_t nodeIndex = 0; nodeIndex < graph.nodes.size(); ++nodeIndex)
|
|
||||||
if (incomingEdgeCount[nodeIndex] == 0)
|
|
||||||
readyNodes.push(nodeIndex);
|
|
||||||
|
|
||||||
std::vector<Time> startTimes(graph.nodes.size(), 0);
|
|
||||||
size_t processedNodeCount = 0;
|
|
||||||
while (!readyNodes.empty()) {
|
|
||||||
size_t sourceIndex = readyNodes.top();
|
|
||||||
readyNodes.pop();
|
|
||||||
processedNodeCount++;
|
|
||||||
|
|
||||||
for (const ScheduledEdge& edge : scheduledChildren[sourceIndex]) {
|
|
||||||
startTimes[edge.target] = std::max(startTimes[edge.target], addOrMax(startTimes[sourceIndex], edge.delay));
|
|
||||||
assert(incomingEdgeCount[edge.target] > 0 && "scheduled incoming edge count underflow");
|
|
||||||
incomingEdgeCount[edge.target]--;
|
|
||||||
if (incomingEdgeCount[edge.target] == 0)
|
|
||||||
readyNodes.push(edge.target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (processedNodeCount != graph.nodes.size())
|
|
||||||
llvm::report_fatal_error("merge scheduling: coarsened DCP schedule is cyclic");
|
|
||||||
|
|
||||||
for (auto [nodeIndex, node] : llvm::enumerate(graph.nodes))
|
|
||||||
result.computeToAestMap[node.instance] = startTimes[nodeIndex];
|
|
||||||
}
|
|
||||||
|
|
||||||
MergeScheduleResult buildResultFromVirtualGraph(const VirtualGraph& graph, const ComputeGraph& originalGraph) {
|
|
||||||
MergeScheduleResult result;
|
|
||||||
|
|
||||||
TimingInfo timing = computeTiming(graph);
|
|
||||||
std::vector<size_t> virtualNodeOrder;
|
|
||||||
if (timing.valid)
|
|
||||||
virtualNodeOrder = std::move(timing.topologicalOrder);
|
|
||||||
else {
|
|
||||||
virtualNodeOrder.resize(graph.nodes.size());
|
|
||||||
std::iota(virtualNodeOrder.begin(), virtualNodeOrder.end(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<size_t> originalNodeToCpu(originalGraph.nodes.size(), 0);
|
|
||||||
for (auto [cpu, virtualNodeIndex] : llvm::enumerate(virtualNodeOrder)) {
|
|
||||||
const VirtualNode& virtualNode = graph.nodes[virtualNodeIndex];
|
|
||||||
for (size_t originalIndex : virtualNode.originalNodeIndices)
|
|
||||||
originalNodeToCpu[originalIndex] = cpu;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.dominanceOrderCompute.reserve(originalGraph.nodes.size());
|
|
||||||
llvm::DenseMap<size_t, size_t> nextCpuSlot;
|
|
||||||
for (auto [originalIndex, node] : llvm::enumerate(originalGraph.nodes)) {
|
|
||||||
size_t cpu = originalNodeToCpu[originalIndex];
|
|
||||||
result.dominanceOrderCompute.push_back(node.instance);
|
|
||||||
result.computeToCpuMap[node.instance] = cpu;
|
|
||||||
result.computeToCpuSlotMap[node.instance] = nextCpuSlot[cpu]++;
|
|
||||||
result.cpuToLastComputeMap[cpu] = node.instance;
|
|
||||||
}
|
|
||||||
for (const auto& [cpu, lastCompute] : result.cpuToLastComputeMap)
|
|
||||||
result.isLastComputeOfCpu.insert(lastCompute);
|
|
||||||
assignFeasibleAest(originalGraph, result);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
MergeScheduleResult buildResultFromScheduledGraph(GraphDCP& graphDCP, const ComputeGraph& graph) {
|
|
||||||
MergeScheduleResult result;
|
|
||||||
result.dominanceOrderCompute.reserve(graph.nodes.size());
|
|
||||||
for (const ComputeGraphNode& node : graph.nodes)
|
|
||||||
result.dominanceOrderCompute.push_back(node.instance);
|
|
||||||
|
|
||||||
for (CPU cpu = 0; cpu < graphDCP.cpuCount(); ++cpu) {
|
|
||||||
auto scheduledTasks = graphDCP.getScheduledTasks(cpu);
|
|
||||||
if (scheduledTasks.empty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
for (auto [slot, task] : llvm::enumerate(scheduledTasks)) {
|
|
||||||
const ComputeInstance instance = graph.nodes[task.nodeIndex].instance;
|
|
||||||
result.computeToCpuMap[instance] = cpu;
|
|
||||||
result.computeToCpuSlotMap[instance] = slot;
|
|
||||||
result.computeToAestMap[instance] = static_cast<uint64_t>(task.aest);
|
|
||||||
}
|
|
||||||
|
|
||||||
const ComputeInstance lastInstance = graph.nodes[scheduledTasks.back().nodeIndex].instance;
|
|
||||||
result.cpuToLastComputeMap[cpu] = lastInstance;
|
|
||||||
result.isLastComputeOfCpu.insert(lastInstance);
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
MergeScheduleResult
|
|
||||||
runLegacyDcp(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context) {
|
|
||||||
llvm::SmallVector<Weight> nodeWeights;
|
|
||||||
llvm::SmallVector<CrossbarUsage> nodeCrossbarUsage;
|
|
||||||
llvm::SmallVector<int64_t> nodeOrderKeys;
|
|
||||||
llvm::SmallVector<IndexedEdge> edges;
|
|
||||||
nodeWeights.reserve(graph.nodes.size());
|
|
||||||
nodeCrossbarUsage.reserve(graph.nodes.size());
|
|
||||||
nodeOrderKeys.reserve(graph.nodes.size());
|
|
||||||
edges.reserve(graph.edges.size());
|
|
||||||
|
|
||||||
for (const ComputeGraphNode& node : graph.nodes) {
|
|
||||||
nodeWeights.push_back(node.weight);
|
|
||||||
nodeCrossbarUsage.push_back(node.crossbarUsage);
|
|
||||||
nodeOrderKeys.push_back(static_cast<int64_t>(node.originalOrder));
|
|
||||||
}
|
|
||||||
for (const ComputeGraphEdge& edge : graph.edges) {
|
|
||||||
edges.push_back(
|
|
||||||
{static_cast<int64_t>(edge.source), static_cast<int64_t>(edge.target), static_cast<int64_t>(edge.transferCost)});
|
|
||||||
}
|
|
||||||
|
|
||||||
GraphDCP graphDCP(nodeWeights, edges, nodeOrderKeys, nodeCrossbarUsage);
|
|
||||||
if (options.processorCount > 0)
|
|
||||||
graphDCP.setMaxCpuCount(static_cast<int>(options.processorCount));
|
|
||||||
graphDCP.setContext(context);
|
|
||||||
graphDCP.runDcp();
|
|
||||||
return buildResultFromScheduledGraph(graphDCP, graph);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool needsExactScheduledBatches(const ComputeGraph& graph, const DcpScheduleOptions& options) {
|
|
||||||
if (options.processorCount == 0 || !options.allowFallbackForAutoCoreCount)
|
|
||||||
return false;
|
|
||||||
size_t schedulingCpuBudget = getSchedulingCpuBudget(options);
|
|
||||||
return llvm::any_of(graph.nodes, [&](const ComputeGraphNode& node) {
|
|
||||||
auto batch = dyn_cast<SpatComputeBatch>(node.instance.op);
|
|
||||||
return batch && static_cast<size_t>(batch.getLaneCount()) > schedulingCpuBudget;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
MergeScheduleResult
|
|
||||||
runDcpScheduler(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context) {
|
|
||||||
if (needsExactScheduledBatches(graph, options))
|
|
||||||
return runLegacyDcp(graph, options, context);
|
|
||||||
|
|
||||||
if (options.criticalWindowSize == 0)
|
|
||||||
return runLegacyDcp(graph, options, context);
|
|
||||||
|
|
||||||
VirtualGraph virtualGraph = buildInitialVirtualGraph(graph);
|
|
||||||
size_t iteration = 0;
|
|
||||||
bool debugCoarsening = isDcpCoarsenDebugEnabled();
|
|
||||||
auto tryCoarsenSelectedNodes = [&](llvm::ArrayRef<size_t> selectedNodes) {
|
|
||||||
size_t oldNodeCount = virtualGraph.nodes.size();
|
|
||||||
WindowScheduleResult windowSchedule = scheduleWindow(virtualGraph, selectedNodes, options, context);
|
|
||||||
if (windowSchedule.mergeGroups.empty()) {
|
|
||||||
if (debugCoarsening && oldNodeCount >= 200)
|
|
||||||
llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} "
|
|
||||||
"groups=0 mergedNodes=0 maxGroup=0 new={1} changed=0\n",
|
|
||||||
iteration,
|
|
||||||
oldNodeCount,
|
|
||||||
selectedNodes.size(),
|
|
||||||
windowSchedule.cpuCount);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
VirtualGraph coarsenedGraph;
|
|
||||||
std::vector<size_t> oldToNewNode;
|
|
||||||
if (!coarsenGraph(virtualGraph, windowSchedule.mergeGroups, coarsenedGraph, oldToNewNode))
|
|
||||||
return false;
|
|
||||||
if (debugCoarsening && (oldNodeCount >= 200 || coarsenedGraph.nodes.size() >= 200))
|
|
||||||
llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} windowCpus={3} "
|
|
||||||
"groups={4} mergedNodes={5} maxGroup={6} new={7} changed={8}\n",
|
|
||||||
iteration,
|
|
||||||
oldNodeCount,
|
|
||||||
selectedNodes.size(),
|
|
||||||
windowSchedule.cpuCount,
|
|
||||||
windowSchedule.mergeGroups.size(),
|
|
||||||
windowSchedule.mergedNodeCount,
|
|
||||||
windowSchedule.maxMergeGroupSize,
|
|
||||||
coarsenedGraph.nodes.size(),
|
|
||||||
oldNodeCount - coarsenedGraph.nodes.size());
|
|
||||||
virtualGraph = std::move(coarsenedGraph);
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
while (virtualGraph.nodes.size() > 1) {
|
|
||||||
if (virtualGraph.nodes.size() <= getSchedulingCpuBudget(options)) {
|
|
||||||
if (debugCoarsening && virtualGraph.nodes.size() >= 200)
|
|
||||||
llvm::errs() << llvm::formatv(
|
|
||||||
"[DCP-COARSEN] iter={0} old={1} stop=cpu-budget\n", iteration, virtualGraph.nodes.size());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
iteration++;
|
|
||||||
TimingInfo timing = computeTiming(virtualGraph);
|
|
||||||
if (!timing.valid) {
|
|
||||||
if (debugCoarsening && virtualGraph.nodes.size() >= 200)
|
|
||||||
llvm::errs() << llvm::formatv(
|
|
||||||
"[DCP-COARSEN] iter={0} old={1} invalid-timing\n", iteration, virtualGraph.nodes.size());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
llvm::SmallVector<size_t> selectedNodes;
|
|
||||||
auto criticalWindow =
|
|
||||||
selectCriticalWindow(virtualGraph, timing, getDcpCoarseningWindowSize(virtualGraph.nodes.size(), options));
|
|
||||||
selectedNodes.append(criticalWindow.begin(), criticalWindow.end());
|
|
||||||
|
|
||||||
if (selectedNodes.size() < 2) {
|
|
||||||
if (debugCoarsening && virtualGraph.nodes.size() >= 200)
|
|
||||||
llvm::errs() << llvm::formatv("[DCP-COARSEN] iter={0} old={1} selected={2} stop=small-window\n",
|
|
||||||
iteration,
|
|
||||||
virtualGraph.nodes.size(),
|
|
||||||
selectedNodes.size());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tryCoarsenSelectedNodes(selectedNodes))
|
|
||||||
continue;
|
|
||||||
if (debugCoarsening && virtualGraph.nodes.size() >= 200)
|
|
||||||
llvm::errs() << llvm::formatv(
|
|
||||||
"[DCP-COARSEN] iter={0} old={1} stop=no-merge\n", iteration, virtualGraph.nodes.size());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return buildResultFromVirtualGraph(virtualGraph, graph);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace spatial
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
@@ -1,21 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "mlir/IR/MLIRContext.h"
|
|
||||||
|
|
||||||
#include "ComputeGraph.hpp"
|
|
||||||
#include "MergeSchedule.hpp"
|
|
||||||
|
|
||||||
namespace onnx_mlir {
|
|
||||||
namespace spatial {
|
|
||||||
|
|
||||||
struct DcpScheduleOptions {
|
|
||||||
size_t processorCount = 0;
|
|
||||||
size_t criticalWindowSize = 0;
|
|
||||||
bool allowFallbackForAutoCoreCount = true;
|
|
||||||
};
|
|
||||||
|
|
||||||
MergeScheduleResult
|
|
||||||
runDcpScheduler(const ComputeGraph& graph, const DcpScheduleOptions& options, mlir::MLIRContext* context);
|
|
||||||
|
|
||||||
} // namespace spatial
|
|
||||||
} // namespace onnx_mlir
|
|
||||||
+5
-16
@@ -6,9 +6,7 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "../DCPGraph/DCPAnalysis.hpp"
|
|
||||||
#include "ComputeGraph.hpp"
|
#include "ComputeGraph.hpp"
|
||||||
#include "DcpScheduler.hpp"
|
|
||||||
#include "MergeSchedulingAnalysis.hpp"
|
#include "MergeSchedulingAnalysis.hpp"
|
||||||
#include "PeftScheduler.hpp"
|
#include "PeftScheduler.hpp"
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||||
@@ -21,12 +19,11 @@ namespace {
|
|||||||
MergeSchedulerKind getSchedulerKind() {
|
MergeSchedulerKind getSchedulerKind() {
|
||||||
switch (pimMergeScheduler.getValue()) {
|
switch (pimMergeScheduler.getValue()) {
|
||||||
case MergeSchedulerPeft: return MergeSchedulerKind::Peft;
|
case MergeSchedulerPeft: return MergeSchedulerKind::Peft;
|
||||||
case MergeSchedulerDcp: return MergeSchedulerKind::Dcp;
|
|
||||||
}
|
}
|
||||||
llvm_unreachable("unknown merge scheduler kind");
|
llvm_unreachable("unknown merge scheduler kind");
|
||||||
}
|
}
|
||||||
|
|
||||||
void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result, CrossbarUsage crossbarCapacity) {
|
void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result, unsigned long crossbarCapacity) {
|
||||||
llvm::DenseMap<size_t, std::vector<std::pair<size_t, size_t>>> tasksByCpu;
|
llvm::DenseMap<size_t, std::vector<std::pair<size_t, size_t>>> tasksByCpu;
|
||||||
tasksByCpu.reserve(result.cpuToLastComputeMap.size());
|
tasksByCpu.reserve(result.cpuToLastComputeMap.size());
|
||||||
|
|
||||||
@@ -51,11 +48,11 @@ void verifySchedule(const ComputeGraph& graph, const MergeScheduleResult& result
|
|||||||
return lhs.second < rhs.second;
|
return lhs.second < rhs.second;
|
||||||
});
|
});
|
||||||
|
|
||||||
CrossbarUsage usedCrossbars = 0;
|
unsigned int usedCrossbars = 0;
|
||||||
for (size_t slot = 0; slot < scheduledTasks.size(); ++slot) {
|
for (size_t slot = 0; slot < scheduledTasks.size(); ++slot) {
|
||||||
if (scheduledTasks[slot].first != slot)
|
if (scheduledTasks[slot].first != slot)
|
||||||
llvm::report_fatal_error("merge scheduling: CPU slots are not contiguous");
|
llvm::report_fatal_error("merge scheduling: CPU slots are not contiguous");
|
||||||
usedCrossbars = addOrMax(usedCrossbars, graph.nodes[scheduledTasks[slot].second].crossbarUsage);
|
usedCrossbars = addOrMax(usedCrossbars, graph.nodes[scheduledTasks[slot].second].crossbarUsage.size());
|
||||||
if (usedCrossbars > crossbarCapacity)
|
if (usedCrossbars > crossbarCapacity)
|
||||||
llvm::report_fatal_error("merge scheduling: CPU crossbar capacity exceeded");
|
llvm::report_fatal_error("merge scheduling: CPU crossbar capacity exceeded");
|
||||||
}
|
}
|
||||||
@@ -115,18 +112,10 @@ MergeScheduleResult MergeSchedulingAnalysis::run() {
|
|||||||
if (options.kind == MergeSchedulerKind::Peft) {
|
if (options.kind == MergeSchedulerKind::Peft) {
|
||||||
schedule = runPeftScheduler(graph,
|
schedule = runPeftScheduler(graph,
|
||||||
PeftScheduleOptions {options.processorCount,
|
PeftScheduleOptions {options.processorCount,
|
||||||
static_cast<CrossbarUsage>(crossbarCountInCore.getValue()),
|
static_cast<unsigned long>(crossbarCountInCore.getValue()),
|
||||||
entryOp->getContext()});
|
entryOp->getContext()});
|
||||||
}
|
}
|
||||||
else {
|
verifySchedule(graph, schedule, static_cast<unsigned long>(crossbarCountInCore.getValue()));
|
||||||
schedule = runDcpScheduler(graph,
|
|
||||||
DcpScheduleOptions {options.processorCount,
|
|
||||||
dcpCriticalWindowSize.getValue(),
|
|
||||||
options.allowDcpFallbackForAutoCoreCount},
|
|
||||||
entryOp->getContext());
|
|
||||||
}
|
|
||||||
|
|
||||||
verifySchedule(graph, schedule, static_cast<CrossbarUsage>(crossbarCountInCore.getValue()));
|
|
||||||
return schedule;
|
return schedule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
-2
@@ -10,14 +10,12 @@ namespace onnx_mlir {
|
|||||||
namespace spatial {
|
namespace spatial {
|
||||||
|
|
||||||
enum class MergeSchedulerKind {
|
enum class MergeSchedulerKind {
|
||||||
Dcp,
|
|
||||||
Peft,
|
Peft,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MergeSchedulingOptions {
|
struct MergeSchedulingOptions {
|
||||||
MergeSchedulerKind kind = MergeSchedulerKind::Peft;
|
MergeSchedulerKind kind = MergeSchedulerKind::Peft;
|
||||||
size_t processorCount = 0;
|
size_t processorCount = 0;
|
||||||
bool allowDcpFallbackForAutoCoreCount = true;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class MergeSchedulingAnalysis {
|
class MergeSchedulingAnalysis {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#include "mlir/IR/Threading.h"
|
#include "mlir/IR/Threading.h"
|
||||||
|
|
||||||
#include "llvm/ADT/STLExtras.h"
|
#include "llvm/ADT/STLExtras.h"
|
||||||
|
#include "llvm/ADT/SmallSet.h"
|
||||||
#include "llvm/Support/ErrorHandling.h"
|
#include "llvm/Support/ErrorHandling.h"
|
||||||
#include "llvm/Support/FormatVariadic.h"
|
#include "llvm/Support/FormatVariadic.h"
|
||||||
|
|
||||||
@@ -161,7 +162,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<char> scheduled(nodeCount, false);
|
std::vector<char> scheduled(nodeCount, false);
|
||||||
std::vector<CrossbarUsage> processorCrossbars(processorCount, 0);
|
std::vector<CrossbarUsage> processorCrossbars(processorCount, llvm::SmallPtrSet<mlir::Value, 6> {});
|
||||||
std::vector<ScheduledTask> schedules(nodeCount);
|
std::vector<ScheduledTask> schedules(nodeCount);
|
||||||
std::vector<std::vector<size_t>> tasksByProcessor(processorCount);
|
std::vector<std::vector<size_t>> tasksByProcessor(processorCount);
|
||||||
|
|
||||||
@@ -179,8 +180,14 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
|
|||||||
bool crossbarRejected = false;
|
bool crossbarRejected = false;
|
||||||
|
|
||||||
for (size_t processor = 0; processor < processorCount; ++processor) {
|
for (size_t processor = 0; processor < processorCount; ++processor) {
|
||||||
if (graph.nodes[task].crossbarUsage != 0
|
if (graph.nodes[task].crossbarUsage.size() != 0
|
||||||
&& addOrMax(processorCrossbars[processor], graph.nodes[task].crossbarUsage) > options.crossbarCapacity) {
|
&& !llvm::all_of(graph.nodes[task].crossbarUsage,
|
||||||
|
[&processorCrossbars, processor](mlir::Value nodeCrossbar) {
|
||||||
|
return llvm::is_contained(processorCrossbars[processor], nodeCrossbar);
|
||||||
|
})
|
||||||
|
&& addOrMax(processorCrossbars[processor].size(), graph.nodes[task].crossbarUsage.size())
|
||||||
|
> options.crossbarCapacity) {
|
||||||
|
|
||||||
crossbarRejected = true;
|
crossbarRejected = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -245,7 +252,7 @@ MergeScheduleResult runPeftScheduler(const ComputeGraph& graph, const PeftSchedu
|
|||||||
schedules[task] = {bestProcessor, bestEst, bestEft};
|
schedules[task] = {bestProcessor, bestEst, bestEft};
|
||||||
scheduled[task] = true;
|
scheduled[task] = true;
|
||||||
++scheduledCount;
|
++scheduledCount;
|
||||||
processorCrossbars[bestProcessor] = addOrMax(processorCrossbars[bestProcessor], graph.nodes[task].crossbarUsage);
|
processorCrossbars[bestProcessor].insert_range(graph.nodes[task].crossbarUsage);
|
||||||
|
|
||||||
// 3. CRITICAL FIX: Topological Append
|
// 3. CRITICAL FIX: Topological Append
|
||||||
// Because the readyQueue pops in strict topological order, simply pushing to the
|
// Because the readyQueue pops in strict topological order, simply pushing to the
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ namespace spatial {
|
|||||||
|
|
||||||
struct PeftScheduleOptions {
|
struct PeftScheduleOptions {
|
||||||
size_t processorCount = 0;
|
size_t processorCount = 0;
|
||||||
CrossbarUsage crossbarCapacity = 0;
|
unsigned long crossbarCapacity = 0;
|
||||||
mlir::MLIRContext* context = nullptr;
|
mlir::MLIRContext* context = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,58 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||||
|
|
||||||
|
#include "llvm/Support/Casting.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
|
#include <list>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "src/Accelerators/PIM/Common/LabeledList.hpp"
|
||||||
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||||
|
|
||||||
|
using CPU = int;
|
||||||
|
using Weight = unsigned long long;
|
||||||
|
using Time = unsigned long long;
|
||||||
|
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline T checkedAdd(T lhs, T rhs) {
|
||||||
|
static_assert(std::is_unsigned_v<T>, "checkedAdd only supports unsigned types");
|
||||||
|
assert(lhs <= std::numeric_limits<T>::max() - rhs && "unsigned addition overflow");
|
||||||
|
return lhs + rhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline T checkedMultiply(T lhs, T rhs) {
|
||||||
|
static_assert(std::is_unsigned_v<T>, "checkedMultiply only supports unsigned types");
|
||||||
|
if (lhs == 0 || rhs == 0)
|
||||||
|
return 0;
|
||||||
|
assert(lhs <= std::numeric_limits<T>::max() / rhs && "unsigned multiplication overflow");
|
||||||
|
return lhs * rhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline T addOrMax(T lhs, T rhs) {
|
||||||
|
static_assert(std::is_unsigned_v<T>, "addOrMax only supports unsigned types");
|
||||||
|
if (lhs == std::numeric_limits<T>::max() || rhs == std::numeric_limits<T>::max())
|
||||||
|
return std::numeric_limits<T>::max();
|
||||||
|
return checkedAdd(lhs, rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
inline T subtractOrZero(T lhs, T rhs) {
|
||||||
|
static_assert(std::is_unsigned_v<T>, "subtractOrZero only supports unsigned types");
|
||||||
|
if (lhs == std::numeric_limits<T>::max())
|
||||||
|
return lhs;
|
||||||
|
if (rhs == std::numeric_limits<T>::max() || lhs <= rhs)
|
||||||
|
return 0;
|
||||||
|
return lhs - rhs;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline Time slackOrZero(Time earliestStart, Time latestStart) { return subtractOrZero(latestStart, earliestStart); }
|
||||||
|
|
||||||
@@ -30,10 +30,3 @@ add_pim_unittest(LabeledListTest
|
|||||||
OMPimCommon
|
OMPimCommon
|
||||||
)
|
)
|
||||||
|
|
||||||
add_pim_unittest(DCPTest
|
|
||||||
DCPTest.cpp
|
|
||||||
|
|
||||||
LINK_LIBS PRIVATE
|
|
||||||
OMPimCommon
|
|
||||||
SpatialOps
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,531 +0,0 @@
|
|||||||
#include <algorithm>
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <fstream>
|
|
||||||
#include <initializer_list>
|
|
||||||
#include <iostream>
|
|
||||||
#include <limits>
|
|
||||||
#include <optional>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
||||||
#include "src/Accelerators/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/DCPGraph/Graph.hpp"
|
|
||||||
#include "src/Compiler/CompilerOptions.hpp"
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
struct ExpectedScheduledTask {
|
|
||||||
size_t nodeIndex;
|
|
||||||
Time aest;
|
|
||||||
Time alst;
|
|
||||||
Weight weight;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ScheduledPlacement {
|
|
||||||
CPU cpu;
|
|
||||||
GraphDCP::ScheduledTaskInfo task;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::filesystem::path getDcpTestOutputDir() { return std::filesystem::temp_directory_path() / "raptor-test-pim"; }
|
|
||||||
|
|
||||||
void configureDcpDotOutput() {
|
|
||||||
auto outputDir = getDcpTestOutputDir();
|
|
||||||
std::error_code errorCode;
|
|
||||||
std::filesystem::remove_all(outputDir, errorCode);
|
|
||||||
std::filesystem::create_directories(outputDir, errorCode);
|
|
||||||
assert(!errorCode);
|
|
||||||
onnx_mlir::outputBaseName = (outputDir / "DCPTest.mlir").string();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::optional<std::filesystem::path> getLatestDcpDotFile() {
|
|
||||||
auto graphDir = getDcpTestOutputDir() / "dcp_graph";
|
|
||||||
if (!std::filesystem::exists(graphDir))
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
std::optional<std::filesystem::path> latestDot;
|
|
||||||
for (const auto& entry : std::filesystem::directory_iterator(graphDir)) {
|
|
||||||
if (!entry.is_regular_file() || entry.path().extension() != ".dot")
|
|
||||||
continue;
|
|
||||||
if (!latestDot || entry.path().filename() > latestDot->filename())
|
|
||||||
latestDot = entry.path();
|
|
||||||
}
|
|
||||||
return latestDot;
|
|
||||||
}
|
|
||||||
|
|
||||||
void dumpDcpFailureArtifacts() {
|
|
||||||
auto latestDot = getLatestDcpDotFile();
|
|
||||||
if (!latestDot) {
|
|
||||||
std::cerr << "No DCP dot file was produced.\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << "DCP dot file: " << latestDot->string() << '\n';
|
|
||||||
std::ifstream dotFile(*latestDot);
|
|
||||||
if (!dotFile.is_open()) {
|
|
||||||
std::cerr << "Failed to open DCP dot file.\n";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << dotFile.rdbuf();
|
|
||||||
}
|
|
||||||
|
|
||||||
void printCpuSchedule(GraphDCP& graph, CPU cpu) {
|
|
||||||
auto actualTasks = graph.getScheduledTasks(cpu);
|
|
||||||
std::cerr << "CPU " << cpu << " actual schedule:\n";
|
|
||||||
for (const auto& task : actualTasks) {
|
|
||||||
std::cerr << " " << task.nodeIndex << ") aest: " << task.aest << " alst: " << task.alst
|
|
||||||
<< " weight: " << task.weight << '\n';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void printGraphSchedule(GraphDCP& graph) {
|
|
||||||
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
|
|
||||||
printCpuSchedule(graph, cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool checkScheduledTasks(GraphDCP& graph, CPU cpu, std::initializer_list<ExpectedScheduledTask> expectedTasks) {
|
|
||||||
auto actualTasks = graph.getScheduledTasks(cpu);
|
|
||||||
if (actualTasks.size() != expectedTasks.size()) {
|
|
||||||
printCpuSchedule(graph, cpu);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto expectedIt = expectedTasks.begin();
|
|
||||||
for (const auto& actualTask : actualTasks) {
|
|
||||||
if (actualTask.nodeIndex != expectedIt->nodeIndex || actualTask.aest != expectedIt->aest
|
|
||||||
|| actualTask.alst != expectedIt->alst || actualTask.weight != expectedIt->weight) {
|
|
||||||
printCpuSchedule(graph, cpu);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
++expectedIt;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unordered_map<size_t, ScheduledPlacement> collectScheduledPlacements(GraphDCP& graph) {
|
|
||||||
std::unordered_map<size_t, ScheduledPlacement> scheduledPlacements;
|
|
||||||
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
|
|
||||||
for (const auto& task : graph.getScheduledTasks(cpu)) {
|
|
||||||
auto [it, inserted] = scheduledPlacements.emplace(task.nodeIndex, ScheduledPlacement {cpu, task});
|
|
||||||
assert(inserted && "task scheduled multiple times");
|
|
||||||
(void) it;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return scheduledPlacements;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool checkAllTasksScheduled(GraphDCP& graph, size_t expectedTaskCount) {
|
|
||||||
auto scheduledPlacements = collectScheduledPlacements(graph);
|
|
||||||
if (scheduledPlacements.size() != expectedTaskCount) {
|
|
||||||
std::cerr << "Expected " << expectedTaskCount << " scheduled tasks, got " << scheduledPlacements.size() << "\n";
|
|
||||||
printGraphSchedule(graph);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool checkCpuSchedulesDoNotOverlap(GraphDCP& graph) {
|
|
||||||
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu) {
|
|
||||||
auto scheduledTasks = graph.getScheduledTasks(cpu);
|
|
||||||
Time previousCompletion = 0;
|
|
||||||
bool firstTask = true;
|
|
||||||
for (const auto& task : scheduledTasks) {
|
|
||||||
Time completion = addOrMax(task.aest, task.weight);
|
|
||||||
if (task.aest > task.alst) {
|
|
||||||
std::cerr << "Task " << task.nodeIndex << " on CPU " << cpu << " has aest > alst\n";
|
|
||||||
printCpuSchedule(graph, cpu);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!firstTask && task.aest < previousCompletion) {
|
|
||||||
std::cerr << "CPU " << cpu << " has overlapping tasks\n";
|
|
||||||
printCpuSchedule(graph, cpu);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
previousCompletion = completion;
|
|
||||||
firstTask = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool checkDependencyConstraints(GraphDCP& graph, llvm::ArrayRef<IndexedEdge> edges) {
|
|
||||||
auto scheduledPlacements = collectScheduledPlacements(graph);
|
|
||||||
for (auto [parentIndex, childIndex, transferCost] : edges) {
|
|
||||||
const auto& parent = scheduledPlacements.at(parentIndex);
|
|
||||||
const auto& child = scheduledPlacements.at(childIndex);
|
|
||||||
Time requiredStart = addOrMax(parent.task.aest, parent.task.weight);
|
|
||||||
if (parent.cpu != child.cpu)
|
|
||||||
requiredStart = addOrMax(requiredStart, static_cast<Weight>(transferCost));
|
|
||||||
if (child.task.aest < requiredStart) {
|
|
||||||
std::cerr << "Dependency violation for edge " << parentIndex << " -> " << childIndex << '\n';
|
|
||||||
printGraphSchedule(graph);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Time getMaxCompletion(GraphDCP& graph) {
|
|
||||||
Time maxCompletion = 0;
|
|
||||||
for (CPU cpu = 0; cpu < graph.cpuCount(); ++cpu)
|
|
||||||
for (const auto& task : graph.getScheduledTasks(cpu))
|
|
||||||
maxCompletion = std::max(maxCompletion, addOrMax(task.aest, task.weight));
|
|
||||||
return maxCompletion;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphSingleNode() {
|
|
||||||
std::cout << "testDCPGraphSingleNode:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {15};
|
|
||||||
GraphDCP graph(nodeWeights, {});
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (graph.cpuCount() != 1) {
|
|
||||||
std::cerr << "Expected exactly 1 CPU, got " << graph.cpuCount() << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
0,
|
|
||||||
{
|
|
||||||
{0, 0, 0, 15},
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphLinearChain() {
|
|
||||||
std::cout << "testDCPGraphLinearChain:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {10, 20, 5};
|
|
||||||
const std::vector<IndexedEdge> edges = {
|
|
||||||
{0, 1, 7},
|
|
||||||
{1, 2, 9},
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, edges);
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (graph.cpuCount() != 1) {
|
|
||||||
std::cerr << "Expected a linear chain to stay on one CPU, got " << graph.cpuCount() << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
0,
|
|
||||||
{
|
|
||||||
{0, 0, 0, 10},
|
|
||||||
{1, 10, 10, 20},
|
|
||||||
{2, 30, 30, 5 },
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkCpuSchedulesDoNotOverlap(graph) || !checkDependencyConstraints(graph, edges)) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphFixture() {
|
|
||||||
std::cout << "testDCPGraphFixture:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {
|
|
||||||
80,
|
|
||||||
40,
|
|
||||||
40,
|
|
||||||
40,
|
|
||||||
40,
|
|
||||||
40,
|
|
||||||
60,
|
|
||||||
30,
|
|
||||||
30,
|
|
||||||
30,
|
|
||||||
30,
|
|
||||||
40,
|
|
||||||
20,
|
|
||||||
20,
|
|
||||||
20,
|
|
||||||
20,
|
|
||||||
10,
|
|
||||||
10,
|
|
||||||
};
|
|
||||||
const std::vector<IndexedEdge> edges = {
|
|
||||||
{0, 1, 3 },
|
|
||||||
{0, 1, 120},
|
|
||||||
{0, 2, 120},
|
|
||||||
{0, 3, 120},
|
|
||||||
{0, 4, 120},
|
|
||||||
{0, 5, 120},
|
|
||||||
{0, 6, 120},
|
|
||||||
{2, 6, 80 },
|
|
||||||
{2, 7, 80 },
|
|
||||||
{3, 8, 80 },
|
|
||||||
{4, 9, 80 },
|
|
||||||
{5, 10, 80 },
|
|
||||||
{6, 7, 120},
|
|
||||||
{6, 8, 120},
|
|
||||||
{6, 9, 120},
|
|
||||||
{6, 10, 120},
|
|
||||||
{6, 11, 120},
|
|
||||||
{8, 11, 80 },
|
|
||||||
{8, 12, 80 },
|
|
||||||
{9, 13, 80 },
|
|
||||||
{10, 14, 80 },
|
|
||||||
{11, 12, 120},
|
|
||||||
{11, 13, 120},
|
|
||||||
{11, 14, 120},
|
|
||||||
{11, 15, 120},
|
|
||||||
{13, 15, 80 },
|
|
||||||
{13, 16, 80 },
|
|
||||||
{14, 17, 80 },
|
|
||||||
{15, 16, 120},
|
|
||||||
{15, 17, 120},
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, {});
|
|
||||||
for (auto [parent, child, weight] : edges)
|
|
||||||
graph.makeEdge(parent, child, weight);
|
|
||||||
|
|
||||||
graph.runDcp();
|
|
||||||
if (graph.cpuCount() != 4) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
3,
|
|
||||||
{
|
|
||||||
{1, 200, 400, 40},
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
2,
|
|
||||||
{
|
|
||||||
{5, 200, 260, 40},
|
|
||||||
{10, 300, 300, 30},
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
1,
|
|
||||||
{
|
|
||||||
{4, 200, 210, 40},
|
|
||||||
{7, 300, 410, 30},
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkScheduledTasks(graph,
|
|
||||||
0,
|
|
||||||
{
|
|
||||||
{0, 0, 0, 80},
|
|
||||||
{2, 80, 80, 40},
|
|
||||||
{6, 120, 120, 60},
|
|
||||||
{3, 180, 200, 40},
|
|
||||||
{8, 220, 240, 30},
|
|
||||||
{11, 250, 270, 40},
|
|
||||||
{12, 290, 310, 20},
|
|
||||||
{9, 320, 330, 30},
|
|
||||||
{13, 350, 360, 20},
|
|
||||||
{15, 370, 380, 20},
|
|
||||||
{16, 390, 400, 10},
|
|
||||||
{14, 410, 410, 20},
|
|
||||||
{17, 430, 430, 10},
|
|
||||||
})) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
|
||||||
|| !checkDependencyConstraints(graph, edges)) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphMaxCPUs() {
|
|
||||||
std::cout << "testDCPGraphMaxCPUs:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {20, 10, 10, 10, 10, 10, 10};
|
|
||||||
const std::vector<IndexedEdge> edges = {
|
|
||||||
{0, 1, 0},
|
|
||||||
{0, 2, 0},
|
|
||||||
{0, 3, 0},
|
|
||||||
{0, 4, 0},
|
|
||||||
{0, 5, 0},
|
|
||||||
{0, 6, 0},
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, edges);
|
|
||||||
graph.setMaxCpuCount(2);
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (graph.cpuCount() != 2) {
|
|
||||||
std::cerr << "Expected exactly 2 CPUs with maxCpuCount=2, got " << graph.cpuCount() << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
|
||||||
|| !checkDependencyConstraints(graph, edges)) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (getMaxCompletion(graph) > 50) {
|
|
||||||
std::cerr << "Expected makespan <= 50 under maxCpuCount=2, got " << getMaxCompletion(graph) << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphSingleCpuCap() {
|
|
||||||
std::cout << "testDCPGraphSingleCpuCap:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {20, 10, 10, 10};
|
|
||||||
const std::vector<IndexedEdge> edges = {
|
|
||||||
{0, 1, 0},
|
|
||||||
{0, 2, 0},
|
|
||||||
{0, 3, 0},
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, edges);
|
|
||||||
graph.setMaxCpuCount(1);
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (graph.cpuCount() != 1) {
|
|
||||||
std::cerr << "Expected exactly 1 CPU with maxCpuCount=1, got " << graph.cpuCount() << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
|
||||||
|| !checkDependencyConstraints(graph, edges)) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (getMaxCompletion(graph) != 50) {
|
|
||||||
std::cerr << "Expected makespan 50 under maxCpuCount=1, got " << getMaxCompletion(graph) << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int testDCPGraphDiamondDependencies() {
|
|
||||||
std::cout << "testDCPGraphDiamondDependencies:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {15, 10, 12, 20};
|
|
||||||
const std::vector<IndexedEdge> edges = {
|
|
||||||
{0, 1, 5},
|
|
||||||
{0, 2, 7},
|
|
||||||
{1, 3, 3},
|
|
||||||
{2, 3, 2},
|
|
||||||
};
|
|
||||||
|
|
||||||
GraphDCP graph(nodeWeights, edges);
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (!checkAllTasksScheduled(graph, nodeWeights.size()) || !checkCpuSchedulesDoNotOverlap(graph)
|
|
||||||
|| !checkDependencyConstraints(graph, edges)) {
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto scheduledPlacements = collectScheduledPlacements(graph);
|
|
||||||
const auto& sink = scheduledPlacements.at(3).task;
|
|
||||||
if (sink.aest < 27) {
|
|
||||||
std::cerr << "Expected sink node to start no earlier than the longest parent path, got " << sink.aest << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// crossbarSize=4, crossbarCount=2 => capacity = 4*4*2 = 32.
|
|
||||||
// Each task with crossbarUsage=1 needs footprint = 4*4 = 16, so at most 1 task
|
|
||||||
// can fit per CPU (16+16 = 32 >= capacity). The scheduler must open a fresh CPU
|
|
||||||
// for each task; all three end up on separate CPUs with their base weight.
|
|
||||||
int testDCPGraphCrossbarExhaustion() {
|
|
||||||
std::cout << "testDCPGraphCrossbarExhaustion:" << std::endl;
|
|
||||||
configureDcpDotOutput();
|
|
||||||
|
|
||||||
const size_t savedCrossbarSize = onnx_mlir::crossbarSize.getValue();
|
|
||||||
const size_t savedCrossbarCount = onnx_mlir::crossbarCountInCore.getValue();
|
|
||||||
onnx_mlir::crossbarSize = 4;
|
|
||||||
onnx_mlir::crossbarCountInCore = 2;
|
|
||||||
|
|
||||||
auto restoreCrossbarOptions = [&]() {
|
|
||||||
onnx_mlir::crossbarSize = savedCrossbarSize;
|
|
||||||
onnx_mlir::crossbarCountInCore = savedCrossbarCount;
|
|
||||||
};
|
|
||||||
|
|
||||||
const std::vector<Weight> nodeWeights = {10, 10, 10};
|
|
||||||
const std::vector<CrossbarUsage> nodeCrossbarUsage = {1, 1, 1};
|
|
||||||
GraphDCP graph(nodeWeights, {}, {},nodeCrossbarUsage);
|
|
||||||
graph.setMaxCpuCount(3);
|
|
||||||
graph.runDcp();
|
|
||||||
|
|
||||||
if (graph.cpuCount() != 3) {
|
|
||||||
restoreCrossbarOptions();
|
|
||||||
std::cerr << "Expected 3 CPUs (one per task due to crossbar limit), got " << graph.cpuCount() << "\n";
|
|
||||||
dumpDcpFailureArtifacts();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int failures = 0;
|
|
||||||
for (CPU c = 0; c < 3; c++) {
|
|
||||||
auto scheduledTasks = graph.getScheduledTasks(c);
|
|
||||||
if (scheduledTasks.size() != 1) {
|
|
||||||
std::cerr << "Expected exactly 1 task on CPU " << c << ", got " << scheduledTasks.size() << "\n";
|
|
||||||
printCpuSchedule(graph, c);
|
|
||||||
failures++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (scheduledTasks[0].weight != 10) {
|
|
||||||
std::cerr << "Expected weight=10 on CPU " << c << ", got " << scheduledTasks[0].weight << "\n";
|
|
||||||
printCpuSchedule(graph, c);
|
|
||||||
failures++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
restoreCrossbarOptions();
|
|
||||||
if (failures) dumpDcpFailureArtifacts();
|
|
||||||
return failures;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
|
||||||
(void) argc;
|
|
||||||
(void) argv;
|
|
||||||
|
|
||||||
int failures = 0;
|
|
||||||
failures += testDCPGraphSingleNode();
|
|
||||||
failures += testDCPGraphLinearChain();
|
|
||||||
failures += testDCPGraphFixture();
|
|
||||||
failures += testDCPGraphMaxCPUs();
|
|
||||||
failures += testDCPGraphSingleCpuCap();
|
|
||||||
failures += testDCPGraphDiamondDependencies();
|
|
||||||
failures += testDCPGraphCrossbarExhaustion();
|
|
||||||
if (failures != 0) {
|
|
||||||
std::cerr << failures << " test failures\n";
|
|
||||||
return EXIT_FAILURE;
|
|
||||||
}
|
|
||||||
return EXIT_SUCCESS;
|
|
||||||
}
|
|
||||||
@@ -65,7 +65,7 @@ def main():
|
|||||||
ap.add_argument("--crossbar-count", type=int, default=8)
|
ap.add_argument("--crossbar-count", type=int, default=8)
|
||||||
ap.add_argument("--core-count", type=int, default=None,
|
ap.add_argument("--core-count", type=int, default=None,
|
||||||
help="Core count to pass to Raptor. Required for PIM validation.")
|
help="Core count to pass to Raptor. Required for PIM validation.")
|
||||||
ap.add_argument("--pim-merge-scheduler", choices=("peft", "dcp"), default="peft",
|
ap.add_argument("--pim-merge-scheduler", choices=("peft"), default="peft",
|
||||||
help="Scheduler used by the Spatial merge-compute-nodes pass.")
|
help="Scheduler used by the Spatial merge-compute-nodes pass.")
|
||||||
ap.add_argument("--command-timeout-seconds", type=float, default=1000000.0,
|
ap.add_argument("--command-timeout-seconds", type=float, default=1000000.0,
|
||||||
help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")
|
help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")
|
||||||
|
|||||||
Reference in New Issue
Block a user