fix bufferization and weight emission after new gemm patterns
Validate Operations / validate-operations (push) Has been cancelled
Validate Operations / validate-operations (push) Has been cancelled
This commit is contained in:
@@ -814,7 +814,7 @@ static SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
|
||||
struct CoreEmissionResult {
|
||||
OnnxMlirCompilerErrorCodes status = CompilerSuccess;
|
||||
MemoryReportRow reportRow;
|
||||
llvm::SmallVector<unsigned, 8> usedWeightIndices;
|
||||
llvm::SmallVector<ResolvedWeightView, 8> usedWeights;
|
||||
};
|
||||
|
||||
template <typename MapTy>
|
||||
@@ -879,7 +879,6 @@ struct CompiledCoreNode {
|
||||
Kind kind = Kind::Op;
|
||||
Operation* op = nullptr;
|
||||
CompiledCoreOpKind opKind = CompiledCoreOpKind::Load;
|
||||
std::optional<unsigned> weightIndex;
|
||||
CompiledIndexExpr lowerBound;
|
||||
CompiledIndexExpr upperBound;
|
||||
CompiledIndexExpr step;
|
||||
@@ -978,12 +977,6 @@ static LogicalResult compileCoreEmissionPlan(Block& block,
|
||||
opNode.kind = CompiledCoreNode::Kind::Op;
|
||||
opNode.op = &op;
|
||||
opNode.opKind = *opKind;
|
||||
if (auto vmmOp = dyn_cast<pim::PimVMMOp>(op)) {
|
||||
auto weightIndex = onnx_mlir::resolveWeightIndex(weightOwner, vmmOp);
|
||||
if (!weightIndex)
|
||||
return failure();
|
||||
opNode.weightIndex = *weightIndex;
|
||||
}
|
||||
plan.push_back(std::move(opNode));
|
||||
}
|
||||
return success();
|
||||
@@ -992,6 +985,9 @@ static LogicalResult compileCoreEmissionPlan(Block& block,
|
||||
static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<CompiledCoreNode>& plan,
|
||||
PimCodeGen& coreCodeGen,
|
||||
StaticValueKnowledge& knowledge,
|
||||
llvm::function_ref<llvm::FailureOr<unsigned>(pim::PimVMMOp,
|
||||
const StaticValueKnowledge&)>
|
||||
resolveWeightSlot,
|
||||
size_t& processedOperations,
|
||||
std::optional<unsigned> batchLane = std::nullopt,
|
||||
std::optional<unsigned> batchLaneCount = std::nullopt) {
|
||||
@@ -1015,7 +1011,7 @@ static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<Compile
|
||||
aliasBindings.bind(iterArg, iterValue);
|
||||
|
||||
if (failed(executeCompiledCorePlan(
|
||||
*node.loopBody, coreCodeGen, knowledge, processedOperations, batchLane, batchLaneCount)))
|
||||
*node.loopBody, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount)))
|
||||
return failure();
|
||||
|
||||
auto yieldOp = cast<mlir::scf::YieldOp>(forOp.getRegion().front().getTerminator());
|
||||
@@ -1048,9 +1044,10 @@ static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<Compile
|
||||
coreCodeGen.codeGenConcatOp(cast<pim::PimConcatOp>(node.op), knowledge);
|
||||
break;
|
||||
case CompiledCoreOpKind::Vmm:
|
||||
assert(node.weightIndex && "compiled VMM op must have cached weight index");
|
||||
coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(
|
||||
*node.weightIndex, cast<pim::PimVMMOp>(node.op), true, knowledge);
|
||||
if (auto weightSlot = resolveWeightSlot(cast<pim::PimVMMOp>(node.op), knowledge); succeeded(weightSlot))
|
||||
coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(*weightSlot, cast<pim::PimVMMOp>(node.op), true, knowledge);
|
||||
else
|
||||
return failure();
|
||||
break;
|
||||
case CompiledCoreOpKind::Transpose:
|
||||
coreCodeGen.codeGenTransposeOp(cast<pim::PimTransposeOp>(node.op), knowledge);
|
||||
@@ -1138,6 +1135,9 @@ static int64_t codeGenCoreOps(Block& block,
|
||||
PimCodeGen& coreCodeGen,
|
||||
const StaticValueKnowledge& initialKnowledge,
|
||||
Operation* weightOwner,
|
||||
llvm::function_ref<llvm::FailureOr<unsigned>(pim::PimVMMOp,
|
||||
const StaticValueKnowledge&)>
|
||||
resolveWeightSlot,
|
||||
std::optional<unsigned> batchLane = std::nullopt,
|
||||
std::optional<unsigned> batchLaneCount = std::nullopt) {
|
||||
llvm::SmallVector<CompiledCoreNode, 32> plan;
|
||||
@@ -1146,7 +1146,8 @@ static int64_t codeGenCoreOps(Block& block,
|
||||
|
||||
size_t processedOperations = 0;
|
||||
StaticValueKnowledge knowledge = initialKnowledge;
|
||||
auto result = executeCompiledCorePlan(plan, coreCodeGen, knowledge, processedOperations, batchLane, batchLaneCount);
|
||||
auto result =
|
||||
executeCompiledCorePlan(plan, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount);
|
||||
return failed(result) ? -1 : static_cast<int64_t>(processedOperations);
|
||||
}
|
||||
|
||||
@@ -1174,9 +1175,6 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
size_t maxCoreId = 0;
|
||||
uint64_t nextBatchReportId = 0;
|
||||
|
||||
// Create Weight Folder
|
||||
auto mapCoreWeightToFileName = createAndPopulateWeightFolder(funcOp, outputDirPath);
|
||||
|
||||
SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
|
||||
SmallDenseMap<memref::GlobalOp, MemEntry, 16> materializedHostGlobals =
|
||||
collectMaterializedHostGlobals(moduleOp, funcOp, memory);
|
||||
@@ -1238,11 +1236,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
++nextBatchReportId;
|
||||
}
|
||||
|
||||
auto linkCoreWeights = [&](size_t originalCoreId,
|
||||
size_t coreId,
|
||||
ArrayRef<unsigned> usedIndices,
|
||||
ValueRange weights,
|
||||
Operation* weightOwner,
|
||||
auto linkCoreWeights = [&](size_t coreId,
|
||||
ArrayRef<std::string> weightFiles,
|
||||
json::Array& xbarsPerGroup) -> OnnxMlirCompilerErrorCodes {
|
||||
auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
|
||||
if (auto error = sys::fs::create_directory(coreWeightsDirPath)) {
|
||||
@@ -1250,20 +1245,12 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
return InvalidOutputFileAccess;
|
||||
}
|
||||
|
||||
auto& mapWeightToFile = mapCoreWeightToFileName[originalCoreId];
|
||||
for (unsigned index : usedIndices) {
|
||||
if (index >= weights.size()) {
|
||||
weightOwner->emitWarning("Weight index " + std::to_string(index) + " is out of range");
|
||||
assert(index < weights.size() && "Weight index is out of range");
|
||||
}
|
||||
mlir::Value weight = weights[index];
|
||||
xbarsPerGroup.push_back(index);
|
||||
assert(mapWeightToFile.contains(weight) && "Weight was not materialized into a file!!");
|
||||
auto& fileName = mapWeightToFile[weight];
|
||||
for (auto [slot, fileName] : llvm::enumerate(weightFiles)) {
|
||||
xbarsPerGroup.push_back(static_cast<int64_t>(slot));
|
||||
if (auto error = sys::fs::create_link(outputDirPath + "/weights/" + fileName,
|
||||
coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")) {
|
||||
coreWeightsDirPath + "/crossbar_" + std::to_string(slot) + ".bin")) {
|
||||
errs() << "Error creating link file: " << (outputDirPath + "/weights/" + fileName) << " to "
|
||||
<< (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")
|
||||
<< (coreWeightsDirPath + "/crossbar_" + std::to_string(slot) + ".bin")
|
||||
<< "\nError:" << error.message() << '\n';
|
||||
return InvalidOutputFileAccess;
|
||||
}
|
||||
@@ -1275,6 +1262,22 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
auto emitJob = [&](const CoreEmissionJob& job) -> CoreEmissionResult {
|
||||
CoreEmissionResult result;
|
||||
PimAcceleratorMemory jobMemory(memory.memEntriesMap, false);
|
||||
llvm::SmallVector<ResolvedWeightView, 8> usedWeights;
|
||||
|
||||
auto resolveWeightSlot = [&](pim::PimVMMOp vmmOp,
|
||||
const StaticValueKnowledge& knowledge) -> llvm::FailureOr<unsigned> {
|
||||
auto weightView = onnx_mlir::resolveWeightView(job.coreLikeOp, vmmOp.getWeight(), knowledge);
|
||||
if (failed(weightView)) {
|
||||
vmmOp.emitOpError("requires a statically resolvable dense global weight view during PIM codegen");
|
||||
return failure();
|
||||
}
|
||||
|
||||
if (auto it = llvm::find(usedWeights, *weightView); it != usedWeights.end())
|
||||
return static_cast<unsigned>(std::distance(usedWeights.begin(), it));
|
||||
|
||||
usedWeights.push_back(*weightView);
|
||||
return static_cast<unsigned>(usedWeights.size() - 1);
|
||||
};
|
||||
|
||||
std::error_code errorCode;
|
||||
auto outputCorePath = outputDirPath + "/core_" + std::to_string(job.emittedCoreId) + ".pim";
|
||||
@@ -1307,21 +1310,20 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId);
|
||||
deviceMemory.allocateCore(coreOp);
|
||||
|
||||
int64_t processedOperations =
|
||||
codeGenCoreOps(coreOp.getBody().front(), coreCodeGen, StaticValueKnowledge {}, coreOp.getOperation());
|
||||
int64_t processedOperations = codeGenCoreOps(
|
||||
coreOp.getBody().front(), coreCodeGen, StaticValueKnowledge {}, coreOp.getOperation(), resolveWeightSlot);
|
||||
if (processedOperations < 0) {
|
||||
result.status = CompilerFailure;
|
||||
return result;
|
||||
}
|
||||
assert(processedOperations > 0);
|
||||
result.reportRow = deviceMemory.getReportRow();
|
||||
result.usedWeightIndices = getUsedWeightIndices(coreOp);
|
||||
result.usedWeights = std::move(usedWeights);
|
||||
}
|
||||
else {
|
||||
auto coreBatchOp = cast<pim::PimCoreBatchOp>(job.coreLikeOp);
|
||||
aliasMaterializedHostGlobals(coreBatchOp, moduleOp, materializedHostGlobals, jobMemory);
|
||||
auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId);
|
||||
result.usedWeightIndices = getUsedWeightIndices(coreBatchOp);
|
||||
|
||||
for (unsigned lane : job.lanes) {
|
||||
StaticValueKnowledge knowledge;
|
||||
@@ -1335,6 +1337,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
coreCodeGen,
|
||||
knowledge,
|
||||
coreBatchOp.getOperation(),
|
||||
resolveWeightSlot,
|
||||
lane,
|
||||
static_cast<unsigned>(coreBatchOp.getLaneCount()));
|
||||
if (processedOperations < 0) {
|
||||
@@ -1345,6 +1348,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
}
|
||||
|
||||
result.reportRow = deviceMemory.getReportRow();
|
||||
result.usedWeights = std::move(usedWeights);
|
||||
}
|
||||
|
||||
pim_binary::patchInstructionCount(coreBinaryStream, coreCodeGen.getEmittedInstructionCount());
|
||||
@@ -1368,14 +1372,23 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
if (jobResults[jobIndex].status != CompilerSuccess)
|
||||
return jobResults[jobIndex].status;
|
||||
|
||||
llvm::SmallVector<WeightFileRequest, 8> weightRequests;
|
||||
weightRequests.reserve(jobs.size());
|
||||
for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) {
|
||||
WeightFileRequest request;
|
||||
request.coreId = jobs[jobIndex].emittedCoreId;
|
||||
request.weights = jobResults[jobIndex].usedWeights;
|
||||
weightRequests.push_back(std::move(request));
|
||||
}
|
||||
auto mapCoreWeightToFileName = createAndPopulateWeightFolder(weightRequests, outputDirPath);
|
||||
|
||||
for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) {
|
||||
const CoreEmissionJob& job = jobs[jobIndex];
|
||||
const CoreEmissionResult& result = jobResults[jobIndex];
|
||||
json::Array xbarsPerGroup;
|
||||
|
||||
if (auto coreOp = dyn_cast<pim::PimCoreOp>(job.coreLikeOp)) {
|
||||
if (auto err = linkCoreWeights(
|
||||
job.originalCoreId, job.emittedCoreId, result.usedWeightIndices, coreOp.getWeights(), coreOp.getOperation(), xbarsPerGroup))
|
||||
if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup))
|
||||
return err;
|
||||
xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup);
|
||||
memory.recordCoreReport(job.emittedCoreId, result.reportRow);
|
||||
@@ -1391,14 +1404,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
|
||||
for (size_t jobIndex : group) {
|
||||
const CoreEmissionJob& job = jobs[jobIndex];
|
||||
const CoreEmissionResult& result = jobResults[jobIndex];
|
||||
auto coreBatchOp = cast<pim::PimCoreBatchOp>(job.coreLikeOp);
|
||||
json::Array xbarsPerGroup;
|
||||
if (auto err = linkCoreWeights(job.originalCoreId,
|
||||
job.emittedCoreId,
|
||||
result.usedWeightIndices,
|
||||
coreBatchOp.getWeights(),
|
||||
coreBatchOp.getOperation(),
|
||||
xbarsPerGroup))
|
||||
if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup))
|
||||
return err;
|
||||
xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup);
|
||||
reportedCoreIds.push_back(static_cast<int32_t>(job.emittedCoreId));
|
||||
|
||||
Reference in New Issue
Block a user