#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Threading.h" #include "mlir/IR/Value.h" #include "mlir/IR/Verifier.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" #include "llvm/Support/JSON.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include #include #include #include "Common/IR/CompactAsmUtils.hpp" #include "Common/PimCommon.hpp" #include "Common/Support/ReportUtils.hpp" #include "Conversion/ONNXToSpatial/Common/Common.hpp" #include "src/Accelerators/PIM/Common/IR/BatchCoreUtils.hpp" #include "src/Accelerators/PIM/Common/IR/WeightUtils.hpp" #include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp" #include "src/Accelerators/PIM/Compiler/PimBinaryFormat.hpp" #include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Compiler/PimWeightEmitter.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" using namespace llvm; using namespace mlir; using namespace onnx_mlir; using namespace onnx_mlir::compact_asm; namespace { static std::optional getLaneForMemoryValue(mlir::Value value, std::optional lane) { if (!lane) return std::nullopt; auto allocOp = value.getDefiningOp(); if (!allocOp || !allocOp->getParentOfType()) return std::nullopt; return lane; } static mlir::Value resolveCachedAlias(mlir::Value value, const StaticValueKnowledge& knowledge) { auto iter = knowledge.aliases.find(value); while (iter != knowledge.aliases.end()) { value = iter->second; iter = knowledge.aliases.find(value); } return value; } static MemoryValueKey getMemoryValueKey(mlir::Value value, std::optional lane = std::nullopt) { return {value, getLaneForMemoryValue(value, lane)}; } } // namespace MemEntry* PimMemory::gatherMemEntry(mlir::Value value, std::optional lane) { auto type = cast(value.getType()); assert("Only static shape is supported" && type.hasStaticShape()); size_t allocSize = getShapedTypeSizeInBytes(type); MemEntry memEntry = {0, allocSize}; return &memEntries.emplace_back(memEntry, getMemoryValueKey(value, lane)).first; } void PimMemory::allocateGatheredMemory() { llvm::sort(memEntries, [](auto a, auto b) -> bool { return a.first.size > b.first.size; }); for (auto& [memEntry, key] : memEntries) allocateMemoryForValue(key, memEntry); memEntries.clear(); } void PimMemory::allocateMemoryForValue(const MemoryValueKey& key, MemEntry& memEntry) { memEntry.address = firstAvailableAddress; firstAvailableAddress += memEntry.size; // Alignment if (size_t remainder = firstAvailableAddress % minAlignment) firstAvailableAddress += minAlignment - remainder; ownedMemEntriesMap[key] = memEntry; globalMemEntriesMap[key] = memEntry; } void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) { SmallDenseMap globalConstants; SmallVector, 16> globalAliases; SmallVector args; for (mlir::Value arg : funcOp.getArguments()) { gatherMemEntry(arg); args.push_back(arg); } funcOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (!hasWeightAlways(getGlobalOp)) { auto globalMemrefOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (globalMemrefOp.getName().starts_with("arg")) { StringRef indexStr = globalMemrefOp.getName().substr(4); int index = 0; llvm::to_integer(indexStr, index, 10); globalAliases.push_back({getGlobalOp.getResult(), args[index]}); } auto [iter, inserted] = globalConstants.try_emplace(globalMemrefOp, getGlobalOp.getResult()); if (inserted) gatherMemEntry(getGlobalOp.getResult()); else globalAliases.push_back({getGlobalOp.getResult(), iter->second}); } }); funcOp.walk([&](memref::AllocOp allocOp) { if (!allocOp->getParentOfType()) gatherMemEntry(allocOp.getResult()); }); allocateGatheredMemory(); for (auto [alias, original] : globalAliases) globalMemEntriesMap[getMemoryValueKey(alias)] = getMemEntry(getMemoryValueKey(original)); } void PimMemory::allocateCore(Operation* op, std::optional lane) { op->walk([&](memref::AllocOp allocOp) { gatherMemEntry(allocOp, lane); }); allocateGatheredMemory(); } static void printHostMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) { llvm::SmallVector fields = { {"Number of globals", std::to_string(row.numGlobal) }, {"Global memory", formatReportMemory(row.sizeGlobal)} }; printReportFlatFields(os, fields); } static void printCoreMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) { llvm::SmallVector fields = { {"Number of allocas", std::to_string(entry.row.numAlloca) }, {"Allocated memory", formatReportMemory(entry.row.sizeAlloca)} }; printReportFlatFields(os, fields); } static void printBatchMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) { llvm::SmallVector perCoreFields = { {"Number of allocas", std::to_string(entry.row.numAlloca) }, {"Allocated memory", formatReportMemory(entry.row.sizeAlloca)} }; llvm::SmallVector totalFields = { {"Number of allocas", std::to_string(entry.totalAllocaCount) }, {"Batch memory", formatReportMemory(entry.totalAllocaBytes)} }; printReportPerCoreAndTotalFields(os, perCoreFields, totalFields); } static MemoryReportRow addMemoryReportRows(const MemoryReportRow& lhs, const MemoryReportRow& rhs) { MemoryReportRow result = lhs; result.numAlloca += rhs.numAlloca; result.sizeAlloca += rhs.sizeAlloca; result.numGlobal += rhs.numGlobal; result.sizeGlobal += rhs.sizeGlobal; return result; } MemoryReportRow PimMemory::getReportRow() const { MemoryReportRow row; for (auto& [key, memEntry] : ownedMemEntriesMap) { if (auto op = key.value.getDefiningOp()) { if (isa(op)) { row.numAlloca++; row.sizeAlloca += memEntry.size; } if (isa(op)) { row.numGlobal++; row.sizeGlobal += memEntry.size; } } } return row; } void PimMemory::remove(mlir::Value val) { for (auto it = ownedMemEntriesMap.begin(); it != ownedMemEntriesMap.end();) if (it->first.value == val) { auto eraseIt = it++; ownedMemEntriesMap.erase(eraseIt); } else ++it; for (auto it = globalMemEntriesMap.begin(); it != globalMemEntriesMap.end();) if (it->first.value == val) { auto eraseIt = it++; globalMemEntriesMap.erase(eraseIt); } else ++it; } MemEntry PimMemory::getMemEntry(const MemoryValueKey& key) const { auto iter = globalMemEntriesMap.find(key); assert("Missing memEntry for value" && iter != globalMemEntriesMap.end()); return iter->second; } PimMemory& PimAcceleratorMemory::getOrCreateDeviceMem(size_t id) { return deviceMem.try_emplace(id, memEntriesMap).first->second; } size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge, std::optional lane) const { value = resolveCachedAlias(value, knowledge); auto compiledIt = compiledAddressExprs.find(value); if (compiledIt == compiledAddressExprs.end()) { auto compiledExpr = compileContiguousAddressExpr(value); if (failed(compiledExpr)) { errs() << "Failed to compile contiguous address for value: "; value.print(errs()); errs() << "\n"; llvm_unreachable("Failed to compile contiguous address"); } compiledIt = compiledAddressExprs.try_emplace(value, *compiledExpr).first; } auto resolvedAddress = compiledIt->second.evaluate(knowledge, lane); if (failed(resolvedAddress)) { errs() << "Failed to evaluate contiguous address for value: "; value.print(errs()); errs() << "\n"; if (auto* definingOp = value.getDefiningOp()) { errs() << "Defining op:\n"; definingOp->print(errs()); errs() << "\n"; } llvm_unreachable("Failed to resolve contiguous address"); } MemoryValueKey key = getMemoryValueKey(resolvedAddress->base, lane); auto iter = memEntriesMap.find(key); if (iter == memEntriesMap.end()) { errs() << "Missing mem entry for value: "; resolvedAddress->base.print(errs()); errs() << "\n"; if (key.lane) errs() << "Lane: " << *key.lane << "\n"; if (auto* definingOp = resolvedAddress->base.getDefiningOp()) { errs() << "Defining op:\n"; definingOp->print(errs()); errs() << "\n"; } llvm_unreachable("Missing mem entry"); } return iter->second.address + resolvedAddress->byteOffset; } llvm::FailureOr PimAcceleratorMemory::getIndexValue(mlir::Value value, const StaticValueKnowledge& knowledge) const { value = resolveCachedAlias(value, knowledge); auto compiledIt = compiledIndexExprs.find(value); if (compiledIt == compiledIndexExprs.end()) { auto compiledExpr = compileIndexExpr(value); if (failed(compiledExpr)) return mlir::failure(); compiledIt = compiledIndexExprs.try_emplace(value, *compiledExpr).first; } return compiledIt->second.evaluate(knowledge); } void PimAcceleratorMemory::reportHost() { hostReportRow = hostMem.getReportRow(); } void PimAcceleratorMemory::recordCoreReport(size_t coreId, const MemoryReportRow& row) { reportEntries.push_back( {MemoryReportEntry::Kind::Core, coreId, {static_cast(coreId)}, row, row.numAlloca, row.sizeAlloca}); } void PimAcceleratorMemory::recordBatchReport(uint64_t batchId, ArrayRef coreIds, const MemoryReportRow& perCoreRow, uint64_t totalAllocaCount, uint64_t totalAllocaBytes) { MemoryReportEntry entry; entry.kind = MemoryReportEntry::Kind::Batch; entry.id = batchId; llvm::append_range(entry.coreIds, coreIds); entry.row = perCoreRow; entry.totalAllocaCount = totalAllocaCount; entry.totalAllocaBytes = totalAllocaBytes; reportEntries.push_back(std::move(entry)); } void PimAcceleratorMemory::flushReport() { if (!fileReport.is_open()) return; llvm::raw_os_ostream os(fileReport); uint64_t totalGlobalMemory = hostReportRow.has_value() ? hostReportRow->sizeGlobal : 0; uint64_t totalCoresMemory = 0; for (const MemoryReportEntry& entry : reportEntries) totalCoresMemory += entry.totalAllocaBytes; llvm::SmallVector totalFields = { {"Global memory", formatReportMemory(totalGlobalMemory)}, {"Cores memory", formatReportMemory(totalCoresMemory) } }; printReportTotalsBlock(os, totalFields); if (hostReportRow.has_value()) { os << "\nHost:\n"; printHostMemoryReportRow(os, *hostReportRow); } if (!reportEntries.empty()) { if (hostReportRow.has_value()) os << "\n"; sortReportEntriesByFirstCore(reportEntries); for (size_t index = 0; index < reportEntries.size();) { size_t runEnd = index + 1; while (runEnd < reportEntries.size() && reportEntries[runEnd].kind == reportEntries[index].kind && reportEntries[runEnd].row == reportEntries[index].row && reportEntries[runEnd].totalAllocaCount == reportEntries[index].totalAllocaCount && reportEntries[runEnd].totalAllocaBytes == reportEntries[index].totalAllocaBytes) { ++runEnd; } if (reportEntries[index].kind == MemoryReportEntry::Kind::Batch) { os << "Batch "; for (size_t batchIndex = index; batchIndex < runEnd; ++batchIndex) { if (batchIndex != index) os << ",\n "; os << reportEntries[batchIndex].id << " (cores "; printCompressedIntegerEntries(os, ArrayRef(reportEntries[batchIndex].coreIds)); os << ")"; } } else { llvm::SmallVector coreIds; for (size_t coreIndex = index; coreIndex < runEnd; ++coreIndex) coreIds.push_back(reportEntries[coreIndex].coreIds.front()); os << "Core "; printCompressedIntegerEntries(os, ArrayRef(coreIds)); } os << ":\n"; if (reportEntries[index].kind == MemoryReportEntry::Kind::Batch) printBatchMemoryReportRow(os, reportEntries[index]); else printCoreMemoryReportRow(os, reportEntries[index]); printReportEntrySeparator(os, runEnd < reportEntries.size()); index = runEnd; } } os.flush(); fileReport.close(); } void PimAcceleratorMemory::clean(mlir::Operation* op) { for (auto value : op->getResults()) { hostMem.remove(value); for (auto& device : deviceMem) device.second.remove(value); } } size_t PimCodeGen::remapCoreId(size_t coreId) const { auto it = emittedCoreIds.find(coreId); assert(it != emittedCoreIds.end() && "Missing emitted core id remapping"); return it->second; } void PimCodeGen::emitInstruction(const pim_binary::InstructionRecord& instruction) const { pim_binary::writeInstructionRecord(coreBinaryStream, instruction); ++emittedInstructionCount; if (coreJsonStream) *coreJsonStream << json::Value(pim_binary::makeInstructionJson(instruction)) << ','; } void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) const { pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::sldi; instruction.rd = static_cast(registerNumber); instruction.r2OrImm = static_cast(immediate); emitInstruction(instruction); } void PimCodeGen::setupRd(size_t rdAddress, size_t rdOffset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); } void PimCodeGen::setupRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset); } void PimCodeGen::setupRdRs1Rs2( size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset); genSetRegisterImmediateUnsigned(2, rs2Address + rs2Offset); } void PimCodeGen::emitMemCopyOp(StringRef opName, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset, size_t size, StringRef sizeFieldName) const { setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::opcodeFromString(opName); instruction.rd = 0; instruction.r1 = 1; instruction.generic1 = 0; instruction.generic2 = 0; instruction.generic3 = static_cast(size); (void) sizeFieldName; emitInstruction(instruction); } void PimCodeGen::emitCommunicationOp(StringRef opName, size_t bufferAddr, size_t coreId, size_t size) const { setupRd(bufferAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::opcodeFromString(opName); instruction.rd = 0; instruction.r2OrImm = static_cast(remapCoreId(coreId)); instruction.generic1 = 0; instruction.generic2 = 0; instruction.generic3 = static_cast(size); emitInstruction(instruction); } void PimCodeGen::emitMvmOp(size_t groupId, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset) const { setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::mvmul; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 8; instruction.generic1 = 0; instruction.generic2 = static_cast(groupId); emitInstruction(instruction); } void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const { auto deviceTargetOffset = indexOf(loadOp.getDeviceTargetOffset(), knowledge); auto hostSourceOffset = indexOf(loadOp.getHostSourceOffset(), knowledge); assert(succeeded(deviceTargetOffset) && succeeded(hostSourceOffset) && "pim.memcp_hd offsets must be statically resolvable during codegen"); emitMemCopyOp("ld", addressOf(loadOp.getDeviceTarget(), knowledge), *deviceTargetOffset, addressOf(loadOp.getHostSource(), knowledge), *hostSourceOffset, loadOp.getSize()); } void PimCodeGen::codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp, const StaticValueKnowledge& knowledge) const { auto hostTargetOffset = indexOf(storeOp.getHostTargetOffset(), knowledge); auto deviceSourceOffset = indexOf(storeOp.getDeviceSourceOffset(), knowledge); assert(succeeded(hostTargetOffset) && succeeded(deviceSourceOffset) && "pim.memcp_dh offsets must be statically resolvable during codegen"); emitMemCopyOp("st", addressOf(storeOp.getHostTarget(), knowledge), *hostTargetOffset, addressOf(storeOp.getDeviceSource(), knowledge), *deviceSourceOffset, storeOp.getSize()); } void PimCodeGen::codeGenLmvOp(pim::PimMemCopyOp lmvOp, const StaticValueKnowledge& knowledge) const { emitMemCopyOp("lmv", addressOf(lmvOp.getTarget(), knowledge), lmvOp.getTargetOffset(), addressOf(lmvOp.getSource(), knowledge), lmvOp.getSourceOffset(), lmvOp.getSize(), "len"); } void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp, const StaticValueKnowledge& knowledge) const { auto sourceCoreId = indexOf(receiveOp.getSourceCoreId(), knowledge); assert(succeeded(sourceCoreId) && "pim.receive source core id must be statically resolvable during codegen"); emitCommunicationOp("recv", addressOf(receiveOp.getOutputBuffer(), knowledge), *sourceCoreId, receiveOp.getSize()); } void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const { auto targetCoreId = indexOf(sendOp.getTargetCoreId(), knowledge); assert(succeeded(targetCoreId) && "pim.send target core id must be statically resolvable during codegen"); emitCommunicationOp("send", addressOf(sendOp.getInput(), knowledge), *targetCoreId, sendOp.getSize()); } void PimCodeGen::codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const { auto outputType = cast(concatOp.getOutputBuffer().getType()); assert(outputType.hasStaticShape() && "concat codegen requires static output shape"); int64_t axis = concatOp.getAxis(); ArrayRef outputShape = outputType.getShape(); size_t elementSize = getElementTypeSizeInBytes(outputType.getElementType()); size_t outputAddr = addressOf(concatOp.getOutputBuffer(), knowledge); size_t outerCount = 1; for (int64_t dim = 0; dim < axis; ++dim) outerCount *= static_cast(outputShape[dim]); size_t innerCount = 1; for (size_t dim = static_cast(axis) + 1; dim < outputShape.size(); ++dim) innerCount *= static_cast(outputShape[dim]); size_t outputConcatDim = static_cast(outputShape[axis]); size_t concatOffset = 0; for (mlir::Value input : concatOp.getInputs()) { auto inputType = cast(input.getType()); assert(inputType.hasStaticShape() && "concat codegen requires static input shapes"); size_t inputConcatDim = static_cast(inputType.getDimSize(axis)); size_t blockSizeInBytes = inputConcatDim * innerCount * elementSize; size_t inputAddr = addressOf(input, knowledge); for (size_t outerIndex = 0; outerIndex < outerCount; ++outerIndex) { size_t dstOffset = (outerIndex * outputConcatDim + concatOffset) * innerCount * elementSize; size_t srcOffset = outerIndex * inputConcatDim * innerCount * elementSize; emitMemCopyOp("lmv", outputAddr, dstOffset, inputAddr, srcOffset, blockSizeInBytes, "len"); } concatOffset += inputConcatDim; } } template void PimCodeGen::codeGenMVMLikeOp(size_t mvmId, MVMTy mvmLikeOp, bool transposeMatrix, const StaticValueKnowledge& knowledge) { emitMvmOp(mvmId, addressOf(mvmLikeOp.getOutputBuffer(), knowledge), 0, addressOf(mvmLikeOp.getInput(), knowledge), 0); // TODO: save weights somewhere (if transposeMatrix=true, transpose the weight matrix) } void PimCodeGen::codeGenVVAddOp(pim::PimVVAddOp vvaddOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvaddOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvaddOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvaddOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vvadd; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 2; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vvaddOp.getLhs().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVVSubOp(pim::PimVVSubOp vvsubOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvsubOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvsubOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvsubOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vvsub; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 2; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vvsubOp.getLhs().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVVMulOp(pim::PimVVMulOp vvmulOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvmulOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvmulOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvmulOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vvmul; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 2; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vvmulOp.getLhs().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVVMaxOp(pim::PimVVMaxOp vvmaxOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvmaxOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvmaxOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvmaxOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vvmax; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 2; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vvmaxOp.getLhs().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVVDMulOp(pim::PimVVDMulOp vvdmulOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvdmulOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvdmulOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvdmulOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vvdmul; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 2; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vvdmulOp.getLhs().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVAvgOp(pim::PimVAvgOp vavgOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vavgOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vavgOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vavg; instruction.rd = 0; instruction.r1 = 1; instruction.r2OrImm = 1; instruction.generic1 = 1; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vavgOp.getInput().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vreluOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vreluOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vrelu; instruction.rd = 0; instruction.r1 = 1; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vreluOp.getInput().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVTanhOp(pim::PimVTanhOp vtanhOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vtanhOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vtanhOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vtanh; instruction.rd = 0; instruction.r1 = 1; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vtanhOp.getInput().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vsigmOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vsigmOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vsigm; instruction.rd = 0; instruction.r1 = 1; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vsigmOp.getInput().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vsoftmaxOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vsoftmaxOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); pim_binary::InstructionRecord instruction; instruction.opcode = pim_binary::Opcode::vsoftmax; instruction.rd = 0; instruction.r1 = 1; instruction.generic3 = static_cast(getShapedTypeSizeInBytes(cast(vsoftmaxOp.getInput().getType()))); emitInstruction(instruction); } void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const {} void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp, const StaticValueKnowledge& knowledge) const { auto srcAddr = addressOf(transposeOp.getInput(), knowledge); auto dstAddr = addressOf(transposeOp.getOutputBuffer(), knowledge); auto srcType = cast(transposeOp.getInput().getType()); auto srcShape = srcType.getShape(); size_t rank = srcShape.size(); size_t elementSize = getElementTypeSizeInBytes(srcType.getElementType()); size_t totalElements = srcType.getNumElements(); // Read permutation. Destination dim i corresponds to source dim perm[i]. SmallVector perm = map_to_vector(transposeOp.getPermutation().getAsRange(), [](auto attr) -> int64_t { return attr.getInt(); }); // Destination shape: dstShape[i] = srcShape[perm[i]] SmallVector dstShape(rank); for (size_t i = 0; i < rank; i++) dstShape[i] = srcShape[perm[i]]; // Row-major strides for source and destination SmallVector srcStrides(rank, 1); SmallVector dstStrides(rank, 1); for (int64_t i = rank - 2; i >= 0; i--) { srcStrides[i] = srcStrides[i + 1] * srcShape[i + 1]; dstStrides[i] = dstStrides[i + 1] * dstShape[i + 1]; } bool storagePreserving = true; for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) { SmallVector srcIdx(rank); size_t remaining = srcFlat; for (size_t d = 0; d < rank; d++) { srcIdx[d] = remaining / srcStrides[d]; remaining %= srcStrides[d]; } size_t dstFlat = 0; for (size_t d = 0; d < rank; d++) dstFlat += srcIdx[perm[d]] * dstStrides[d]; if (dstFlat != srcFlat) { storagePreserving = false; break; } } if (storagePreserving) { emitMemCopyOp("lmv", dstAddr, 0, srcAddr, 0, totalElements * elementSize, "len"); return; } // Emit element-by-element copy with transposed addressing for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) { // Decompose flat source index into multi-dimensional index SmallVector srcIdx(rank); size_t remaining = srcFlat; for (size_t d = 0; d < rank; d++) { srcIdx[d] = remaining / srcStrides[d]; remaining %= srcStrides[d]; } // Compute flat destination index: dstIdx[d] = srcIdx[perm[d]] size_t dstFlat = 0; for (size_t d = 0; d < rank; d++) dstFlat += srcIdx[perm[d]] * dstStrides[d]; emitMemCopyOp("lmv", dstAddr, dstFlat * elementSize, srcAddr, srcFlat * elementSize, elementSize, "len"); } } size_t getMatrixSize(ShapedType matrixShape) { if (matrixShape.getRank() != 2 && matrixShape.getRank() != 4) assert(false && "Unsupported matrix shape"); return std::max(matrixShape.getDimSize(0), matrixShape.getDimSize(1)); } std::string getMemorySizeAsString(size_t size) { if (size > 1024 * 1024 * 1024) return std::to_string(size / 1024 / 1024 / 1024) + " GB"; if (size > 1024 * 1024) return std::to_string(size / 1024 / 1024) + " MB"; if (size > 1024) return std::to_string(size / 1024) + " KB"; return std::to_string(size) + " Bytes"; } static SmallVector collectTopLevelCoreLikeOps(func::FuncOp funcOp) { SmallVector coreLikeOps; for (Operation& op : funcOp.getBody().front()) if (dyn_cast(&op) || dyn_cast(&op)) coreLikeOps.push_back(&op); return coreLikeOps; } struct CoreEmissionResult { OnnxMlirCompilerErrorCodes status = CompilerSuccess; MemoryReportRow reportRow; llvm::SmallVector usedWeights; }; template class ScopedMapBindings { using KeyTy = typename MapTy::key_type; using ValueTy = typename MapTy::mapped_type; MapTy& map; llvm::SmallVector>, 8> savedEntries; public: explicit ScopedMapBindings(MapTy& map) : map(map) {} void bind(const KeyTy& key, const ValueTy& value) { auto it = map.find(key); if (it == map.end()) savedEntries.emplace_back(key, std::nullopt); else savedEntries.emplace_back(key, it->second); map[key] = value; } ~ScopedMapBindings() { for (auto it = savedEntries.rbegin(); it != savedEntries.rend(); ++it) if (it->second) map[it->first] = *it->second; else map.erase(it->first); } }; enum class CompiledCoreOpKind : uint8_t { Load, Store, Lmv, Receive, Send, Concat, Vmm, Transpose, VVAdd, VVSub, VVMul, VVMax, VVDMul, VAvg, VRelu, VTanh, VSigm, VSoftmax, GetGlobal }; struct CompiledCoreNode { enum class Kind : uint8_t { Op, Loop }; Kind kind = Kind::Op; Operation* op = nullptr; CompiledCoreOpKind opKind = CompiledCoreOpKind::Load; CompiledIndexExpr lowerBound; CompiledIndexExpr upperBound; CompiledIndexExpr step; std::unique_ptr> loopBody; }; static FailureOr classifyCompiledCoreOpKind(Operation& op) { if (isa(op)) return CompiledCoreOpKind::Load; if (isa(op)) return CompiledCoreOpKind::Store; if (isa(op)) return CompiledCoreOpKind::Lmv; if (isa(op)) return CompiledCoreOpKind::Receive; if (isa(op)) return CompiledCoreOpKind::Send; if (isa(op)) return CompiledCoreOpKind::Concat; if (isa(op)) return CompiledCoreOpKind::Vmm; if (isa(op)) return CompiledCoreOpKind::Transpose; if (isa(op)) return CompiledCoreOpKind::VVAdd; if (isa(op)) return CompiledCoreOpKind::VVSub; if (isa(op)) return CompiledCoreOpKind::VVMul; if (isa(op)) return CompiledCoreOpKind::VVMax; if (isa(op)) return CompiledCoreOpKind::VVDMul; if (isa(op)) return CompiledCoreOpKind::VAvg; if (isa(op)) return CompiledCoreOpKind::VRelu; if (isa(op)) return CompiledCoreOpKind::VTanh; if (isa(op)) return CompiledCoreOpKind::VSigm; if (isa(op)) return CompiledCoreOpKind::VSoftmax; if (isa(op)) return CompiledCoreOpKind::GetGlobal; return failure(); } static LogicalResult compileCoreEmissionPlan(Block& block, Operation* weightOwner, llvm::SmallVectorImpl& plan) { for (Operation& op : block) { if (isa(op) || isCoreStaticAddressOp(&op)) continue; if (auto loadOp = dyn_cast(op)) { if (succeeded(compileIndexExpr(loadOp.getResult()))) continue; } if (auto forOp = dyn_cast(op)) { auto lowerBound = compileIndexExpr(forOp.getLowerBound()); auto upperBound = compileIndexExpr(forOp.getUpperBound()); auto step = compileIndexExpr(forOp.getStep()); if (failed(lowerBound) || failed(upperBound) || failed(step)) { forOp.emitOpError("requires statically evaluable scf.for bounds for PIM codegen"); return failure(); } CompiledCoreNode loopNode; loopNode.kind = CompiledCoreNode::Kind::Loop; loopNode.op = forOp.getOperation(); loopNode.lowerBound = *lowerBound; loopNode.upperBound = *upperBound; loopNode.step = *step; loopNode.loopBody = std::make_unique>(); if (failed(compileCoreEmissionPlan(forOp.getRegion().front(), weightOwner, *loopNode.loopBody))) return failure(); plan.push_back(std::move(loopNode)); continue; } auto opKind = classifyCompiledCoreOpKind(op); if (failed(opKind)) { InFlightDiagnostic diag = op.emitError() << "unsupported codegen for op '" << op.getName().getStringRef() << "'"; if (auto coreOp = op.getParentOfType()) diag << " inside pim.core " << coreOp.getCoreId(); else if (auto coreBatchOp = op.getParentOfType()) diag << " inside pim.core_batch with laneCount " << coreBatchOp.getLaneCount(); return failure(); } CompiledCoreNode opNode; opNode.kind = CompiledCoreNode::Kind::Op; opNode.op = &op; opNode.opKind = *opKind; plan.push_back(std::move(opNode)); } return success(); } static LogicalResult executeCompiledCorePlan( const llvm::SmallVectorImpl& plan, PimCodeGen& coreCodeGen, StaticValueKnowledge& knowledge, llvm::function_ref(pim::PimVMMOp, const StaticValueKnowledge&)> resolveWeightSlot, size_t& processedOperations, std::optional batchLane = std::nullopt, std::optional batchLaneCount = std::nullopt) { for (const CompiledCoreNode& node : plan) { if (node.kind == CompiledCoreNode::Kind::Loop) { auto lowerBound = node.lowerBound.evaluate(knowledge); auto upperBound = node.upperBound.evaluate(knowledge); auto step = node.step.evaluate(knowledge); auto forOp = cast(node.op); if (failed(lowerBound) || failed(upperBound) || failed(step) || *step <= 0) { forOp.emitOpError("requires statically evaluable scf.for bounds for PIM codegen"); return failure(); } llvm::SmallVector iterValues(forOp.getInitArgs().begin(), forOp.getInitArgs().end()); for (int64_t inductionValue = *lowerBound; inductionValue < *upperBound; inductionValue += *step) { ScopedMapBindings indexBindings(knowledge.indexValues); ScopedMapBindings aliasBindings(knowledge.aliases); indexBindings.bind(forOp.getInductionVar(), inductionValue); for (auto [iterArg, iterValue] : llvm::zip_equal(forOp.getRegionIterArgs(), iterValues)) aliasBindings.bind(iterArg, iterValue); if (failed(executeCompiledCorePlan(*node.loopBody, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount))) return failure(); auto yieldOp = cast(forOp.getRegion().front().getTerminator()); for (auto [index, yieldedValue] : llvm::enumerate(yieldOp.getOperands())) iterValues[index] = resolveLoopCarriedAlias(yieldedValue, knowledge); } continue; } switch (node.opKind) { case CompiledCoreOpKind::Load: coreCodeGen.codeGenLoadOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Store: coreCodeGen.codeGenStoreOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Lmv: coreCodeGen.codeGenLmvOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Receive: coreCodeGen.codeGenReceiveOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Send: coreCodeGen.codeGenSendOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Concat: coreCodeGen.codeGenConcatOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::Vmm: if (auto weightSlot = resolveWeightSlot(cast(node.op), knowledge); succeeded(weightSlot)) coreCodeGen.codeGenMVMLikeOp(*weightSlot, cast(node.op), true, knowledge); else return failure(); break; case CompiledCoreOpKind::Transpose: coreCodeGen.codeGenTransposeOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VVAdd: coreCodeGen.codeGenVVAddOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VVSub: coreCodeGen.codeGenVVSubOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VVMul: coreCodeGen.codeGenVVMulOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VVMax: coreCodeGen.codeGenVVMaxOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VVDMul: coreCodeGen.codeGenVVDMulOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VAvg: coreCodeGen.codeGenVAvgOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VRelu: coreCodeGen.codeGenVReluOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VTanh: coreCodeGen.codeGenVTanhOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VSigm: coreCodeGen.codeGenVSigmOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::VSoftmax: coreCodeGen.codeGenVSoftmaxOp(cast(node.op), knowledge); break; case CompiledCoreOpKind::GetGlobal: coreCodeGen.codeGetGlobalOp(cast(node.op), knowledge); break; } processedOperations++; } return success(); } static SmallDenseMap collectMaterializedHostGlobals(ModuleOp moduleOp, func::FuncOp funcOp, const PimAcceleratorMemory& memory) { SmallDenseMap materializedHostGlobals; funcOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (hasWeightAlways(getGlobalOp)) return; auto targetGlobal = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!targetGlobal || materializedHostGlobals.contains(targetGlobal)) return; auto it = memory.memEntriesMap.find(getMemoryValueKey(getGlobalOp.getResult())); if (it != memory.memEntriesMap.end()) materializedHostGlobals[targetGlobal] = it->second; }); return materializedHostGlobals; } template static void aliasMaterializedHostGlobals(CoreLikeOpTy coreLikeOp, ModuleOp moduleOp, const SmallDenseMap& materializedHostGlobals, PimAcceleratorMemory& memory) { coreLikeOp.walk([&](memref::GetGlobalOp getGlobalOp) { MemoryValueKey key = getMemoryValueKey(getGlobalOp.getResult()); if (hasWeightAlways(getGlobalOp) || memory.memEntriesMap.contains(key)) return; auto targetGlobal = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!targetGlobal) return; auto it = materializedHostGlobals.find(targetGlobal); if (it != materializedHostGlobals.end()) memory.memEntriesMap[key] = it->second; }); } /// Dispatch all operations in a core region to the appropriate code generator. /// scf.for loops are statically unrolled via walkPimCoreBlock so that addressing is /// fully resolved before the JSON instructions are emitted. /// Returns the number of emitted instructions, or -1 on failure. static int64_t codeGenCoreOps( Block& block, PimCodeGen& coreCodeGen, const StaticValueKnowledge& initialKnowledge, Operation* weightOwner, llvm::function_ref(pim::PimVMMOp, const StaticValueKnowledge&)> resolveWeightSlot, std::optional batchLane = std::nullopt, std::optional batchLaneCount = std::nullopt) { llvm::SmallVector plan; if (failed(compileCoreEmissionPlan(block, weightOwner, plan))) return -1; size_t processedOperations = 0; StaticValueKnowledge knowledge = initialKnowledge; auto result = executeCompiledCorePlan( plan, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount); return failed(result) ? -1 : static_cast(processedOperations); } OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::string& outputDirPath) { if (!outputDirPath.empty()) { if (auto error = sys::fs::create_directory(outputDirPath)) { errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n'; return InvalidOutputFileAccess; } } auto entryFunc = getPimEntryFunc(moduleOp); if (failed(entryFunc)) return CompilerFailure; auto funcOp = *entryFunc; PimAcceleratorMemory memory; memory.hostMem.allocateHost(moduleOp, funcOp); memory.reportHost(); if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath)) return err; json::Object xbarsPerArrayGroup; size_t maxCoreId = 0; uint64_t nextBatchReportId = 0; SmallVector coreLikeOps = collectTopLevelCoreLikeOps(funcOp); SmallDenseMap materializedHostGlobals = collectMaterializedHostGlobals(moduleOp, funcOp, memory); llvm::DenseMap emittedCoreIds; size_t nextEmittedCoreId = 0; for (Operation* op : coreLikeOps) { if (auto coreOp = dyn_cast(op)) { size_t originalCoreId = static_cast(coreOp.getCoreId()); if (!emittedCoreIds.contains(originalCoreId)) emittedCoreIds[originalCoreId] = nextEmittedCoreId++; continue; } auto coreBatchOp = cast(op); auto batchCoreIds = getBatchCoreIds(coreBatchOp); for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) { size_t originalCoreId = static_cast(batchCoreIds[lane]); if (!emittedCoreIds.contains(originalCoreId)) emittedCoreIds[originalCoreId] = nextEmittedCoreId++; } } SmallVector jobs; SmallVector> batchJobIndices; for (Operation* op : coreLikeOps) { if (auto coreOp = dyn_cast(op)) { size_t originalCoreId = static_cast(coreOp.getCoreId()); CoreEmissionJob job; job.coreLikeOp = coreOp; job.originalCoreId = originalCoreId; job.emittedCoreId = emittedCoreIds.lookup(originalCoreId); jobs.push_back(std::move(job)); continue; } auto coreBatchOp = cast(op); auto batchCoreIds = getBatchCoreIds(coreBatchOp); llvm::DenseMap> lanesByCoreId; for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) lanesByCoreId[static_cast(batchCoreIds[lane])].push_back(lane); SmallVector jobIndices; SmallVector orderedOriginalCoreIds = llvm::to_vector(lanesByCoreId.keys()); llvm::sort(orderedOriginalCoreIds, [&](size_t lhs, size_t rhs) { return emittedCoreIds.lookup(lhs) < emittedCoreIds.lookup(rhs); }); for (size_t originalCoreId : orderedOriginalCoreIds) { CoreEmissionJob job; job.coreLikeOp = coreBatchOp; job.originalCoreId = originalCoreId; job.emittedCoreId = emittedCoreIds.lookup(originalCoreId); job.lanes = lanesByCoreId.lookup(originalCoreId); job.batchReportId = nextBatchReportId; jobIndices.push_back(jobs.size()); jobs.push_back(std::move(job)); } batchJobIndices.push_back(std::move(jobIndices)); ++nextBatchReportId; } auto linkCoreWeights = [&](size_t coreId, ArrayRef weightFiles, json::Array& xbarsPerGroup) -> OnnxMlirCompilerErrorCodes { auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId); if (auto error = sys::fs::create_directory(coreWeightsDirPath); error && error != std::errc::file_exists) { errs() << "Error creating core directory: " << coreWeightsDirPath << ": " << error.message() << '\n'; return InvalidOutputFileAccess; } for (auto [slot, fileName] : llvm::enumerate(weightFiles)) { xbarsPerGroup.push_back(static_cast(slot)); std::string sourcePath = outputDirPath + "/weights/" + fileName; std::string targetPath = coreWeightsDirPath + "/crossbar_" + std::to_string(slot) + ".bin"; sys::fs::remove(targetPath); if (auto error = sys::fs::create_link(sourcePath, targetPath)) { errs() << "Error creating link file: " << sourcePath << " to " << targetPath << "\nError:" << error.message() << '\n'; return InvalidOutputFileAccess; } } return CompilerSuccess; }; auto emitJob = [&](const CoreEmissionJob& job) -> CoreEmissionResult { CoreEmissionResult result; PimAcceleratorMemory jobMemory(memory.memEntriesMap, false); llvm::SmallVector usedWeights; auto resolveWeightSlot = [&](pim::PimVMMOp vmmOp, const StaticValueKnowledge& knowledge) -> llvm::FailureOr { auto weightView = onnx_mlir::resolveWeightView(job.coreLikeOp, vmmOp.getWeight(), knowledge); if (failed(weightView)) { vmmOp.emitOpError("requires a statically resolvable dense global weight view during PIM codegen"); return failure(); } if (auto it = llvm::find(usedWeights, *weightView); it != usedWeights.end()) return static_cast(std::distance(usedWeights.begin(), it)); usedWeights.push_back(*weightView); return static_cast(usedWeights.size() - 1); }; std::error_code errorCode; auto outputCorePath = outputDirPath + "/core_" + std::to_string(job.emittedCoreId) + ".pim"; raw_fd_ostream coreBinaryStream(outputCorePath, errorCode, sys::fs::OF_None); if (errorCode) { errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n'; result.status = InvalidOutputFileAccess; return result; } std::unique_ptr coreJsonStream; if (pimEmitJson.getValue()) { std::string outputCoreJsonPath = outputDirPath + "/core_" + std::to_string(job.emittedCoreId) + ".json"; errorCode = std::error_code(); coreJsonStream = std::make_unique(outputCoreJsonPath, errorCode); if (errorCode) { errs() << "Error while opening core json file `" << outputCoreJsonPath << "`: " << errorCode.message() << '\n'; result.status = InvalidOutputFileAccess; return result; } *coreJsonStream << '['; } pim_binary::writeHeader(coreBinaryStream); PimCodeGen coreCodeGen(jobMemory, coreBinaryStream, coreJsonStream.get(), emittedCoreIds); if (auto coreOp = dyn_cast(job.coreLikeOp)) { aliasMaterializedHostGlobals(coreOp, moduleOp, materializedHostGlobals, jobMemory); auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId); deviceMemory.allocateCore(coreOp); int64_t processedOperations = codeGenCoreOps( coreOp.getBody().front(), coreCodeGen, StaticValueKnowledge {}, coreOp.getOperation(), resolveWeightSlot); if (processedOperations < 0) { result.status = CompilerFailure; return result; } assert(processedOperations > 0); result.reportRow = deviceMemory.getReportRow(); result.usedWeights = std::move(usedWeights); } else { auto coreBatchOp = cast(job.coreLikeOp); aliasMaterializedHostGlobals(coreBatchOp, moduleOp, materializedHostGlobals, jobMemory); auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId); for (unsigned lane : job.lanes) { StaticValueKnowledge knowledge; knowledge.indexValues[coreBatchOp.getLaneArgument()] = lane; for (unsigned i = 0; i < coreBatchOp.getInputs().size(); ++i) knowledge.aliases[coreBatchOp.getInputArgument(i)] = coreBatchOp.getInputs()[i]; deviceMemory.allocateCore(coreBatchOp, lane); coreCodeGen.setBatchLane(lane); int64_t processedOperations = codeGenCoreOps(coreBatchOp.getBody().front(), coreCodeGen, knowledge, coreBatchOp.getOperation(), resolveWeightSlot, lane, static_cast(coreBatchOp.getLaneCount())); if (processedOperations < 0) { result.status = CompilerFailure; return result; } assert(processedOperations > 0); } result.reportRow = deviceMemory.getReportRow(); result.usedWeights = std::move(usedWeights); } pim_binary::patchInstructionCount(coreBinaryStream, coreCodeGen.getEmittedInstructionCount()); coreBinaryStream.close(); if (coreJsonStream) { coreJsonStream->seek(coreJsonStream->tell() - 1); *coreJsonStream << ']'; coreJsonStream->close(); } return result; }; std::vector jobResults(jobs.size()); mlir::parallelFor( moduleOp.getContext(), 0, jobs.size(), [&](size_t index) { jobResults[index] = emitJob(jobs[index]); }); for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) if (jobResults[jobIndex].status != CompilerSuccess) return jobResults[jobIndex].status; llvm::SmallVector weightRequests; weightRequests.reserve(jobs.size()); for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) { WeightFileRequest request; request.coreId = jobs[jobIndex].emittedCoreId; request.weights = jobResults[jobIndex].usedWeights; weightRequests.push_back(std::move(request)); } auto mapCoreWeightToFileName = createAndPopulateWeightFolder(weightRequests, outputDirPath); for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) { const CoreEmissionJob& job = jobs[jobIndex]; const CoreEmissionResult& result = jobResults[jobIndex]; json::Array xbarsPerGroup; if (auto coreOp = dyn_cast(job.coreLikeOp)) { if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup)) return err; xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup); memory.recordCoreReport(job.emittedCoreId, result.reportRow); continue; } } for (const SmallVector& group : batchJobIndices) { SmallVector reportedCoreIds; MemoryReportRow batchRow; std::optional batchPerCoreRow; for (size_t jobIndex : group) { const CoreEmissionJob& job = jobs[jobIndex]; const CoreEmissionResult& result = jobResults[jobIndex]; json::Array xbarsPerGroup; if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup)) return err; xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup); reportedCoreIds.push_back(static_cast(job.emittedCoreId)); if (!batchPerCoreRow) batchPerCoreRow = result.reportRow; batchRow = addMemoryReportRows(batchRow, result.reportRow); } uint64_t batchReportId = jobs[group.front()].batchReportId.value_or(0); memory.recordBatchReport(batchReportId, reportedCoreIds, batchPerCoreRow.value_or(MemoryReportRow {}), batchRow.numAlloca, batchRow.sizeAlloca); } maxCoreId = nextEmittedCoreId == 0 ? 0 : nextEmittedCoreId - 1; memory.flushReport(); return writeConfigJson(funcOp, memory, maxCoreId, std::move(xbarsPerArrayGroup), outputDirPath); }