#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Value.h" #include "mlir/IR/Verifier.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" #include "llvm/Support/JSON.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include #include #include #include #include #include "Common/PimCommon.hpp" #include "Conversion/ONNXToSpatial/Common/Common.hpp" #include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" using namespace llvm; using namespace mlir; using namespace onnx_mlir; static size_t getValueSizeInBytes(mlir::Value value) { auto type = cast(value.getType()); return type.getNumElements() * type.getElementTypeBitWidth() / 8; } MemEntry* PimMemory::gatherMemEntry(mlir::Value value) { auto type = cast(value.getType()); assert("Only static shape is supported" && type.hasStaticShape()); size_t allocSize = type.getNumElements() * type.getElementType().getIntOrFloatBitWidth() / 8; MemEntry memEntry = {0, allocSize}; return &memEntries.emplace_back(memEntry, value).first; } void PimMemory::allocateGatheredMemory() { llvm::sort(memEntries, [](auto a, auto b) -> bool { return a.first.size > b.first.size; }); for (auto& [memEntry, value] : memEntries) allocateMemoryForValue(value, memEntry); } void PimMemory::allocateMemoryForValue(mlir::Value value, MemEntry& memEntry) { memEntry.address = firstAvailableAddress; firstAvailableAddress += memEntry.size; // Alignment if (size_t remainder = firstAvailableAddress % minAlignment) firstAvailableAddress += minAlignment - remainder; globalMemEntriesMap[value] = memEntry; } void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) { SmallDenseMap globalConstants; SmallVector, 16> globalAliases; SmallVector args; for (mlir::Value arg : funcOp.getArguments()) { gatherMemEntry(arg); args.push_back(arg); } funcOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (!hasWeightAlways(getGlobalOp)) { auto globalMemrefOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (globalMemrefOp.getName().starts_with("arg")) { StringRef indexStr = globalMemrefOp.getName().substr(4); int index = 0; llvm::to_integer(indexStr, index, 10); globalAliases.push_back({getGlobalOp.getResult(), args[index]}); } auto [iter, inserted] = globalConstants.try_emplace(globalMemrefOp, getGlobalOp.getResult()); if (inserted) gatherMemEntry(getGlobalOp.getResult()); else globalAliases.push_back({getGlobalOp.getResult(), iter->second}); } }); funcOp.walk([&](memref::AllocOp allocOp) { if (!allocOp->getParentOfType()) gatherMemEntry(allocOp.getResult()); }); allocateGatheredMemory(); for (auto [alias, original] : globalAliases) globalMemEntriesMap[alias] = getMemEntry(original); } void PimMemory::allocateCore(Operation* op) { op->walk([&](memref::AllocOp allocOp) { gatherMemEntry(allocOp); }); allocateGatheredMemory(); } std::string formatMemory(uint64_t bytes) { const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"}; int i = 0; double size = static_cast(bytes); while (size >= 1024 && i < 6) { size /= 1024; i++; } // Formats to 2 decimal places std::string out; llvm::raw_string_ostream rss(out); rss << llvm::format("%.2f ", size) << units[i]; return rss.str(); } void PimMemory::report(llvm::raw_ostream& file) { // Key: {OpName, Size}, Value: Vector of Addresses // This groups all "memref.alloc" of "1KB" together std::vector orderedList(globalMemEntriesMap.begin(), globalMemEntriesMap.end()); std::sort( orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.address < rgt.second.address; }); auto newEnd = std::unique(orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.address == rgt.second.address; }); orderedList.erase(newEnd, orderedList.end()); std::sort( orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.size < rgt.second.size; }); std::map, std::vector> groupedStats; for (auto& [value, memEntry] : orderedList) { std::string opName = "Unknown/BlockArg"; if (auto op = value.getDefiningOp()) opName = op->getName().getStringRef().str(); groupedStats[{opName, memEntry.size}].push_back(memEntry.address); } file << "--- Memory Usage Report ---\n"; uint64_t totalMemory = 0; for (auto const& [key, addresses] : groupedStats) { const std::string& opName = key.first; uint64_t size = key.second; file.indent(4) << "Type: " << opName << " [" << formatMemory(size) << "]\n"; file.indent(6) << "Count: " << addresses.size() << "\n"; file.indent(6) << "Total Memory: " << formatMemory(size * addresses.size()) << "\n"; totalMemory += size * addresses.size(); // Optional: Print address range or first/last address to keep it concise if (!addresses.empty()) { auto [min, max] = std::minmax_element(addresses.begin(), addresses.end()); file.indent(6) << "Range: " << llvm::format_hex(*min, 10) << " -> " << llvm::format_hex(*max, 10) << "\n"; } file << "\n"; file << "Total Core Memory: " << formatMemory(totalMemory) << "\n"; } } void PimMemory::remove(mlir::Value val) { if (auto removeIter = globalMemEntriesMap.find(val); removeIter != globalMemEntriesMap.end()) globalMemEntriesMap.erase(removeIter); } MemEntry PimMemory::getMemEntry(mlir::Value value) const { auto iter = globalMemEntriesMap.find(value); assert("Missing memEntry for value" && iter != globalMemEntriesMap.end()); return iter->second; } PimMemory& PimAcceleratorMemory::getOrCreateDeviceMem(size_t id) { return deviceMem.try_emplace(id, memEntriesMap).first->second; } size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge) const { auto resolvedAddress = resolveContiguousAddress(value, knowledge); if (failed(resolvedAddress)) { errs() << "Failed to resolve contiguous address for value: "; value.print(errs()); errs() << "\n"; if (auto* definingOp = value.getDefiningOp()) { errs() << "Defining op:\n"; definingOp->print(errs()); errs() << "\n"; } llvm_unreachable("Failed to resolve contiguous address"); } auto iter = memEntriesMap.find(resolvedAddress->base); if (iter == memEntriesMap.end()) { errs() << "Missing mem entry for value: "; resolvedAddress->base.print(errs()); errs() << "\n"; if (auto* definingOp = resolvedAddress->base.getDefiningOp()) { errs() << "Defining op:\n"; definingOp->print(errs()); errs() << "\n"; } llvm_unreachable("Missing mem entry"); } return iter->second.address + resolvedAddress->byteOffset; } void PimAcceleratorMemory::reportHost() { llvm::raw_os_ostream os(fileReport); os << "Host Memory\n"; hostMem.report(os); os.flush(); } void PimAcceleratorMemory::reportCore(size_t coreId) { llvm::raw_os_ostream os(fileReport); os << "Core " << coreId << " Memory\n"; deviceMem.at(coreId).report(os); os.flush(); } void PimAcceleratorMemory::clean(mlir::Operation* op) { for (auto value : op->getResults()) { hostMem.remove(value); for (auto& device : deviceMem) device.second.remove(value); } } json::Object PimCodeGen::createEmptyOffset() { json::Object offset; offset["offset_select"] = 0; offset["offset_value"] = 0; return offset; } size_t PimCodeGen::remapCoreId(size_t coreId) const { auto it = emittedCoreIds.find(coreId); assert(it != emittedCoreIds.end() && "Missing emitted core id remapping"); return it->second; } static json::Object createRs1OnlyOffset() { json::Object offset; offset["offset_select"] = 1; offset["offset_value"] = 0; return offset; } void PimCodeGen::emitInstruction(json::Object instruction) const { coreFileStream << json::Value(std::move(instruction)) << ','; } void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) const { json::Object json; json["op"] = "sldi"; json["rd"] = registerNumber; json["imm"] = immediate; emitInstruction(std::move(json)); } void PimCodeGen::setupRd(size_t rdAddress, size_t rdOffset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); } void PimCodeGen::setupRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset); } void PimCodeGen::setupRdRs1Rs2( size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset) const { genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset); genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset); genSetRegisterImmediateUnsigned(2, rs2Address + rs2Offset); } void PimCodeGen::emitMemCopyOp(StringRef opName, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset, size_t size, StringRef sizeFieldName) const { setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset); json::Object json; json["op"] = opName; json["rd"] = 0; json["rs1"] = 1; json[sizeFieldName] = size; json["offset"] = createEmptyOffset(); emitInstruction(std::move(json)); } void PimCodeGen::emitCommunicationOp(StringRef opName, size_t bufferAddr, size_t coreId, size_t size) const { setupRd(bufferAddr, 0); json::Object json; json["op"] = opName; json["rd"] = 0; json["core"] = remapCoreId(coreId); json["size"] = size; json["offset"] = createEmptyOffset(); emitInstruction(std::move(json)); } void PimCodeGen::emitMvmOp(size_t groupId, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset) const { setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset); json::Object json; json["op"] = "mvmul"; json["rd"] = 0; json["rs1"] = 1; json["group"] = groupId; json["relu"] = 0; json["mbiw"] = 8; emitInstruction(std::move(json)); } void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const { emitMemCopyOp("ld", addressOf(loadOp.getDeviceTarget(), knowledge), loadOp.getDeviceTargetOffset(), addressOf(loadOp.getHostSource(), knowledge), loadOp.getHostSourceOffset(), loadOp.getSize()); } void PimCodeGen::codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp, const StaticValueKnowledge& knowledge) const { emitMemCopyOp("st", addressOf(storeOp.getHostTarget(), knowledge), storeOp.getHostTargetOffset(), addressOf(storeOp.getDeviceSource(), knowledge), storeOp.getDeviceSourceOffset(), storeOp.getSize()); } void PimCodeGen::codeGenLmvOp(pim::PimMemCopyOp lmvOp, const StaticValueKnowledge& knowledge) const { emitMemCopyOp("lmv", addressOf(lmvOp.getTarget(), knowledge), lmvOp.getTargetOffset(), addressOf(lmvOp.getSource(), knowledge), lmvOp.getSourceOffset(), lmvOp.getSize(), "len"); } void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp, const StaticValueKnowledge& knowledge) const { emitCommunicationOp( "recv", addressOf(receiveOp.getOutputBuffer(), knowledge), receiveOp.getSourceCoreId(), receiveOp.getSize()); } void PimCodeGen::codeGenReceiveManyOp(pim::PimReceiveManyOp receiveManyOp, const StaticValueKnowledge& knowledge) const { for (auto [outputBuffer, sourceCoreId] : llvm::zip(receiveManyOp.getOutputBuffers(), receiveManyOp.getSourceCoreIds())) emitCommunicationOp("recv", addressOf(outputBuffer, knowledge), sourceCoreId, getValueSizeInBytes(outputBuffer)); } void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const { emitCommunicationOp("send", addressOf(sendOp.getInput(), knowledge), sendOp.getTargetCoreId(), sendOp.getSize()); } void PimCodeGen::codeGenSendManyOp(pim::PimSendManyOp sendManyOp, const StaticValueKnowledge& knowledge) const { for (auto [input, targetCoreId] : llvm::zip(sendManyOp.getInputs(), sendManyOp.getTargetCoreIds())) emitCommunicationOp("send", addressOf(input, knowledge), targetCoreId, getValueSizeInBytes(input)); } void PimCodeGen::codeGenExtractRowsOp(pim::PimExtractRowsOp extractRowsOp, const StaticValueKnowledge& knowledge) const { auto inputType = cast(extractRowsOp.getInput().getType()); assert(inputType.hasStaticShape() && inputType.getRank() == 2 && "extract_rows codegen requires static rank-2 input"); size_t elementSize = inputType.getElementTypeBitWidth() / 8; size_t rowSizeInBytes = static_cast(inputType.getDimSize(1)) * elementSize; size_t inputAddr = addressOf(extractRowsOp.getInput(), knowledge); for (auto [rowIndex, outputBuffer] : llvm::enumerate(extractRowsOp.getOutputBuffers())) emitMemCopyOp("lmv", addressOf(outputBuffer, knowledge), 0, inputAddr, rowIndex * rowSizeInBytes, rowSizeInBytes, "len"); } void PimCodeGen::codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const { auto outputType = cast(concatOp.getOutputBuffer().getType()); assert(outputType.hasStaticShape() && "concat codegen requires static output shape"); int64_t axis = concatOp.getAxis(); ArrayRef outputShape = outputType.getShape(); size_t elementSize = outputType.getElementTypeBitWidth() / 8; size_t outputAddr = addressOf(concatOp.getOutputBuffer(), knowledge); size_t outerCount = 1; for (int64_t dim = 0; dim < axis; ++dim) outerCount *= static_cast(outputShape[dim]); size_t innerCount = 1; for (size_t dim = static_cast(axis) + 1; dim < outputShape.size(); ++dim) innerCount *= static_cast(outputShape[dim]); size_t outputConcatDim = static_cast(outputShape[axis]); size_t concatOffset = 0; for (mlir::Value input : concatOp.getInputs()) { auto inputType = cast(input.getType()); assert(inputType.hasStaticShape() && "concat codegen requires static input shapes"); size_t inputConcatDim = static_cast(inputType.getDimSize(axis)); size_t blockSizeInBytes = inputConcatDim * innerCount * elementSize; size_t inputAddr = addressOf(input, knowledge); for (size_t outerIndex = 0; outerIndex < outerCount; ++outerIndex) { size_t dstOffset = (outerIndex * outputConcatDim + concatOffset) * innerCount * elementSize; size_t srcOffset = outerIndex * inputConcatDim * innerCount * elementSize; emitMemCopyOp("lmv", outputAddr, dstOffset, inputAddr, srcOffset, blockSizeInBytes, "len"); } concatOffset += inputConcatDim; } } template void PimCodeGen::codeGenMVMLikeOp(size_t mvmId, MVMTy mvmLikeOp, bool transposeMatrix, const StaticValueKnowledge& knowledge) { emitMvmOp(mvmId, addressOf(mvmLikeOp.getOutputBuffer(), knowledge), 0, addressOf(mvmLikeOp.getInput(), knowledge), 0); // TODO: save weights somewhere (if transposeMatrix=true, transpose the weight matrix) } void PimCodeGen::codeGenVVAddOp(pim::PimVVAddOp vvaddOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvaddOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvaddOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvaddOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); json::Object json; json["op"] = "vvadd"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 2; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vvaddOp.getLhs()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVVSubOp(pim::PimVVSubOp vvsubOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvsubOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvsubOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvsubOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); json::Object json; json["op"] = "vvsub"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 2; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vvsubOp.getLhs()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVVMulOp(pim::PimVVMulOp vvmulOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvmulOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvmulOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvmulOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); json::Object json; json["op"] = "vvmul"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 2; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vvmulOp.getLhs()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVVMaxOp(pim::PimVVMaxOp vvmaxOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvmaxOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvmaxOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvmaxOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); json::Object json; json["op"] = "vvmax"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 2; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vvmaxOp.getLhs()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVVDMulOp(pim::PimVVDMulOp vvdmulOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vvdmulOp.getOutputBuffer(), knowledge); auto lhsAddr = addressOf(vvdmulOp.getLhs(), knowledge); auto rhsAddr = addressOf(vvdmulOp.getRhs(), knowledge); setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0); json::Object json; json["op"] = "vvdmul"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 2; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vvdmulOp.getLhs()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVAvgOp(pim::PimVAvgOp vavgOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vavgOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vavgOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); json::Object json; json["op"] = "vavg"; json["rd"] = 0; json["rs1"] = 1; json["rs2"] = 1; json["offset"] = createRs1OnlyOffset(); json["len"] = getValueSizeInBytes(vavgOp.getInput()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vreluOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vreluOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); json::Object json; json["op"] = "vrelu"; json["rd"] = 0; json["rs1"] = 1; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vreluOp.getInput()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVTanhOp(pim::PimVTanhOp vtanhOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vtanhOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vtanhOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); json::Object json; json["op"] = "vtanh"; json["rd"] = 0; json["rs1"] = 1; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vtanhOp.getInput()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vsigmOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vsigmOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); json::Object json; json["op"] = "vsigm"; json["rd"] = 0; json["rs1"] = 1; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vsigmOp.getInput()); emitInstruction(std::move(json)); } void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticValueKnowledge& knowledge) const { auto outputBufferAddr = addressOf(vsoftmaxOp.getOutputBuffer(), knowledge); auto inputAddr = addressOf(vsoftmaxOp.getInput(), knowledge); setupRdRs1(outputBufferAddr, 0, inputAddr, 0); json::Object json; json["op"] = "vsoftmax"; json["rd"] = 0; json["rs1"] = 1; json["offset"] = createEmptyOffset(); json["len"] = getValueSizeInBytes(vsoftmaxOp.getInput()); emitInstruction(std::move(json)); } void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const {} void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp, const StaticValueKnowledge& knowledge) const { auto srcAddr = addressOf(transposeOp.getInput(), knowledge); auto dstAddr = addressOf(transposeOp.getOutputBuffer(), knowledge); auto srcType = cast(transposeOp.getInput().getType()); auto srcShape = srcType.getShape(); size_t rank = srcShape.size(); size_t elementSize = srcType.getElementTypeBitWidth() / 8; size_t totalElements = srcType.getNumElements(); // Read permutation. Destination dim i corresponds to source dim perm[i]. SmallVector perm = map_to_vector(transposeOp.getPermutation().getAsRange(), [](auto attr) -> int64_t { return attr.getInt(); }); // Destination shape: dstShape[i] = srcShape[perm[i]] SmallVector dstShape(rank); for (size_t i = 0; i < rank; i++) dstShape[i] = srcShape[perm[i]]; // Row-major strides for source and destination SmallVector srcStrides(rank, 1); SmallVector dstStrides(rank, 1); for (int64_t i = rank - 2; i >= 0; i--) { srcStrides[i] = srcStrides[i + 1] * srcShape[i + 1]; dstStrides[i] = dstStrides[i + 1] * dstShape[i + 1]; } // Emit element-by-element copy with transposed addressing for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) { // Decompose flat source index into multi-dimensional index SmallVector srcIdx(rank); size_t remaining = srcFlat; for (size_t d = 0; d < rank; d++) { srcIdx[d] = remaining / srcStrides[d]; remaining %= srcStrides[d]; } // Compute flat destination index: dstIdx[d] = srcIdx[perm[d]] size_t dstFlat = 0; for (size_t d = 0; d < rank; d++) dstFlat += srcIdx[perm[d]] * dstStrides[d]; emitMemCopyOp("lmv", dstAddr, dstFlat * elementSize, srcAddr, srcFlat * elementSize, elementSize, "len"); } } size_t getMatrixSize(ShapedType matrixShape) { if (matrixShape.getRank() != 2 && matrixShape.getRank() != 4) assert(false && "Unsupported matrix shape"); return std::max(matrixShape.getDimSize(0), matrixShape.getDimSize(1)); } std::string getMemorySizeAsString(size_t size) { if (size > 1024 * 1024 * 1024) return std::to_string(size / 1024 / 1024 / 1024) + " GB"; if (size > 1024 * 1024) return std::to_string(size / 1024 / 1024) + " MB"; if (size > 1024) return std::to_string(size / 1024) + " KB"; return std::to_string(size) + " Bytes"; } static SmallVector getUsedWeightIndices(Block& block) { SmallVector indices; auto addIndex = [&](unsigned weightIndex) { if (!llvm::is_contained(indices, weightIndex)) indices.push_back(weightIndex); }; block.walk([&](pim::PimMVMOp mvmOp) { addIndex(mvmOp.getWeightIndex()); }); block.walk([&](pim::PimVMMOp vmmOp) { addIndex(vmmOp.getWeightIndex()); }); llvm::sort(indices); return indices; } static SmallVector getUsedWeightIndices(pim::PimCoreOp coreOp) { return getUsedWeightIndices(coreOp.getBody().front()); } static SmallVector getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) { auto coreIdsAttr = coreBatchOp->getAttrOfType(onnx_mlir::kCoreIdsAttrName); assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute"); return SmallVector(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end()); } static SmallVector collectTopLevelCoreLikeOps(func::FuncOp funcOp) { SmallVector coreLikeOps; for (Operation& op : funcOp.getBody().front()) if (dyn_cast(&op) || dyn_cast(&op)) coreLikeOps.push_back(&op); return coreLikeOps; } static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, unsigned lane) { OpBuilder builder(coreBatchOp); builder.setInsertionPointAfter(coreBatchOp); size_t laneCount = static_cast(coreBatchOp.getLaneCount()); size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount; SmallVector laneWeights; laneWeights.reserve(weightsPerLane); for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex) laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]); auto coreIds = getBatchCoreIds(coreBatchOp); auto scalarCore = pim::PimCoreOp::create( builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane])); Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end()); IRMapping mapper; if (coreBatchOp.getBody().front().getNumArguments() == 1) mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]); builder.setInsertionPointToEnd(block); for (Operation& op : coreBatchOp.getBody().front()) { if (isa(op)) { pim::PimHaltOp::create(builder, op.getLoc()); continue; } if (auto sendBatchOp = dyn_cast(op)) { pim::PimSendOp::create(builder, sendBatchOp.getLoc(), mapper.lookup(sendBatchOp.getInput()), sendBatchOp.getSizeAttr(), builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane])); continue; } if (auto sendManyBatchOp = dyn_cast(op)) { SmallVector laneTargetCoreIds; laneTargetCoreIds.reserve(sendManyBatchOp.getInputs().size()); for (auto valueIndex : llvm::seq(0, sendManyBatchOp.getInputs().size())) laneTargetCoreIds.push_back( sendManyBatchOp.getTargetCoreIds()[valueIndex * laneCount + static_cast(lane)]); SmallVector mappedInputs; mappedInputs.reserve(sendManyBatchOp.getInputs().size()); for (mlir::Value input : sendManyBatchOp.getInputs()) mappedInputs.push_back(mapper.lookup(input)); pim::PimSendManyOp::create(builder, sendManyBatchOp.getLoc(), builder.getDenseI32ArrayAttr(laneTargetCoreIds), ValueRange(mappedInputs)); continue; } if (auto receiveBatchOp = dyn_cast(op)) { auto scalarReceive = pim::PimReceiveOp::create(builder, receiveBatchOp.getLoc(), receiveBatchOp.getOutput().getType(), mapper.lookup(receiveBatchOp.getOutputBuffer()), receiveBatchOp.getSizeAttr(), builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane])); mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput()); continue; } if (auto receiveManyBatchOp = dyn_cast(op)) { SmallVector laneSourceCoreIds; laneSourceCoreIds.reserve(receiveManyBatchOp.getOutputs().size()); for (auto valueIndex : llvm::seq(0, receiveManyBatchOp.getOutputs().size())) laneSourceCoreIds.push_back( receiveManyBatchOp.getSourceCoreIds()[valueIndex * laneCount + static_cast(lane)]); SmallVector mappedOutputBuffers; mappedOutputBuffers.reserve(receiveManyBatchOp.getOutputBuffers().size()); for (mlir::Value outputBuffer : receiveManyBatchOp.getOutputBuffers()) mappedOutputBuffers.push_back(mapper.lookup(outputBuffer)); auto scalarReceiveMany = pim::PimReceiveManyOp::create(builder, receiveManyBatchOp.getLoc(), receiveManyBatchOp->getResultTypes(), ValueRange(mappedOutputBuffers), builder.getDenseI32ArrayAttr(laneSourceCoreIds)); for (auto [originalOutput, scalarOutput] : llvm::zip(receiveManyBatchOp.getOutputs(), scalarReceiveMany.getOutputs())) mapper.map(originalOutput, scalarOutput); continue; } if (auto memcpBatchOp = dyn_cast(op)) { mlir::Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource()); if (!hostSource) hostSource = memcpBatchOp.getHostSource(); auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder, memcpBatchOp.getLoc(), memcpBatchOp.getOutput().getType(), mapper.lookup(memcpBatchOp.getDeviceTarget()), hostSource, memcpBatchOp.getDeviceTargetOffsetAttr(), memcpBatchOp.getHostSourceOffsetAttr(), memcpBatchOp.getSizeAttr()); mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput()); continue; } Operation* cloned = builder.clone(op, mapper); for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults())) mapper.map(originalResult, clonedResult); } if (block->empty() || !isa(block->back())) pim::PimHaltOp::create(builder, coreBatchOp.getLoc()); return scalarCore; } static void aliasMaterializedHostGlobals(ModuleOp moduleOp, func::FuncOp funcOp, pim::PimCoreOp coreOp, PimAcceleratorMemory& memory) { coreOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (hasWeightAlways(getGlobalOp) || memory.memEntriesMap.contains(getGlobalOp.getResult())) return; auto targetGlobal = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!targetGlobal) return; mlir::Value aliasedValue; funcOp.walk([&](memref::GetGlobalOp candidate) { if (aliasedValue || candidate == getGlobalOp || !memory.memEntriesMap.contains(candidate.getResult())) return; if (lookupGlobalForGetGlobal(moduleOp, candidate) == targetGlobal) aliasedValue = candidate.getResult(); }); if (aliasedValue) memory.memEntriesMap[getGlobalOp.getResult()] = memory.memEntriesMap[aliasedValue]; }); } /// Write global constant data into a binary memory image at their allocated addresses. static OnnxMlirCompilerErrorCodes writeMemoryBinary(ModuleOp moduleOp, func::FuncOp funcOp, PimAcceleratorMemory& memory, StringRef outputDirPath) { auto memoryFilePath = (outputDirPath + "/memory.bin").str(); std::error_code errorCode; raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, sys::fs::OF_None); if (errorCode) { errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n'; return InvalidOutputFileAccess; } std::vector memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0); SmallPtrSet writtenGlobals; funcOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (hasWeightAlways(getGlobalOp)) return; auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!globalOp) return; if (!writtenGlobals.insert(globalOp.getOperation()).second) return; auto initialValue = globalOp.getInitialValue(); if (!initialValue) return; auto denseAttr = dyn_cast(*initialValue); if (!denseAttr) return; MemEntry memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult()); ArrayRef rawData = denseAttr.getRawData(); char* dst = memoryBuffer.data() + memEntry.address; if (denseAttr.isSplat()) { size_t elementSize = rawData.size(); assert(elementSize * getGlobalOp.getType().getNumElements() == memEntry.size && "Data size mismatch"); for (size_t offset = 0; offset < memEntry.size; offset += elementSize) std::memcpy(dst + offset, rawData.data(), std::min(elementSize, memEntry.size - offset)); } else { assert(rawData.size() == memEntry.size && "Data size mismatch"); std::memcpy(dst, rawData.data(), rawData.size()); } }); memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size()); memoryFileStream.close(); return CompilerSuccess; } /// Dispatch all operations in a core region to the appropriate code generator. /// scf.for loops are statically unrolled via walkPimCoreBlock so that addressing is /// fully resolved before the JSON instructions are emitted. /// Returns the number of emitted instructions, or -1 on failure. static int64_t codeGenCoreOps(Block& block, PimCodeGen& coreCodeGen) { size_t processedOperations = 0; auto result = walkPimCoreBlock(block, StaticValueKnowledge {}, [&](Operation& op, const StaticValueKnowledge& knowledge) { if (auto loadOp = dyn_cast(op)) coreCodeGen.codeGenLoadOp(loadOp, knowledge); else if (auto storeOp = dyn_cast(op)) coreCodeGen.codeGenStoreOp(storeOp, knowledge); else if (auto lmvOp = dyn_cast(op)) coreCodeGen.codeGenLmvOp(lmvOp, knowledge); else if (auto receiveOp = dyn_cast(op)) coreCodeGen.codeGenReceiveOp(receiveOp, knowledge); else if (auto receiveManyOp = dyn_cast(op)) coreCodeGen.codeGenReceiveManyOp(receiveManyOp, knowledge); else if (auto sendOp = dyn_cast(op)) coreCodeGen.codeGenSendOp(sendOp, knowledge); else if (auto sendManyOp = dyn_cast(op)) coreCodeGen.codeGenSendManyOp(sendManyOp, knowledge); else if (auto extractRowsOp = dyn_cast(op)) coreCodeGen.codeGenExtractRowsOp(extractRowsOp, knowledge); else if (auto concatOp = dyn_cast(op)) coreCodeGen.codeGenConcatOp(concatOp, knowledge); else if (auto vmmOp = dyn_cast(op)) coreCodeGen.codeGenMVMLikeOp(vmmOp.getWeightIndex(), vmmOp, true, knowledge); else if (auto mvmOp = dyn_cast(op)) coreCodeGen.codeGenMVMLikeOp(mvmOp.getWeightIndex(), mvmOp, false, knowledge); else if (auto transposeOp = dyn_cast(op)) coreCodeGen.codeGenTransposeOp(transposeOp, knowledge); else if (auto vvaddOp = dyn_cast(op)) coreCodeGen.codeGenVVAddOp(vvaddOp, knowledge); else if (auto vvsubOp = dyn_cast(op)) coreCodeGen.codeGenVVSubOp(vvsubOp, knowledge); else if (auto vvmulOp = dyn_cast(op)) coreCodeGen.codeGenVVMulOp(vvmulOp, knowledge); else if (auto vvmaxOp = dyn_cast(op)) coreCodeGen.codeGenVVMaxOp(vvmaxOp, knowledge); else if (auto vvdmulOp = dyn_cast(op)) coreCodeGen.codeGenVVDMulOp(vvdmulOp, knowledge); else if (auto vavgOp = dyn_cast(op)) coreCodeGen.codeGenVAvgOp(vavgOp, knowledge); else if (auto vreluOp = dyn_cast(op)) coreCodeGen.codeGenVReluOp(vreluOp, knowledge); else if (auto vtanhOp = dyn_cast(op)) coreCodeGen.codeGenVTanhOp(vtanhOp, knowledge); else if (auto vsigmOp = dyn_cast(op)) coreCodeGen.codeGenVSigmOp(vsigmOp, knowledge); else if (auto vsoftmaxOp = dyn_cast(op)) coreCodeGen.codeGenVSoftmaxOp(vsoftmaxOp, knowledge); else if (auto getGlobalOp = dyn_cast(op)) coreCodeGen.codeGetGlobalOp(getGlobalOp, knowledge); else { op.emitError("Unsupported codegen for this operation"); op.dump(); return failure(); } processedOperations++; return success(); }); return failed(result) ? -1 : static_cast(processedOperations); } /// Write crossbar weight matrices as padded binary files for a single core. static OnnxMlirCompilerErrorCodes writeCrossbarWeights(ModuleOp moduleOp, pim::PimCoreOp coreOp, StringRef coreWeightsDirPath, json::Array& xbarsPerGroup) { int64_t xbarSize = crossbarSize.getValue(); std::error_code errorCode; size_t weightIndex = 0; for (auto weight : coreOp.getWeights()) { xbarsPerGroup.push_back(weightIndex); auto getGlobalOp = weight.getDefiningOp(); if (!getGlobalOp) { coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(weightIndex)); weightIndex++; continue; } auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!globalOp) { coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(weightIndex)); weightIndex++; continue; } auto initialValue = globalOp.getInitialValue(); if (!initialValue) { coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(weightIndex)); weightIndex++; continue; } auto denseAttr = dyn_cast(*initialValue); if (!denseAttr) { coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(weightIndex)); weightIndex++; continue; } auto type = denseAttr.getType(); auto shape = type.getShape(); assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional"); int64_t numRows = shape[0]; int64_t numCols = shape[1]; assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size"); size_t elementByteWidth = type.getElementType().getIntOrFloatBitWidth() / 8; auto weightFilePath = (coreWeightsDirPath + "/crossbar_" + std::to_string(weightIndex) + ".bin").str(); raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None); if (errorCode) { errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n'; return InvalidOutputFileAccess; } uint64_t zero = 0; for (int64_t row = 0; row < xbarSize; row++) { for (int64_t col = 0; col < xbarSize; col++) { if (row < numRows && col < numCols) { int64_t index = row * numCols + col; APInt bits = denseAttr.getValues()[index].bitcastToAPInt(); uint64_t word = bits.getZExtValue(); weightFileStream.write(reinterpret_cast(&word), elementByteWidth); } else { weightFileStream.write(reinterpret_cast(&zero), elementByteWidth); } } } weightFileStream.close(); weightIndex++; } return CompilerSuccess; } llvm::DenseMap> createAndPopulateWeightFolder(func::FuncOp funcOp, StringRef outputDirPath) { ModuleOp moduleOp = funcOp->getParentOfType(); auto coreWeightsDirPath = outputDirPath + "/weights"; auto error = sys::fs::create_directory(coreWeightsDirPath); assert(!error && "Error creating weights directory"); size_t indexFileName = 0; int64_t xbarSize = crossbarSize.getValue(); llvm::DenseMap> mapCoreWeightToFileName; llvm::DenseMap mapGlobalOpToFileName; SmallVector coreLikeOps = collectTopLevelCoreLikeOps(funcOp); for (Operation* op : coreLikeOps) { SmallVector scalarCores; if (auto coreOp = dyn_cast(op)) { scalarCores.push_back(coreOp); } else { auto coreBatchOp = cast(op); for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane)); } for (pim::PimCoreOp coreOp : scalarCores) { size_t coreId = static_cast(coreOp.getCoreId()); for (unsigned index : getUsedWeightIndices(coreOp)) { if (index >= coreOp.getWeights().size()) { coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range"); assert(index < coreOp.getWeights().size() && "Weight index is out of range"); } mlir::Value weight = coreOp.getWeights()[index]; auto getGlobalOp = weight.getDefiningOp(); if (!getGlobalOp) { coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(index)); assert(!getGlobalOp && "Weight is not from a memref.get_global"); } auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); if (!globalOp) { coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(index)); assert(!globalOp && "Could not find memref.global"); } auto initialValue = globalOp.getInitialValue(); if (!initialValue) { coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(index)); assert(!initialValue && "memref.global has no initial value"); } auto denseAttr = dyn_cast(*initialValue); if (!denseAttr) { coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(index)); assert(!denseAttr && "memref.global initial value is not dense"); } if (mapGlobalOpToFileName.contains(globalOp)) { auto& fileName = mapGlobalOpToFileName[globalOp]; std::pair weightToFile = {weight, fileName}; mapCoreWeightToFileName[coreId].insert(weightToFile); continue; } auto type = denseAttr.getType(); auto shape = type.getShape(); assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional"); int64_t numRows = shape[0]; int64_t numCols = shape[1]; assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size"); size_t elementByteWidth = type.getElementType().getIntOrFloatBitWidth() / 8; std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin"; auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str(); std::error_code errorCode; raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None); if (errorCode) { errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n'; assert(errorCode); } uint64_t zero = 0; for (int64_t row = 0; row < xbarSize; row++) { for (int64_t col = 0; col < xbarSize; col++) { if (row < numRows && col < numCols) { int64_t index = row * numCols + col; APInt bits = denseAttr.getValues()[index].bitcastToAPInt(); uint64_t word = bits.getZExtValue(); weightFileStream.write(reinterpret_cast(&word), elementByteWidth); } else { weightFileStream.write(reinterpret_cast(&zero), elementByteWidth); } } } weightFileStream.close(); mapGlobalOpToFileName.insert({globalOp, newFileName}); mapCoreWeightToFileName[coreId].insert({weight, newFileName}); } } for (pim::PimCoreOp coreOp : scalarCores) if (coreOp.getOperation() != op) coreOp.erase(); } return mapCoreWeightToFileName; } /// Write the top-level PIM configuration JSON (core count, crossbar config, I/O addresses). static OnnxMlirCompilerErrorCodes writeConfigJson(func::FuncOp funcOp, PimAcceleratorMemory& memory, size_t maxCoreId, json::Object xbarsPerArrayGroup, StringRef outputDirPath) { json::Object configJson; // pimsim-nn indexes cores directly by their numeric core ID, with the host // occupying core 0. configJson["core_cnt"] = maxCoreId + 1; // TODO: Should this be based on the floating point type used in the model? // The 2 following values determine the bitwidth of the vectors' elements: bitwidth = adc_count * cell_precision // Number of ADC for MVM units configJson["adc_count"] = 16; // The bit precision of each ADC configJson["cell_precision"] = 2; // Crossbar configuration configJson["xbar_array_count"] = crossbarCountInCore.getValue(); configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()}; configJson["array_group_map"] = std::move(xbarsPerArrayGroup); // Memory layout of inputs and outputs json::Array inputsAddresses; for (BlockArgument input : funcOp.getArguments()) inputsAddresses.push_back(memory.getValueAddress(input)); configJson["inputs_addresses"] = std::move(inputsAddresses); json::Array outputsAddresses; for (func::ReturnOp returnOp : funcOp.getOps()) for (mlir::Value output : returnOp.getOperands()) outputsAddresses.push_back(memory.getValueAddress(output)); configJson["outputs_addresses"] = std::move(outputsAddresses); auto configPath = (outputDirPath + "/config.json").str(); std::error_code errorCode; raw_fd_ostream jsonOS(configPath, errorCode); if (errorCode) { errs() << "Error while opening config file: " << errorCode.message() << '\n'; return InvalidOutputFileAccess; } jsonOS << json::Value(std::move(configJson)) << '\n'; jsonOS.close(); return CompilerSuccess; } OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::string& outputDirPath) { if (!outputDirPath.empty()) { if (auto error = sys::fs::create_directory(outputDirPath)) { errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n'; return InvalidOutputFileAccess; } } auto entryFunc = getPimEntryFunc(moduleOp); if (failed(entryFunc)) return CompilerFailure; auto funcOp = *entryFunc; PimAcceleratorMemory memory; memory.hostMem.allocateHost(moduleOp, funcOp); memory.reportHost(); if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath)) return err; // Write empty host core file std::error_code errorCode; auto outputHostCorePath = outputDirPath + "/core_0.json"; raw_fd_ostream hostFileStream(outputHostCorePath, errorCode); if (errorCode) { errs() << "Error while opening host core file `" << outputHostCorePath << "`: " << errorCode.message() << '\n'; return InvalidOutputFileAccess; } // The host core json contains 2 random instructions, just to make pimsim-nn happy hostFileStream << "[{\"imm\":0,\"op\":\"sldi\",\"rd\":0},{\"imm\":0,\"op\":\"sldi\",\"rd\":0}]"; hostFileStream.close(); // For each core, specify the number of crossbar per array group. // This implementation always assigns one crossbar per group. json::Object xbarsPerArrayGroup; size_t maxCoreId = 0; // Create Weight Folder auto mapCoreWeightToFileName = createAndPopulateWeightFolder(funcOp, outputDirPath); SmallVector coreLikeOps = collectTopLevelCoreLikeOps(funcOp); llvm::DenseMap emittedCoreIds; size_t nextEmittedCoreId = 1; for (Operation* op : coreLikeOps) { if (auto coreOp = dyn_cast(op)) { size_t originalCoreId = static_cast(coreOp.getCoreId()); if (!emittedCoreIds.contains(originalCoreId)) emittedCoreIds[originalCoreId] = nextEmittedCoreId++; continue; } auto coreBatchOp = cast(op); auto batchCoreIds = getBatchCoreIds(coreBatchOp); for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) { size_t originalCoreId = static_cast(batchCoreIds[lane]); if (!emittedCoreIds.contains(originalCoreId)) emittedCoreIds[originalCoreId] = nextEmittedCoreId++; } } for (Operation* op : coreLikeOps) { SmallVector scalarCores; if (auto coreOp = dyn_cast(op)) { scalarCores.push_back(coreOp); } else { auto coreBatchOp = cast(op); for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane)); } for (pim::PimCoreOp coreOp : scalarCores) { size_t originalCoreId = static_cast(coreOp.getCoreId()); size_t coreId = emittedCoreIds.lookup(originalCoreId); maxCoreId = std::max(maxCoreId, coreId); std::error_code errorCode; auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".json"; raw_fd_ostream coreFileStream(outputCorePath, errorCode); if (errorCode) { errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n'; return InvalidOutputFileAccess; } coreFileStream << '['; PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds); aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory); memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp); memory.reportCore(coreId); int64_t processedOperations = codeGenCoreOps(coreOp.getBody().front(), coreCodeGen); if (processedOperations < 0) return CompilerFailure; assert(processedOperations > 0); coreFileStream.seek(coreFileStream.tell() - 1); coreFileStream << ']'; coreFileStream.close(); auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId); if (auto error = sys::fs::create_directory(coreWeightsDirPath)) { errs() << "Error creating core directory: " << coreWeightsDirPath << ": " << error.message() << '\n'; return InvalidOutputFileAccess; } auto& mapWeightToFile = mapCoreWeightToFileName[originalCoreId]; json::Array xbarsPerGroup; for (unsigned index : getUsedWeightIndices(coreOp)) { if (index >= coreOp.getWeights().size()) { coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range"); assert(index < coreOp.getWeights().size() && "Weight index is out of range"); } mlir::Value weight = coreOp.getWeights()[index]; xbarsPerGroup.push_back(index); assert(mapWeightToFile.contains(weight) && "Weight was not materialized into a file!!"); auto& fileName = mapWeightToFile[weight]; if (auto error = sys::fs::create_link(outputDirPath + "/weights/" + fileName, coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")) { errs() << "Error creating link file: " << (outputDirPath + "/weights/" + fileName) << " to " << (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin") << "\nError:" << error.message() << '\n'; return InvalidOutputFileAccess; } } xbarsPerArrayGroup["core" + std::to_string(coreId)] = std::move(xbarsPerGroup); } for (pim::PimCoreOp coreOp : scalarCores) if (coreOp.getOperation() != op) { coreOp.walk([&memory](Operation* op) { memory.clean(op); }); coreOp.erase(); } } return writeConfigJson(funcOp, memory, maxCoreId, std::move(xbarsPerArrayGroup), outputDirPath); }