diff --git a/src/PIM/Compiler/PimCodeGen.cpp b/src/PIM/Compiler/PimCodeGen.cpp index c6606b9..ee112b6 100644 --- a/src/PIM/Compiler/PimCodeGen.cpp +++ b/src/PIM/Compiler/PimCodeGen.cpp @@ -1,22 +1,29 @@ -#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/AsmState.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Value.h" +#include "mlir/IR/Verifier.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" #include "llvm/Support/JSON.h" #include "llvm/Support/raw_ostream.h" +#include #include #include #include +#include +#include #include #include @@ -59,8 +66,7 @@ void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) { SmallVector, 16> globalAliases; SmallVector args; - - for (mlir::Value arg : funcOp.getArguments()){ + for (mlir::Value arg : funcOp.getArguments()) { gatherMemEntry(arg); args.push_back(arg); } @@ -68,11 +74,11 @@ void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) { funcOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (!hasWeightAlways(getGlobalOp)) { auto globalMemrefOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp); - if (globalMemrefOp.getName().starts_with("arg")){ + if (globalMemrefOp.getName().starts_with("arg")) { StringRef indexStr = globalMemrefOp.getName().substr(4); - int index = 0; - llvm::to_integer(indexStr,index, 10); - globalAliases.push_back({getGlobalOp.getResult(), args[index]}); + int index = 0; + llvm::to_integer(indexStr, index, 10); + globalAliases.push_back({getGlobalOp.getResult(), args[index]}); } auto [iter, inserted] = globalConstants.try_emplace(globalMemrefOp, getGlobalOp.getResult()); if (inserted) @@ -82,7 +88,6 @@ void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) { } }); - funcOp.walk([&](memref::AllocOp allocOp) { if (!allocOp->getParentOfType()) gatherMemEntry(allocOp.getResult()); @@ -100,6 +105,97 @@ void PimMemory::allocateCore(Operation* op) { allocateGatheredMemory(); } +std::string formatMemory(uint64_t bytes) { + const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"}; + int i = 0; + double size = static_cast(bytes); + while (size >= 1024 && i < 6) { + size /= 1024; + i++; + } + // Formats to 2 decimal places + std::string out; + llvm::raw_string_ostream rss(out); + rss << llvm::format("%.2f ", size) << units[i]; + return rss.str(); +} + +void PimMemory::report(llvm::raw_ostream& file) { + // Key: {OpName, Size}, Value: Vector of Addresses + // This groups all "memref.alloc" of "1KB" together + std::vector orderedList(globalMemEntriesMap.begin(), globalMemEntriesMap.end()); + std::sort( + orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.address < rgt.second.address; }); + auto newEnd = std::unique(orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { + return lft.second.address == rgt.second.address; + }); + orderedList.erase(newEnd, orderedList.end()); + std::sort( + orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.size < rgt.second.size; }); + std::map, std::vector> groupedStats; + + for (auto& [value, memEntry] : orderedList) { + std::string opName = "Unknown/BlockArg"; + if (auto op = value.getDefiningOp()) + opName = op->getName().getStringRef().str(); + + groupedStats[{opName, memEntry.size}].push_back(memEntry.address); + } + + file << "--- Memory Usage Report ---\n"; + + uint64_t totalMemory = 0; + for (auto const& [key, addresses] : groupedStats) { + const std::string& opName = key.first; + uint64_t size = key.second; + + file.indent(4) << "Type: " << opName << " [" << formatMemory(size) << "]\n"; + file.indent(6) << "Count: " << addresses.size() << "\n"; + file.indent(6) << "Total Memory: " << formatMemory(size * addresses.size()) << "\n"; + totalMemory += size * addresses.size(); + + // Optional: Print address range or first/last address to keep it concise + if (!addresses.empty()) { + auto [min, max] = std::minmax_element(addresses.begin(), addresses.end()); + file.indent(6) << "Range: " << llvm::format_hex(*min, 10) << " -> " << llvm::format_hex(*max, 10) << "\n"; + } + file << "\n"; + file << "Total Core Memory: " << formatMemory(totalMemory) << "\n"; + } +} + +// void PimMemory::report(llvm::raw_ostream& file) { +// std::vector orderedList(globalMemEntriesMap.begin(), globalMemEntriesMap.end()); +// std::sort( +// orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.address < rgt.second.address; +// }); +// auto newEnd = std::unique(orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { +// return (lft.first.getDefiningOp() == rgt.first.getDefiningOp()) && (lft.second.address == rgt.second.address); +// }); +// orderedList.erase(newEnd, orderedList.end()); +// mlir::OpPrintingFlags flags; +// flags.assumeVerified(true); +// for (auto& [value, memEntry] : orderedList) { +// if (auto op = value.getDefiningOp()) { +// file.indent(4) << op << ": "; +// op->print(file, flags); +// file << "\n"; +// file.indent(6) << "Address: " << llvm::format_hex(memEntry.address, 10) << "\n"; +// file.indent(6) << "Memory: " << formatMemory(memEntry.size) << "\n"; +// } +// else { +// file.indent(4) << value << "\n"; +// file.indent(6) << "Address: " << llvm::format_hex(memEntry.address, 10) << "\n"; +// file.indent(6) << "Memory: " << formatMemory(memEntry.size) << "\n"; +// } +// } +// } + +void PimMemory::remove(mlir::Value val) { + if (auto removeIter = globalMemEntriesMap.find(val); removeIter != globalMemEntriesMap.end()) + globalMemEntriesMap.erase(removeIter); +} + MemEntry PimMemory::getMemEntry(mlir::Value value) const { auto iter = globalMemEntriesMap.find(value); assert("Missing memEntry for value" && iter != globalMemEntriesMap.end()); @@ -140,6 +236,28 @@ size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValu return iter->second.address + resolvedAddress->byteOffset; } +void PimAcceleratorMemory::reportHost() { + llvm::raw_os_ostream os(fileReport); + os << "Host Memory\n"; + hostMem.report(os); + os.flush(); +} + +void PimAcceleratorMemory::reportCore(size_t coreId) { + llvm::raw_os_ostream os(fileReport); + os << "Core " << coreId << " Memory\n"; + deviceMem.at(coreId).report(os); + os.flush(); +} + +void PimAcceleratorMemory::clean(mlir::Operation* op) { + for (auto value : op->getResults()) { + hostMem.remove(value); + for (auto& device : deviceMem) + device.second.remove(value); + } +} + json::Object PimCodeGen::createEmptyOffset() { json::Object offset; offset["offset_select"] = 0; @@ -434,8 +552,7 @@ void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticVa emitInstruction(std::move(json)); } -void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const { -} +void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const {} void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp, const StaticValueKnowledge& knowledge) const { auto srcAddr = addressOf(transposeOp.getInput(), knowledge); @@ -524,10 +641,9 @@ static SmallVector getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) { static SmallVector collectTopLevelCoreLikeOps(func::FuncOp funcOp) { SmallVector coreLikeOps; - for (Operation& op : funcOp.getBody().front()) { + for (Operation& op : funcOp.getBody().front()) if (dyn_cast(&op) || dyn_cast(&op)) coreLikeOps.push_back(&op); - } return coreLikeOps; } @@ -543,10 +659,8 @@ static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp cor laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]); auto coreIds = getBatchCoreIds(coreBatchOp); - auto scalarCore = pim::PimCoreOp::create(builder, - coreBatchOp.getLoc(), - ValueRange(laneWeights), - builder.getI32IntegerAttr(coreIds[lane])); + auto scalarCore = pim::PimCoreOp::create( + builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane])); Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end()); IRMapping mapper; if (coreBatchOp.getBody().front().getNumArguments() == 1) @@ -569,12 +683,13 @@ static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp cor } if (auto receiveBatchOp = dyn_cast(op)) { - auto scalarReceive = pim::PimReceiveOp::create(builder, - receiveBatchOp.getLoc(), - receiveBatchOp.getOutput().getType(), - mapper.lookup(receiveBatchOp.getOutputBuffer()), - receiveBatchOp.getSizeAttr(), - builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane])); + auto scalarReceive = + pim::PimReceiveOp::create(builder, + receiveBatchOp.getLoc(), + receiveBatchOp.getOutput().getType(), + mapper.lookup(receiveBatchOp.getOutputBuffer()), + receiveBatchOp.getSizeAttr(), + builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane])); mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput()); continue; } @@ -606,8 +721,10 @@ static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp cor return scalarCore; } -static void aliasMaterializedHostGlobals( - ModuleOp moduleOp, func::FuncOp funcOp, pim::PimCoreOp coreOp, PimAcceleratorMemory& memory) { +static void aliasMaterializedHostGlobals(ModuleOp moduleOp, + func::FuncOp funcOp, + pim::PimCoreOp coreOp, + PimAcceleratorMemory& memory) { coreOp.walk([&](memref::GetGlobalOp getGlobalOp) { if (hasWeightAlways(getGlobalOp) || memory.memEntriesMap.contains(getGlobalOp.getResult())) return; @@ -990,6 +1107,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: PimAcceleratorMemory memory; memory.hostMem.allocateHost(moduleOp, funcOp); + memory.reportHost(); if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath)) return err; @@ -1063,6 +1181,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds); aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory); memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp); + memory.reportCore(coreId); int64_t processedOperations = codeGenCoreOps(coreOp.getBody().front(), coreCodeGen); if (processedOperations < 0) @@ -1093,8 +1212,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: if (auto error = sys::fs::create_link(outputDirPath + "/weights/" + fileName, coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")) { errs() << "Error creating link file: " << (outputDirPath + "/weights/" + fileName) << " to " - << (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin") << "\nError:" - << error.message() << '\n'; + << (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin") + << "\nError:" << error.message() << '\n'; return InvalidOutputFileAccess; } } @@ -1103,8 +1222,10 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: } for (pim::PimCoreOp coreOp : scalarCores) - if (coreOp.getOperation() != op) + if (coreOp.getOperation() != op) { + coreOp.walk([&memory](Operation* op) { memory.clean(op); }); coreOp.erase(); + } } return writeConfigJson(funcOp, memory, maxCoreId, std::move(xbarsPerArrayGroup), outputDirPath); diff --git a/src/PIM/Compiler/PimCodeGen.hpp b/src/PIM/Compiler/PimCodeGen.hpp index 38e2c3f..83ca886 100644 --- a/src/PIM/Compiler/PimCodeGen.hpp +++ b/src/PIM/Compiler/PimCodeGen.hpp @@ -1,8 +1,12 @@ #pragma once -#include "llvm/ADT/DenseMap.h" +#include "mlir/IR/Operation.h" #include "llvm-project/clang/include/clang/Basic/LLVM.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/raw_os_ostream.h" + +#include #include "onnx-mlir/Compiler/OMCompilerTypes.h" #include "src/Accelerators/PIM/Common/PimCommon.hpp" @@ -34,6 +38,8 @@ public: void allocateHost(mlir::ModuleOp moduleOp, mlir::func::FuncOp funcOp); void allocateCore(mlir::Operation* op); + void report(llvm::raw_ostream& os); + void remove(mlir::Value val); size_t getFirstAvailableAddress() const { return firstAvailableAddress; } MemEntry getMemEntry(mlir::Value value) const; @@ -46,14 +52,28 @@ public: private: llvm::SmallDenseMap deviceMem; + std::fstream fileReport; public: PimAcceleratorMemory() - : hostMem(memEntriesMap) {} + : hostMem(memEntriesMap) { + + std::string outputDir = getOutputDir(); + if (outputDir.empty()) + return; + + std::string dialectsDir = outputDir + "/reports/"; + createDirectory(dialectsDir); + std::fstream file(dialectsDir + "/memory_report.txt", std::ios::out); + fileReport = std::move(file); + } PimMemory& getOrCreateDeviceMem(size_t id); size_t getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const; + void reportHost(); + void reportCore(size_t coreId); + void clean(mlir::Operation* op); }; class PimCodeGen {