From 41de3cb150a57c16d07d585e3f5a6d6d936785b4 Mon Sep 17 00:00:00 2001 From: NiccoloN Date: Tue, 12 May 2026 18:17:00 +0200 Subject: [PATCH] add memory coalescing pass better reports refactor for more code-reuse and patter usage fixes --- README.md | 6 +- .../json_to_instruction/json_to_executor.rs | 4 +- src/PIM/CMakeLists.txt | 1 + src/PIM/Common/CMakeLists.txt | 1 + src/PIM/Common/Support/ReportUtils.cpp | 63 ++++++ src/PIM/Common/Support/ReportUtils.hpp | 48 ++++ src/PIM/Compiler/CMakeLists.txt | 1 + src/PIM/Compiler/PimBatchEmission.cpp | 136 +++++------ src/PIM/Compiler/PimCodeGen.cpp | 139 +++++++----- src/PIM/Compiler/PimCodeGen.hpp | 22 +- src/PIM/Compiler/PimCompilerUtils.cpp | 1 + .../ONNXToSpatial/ONNXToSpatialPass.cpp | 7 +- .../SpatialToPim/SpatialToPimPass.cpp | 213 +----------------- .../SpatialToPim/TensorPackingPatterns.cpp | 107 +++++++++ .../SpatialToPim/TensorPackingPatterns.hpp | 3 + src/PIM/Dialect/Pim/CMakeLists.txt | 1 + .../StaticMemoryCoalescing/CMakeLists.txt | 14 ++ .../StaticMemoryCoalescing.cpp | 172 ++++++++++++++ .../StaticMemoryCoalescing.hpp | 35 +++ .../StaticMemoryCoalescingPass.cpp | 203 +++++++++++++++++ .../MergeComputeNodesPass.cpp | 97 ++++---- src/PIM/Pass/PIMPasses.h | 2 + .../HostConstantFoldingPass.cpp | 2 +- .../MaterializeHostConstantsPass.cpp | 2 +- src/PIM/Pass/PimCodegen/VerificationPass.cpp | 34 +++ src/PIM/PimAccelerator.cpp | 1 + 26 files changed, 930 insertions(+), 385 deletions(-) create mode 100644 src/PIM/Common/Support/ReportUtils.cpp create mode 100644 src/PIM/Common/Support/ReportUtils.hpp create mode 100644 src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/CMakeLists.txt create mode 100644 src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.cpp create mode 100644 src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp create mode 100644 src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescingPass.cpp diff --git a/README.md b/README.md index 5443fd4..acae64c 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,11 @@ ONNX-MLIR ──► Spatial ──► Pim (tensor) ──► Pim (bufferized) standard MLIR `BufferizableOpInterface` machinery (`OpBufferizationInterfaces.*`, `PimBufferization.td`). -5. **PIM code generation** (`src/PIM/Pass/PimCodegen`): +5. **Static memory coalescing** (`src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing`). + Conservatively reuses same-typed local memref allocations inside PIM cores + after bufferization and before code generation. + +6. **PIM code generation** (`src/PIM/Pass/PimCodegen`): - `HostConstantFolding` — folds host-side constants. - `MaterializeHostConstantsPass` — materializes the remaining host constants for emission. diff --git a/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs b/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs index 805c074..45fdc2c 100644 --- a/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs +++ b/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs @@ -15,9 +15,9 @@ use crate::{ }; -pub fn json_to_executor<'a>( +pub fn json_to_executor<'a, 'b>( config: Value, - mut cores: impl Iterator, + mut cores: impl Iterator, crossbars : Vec> ) -> Executable<'a> { let cell_precision = config.get("cell_precision").unwrap().as_i64().unwrap() as i32; diff --git a/src/PIM/CMakeLists.txt b/src/PIM/CMakeLists.txt index 4531b9e..d0f6fa5 100644 --- a/src/PIM/CMakeLists.txt +++ b/src/PIM/CMakeLists.txt @@ -68,5 +68,6 @@ add_pim_library(OMPIMAccel OMSpatialToPim OMPimCommon OMPimBufferization + OMPimStaticMemoryCoalescing MLIRTensorInferTypeOpInterfaceImpl ) diff --git a/src/PIM/Common/CMakeLists.txt b/src/PIM/Common/CMakeLists.txt index 74f52cf..0dce626 100644 --- a/src/PIM/Common/CMakeLists.txt +++ b/src/PIM/Common/CMakeLists.txt @@ -8,6 +8,7 @@ add_pim_library(OMPimCommon Support/DebugDump.cpp Support/Diagnostics.cpp Support/FileSystemUtils.cpp + Support/ReportUtils.cpp EXCLUDE_FROM_OM_LIBS diff --git a/src/PIM/Common/Support/ReportUtils.cpp b/src/PIM/Common/Support/ReportUtils.cpp new file mode 100644 index 0000000..49b0e7e --- /dev/null +++ b/src/PIM/Common/Support/ReportUtils.cpp @@ -0,0 +1,63 @@ +#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp" + +#include "llvm/Support/Format.h" + +#include "src/Accelerators/PIM/Common/Support/FileSystemUtils.hpp" + +namespace onnx_mlir { + +std::fstream openReportFile(const std::string& name) { + std::string outputDir = getOutputDir(); + if (outputDir.empty()) + return {}; + + std::string reportsDir = outputDir + "/reports"; + createDirectory(reportsDir); + return std::fstream(reportsDir + "/" + name + ".txt", std::ios::out); +} + +std::string formatReportMemory(uint64_t bytes) { + const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"}; + int i = 0; + double size = static_cast(bytes); + while (size >= 1024 && i < 6) { + size /= 1024; + i++; + } + + std::string out; + llvm::raw_string_ostream rss(out); + rss << llvm::format("%.2f ", size) << units[i]; + return rss.str(); +} + +void printReportFlatFields(llvm::raw_ostream& os, llvm::ArrayRef fields) { + for (const ReportField& field : fields) + os << "\t" << field.label << ": " << field.value << "\n"; +} + +void printReportFieldBlock(llvm::raw_ostream& os, llvm::StringRef title, llvm::ArrayRef fields) { + os << "\t" << title << ":\n"; + for (const ReportField& field : fields) + os << "\t " << field.label << ": " << field.value << "\n"; +} + +void printReportTotalsBlock(llvm::raw_ostream& os, llvm::ArrayRef fields) { + os << "Totals:\n"; + for (const ReportField& field : fields) + os << "\t" << field.label << ": " << field.value << "\n"; +} + +void printReportPerCoreAndTotalFields(llvm::raw_ostream& os, + llvm::ArrayRef perCoreFields, + llvm::ArrayRef totalFields) { + printReportFieldBlock(os, "Per core", perCoreFields); + printReportFieldBlock(os, "Total", totalFields); +} + +void printReportEntrySeparator(llvm::raw_ostream& os, bool hasNextEntry) { + if (hasNextEntry) + os << "\n"; +} + +} // namespace onnx_mlir diff --git a/src/PIM/Common/Support/ReportUtils.hpp b/src/PIM/Common/Support/ReportUtils.hpp new file mode 100644 index 0000000..dde9cb3 --- /dev/null +++ b/src/PIM/Common/Support/ReportUtils.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include +#include + +namespace onnx_mlir { + +std::fstream openReportFile(const std::string& name); +std::string formatReportMemory(uint64_t bytes); + +struct ReportField { + std::string label; + std::string value; +}; + +void printReportFlatFields(llvm::raw_ostream& os, llvm::ArrayRef fields); +void printReportFieldBlock(llvm::raw_ostream& os, llvm::StringRef title, llvm::ArrayRef fields); +void printReportTotalsBlock(llvm::raw_ostream& os, llvm::ArrayRef fields); +void printReportPerCoreAndTotalFields(llvm::raw_ostream& os, + llvm::ArrayRef perCoreFields, + llvm::ArrayRef totalFields); +void printReportEntrySeparator(llvm::raw_ostream& os, bool hasNextEntry); + +template +int32_t getFirstReportCoreId(const EntryTy& entry) { + if (entry.coreIds.empty()) + return std::numeric_limits::max(); + return entry.coreIds.front(); +} + +template +void sortReportEntriesByFirstCore(EntryRange& entries) { + llvm::stable_sort(entries, [](const auto& lhs, const auto& rhs) { + int32_t lhsFirstCore = getFirstReportCoreId(lhs); + int32_t rhsFirstCore = getFirstReportCoreId(rhs); + if (lhsFirstCore != rhsFirstCore) + return lhsFirstCore < rhsFirstCore; + return lhs.id < rhs.id; + }); +} + +} // namespace onnx_mlir diff --git a/src/PIM/Compiler/CMakeLists.txt b/src/PIM/Compiler/CMakeLists.txt index 5048f67..a578c12 100644 --- a/src/PIM/Compiler/CMakeLists.txt +++ b/src/PIM/Compiler/CMakeLists.txt @@ -29,6 +29,7 @@ add_pim_library(OMPimCompilerUtils OMPimCompilerOptions OMPimCommon OMPimBufferization + OMPimStaticMemoryCoalescing OMPimPasses OMONNXToSpatial OMSpatialToPim diff --git a/src/PIM/Compiler/PimBatchEmission.cpp b/src/PIM/Compiler/PimBatchEmission.cpp index 752e79a..656a57b 100644 --- a/src/PIM/Compiler/PimBatchEmission.cpp +++ b/src/PIM/Compiler/PimBatchEmission.cpp @@ -24,6 +24,78 @@ static SmallVector getLaneChunkCoreIds(ArrayRef coreIds, size_ return laneCoreIds; } +static void scalarizeBatchOpsInCore(pim::PimCoreOp scalarCore, size_t laneCount, unsigned lane) { + IRRewriter rewriter(scalarCore.getContext()); + SmallVector batchOps; + scalarCore.walk([&](Operation* op) { + if (isa(op)) { + batchOps.push_back(op); + } + }); + + for (Operation* op : batchOps) { + rewriter.setInsertionPoint(op); + + if (auto sendBatchOp = dyn_cast(op)) { + pim::PimSendOp::create(rewriter, + sendBatchOp.getLoc(), + sendBatchOp.getInput(), + sendBatchOp.getSizeAttr(), + rewriter.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane])); + rewriter.eraseOp(op); + continue; + } + + if (auto sendTensorBatchOp = dyn_cast(op)) { + pim::PimSendTensorOp::create( + rewriter, + sendTensorBatchOp.getLoc(), + sendTensorBatchOp.getInput(), + rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane))); + rewriter.eraseOp(op); + continue; + } + + if (auto receiveBatchOp = dyn_cast(op)) { + auto scalarReceive = + pim::PimReceiveOp::create(rewriter, + receiveBatchOp.getLoc(), + receiveBatchOp.getOutput().getType(), + receiveBatchOp.getOutputBuffer(), + receiveBatchOp.getSizeAttr(), + rewriter.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane])); + rewriter.replaceOp(op, scalarReceive->getResults()); + continue; + } + + if (auto receiveTensorBatchOp = dyn_cast(op)) { + auto scalarReceive = pim::PimReceiveTensorOp::create( + rewriter, + receiveTensorBatchOp.getLoc(), + receiveTensorBatchOp.getOutput().getType(), + receiveTensorBatchOp.getOutputBuffer(), + rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane))); + rewriter.replaceOp(op, scalarReceive->getResults()); + continue; + } + + auto memcpBatchOp = cast(op); + auto scalarCopy = pim::PimMemCopyHostToDevOp::create(rewriter, + memcpBatchOp.getLoc(), + memcpBatchOp.getOutput().getType(), + memcpBatchOp.getDeviceTarget(), + memcpBatchOp.getHostSource(), + memcpBatchOp.getDeviceTargetOffsetAttr(), + memcpBatchOp.getHostSourceOffsetAttr(), + memcpBatchOp.getSizeAttr()); + rewriter.replaceOp(op, scalarCopy->getResults()); + } +} + } // namespace LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, @@ -50,69 +122,6 @@ LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, builder.setInsertionPointToEnd(block); for (Operation& op : coreBatchOp.getBody().front()) { - if (isa(op)) { - pim::PimHaltOp::create(builder, op.getLoc()); - continue; - } - - if (auto sendBatchOp = dyn_cast(op)) { - pim::PimSendOp::create(builder, - sendBatchOp.getLoc(), - mapper.lookup(sendBatchOp.getInput()), - sendBatchOp.getSizeAttr(), - builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane])); - continue; - } - - if (auto sendTensorBatchOp = dyn_cast(op)) { - pim::PimSendTensorOp::create( - builder, - sendTensorBatchOp.getLoc(), - mapper.lookup(sendTensorBatchOp.getInput()), - builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane))); - continue; - } - - if (auto receiveBatchOp = dyn_cast(op)) { - auto scalarReceive = - pim::PimReceiveOp::create(builder, - receiveBatchOp.getLoc(), - receiveBatchOp.getOutput().getType(), - mapper.lookup(receiveBatchOp.getOutputBuffer()), - receiveBatchOp.getSizeAttr(), - builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane])); - mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput()); - continue; - } - - if (auto receiveTensorBatchOp = dyn_cast(op)) { - auto scalarReceive = pim::PimReceiveTensorOp::create( - builder, - receiveTensorBatchOp.getLoc(), - receiveTensorBatchOp.getOutput().getType(), - mapper.lookup(receiveTensorBatchOp.getOutputBuffer()), - builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane))); - mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput()); - continue; - } - - if (auto memcpBatchOp = dyn_cast(op)) { - Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource()); - if (!hostSource) - hostSource = memcpBatchOp.getHostSource(); - - auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder, - memcpBatchOp.getLoc(), - memcpBatchOp.getOutput().getType(), - mapper.lookup(memcpBatchOp.getDeviceTarget()), - hostSource, - memcpBatchOp.getDeviceTargetOffsetAttr(), - memcpBatchOp.getHostSourceOffsetAttr(), - memcpBatchOp.getSizeAttr()); - mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput()); - continue; - } - Operation* cloned = builder.clone(op, mapper); for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults())) mapper.map(originalResult, clonedResult); @@ -120,6 +129,7 @@ LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, if (block->empty() || !isa(block->back())) pim::PimHaltOp::create(builder, coreBatchOp.getLoc()); + scalarizeBatchOpsInCore(scalarCore, laneCount, lane); return callback(scalarCore); } diff --git a/src/PIM/Compiler/PimCodeGen.cpp b/src/PIM/Compiler/PimCodeGen.cpp index 7bc6409..2e28e4f 100644 --- a/src/PIM/Compiler/PimCodeGen.cpp +++ b/src/PIM/Compiler/PimCodeGen.cpp @@ -26,6 +26,7 @@ #include "Common/IR/CompactAsmUtils.hpp" #include "Common/PimCommon.hpp" +#include "Common/Support/ReportUtils.hpp" #include "Conversion/ONNXToSpatial/Common/Common.hpp" #include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp" #include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp" @@ -65,6 +66,7 @@ void PimMemory::allocateMemoryForValue(mlir::Value value, MemEntry& memEntry) { if (size_t remainder = firstAvailableAddress % minAlignment) firstAvailableAddress += minAlignment - remainder; + ownedMemEntriesMap[value] = memEntry; globalMemEntriesMap[value] = memEntry; } @@ -112,26 +114,28 @@ void PimMemory::allocateCore(Operation* op) { allocateGatheredMemory(); } -std::string formatMemory(uint64_t bytes) { - const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"}; - int i = 0; - double size = static_cast(bytes); - while (size >= 1024 && i < 6) { - size /= 1024; - i++; - } - // Formats to 2 decimal places - std::string out; - llvm::raw_string_ostream rss(out); - rss << llvm::format("%.2f ", size) << units[i]; - return rss.str(); +static void printHostMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) { + llvm::SmallVector fields = { + {"Number of globals", std::to_string(row.numGlobal)}, + {"Global memory", formatReportMemory(row.sizeGlobal)}}; + printReportFlatFields(os, fields); } -static void printMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) { - os << "\tNumber of allocas: " << row.numAlloca << "\n"; - os << "\tAllocated memory: " << formatMemory(row.sizeAlloca) << "\n"; - os << "\tNumber of globals: " << row.numGlobal << "\n"; - os << "\tGlobal memory: " << formatMemory(row.sizeGlobal) << "\n"; +static void printCoreMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) { + llvm::SmallVector fields = { + {"Number of allocas", std::to_string(entry.row.numAlloca)}, + {"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}}; + printReportFlatFields(os, fields); +} + +static void printBatchMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) { + llvm::SmallVector perCoreFields = { + {"Number of allocas", std::to_string(entry.row.numAlloca)}, + {"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}}; + llvm::SmallVector totalFields = { + {"Number of allocas", std::to_string(entry.totalAllocaCount)}, + {"Batch memory", formatReportMemory(entry.totalAllocaBytes)}}; + printReportPerCoreAndTotalFields(os, perCoreFields, totalFields); } static MemoryReportRow addMemoryReportRows(const MemoryReportRow& lhs, const MemoryReportRow& rhs) { @@ -145,7 +149,7 @@ static MemoryReportRow addMemoryReportRows(const MemoryReportRow& lhs, const Mem MemoryReportRow PimMemory::getReportRow() const { MemoryReportRow row; - for (auto& [val, memEntry] : globalMemEntriesMap) { + for (auto& [val, memEntry] : ownedMemEntriesMap) { if (auto op = val.getDefiningOp()) { if (isa(op)) { row.numAlloca++; @@ -162,6 +166,8 @@ MemoryReportRow PimMemory::getReportRow() const { } void PimMemory::remove(mlir::Value val) { + if (auto removeIter = ownedMemEntriesMap.find(val); removeIter != ownedMemEntriesMap.end()) + ownedMemEntriesMap.erase(removeIter); if (auto removeIter = globalMemEntriesMap.find(val); removeIter != globalMemEntriesMap.end()) globalMemEntriesMap.erase(removeIter); } @@ -209,15 +215,26 @@ size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValu void PimAcceleratorMemory::reportHost() { hostReportRow = hostMem.getReportRow(); } void PimAcceleratorMemory::recordCoreReport(size_t coreId, const MemoryReportRow& row) { - reportEntries.push_back({MemoryReportEntry::Kind::Core, coreId, {static_cast(coreId)}, row}); + reportEntries.push_back({MemoryReportEntry::Kind::Core, + coreId, + {static_cast(coreId)}, + row, + row.numAlloca, + row.sizeAlloca}); } -void PimAcceleratorMemory::recordBatchReport(uint64_t batchId, ArrayRef coreIds, const MemoryReportRow& row) { +void PimAcceleratorMemory::recordBatchReport(uint64_t batchId, + ArrayRef coreIds, + const MemoryReportRow& perCoreRow, + uint64_t totalAllocaCount, + uint64_t totalAllocaBytes) { MemoryReportEntry entry; entry.kind = MemoryReportEntry::Kind::Batch; entry.id = batchId; llvm::append_range(entry.coreIds, coreIds); - entry.row = row; + entry.row = perCoreRow; + entry.totalAllocaCount = totalAllocaCount; + entry.totalAllocaBytes = totalAllocaBytes; reportEntries.push_back(std::move(entry)); } @@ -226,36 +243,32 @@ void PimAcceleratorMemory::flushReport() { return; llvm::raw_os_ostream os(fileReport); + uint64_t totalGlobalMemory = hostReportRow.has_value() ? hostReportRow->sizeGlobal : 0; + uint64_t totalCoresMemory = 0; + for (const MemoryReportEntry& entry : reportEntries) + totalCoresMemory += entry.totalAllocaBytes; + + llvm::SmallVector totalFields = { + {"Global memory", formatReportMemory(totalGlobalMemory)}, + {"Cores memory", formatReportMemory(totalCoresMemory)}}; + printReportTotalsBlock(os, totalFields); + if (hostReportRow.has_value()) { - os << "Host:\n"; - printMemoryReportRow(os, *hostReportRow); + os << "\nHost:\n"; + printHostMemoryReportRow(os, *hostReportRow); } if (!reportEntries.empty()) { if (hostReportRow.has_value()) os << "\n"; - - llvm::stable_sort(reportEntries, [](const MemoryReportEntry& lhs, const MemoryReportEntry& rhs) { - if (lhs.kind != rhs.kind) - return lhs.kind == MemoryReportEntry::Kind::Batch; - - const MemoryReportRow& lhsRow = lhs.row; - const MemoryReportRow& rhsRow = rhs.row; - if (lhsRow.sizeAlloca != rhsRow.sizeAlloca) - return lhsRow.sizeAlloca > rhsRow.sizeAlloca; - if (lhsRow.numAlloca != rhsRow.numAlloca) - return lhsRow.numAlloca > rhsRow.numAlloca; - if (lhsRow.sizeGlobal != rhsRow.sizeGlobal) - return lhsRow.sizeGlobal > rhsRow.sizeGlobal; - if (lhsRow.numGlobal != rhsRow.numGlobal) - return lhsRow.numGlobal > rhsRow.numGlobal; - return lhs.id < rhs.id; - }); + sortReportEntriesByFirstCore(reportEntries); for (size_t index = 0; index < reportEntries.size();) { size_t runEnd = index + 1; while (runEnd < reportEntries.size() && reportEntries[runEnd].kind == reportEntries[index].kind - && reportEntries[runEnd].row == reportEntries[index].row) { + && reportEntries[runEnd].row == reportEntries[index].row + && reportEntries[runEnd].totalAllocaCount == reportEntries[index].totalAllocaCount + && reportEntries[runEnd].totalAllocaBytes == reportEntries[index].totalAllocaBytes) { ++runEnd; } @@ -277,9 +290,11 @@ void PimAcceleratorMemory::flushReport() { printCompressedIntegerEntries(os, ArrayRef(coreIds)); } os << ":\n"; - printMemoryReportRow(os, reportEntries[index].row); - if (runEnd < reportEntries.size()) - os << "\n"; + if (reportEntries[index].kind == MemoryReportEntry::Kind::Batch) + printBatchMemoryReportRow(os, reportEntries[index]); + else + printCoreMemoryReportRow(os, reportEntries[index]); + printReportEntrySeparator(os, runEnd < reportEntries.size()); index = runEnd; } @@ -876,7 +891,9 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: } for (Operation* op : coreLikeOps) { - auto emitCore = [&](pim::PimCoreOp coreOp, bool temporaryCore) -> OnnxMlirCompilerErrorCodes { + auto emitCore = [&](pim::PimCoreOp coreOp, + bool temporaryCore, + MemoryReportRow* reportRow = nullptr) -> OnnxMlirCompilerErrorCodes { size_t originalCoreId = static_cast(coreOp.getCoreId()); size_t coreId = emittedCoreIds.lookup(originalCoreId); maxCoreId = std::max(maxCoreId, coreId); @@ -892,13 +909,17 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds); aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory); - memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp); + auto& deviceMemory = memory.getOrCreateDeviceMem(coreId); + deviceMemory.allocateCore(coreOp); int64_t processedOperations = codeGenCoreOps(coreOp.getBody().front(), coreCodeGen); if (processedOperations < 0) return CompilerFailure; assert(processedOperations > 0); + if (reportRow) + *reportRow = deviceMemory.getReportRow(); + coreFileStream.seek(coreFileStream.tell() - 1); coreFileStream << ']'; coreFileStream.close(); @@ -936,11 +957,10 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: }; if (auto coreOp = dyn_cast(op)) { - if (auto err = emitCore(coreOp, false)) + MemoryReportRow coreRow; + if (auto err = emitCore(coreOp, false, &coreRow)) return err; - memory.recordCoreReport( - emittedCoreIds.lookup(static_cast(coreOp.getCoreId())), - memory.getOrCreateDeviceMem(emittedCoreIds.lookup(static_cast(coreOp.getCoreId()))).getReportRow()); + memory.recordCoreReport(emittedCoreIds.lookup(static_cast(coreOp.getCoreId())), coreRow); continue; } @@ -949,20 +969,29 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std:: SmallVector reportedCoreIds; reportedCoreIds.reserve(batchCoreIds.size()); MemoryReportRow batchRow; + std::optional batchPerCoreRow; for (unsigned lane = 0; lane < static_cast(coreBatchOp.getLaneCount()); ++lane) { OnnxMlirCompilerErrorCodes laneResult = CompilerSuccess; if (failed(withScalarCoreFromBatchLane(coreBatchOp, lane, [&](pim::PimCoreOp coreOp) { size_t originalCoreId = static_cast(batchCoreIds[lane]); size_t coreId = emittedCoreIds.lookup(originalCoreId); reportedCoreIds.push_back(static_cast(coreId)); - laneResult = emitCore(coreOp, true); - if (laneResult == CompilerSuccess) - batchRow = addMemoryReportRows(batchRow, memory.getOrCreateDeviceMem(coreId).getReportRow()); + MemoryReportRow laneRow; + laneResult = emitCore(coreOp, true, &laneRow); + if (laneResult == CompilerSuccess) { + if (!batchPerCoreRow.has_value()) + batchPerCoreRow = laneRow; + batchRow = addMemoryReportRows(batchRow, laneRow); + } return laneResult == CompilerSuccess ? success() : failure(); }))) return laneResult == CompilerSuccess ? CompilerFailure : laneResult; } - memory.recordBatchReport(nextBatchReportId++, reportedCoreIds, batchRow); + memory.recordBatchReport(nextBatchReportId++, + reportedCoreIds, + batchPerCoreRow.value_or(MemoryReportRow {}), + batchRow.numAlloca, + batchRow.sizeAlloca); } memory.flushReport(); diff --git a/src/PIM/Compiler/PimCodeGen.hpp b/src/PIM/Compiler/PimCodeGen.hpp index 42aa656..895f9de 100644 --- a/src/PIM/Compiler/PimCodeGen.hpp +++ b/src/PIM/Compiler/PimCodeGen.hpp @@ -12,6 +12,7 @@ #include "onnx-mlir/Compiler/OMCompilerTypes.h" #include "src/Accelerators/PIM/Common/PimCommon.hpp" +#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp" #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" namespace onnx_mlir { @@ -43,11 +44,14 @@ struct MemoryReportEntry { uint64_t id = 0; llvm::SmallVector coreIds; MemoryReportRow row; + uint64_t totalAllocaCount = 0; + uint64_t totalAllocaBytes = 0; }; class PimMemory { llvm::SmallVector, 32> memEntries; llvm::SmallDenseMap& globalMemEntriesMap; + llvm::SmallDenseMap ownedMemEntriesMap; size_t minAlignment = 4; size_t firstAvailableAddress = 0; @@ -82,24 +86,18 @@ private: public: PimAcceleratorMemory() - : hostMem(memEntriesMap) { - - std::string outputDir = getOutputDir(); - if (outputDir.empty()) - return; - - std::string dialectsDir = outputDir + "/reports/"; - createDirectory(dialectsDir); - std::fstream file(dialectsDir + "/memory_report.txt", std::ios::out); - fileReport = std::move(file); - } + : hostMem(memEntriesMap), fileReport(openReportFile("memory_report")) {} PimMemory& getOrCreateDeviceMem(size_t id); size_t getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const; void reportHost(); void recordCoreReport(size_t coreId, const MemoryReportRow& row); - void recordBatchReport(uint64_t batchId, llvm::ArrayRef coreIds, const MemoryReportRow& row); + void recordBatchReport(uint64_t batchId, + llvm::ArrayRef coreIds, + const MemoryReportRow& perCoreRow, + uint64_t totalAllocaCount, + uint64_t totalAllocaBytes); void flushReport(); void clean(mlir::Operation* op); }; diff --git a/src/PIM/Compiler/PimCompilerUtils.cpp b/src/PIM/Compiler/PimCompilerUtils.cpp index f12113d..73ce20c 100644 --- a/src/PIM/Compiler/PimCompilerUtils.cpp +++ b/src/PIM/Compiler/PimCompilerUtils.cpp @@ -41,6 +41,7 @@ void addPassesPim(OwningOpRef& module, if (pimEmissionTarget >= EmitPimBufferized) { pm.addPass(createPimBufferizationPass()); + pm.addPass(createPimStaticMemoryCoalescingPass()); // pm.addPass(createCountInstructionPass()); pm.addPass(createMessagePass("Pim bufferized")); } diff --git a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp index 12e8010..efa8ec6 100644 --- a/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp +++ b/src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp @@ -94,7 +94,7 @@ void ONNXToSpatialPass::runOnOperation() { RewritePatternSet prePatterns(ctx); populatePrePatterns(prePatterns, ctx); if (failed(applyPatternsGreedily(moduleOp, std::move(prePatterns)))) - llvm::dbgs() << "Failed to apply pre-patterns, continuing...\n"; + moduleOp.emitWarning("failed to apply ONNX-to-Spatial pre-patterns; continuing"); auto entryFunc = getPimEntryFunc(moduleOp); if (failed(entryFunc)) { @@ -148,7 +148,8 @@ void ONNXToSpatialPass::runOnOperation() { computeOpsCount++; if (computeOpsCount > coresCount) { - llvm::dbgs() << "Number of compute ops exceeds the core count\n"; + entryFunc->emitError() << "number of compute ops (" << computeOpsCount << ") exceeds the core count (" + << coresCount << ")"; signalPassFailure(); return; } @@ -157,7 +158,7 @@ void ONNXToSpatialPass::runOnOperation() { PassManager cleanupPM(ctx); cleanupPM.addPass(createCanonicalizerPass()); if (failed(cleanupPM.run(moduleOp))) - llvm::dbgs() << "Failed to run canonicalization cleanup, continuing...\n"; + moduleOp.emitWarning("failed to run ONNX-to-Spatial canonicalization cleanup; continuing"); annotateWeightsConstants(*entryFunc); diff --git a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp index d5508e5..9885733 100644 --- a/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp +++ b/src/PIM/Conversion/SpatialToPim/SpatialToPimPass.cpp @@ -70,85 +70,6 @@ private: } // namespace -static int32_t translateSpatialCoreIdToPimCoreId(size_t spatialCoreId) { return static_cast(spatialCoreId); } - -static void lowerChannelSend(spatial::SpatChannelSendOp sendOp, IRRewriter& rewriter) { - auto sizeAttr = getTensorSizeInBytesAttr(rewriter, sendOp.getInput()); - auto targetCoreIdAttr = rewriter.getI32IntegerAttr(translateSpatialCoreIdToPimCoreId(sendOp.getTargetCoreId())); - - rewriter.setInsertionPoint(sendOp); - PimSendOp::create(rewriter, sendOp.getLoc(), sendOp.getInput(), sizeAttr, targetCoreIdAttr); - rewriter.eraseOp(sendOp); -} - -static void lowerChannelReceive(spatial::SpatChannelReceiveOp receiveOp, IRRewriter& rewriter) { - if (receiveOp->use_empty()) { - rewriter.eraseOp(receiveOp); - return; - } - - auto outputType = cast(receiveOp.getResult().getType()); - rewriter.setInsertionPoint(receiveOp); - auto outputBuffer = createEmptyTensorFromShaped(rewriter, receiveOp.getLoc(), outputType); - auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult()); - auto sourceCoreIdAttr = rewriter.getI32IntegerAttr(translateSpatialCoreIdToPimCoreId(receiveOp.getSourceCoreId())); - - Value received = - PimReceiveOp::create(rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr) - .getOutput(); - rewriter.replaceOp(receiveOp, received); -} - -static void lowerChannelSendTensor(spatial::SpatChannelSendTensorOp sendTensorOp, IRRewriter& rewriter) { - SmallVector targetCoreIds; - targetCoreIds.reserve(sendTensorOp.getTargetCoreIds().size()); - for (int32_t targetCoreId : sendTensorOp.getTargetCoreIds()) - targetCoreIds.push_back(translateSpatialCoreIdToPimCoreId(targetCoreId)); - - rewriter.setInsertionPoint(sendTensorOp); - PimSendTensorOp::create( - rewriter, sendTensorOp.getLoc(), sendTensorOp.getInput(), rewriter.getDenseI32ArrayAttr(targetCoreIds)); - rewriter.eraseOp(sendTensorOp); -} - -static void lowerChannelReceiveTensor(spatial::SpatChannelReceiveTensorOp receiveTensorOp, IRRewriter& rewriter) { - SmallVector sourceCoreIds; - sourceCoreIds.reserve(receiveTensorOp.getSourceCoreIds().size()); - for (int32_t sourceCoreId : receiveTensorOp.getSourceCoreIds()) - sourceCoreIds.push_back(translateSpatialCoreIdToPimCoreId(sourceCoreId)); - - rewriter.setInsertionPoint(receiveTensorOp); - auto outputType = cast(receiveTensorOp.getOutput().getType()); - Value outputBuffer = createEmptyTensorFromShaped(rewriter, receiveTensorOp.getLoc(), outputType).getResult(); - Value received = PimReceiveTensorOp::create(rewriter, - receiveTensorOp.getLoc(), - receiveTensorOp.getOutput().getType(), - outputBuffer, - rewriter.getDenseI32ArrayAttr(sourceCoreIds)) - .getOutput(); - rewriter.replaceOp(receiveTensorOp, received); -} - -static void lowerExtractRows(spatial::SpatExtractRowsOp extractRowsOp, IRRewriter& rewriter) { - rewriter.setInsertionPoint(extractRowsOp); - auto inputType = cast(extractRowsOp.getInput().getType()); - SmallVector replacements; - replacements.reserve(extractRowsOp.getNumResults()); - for (auto [rowIndex, output] : llvm::enumerate(extractRowsOp.getOutputs())) { - auto outputType = cast(output.getType()); - SmallVector offsets = { - rewriter.getIndexAttr(static_cast(rowIndex) * outputType.getDimSize(0)), rewriter.getIndexAttr(0)}; - SmallVector sizes = {rewriter.getIndexAttr(outputType.getDimSize(0)), - rewriter.getIndexAttr(inputType.getDimSize(1))}; - SmallVector strides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)}; - replacements.push_back( - tensor::ExtractSliceOp::create( - rewriter, extractRowsOp.getLoc(), outputType, extractRowsOp.getInput(), offsets, sizes, strides) - .getResult()); - } - rewriter.replaceOp(extractRowsOp, replacements); -} - static memref::GlobalOp getOrCreateZeroGlobal(IRRewriter& rewriter, Location loc, RankedTensorType tensorType) { auto moduleOp = rewriter.getBlock()->getParentOp()->getParentOfType(); auto memRefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType()); @@ -216,97 +137,6 @@ static Value padHVectorInputToCrossbarSize(IRRewriter& rewriter, Location loc, V return PimMemCopyOp::create(rewriter, loc, paddedType, zeroed, vector, zeroAttr, zeroAttr, sizeAttr).getOutput(); } -static void compactSpatialTensorGroups(func::FuncOp funcOp, IRRewriter& rewriter) { - SmallVector concatOps; - funcOp.walk([&](spatial::SpatConcatOp concatOp) { concatOps.push_back(concatOp); }); - for (auto concatOp : concatOps) { - if (concatOp.getAxis() != 0 || concatOp.getInputs().empty()) - continue; - - SmallVector packedInputs; - bool changed = false; - rewriter.setInsertionPoint(concatOp); - - for (unsigned index = 0; index < concatOp.getInputs().size();) { - Value input = concatOp.getInputs()[index]; - - if (input.getDefiningOp()) { - unsigned endIndex = index + 1; - while (endIndex < concatOp.getInputs().size() - && concatOp.getInputs()[endIndex].getDefiningOp()) - ++endIndex; - - Value packedInput = createPackedExtractSliceTensor( - concatOp.getInputs().slice(index, endIndex - index), rewriter, concatOp.getLoc()); - if (packedInput) { - packedInputs.push_back(packedInput); - changed = true; - index = endIndex; - continue; - } - } - - auto result = dyn_cast(input); - if (!result) { - packedInputs.push_back(input); - ++index; - continue; - } - - Operation* owner = result.getOwner(); - unsigned startIndex = result.getResultNumber(); - unsigned endIndex = index + 1; - while (endIndex < concatOp.getInputs().size()) { - auto nextResult = dyn_cast(concatOp.getInputs()[endIndex]); - if (!nextResult || nextResult.getOwner() != owner - || nextResult.getResultNumber() != startIndex + (endIndex - index)) - break; - ++endIndex; - } - - unsigned count = endIndex - index; - Value packedInput; - if (auto extractRowsOp = dyn_cast(owner)) - packedInput = createPackedExtractRowsSlice(extractRowsOp, startIndex, count, rewriter, concatOp.getLoc()); - - if (packedInput) { - packedInputs.push_back(packedInput); - changed = true; - } - else { - for (unsigned oldIndex = index; oldIndex < endIndex; ++oldIndex) - packedInputs.push_back(concatOp.getInputs()[oldIndex]); - } - - index = endIndex; - } - - if (!changed) - continue; - - auto newConcat = pim::PimConcatOp::create( - rewriter, - concatOp.getLoc(), - concatOp.getOutput().getType(), - concatOp.getAxisAttr(), - ValueRange(packedInputs), - createEmptyTensorFromShaped(rewriter, concatOp.getLoc(), cast(concatOp.getOutput().getType())) - .getResult()); - rewriter.replaceOp(concatOp, newConcat.getOutput()); - } - auto eraseUnusedOps = [&](auto tag) { - using OpTy = decltype(tag); - SmallVector ops; - funcOp.walk([&](OpTy op) { ops.push_back(op); }); - for (auto op : llvm::reverse(ops)) - if (op->use_empty()) - rewriter.eraseOp(op); - }; - eraseUnusedOps(tensor::ConcatOp {}); - eraseUnusedOps(tensor::ExtractSliceOp {}); - eraseUnusedOps(spatial::SpatExtractRowsOp {}); -} - void SpatialToPimPass::runOnOperation() { coreId = 1; ModuleOp moduleOp = getOperation(); @@ -380,7 +210,12 @@ void SpatialToPimPass::runOnOperation() { } } - compactSpatialTensorGroups(funcOp, rewriter); + { + RewritePatternSet patterns(ctx); + populateTensorPackingPatterns(patterns); + walkAndApplyPatterns(funcOp, std::move(patterns)); + eraseUnusedTensorPackingOps(funcOp, rewriter); + } SmallVector receiveOps; for (auto op : funcOp.getOps()) @@ -392,37 +227,8 @@ void SpatialToPimPass::runOnOperation() { markOpToRemove(receiveOp); continue; } - if (receiveOp->use_empty()) { - rewriter.eraseOp(receiveOp); - continue; - } - lowerChannelReceive(receiveOp, rewriter); } - SmallVector receiveTensorOps; - for (auto op : funcOp.getOps()) - receiveTensorOps.push_back(op); - for (auto receiveTensorOp : receiveTensorOps) - lowerChannelReceiveTensor(receiveTensorOp, rewriter); - - SmallVector sendOps; - for (auto op : funcOp.getOps()) - sendOps.push_back(op); - for (auto sendOp : sendOps) - lowerChannelSend(sendOp, rewriter); - - SmallVector sendTensorOps; - for (auto op : funcOp.getOps()) - sendTensorOps.push_back(op); - for (auto sendTensorOp : sendTensorOps) - lowerChannelSendTensor(sendTensorOp, rewriter); - - SmallVector extractRowsOps; - for (auto op : funcOp.getOps()) - extractRowsOps.push_back(op); - for (auto extractRowsOp : extractRowsOps) - lowerExtractRows(extractRowsOp, rewriter); - { RewritePatternSet coreBodyPatterns(ctx); populateWithGenerated(coreBodyPatterns); @@ -457,7 +263,12 @@ void SpatialToPimPass::runOnOperation() { return; } - compactSpatialTensorGroups(funcOp, rewriter); + { + RewritePatternSet patterns(ctx); + populateTensorPackingPatterns(patterns); + walkAndApplyPatterns(funcOp, std::move(patterns)); + eraseUnusedTensorPackingOps(funcOp, rewriter); + } { ConversionTarget communicationTarget(*ctx); diff --git a/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.cpp b/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.cpp index 2dd3043..651914d 100644 --- a/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.cpp +++ b/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.cpp @@ -1,8 +1,96 @@ #include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp" +#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" + using namespace mlir; namespace onnx_mlir { +namespace { + +struct PackSpatialConcatInputsPattern final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(spatial::SpatConcatOp concatOp, PatternRewriter& rewriter) const override { + if (concatOp.getAxis() != 0 || concatOp.getInputs().empty()) + return failure(); + + SmallVector packedInputs; + bool changed = false; + + for (unsigned index = 0; index < concatOp.getInputs().size();) { + Value input = concatOp.getInputs()[index]; + + if (input.getDefiningOp()) { + unsigned endIndex = index + 1; + while (endIndex < concatOp.getInputs().size() + && concatOp.getInputs()[endIndex].getDefiningOp()) + ++endIndex; + + Value packedInput = createPackedExtractSliceTensor( + concatOp.getInputs().slice(index, endIndex - index), rewriter, concatOp.getLoc()); + if (packedInput) { + packedInputs.push_back(packedInput); + changed = true; + index = endIndex; + continue; + } + } + + auto result = dyn_cast(input); + if (!result) { + packedInputs.push_back(input); + ++index; + continue; + } + + Operation* owner = result.getOwner(); + unsigned startIndex = result.getResultNumber(); + unsigned endIndex = index + 1; + while (endIndex < concatOp.getInputs().size()) { + auto nextResult = dyn_cast(concatOp.getInputs()[endIndex]); + if (!nextResult || nextResult.getOwner() != owner + || nextResult.getResultNumber() != startIndex + (endIndex - index)) + break; + ++endIndex; + } + + unsigned count = endIndex - index; + Value packedInput; + if (auto extractRowsOp = dyn_cast(owner)) + packedInput = createPackedExtractRowsSlice(extractRowsOp, startIndex, count, rewriter, concatOp.getLoc()); + + if (packedInput) { + packedInputs.push_back(packedInput); + changed = true; + } + else { + for (unsigned oldIndex = index; oldIndex < endIndex; ++oldIndex) + packedInputs.push_back(concatOp.getInputs()[oldIndex]); + } + + index = endIndex; + } + + if (!changed) + return failure(); + + auto outputType = cast(concatOp.getOutput().getType()); + auto newConcat = pim::PimConcatOp::create(rewriter, + concatOp.getLoc(), + concatOp.getOutput().getType(), + concatOp.getAxisAttr(), + ValueRange(packedInputs), + tensor::EmptyOp::create(rewriter, + concatOp.getLoc(), + outputType.getShape(), + outputType.getElementType()) + .getResult()); + rewriter.replaceOp(concatOp, newConcat.getOutput()); + return success(); + } +}; + +} // namespace RankedTensorType getPackedTensorType(RankedTensorType elementType, int64_t count) { SmallVector packedShape(elementType.getShape().begin(), elementType.getShape().end()); @@ -146,4 +234,23 @@ Value createPackedExtractSliceTensor(ValueRange values, OpBuilder& builder, Loca return tensor::ExtractSliceOp::create(builder, loc, packedType, firstSliceOp.getSource(), offsets, sizes, strides) .getResult(); } + +void populateTensorPackingPatterns(RewritePatternSet& patterns) { + patterns.add(patterns.getContext()); +} + +void eraseUnusedTensorPackingOps(func::FuncOp funcOp, IRRewriter& rewriter) { + auto eraseUnusedOps = [&](auto tag) { + using OpTy = decltype(tag); + SmallVector ops; + funcOp.walk([&](OpTy op) { ops.push_back(op); }); + for (auto op : llvm::reverse(ops)) + if (op->use_empty()) + rewriter.eraseOp(op); + }; + eraseUnusedOps(tensor::ConcatOp {}); + eraseUnusedOps(tensor::ExtractSliceOp {}); + eraseUnusedOps(spatial::SpatExtractRowsOp {}); +} + } // namespace onnx_mlir diff --git a/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp b/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp index f338514..34feea6 100644 --- a/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp +++ b/src/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp @@ -1,6 +1,7 @@ #pragma once #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/PatternMatch.h" #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" @@ -19,5 +20,7 @@ mlir::Value createPackedExtractRowsSlice(spatial::SpatExtractRowsOp extractRowsO mlir::OpBuilder& builder, mlir::Location loc); mlir::Value createPackedExtractSliceTensor(mlir::ValueRange values, mlir::OpBuilder& builder, mlir::Location loc); +void populateTensorPackingPatterns(mlir::RewritePatternSet& patterns); +void eraseUnusedTensorPackingOps(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter); } // namespace onnx_mlir diff --git a/src/PIM/Dialect/Pim/CMakeLists.txt b/src/PIM/Dialect/Pim/CMakeLists.txt index 26e9974..2ad22ca 100644 --- a/src/PIM/Dialect/Pim/CMakeLists.txt +++ b/src/PIM/Dialect/Pim/CMakeLists.txt @@ -2,6 +2,7 @@ add_onnx_mlir_dialect(Pim pim) add_onnx_mlir_dialect_doc(pim Pim.td) add_subdirectory(Transforms/Bufferization) +add_subdirectory(Transforms/StaticMemoryCoalescing) add_pim_library(PimOps PimOps.hpp diff --git a/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/CMakeLists.txt b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/CMakeLists.txt new file mode 100644 index 0000000..916b12f --- /dev/null +++ b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/CMakeLists.txt @@ -0,0 +1,14 @@ +add_pim_library(OMPimStaticMemoryCoalescing + StaticMemoryCoalescing.cpp + StaticMemoryCoalescing.hpp + StaticMemoryCoalescingPass.cpp + + EXCLUDE_FROM_OM_LIBS + + INCLUDE_DIRS PUBLIC + ${PIM_PUBLIC_INCLUDE_DIRS} + + LINK_LIBS PUBLIC + OMPimCommon + PimOps +) diff --git a/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.cpp b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.cpp new file mode 100644 index 0000000..46be194 --- /dev/null +++ b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.cpp @@ -0,0 +1,172 @@ +#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp" + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Interfaces/DestinationStyleOpInterface.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" + +#include + +using namespace mlir; + +namespace onnx_mlir { +namespace pim { + +namespace { + +static bool isSupportedAliasOp(Operation* op) { + return isa(op); +} + +static bool isCandidateAllocType(MemRefType type) { + return type && type.hasStaticShape() && type.getLayout().isIdentity() && type.getElementTypeBitWidth() > 0; +} + +static uint64_t getTypeSizeBytes(MemRefType type) { + return static_cast(type.getNumElements() * type.getElementTypeBitWidth() / 8); +} + +static FailureOr getLastUseInstruction(memref::AllocOp allocOp, + Block& body, + const DenseMap& opOrder) { + uint64_t endInstruction = opOrder.lookup(allocOp); + SmallPtrSet visited; + SmallVector pendingValues; + pendingValues.push_back(allocOp.getResult()); + + while (!pendingValues.empty()) { + Value value = pendingValues.pop_back_val(); + for (Operation* user : value.getUsers()) { + if (user->getBlock() != &body) + return failure(); + if (!visited.insert(user).second) + continue; + + if (isSupportedAliasOp(user)) { + for (Value result : user->getResults()) + pendingValues.push_back(result); + } + + if (auto dpsOp = dyn_cast(user)) { + for (OpResult result : user->getResults()) { + OpOperand* tiedOperand = dpsOp.getTiedOpOperand(result); + if (!tiedOperand || tiedOperand->get() != value) + continue; + pendingValues.push_back(result); + } + } + + auto order = opOrder.find(user); + if (order == opOrder.end()) + return failure(); + endInstruction = std::max(endInstruction, order->second); + } + } + + return endInstruction; +} + +} // namespace + +StaticMemoryCoalescingAnalysis analyzeStaticMemoryCoalescingCandidates(Operation* coreLikeOp) { + StaticMemoryCoalescingAnalysis analysis; + if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty()) + return analysis; + + Block& body = coreLikeOp->getRegion(0).front(); + DenseMap opOrder; + uint64_t nextInstruction = 0; + for (Operation& op : body) + opOrder.try_emplace(&op, nextInstruction++); + + for (Operation& op : body) { + auto allocOp = dyn_cast(&op); + if (!allocOp) + continue; + + auto allocType = dyn_cast(allocOp.getType()); + if (!isCandidateAllocType(allocType)) { + ++analysis.skippedAllocations; + continue; + } + + auto endInstruction = getLastUseInstruction(allocOp, body, opOrder); + if (failed(endInstruction)) { + ++analysis.skippedAllocations; + continue; + } + + analysis.candidates.push_back( + StaticAllocationCandidate {allocOp, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)}); + } + + return analysis; +} + +StaticMemoryCoalescingStats coalesceStaticMemory(Operation* coreLikeOp, RewriterBase& rewriter) { + StaticMemoryCoalescingStats stats; + auto analysis = analyzeStaticMemoryCoalescingCandidates(coreLikeOp); + stats.skippedAllocations = analysis.skippedAllocations; + + llvm::sort(analysis.candidates, [](const StaticAllocationCandidate& lhs, const StaticAllocationCandidate& rhs) { + if (lhs.startInstruction != rhs.startInstruction) + return lhs.startInstruction < rhs.startInstruction; + return lhs.endInstruction < rhs.endInstruction; + }); + + struct ActiveStorage { + memref::AllocOp root; + uint64_t endInstruction = 0; + }; + + SmallVector active; + SmallVector freeList; + + for (StaticAllocationCandidate& candidate : analysis.candidates) { + for (auto it = active.begin(); it != active.end();) { + if (it->endInstruction < candidate.startInstruction) { + freeList.push_back(it->root); + it = active.erase(it); + continue; + } + ++it; + } + + auto bestFit = freeList.end(); + uint64_t bestFitBytes = std::numeric_limits::max(); + auto candidateType = cast(candidate.alloc.getType()); + for (auto it = freeList.begin(); it != freeList.end(); ++it) { + auto freeType = cast((*it).getType()); + if (freeType != candidateType) + continue; + + uint64_t freeBytes = getTypeSizeBytes(freeType); + if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes) + continue; + + bestFit = it; + bestFitBytes = freeBytes; + } + + if (bestFit == freeList.end()) { + active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction}); + continue; + } + + memref::AllocOp root = *bestFit; + freeList.erase(bestFit); + candidate.alloc.getResult().replaceAllUsesWith(root.getResult()); + rewriter.eraseOp(candidate.alloc); + active.push_back(ActiveStorage {root, candidate.endInstruction}); + ++stats.removedAllocs; + stats.savedBytes += candidate.sizeBytes; + } + + return stats; +} + +} // namespace pim +} // namespace onnx_mlir diff --git a/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp new file mode 100644 index 0000000..62938ee --- /dev/null +++ b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Operation.h" + +#include "llvm/ADT/SmallVector.h" + +namespace onnx_mlir { +namespace pim { + +struct StaticAllocationCandidate { + mlir::memref::AllocOp alloc; + uint64_t startInstruction = 0; + uint64_t endInstruction = 0; + uint64_t sizeBytes = 0; +}; + +struct StaticMemoryCoalescingAnalysis { + llvm::SmallVector candidates; + uint64_t skippedAllocations = 0; +}; + +struct StaticMemoryCoalescingStats { + uint64_t removedAllocs = 0; + uint64_t savedBytes = 0; + uint64_t skippedAllocations = 0; +}; + +StaticMemoryCoalescingAnalysis analyzeStaticMemoryCoalescingCandidates(mlir::Operation* coreLikeOp); + +StaticMemoryCoalescingStats coalesceStaticMemory(mlir::Operation* coreLikeOp, mlir::RewriterBase& rewriter); + +} // namespace pim +} // namespace onnx_mlir diff --git a/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescingPass.cpp b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescingPass.cpp new file mode 100644 index 0000000..b0d4516 --- /dev/null +++ b/src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescingPass.cpp @@ -0,0 +1,203 @@ +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_os_ostream.h" + +#include + +#include "Common/IR/CompactAsmUtils.hpp" +#include "src/Accelerators/PIM/Common/PimCommon.hpp" +#include "src/Accelerators/PIM/Common/Support/DebugDump.hpp" +#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp" +#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp" +#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp" +#include "src/Accelerators/PIM/Pass/PIMPasses.h" + +using namespace mlir; +using namespace onnx_mlir::compact_asm; + +namespace onnx_mlir { +namespace { + +struct CoalescingReportRow { + uint64_t numCandidates = 0; + uint64_t numSkipped = 0; + uint64_t numRemoved = 0; + uint64_t savedBytes = 0; + + bool operator==(const CoalescingReportRow& other) const { + return numCandidates == other.numCandidates && numSkipped == other.numSkipped && numRemoved == other.numRemoved + && savedBytes == other.savedBytes; + } +}; + +struct CoalescingReportEntry { + enum class Kind { + Core, + Batch + }; + + Kind kind = Kind::Core; + uint64_t id = 0; + llvm::SmallVector coreIds; + CoalescingReportRow row; +}; + +static std::string formatMemory(uint64_t bytes) { + return formatReportMemory(bytes); +} + +static SmallVector getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) { + auto coreIdsAttr = coreBatchOp->getAttrOfType(onnx_mlir::kCoreIdsAttrName); + assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute"); + return SmallVector(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end()); +} + +static void printReportRow(raw_ostream& os, const CoalescingReportRow& row) { + llvm::SmallVector fields = { + {"Number of candidates", std::to_string(row.numCandidates)}, + {"Skipped allocations", std::to_string(row.numSkipped)}, + {"Removed allocations", std::to_string(row.numRemoved)}, + {"Saved memory", formatMemory(row.savedBytes)}}; + printReportFlatFields(os, fields); +} + +static CoalescingReportRow getTotalRow(const CoalescingReportEntry& entry) { + uint64_t factor = std::max(1, entry.coreIds.size()); + return {entry.row.numCandidates * factor, + entry.row.numSkipped * factor, + entry.row.numRemoved * factor, + entry.row.savedBytes * factor}; +} + +static void emitReport(ArrayRef entries) { + std::fstream file = openReportFile("static_memory_coalescing_report"); + if (!file.is_open()) + return; + + llvm::raw_os_ostream os(file); + CoalescingReportRow totalRow; + for (const CoalescingReportEntry& entry : entries) { + CoalescingReportRow entryTotal = getTotalRow(entry); + totalRow.numCandidates += entryTotal.numCandidates; + totalRow.numSkipped += entryTotal.numSkipped; + totalRow.numRemoved += entryTotal.numRemoved; + totalRow.savedBytes += entryTotal.savedBytes; + } + + llvm::SmallVector totalFields = {{"Number of candidates", std::to_string(totalRow.numCandidates)}, + {"Skipped allocations", std::to_string(totalRow.numSkipped)}, + {"Removed allocations", std::to_string(totalRow.numRemoved)}, + {"Saved memory", formatMemory(totalRow.savedBytes)}}; + printReportTotalsBlock(os, totalFields); + if (!entries.empty()) + os << "\n"; + + llvm::SmallVector sortedEntries(entries.begin(), entries.end()); + sortReportEntriesByFirstCore(sortedEntries); + + for (size_t index = 0; index < sortedEntries.size();) { + size_t runEnd = index + 1; + while (runEnd < sortedEntries.size() && sortedEntries[runEnd].kind == sortedEntries[index].kind + && sortedEntries[runEnd].row == sortedEntries[index].row) { + ++runEnd; + } + + if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) { + os << "Batch "; + for (size_t batchIndex = index; batchIndex < runEnd; ++batchIndex) { + if (batchIndex != index) + os << ",\n "; + os << sortedEntries[batchIndex].id << " (cores "; + printCompressedIntegerEntries(os, ArrayRef(sortedEntries[batchIndex].coreIds)); + os << ")"; + } + } + else { + llvm::SmallVector coreIds; + for (size_t coreIndex = index; coreIndex < runEnd; ++coreIndex) + coreIds.push_back(sortedEntries[coreIndex].coreIds.front()); + os << "Core "; + printCompressedIntegerEntries(os, ArrayRef(coreIds)); + } + + os << ":\n"; + if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) { + llvm::SmallVector perCoreFields = { + {"Number of candidates", std::to_string(sortedEntries[index].row.numCandidates)}, + {"Skipped allocations", std::to_string(sortedEntries[index].row.numSkipped)}, + {"Removed allocations", std::to_string(sortedEntries[index].row.numRemoved)}, + {"Saved memory", formatMemory(sortedEntries[index].row.savedBytes)}}; + CoalescingReportRow totalRow = getTotalRow(sortedEntries[index]); + llvm::SmallVector totalFields = { + {"Number of candidates", std::to_string(totalRow.numCandidates)}, + {"Skipped allocations", std::to_string(totalRow.numSkipped)}, + {"Removed allocations", std::to_string(totalRow.numRemoved)}, + {"Saved memory", formatMemory(totalRow.savedBytes)}}; + printReportPerCoreAndTotalFields(os, perCoreFields, totalFields); + } + else { + printReportRow(os, sortedEntries[index].row); + } + printReportEntrySeparator(os, runEnd < sortedEntries.size()); + index = runEnd; + } + + os.flush(); + file.close(); +} + +struct StaticMemoryCoalescingPass : PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StaticMemoryCoalescingPass) + + StringRef getArgument() const override { return "pim-static-memory-coalescing"; } + StringRef getDescription() const override { return "Analyze static local PIM memory reuse opportunities"; } + + StaticMemoryCoalescingPass() = default; + StaticMemoryCoalescingPass(const StaticMemoryCoalescingPass& pass) {} + + void runOnOperation() override { + IRRewriter rewriter(&getContext()); + SmallVector reportEntries; + uint64_t nextBatchId = 0; + + getOperation().walk([&](Operation* op) { + if (!isa(op)) + return; + + auto analysis = pim::analyzeStaticMemoryCoalescingCandidates(op); + auto stats = pim::coalesceStaticMemory(op, rewriter); + CoalescingReportRow row { + analysis.candidates.size(), stats.skippedAllocations, stats.removedAllocs, stats.savedBytes}; + + if (auto coreOp = dyn_cast(op)) { + reportEntries.push_back({CoalescingReportEntry::Kind::Core, + static_cast(coreOp.getCoreId()), + {static_cast(coreOp.getCoreId())}, + row}); + return; + } + + auto coreIds = getBatchCoreIds(cast(op)); + CoalescingReportEntry entry; + entry.kind = CoalescingReportEntry::Kind::Batch; + entry.id = nextBatchId++; + llvm::append_range(entry.coreIds, coreIds); + entry.row = row; + reportEntries.push_back(std::move(entry)); + }); + + emitReport(reportEntries); + dumpModule(getOperation(), "pim2_coalesced"); + } +}; + +} // namespace + +std::unique_ptr createPimStaticMemoryCoalescingPass() { + return std::make_unique(); +} + +} // namespace onnx_mlir diff --git a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp index eb6d712..9386805 100644 --- a/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp +++ b/src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MergeComputeNodesPass.cpp @@ -40,6 +40,7 @@ #include "RegularOpCompaction.hpp" #include "src/Accelerators/PIM/Common/IR/CompactAsmUtils.hpp" #include "src/Accelerators/PIM/Common/PimCommon.hpp" +#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" using namespace mlir; @@ -764,18 +765,13 @@ void emitMotifProfile(func::FuncOp funcOp) { } void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpuCount = 0) { - std::string outputDir = getOutputDir(); - if (outputDir.empty()) + std::fstream file = openReportFile(name); + if (!file.is_open()) return; - - std::string reportsDir = outputDir + "/reports"; - createDirectory(reportsDir); - - std::fstream file(reportsDir + "/" + name + ".txt", std::ios::out); llvm::raw_os_ostream os(file); struct ReportRow { - uint64_t opId = 0; + uint64_t id = 0; uint64_t logicalComputeCount = 0; uint64_t weightCount = 0; uint64_t instructionCount = 0; @@ -786,6 +782,9 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu uint64_t totalComputeOps = 0; uint64_t totalLogicalComputes = 0; uint64_t totalBatchComputeOps = 0; + uint64_t totalInstructionCount = 0; + uint64_t totalWeightCount = 0; + uint64_t nextBatchId = 0; std::vector collectedData; for (Operation& op : funcOp.getBody().front()) { @@ -793,8 +792,13 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu uint64_t numInst = 0; for (auto& _ : spatCompute.getRegion().front()) ++numInst; - collectedData.push_back({totalComputeOps++, 1, spatCompute.getWeights().size(), numInst, false, {}}); + SmallVector coreIds; + if (auto coreId = getComputeCoreId(spatCompute)) + coreIds.push_back(*coreId); + collectedData.push_back({totalComputeOps++, 1, spatCompute.getWeights().size(), numInst, false, coreIds}); totalLogicalComputes += 1; + totalInstructionCount += numInst; + totalWeightCount += spatCompute.getWeights().size(); continue; } if (auto batch = dyn_cast(&op)) { @@ -805,44 +809,27 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu SmallVector coreIds; if (auto coreIdsAttr = batch->getAttrOfType(onnx_mlir::kCoreIdsAttrName)) llvm::append_range(coreIds, coreIdsAttr.asArrayRef()); - collectedData.push_back({totalComputeOps++, logicalCount, batch.getWeights().size(), numInst, true, coreIds}); + collectedData.push_back({nextBatchId++, logicalCount, batch.getWeights().size(), numInst, true, coreIds}); + totalComputeOps += 1; totalLogicalComputes += logicalCount; totalBatchComputeOps += 1; + totalInstructionCount += numInst * logicalCount; + totalWeightCount += batch.getWeights().size(); } } - os << "Used cores: " << usedCpuCount << "\n"; - os << "Number of top-level compute ops: " << totalComputeOps << "\n"; - os << "Number of logical computes: " << totalLogicalComputes << "\n"; - os << "Number of top-level batch compute ops: " << totalBatchComputeOps << "\n"; - os << "\n"; + llvm::SmallVector totalFields = {{"Used cores", std::to_string(usedCpuCount)}, + {"Number of top-level compute ops", std::to_string(totalComputeOps)}, + {"Number of logical computes", std::to_string(totalLogicalComputes)}, + {"Number of top-level batch compute ops", + std::to_string(totalBatchComputeOps)}, + {"Number of instructions", std::to_string(totalInstructionCount)}, + {"Number of used crossbars", std::to_string(totalWeightCount)}}; + printReportTotalsBlock(os, totalFields); + if (!collectedData.empty()) + os << "\n"; - std::stable_sort(collectedData.begin(), collectedData.end(), [](const ReportRow& lft, const ReportRow& rgt) { - if (lft.isRebatched != rgt.isRebatched) - return lft.isRebatched > rgt.isRebatched; - - if (lft.instructionCount < rgt.instructionCount) - return false; - else if (rgt.instructionCount < lft.instructionCount) - return true; - - if (lft.weightCount < rgt.weightCount) - return false; - else if (rgt.weightCount < lft.weightCount) - return true; - - if (lft.logicalComputeCount < rgt.logicalComputeCount) - return false; - else if (rgt.logicalComputeCount < lft.logicalComputeCount) - return true; - - if (lft.opId < rgt.opId) - return true; - else if (rgt.opId < lft.opId) - return false; - - return true; - }); + sortReportEntriesByFirstCore(collectedData); for (uint64_t cI = 0; cI < totalComputeOps; ++cI) { uint64_t lastIndex = cI; @@ -863,7 +850,7 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu for (uint64_t index = cI; index <= lastIndex; ++index) { if (index != cI) os << ",\n "; - os << collectedData[index].opId << " (cores "; + os << collectedData[index].id << " (cores "; if (collectedData[index].coreIds.empty()) os << "unknown"; else @@ -876,14 +863,32 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu SmallVector opIds; opIds.reserve(lastIndex - cI + 1); for (uint64_t index = cI; index <= lastIndex; ++index) - opIds.push_back(collectedData[index].opId); + opIds.push_back(collectedData[index].id); printCompressedIntegerEntries(os, ArrayRef(opIds)); } os << ":\n"; - os << "\tNumber of logical computes: " << current.logicalComputeCount << "\n"; - os << "\tNumber of instructions: " << current.instructionCount << "\n"; - os << "\tNumber of used crossbars: " << current.weightCount << "\n"; + uint64_t perCoreLogicalComputeCount = current.isRebatched ? 1 : current.logicalComputeCount; + uint64_t perCoreInstructionCount = current.instructionCount; + uint64_t perCoreWeightCount = + current.logicalComputeCount == 0 ? 0 : current.weightCount / current.logicalComputeCount; + uint64_t totalEntryInstructionCount = current.instructionCount * current.logicalComputeCount; + + llvm::SmallVector perCoreFields = { + {"Number of logical computes", std::to_string(perCoreLogicalComputeCount)}, + {"Number of instructions", std::to_string(perCoreInstructionCount)}, + {"Number of used crossbars", std::to_string(perCoreWeightCount)}}; + if (current.isRebatched) { + llvm::SmallVector totalEntryFields = { + {"Number of logical computes", std::to_string(current.logicalComputeCount)}, + {"Number of instructions", std::to_string(totalEntryInstructionCount)}, + {"Number of used crossbars", std::to_string(current.weightCount)}}; + printReportPerCoreAndTotalFields(os, perCoreFields, totalEntryFields); + } + else { + printReportFlatFields(os, perCoreFields); + } + printReportEntrySeparator(os, lastIndex + 1 < totalComputeOps); cI = lastIndex; } diff --git a/src/PIM/Pass/PIMPasses.h b/src/PIM/Pass/PIMPasses.h index 4734d52..ceb9379 100644 --- a/src/PIM/Pass/PIMPasses.h +++ b/src/PIM/Pass/PIMPasses.h @@ -15,6 +15,8 @@ std::unique_ptr createSpatialToPimPass(); std::unique_ptr createPimBufferizationPass(); +std::unique_ptr createPimStaticMemoryCoalescingPass(); + std::unique_ptr createMergeComputeNodesPass(); std::unique_ptr createPimHostConstantFoldingPass(); diff --git a/src/PIM/Pass/PimCodegen/HostConstantFolding/HostConstantFoldingPass.cpp b/src/PIM/Pass/PimCodegen/HostConstantFolding/HostConstantFoldingPass.cpp index 469c09c..04038b0 100644 --- a/src/PIM/Pass/PimCodegen/HostConstantFolding/HostConstantFoldingPass.cpp +++ b/src/PIM/Pass/PimCodegen/HostConstantFolding/HostConstantFoldingPass.cpp @@ -39,7 +39,7 @@ struct HostConstantFoldingPass : PassWrapper patterns; diff --git a/src/PIM/Pass/PimCodegen/MaterializeHostConstantsPass.cpp b/src/PIM/Pass/PimCodegen/MaterializeHostConstantsPass.cpp index abe483b..3c43ff1 100644 --- a/src/PIM/Pass/PimCodegen/MaterializeHostConstantsPass.cpp +++ b/src/PIM/Pass/PimCodegen/MaterializeHostConstantsPass.cpp @@ -164,7 +164,7 @@ struct MaterializeHostConstantsPass : PassWrapper(op); +} + struct VerificationPass : PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VerificationPass) @@ -214,6 +243,11 @@ private: return walkPimCoreBlock( coreOp.getBody().front(), StaticValueKnowledge {}, [](Operation& op, const StaticValueKnowledge& knowledge) { bool hasFailure = false; + if (!isSupportedCoreInstructionOp(&op)) { + op.emitOpError("unsupported executable op reached PIM codegen verification"); + hasFailure = true; + } + for (auto [operandIndex, operand] : llvm::enumerate(op.getOperands())) { if (!isa(operand.getType())) continue; diff --git a/src/PIM/PimAccelerator.cpp b/src/PIM/PimAccelerator.cpp index 4b204cf..74642cb 100644 --- a/src/PIM/PimAccelerator.cpp +++ b/src/PIM/PimAccelerator.cpp @@ -75,6 +75,7 @@ void PimAccelerator::registerPasses(int optLevel) const { registerPass(createSpatialToGraphvizPass); registerPass(createSpatialToPimPass); registerPass(createPimBufferizationPass); + registerPass(createPimStaticMemoryCoalescingPass); registerPass(createMergeComputeNodesPass); registerPass(createPimHostConstantFoldingPass); registerPass(createPimMaterializeHostConstantsPass);