better reports refactor for more code-reuse and patter usage fixes
This commit is contained in:
@@ -81,7 +81,11 @@ ONNX-MLIR ──► Spatial ──► Pim (tensor) ──► Pim (bufferized)
|
||||
standard MLIR `BufferizableOpInterface` machinery
|
||||
(`OpBufferizationInterfaces.*`, `PimBufferization.td`).
|
||||
|
||||
5. **PIM code generation** (`src/PIM/Pass/PimCodegen`):
|
||||
5. **Static memory coalescing** (`src/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing`).
|
||||
Conservatively reuses same-typed local memref allocations inside PIM cores
|
||||
after bufferization and before code generation.
|
||||
|
||||
6. **PIM code generation** (`src/PIM/Pass/PimCodegen`):
|
||||
- `HostConstantFolding` — folds host-side constants.
|
||||
- `MaterializeHostConstantsPass` — materializes the remaining host
|
||||
constants for emission.
|
||||
|
||||
+2
-2
@@ -15,9 +15,9 @@ use crate::{
|
||||
};
|
||||
|
||||
|
||||
pub fn json_to_executor<'a>(
|
||||
pub fn json_to_executor<'a, 'b>(
|
||||
config: Value,
|
||||
mut cores: impl Iterator<Item = &'a Value>,
|
||||
mut cores: impl Iterator<Item = &'b Value>,
|
||||
crossbars : Vec<Vec<&'a Crossbar>>
|
||||
) -> Executable<'a> {
|
||||
let cell_precision = config.get("cell_precision").unwrap().as_i64().unwrap() as i32;
|
||||
|
||||
@@ -68,5 +68,6 @@ add_pim_library(OMPIMAccel
|
||||
OMSpatialToPim
|
||||
OMPimCommon
|
||||
OMPimBufferization
|
||||
OMPimStaticMemoryCoalescing
|
||||
MLIRTensorInferTypeOpInterfaceImpl
|
||||
)
|
||||
|
||||
@@ -8,6 +8,7 @@ add_pim_library(OMPimCommon
|
||||
Support/DebugDump.cpp
|
||||
Support/Diagnostics.cpp
|
||||
Support/FileSystemUtils.cpp
|
||||
Support/ReportUtils.cpp
|
||||
|
||||
EXCLUDE_FROM_OM_LIBS
|
||||
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||
|
||||
#include "llvm/Support/Format.h"
|
||||
|
||||
#include "src/Accelerators/PIM/Common/Support/FileSystemUtils.hpp"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
std::fstream openReportFile(const std::string& name) {
|
||||
std::string outputDir = getOutputDir();
|
||||
if (outputDir.empty())
|
||||
return {};
|
||||
|
||||
std::string reportsDir = outputDir + "/reports";
|
||||
createDirectory(reportsDir);
|
||||
return std::fstream(reportsDir + "/" + name + ".txt", std::ios::out);
|
||||
}
|
||||
|
||||
std::string formatReportMemory(uint64_t bytes) {
|
||||
const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"};
|
||||
int i = 0;
|
||||
double size = static_cast<double>(bytes);
|
||||
while (size >= 1024 && i < 6) {
|
||||
size /= 1024;
|
||||
i++;
|
||||
}
|
||||
|
||||
std::string out;
|
||||
llvm::raw_string_ostream rss(out);
|
||||
rss << llvm::format("%.2f ", size) << units[i];
|
||||
return rss.str();
|
||||
}
|
||||
|
||||
void printReportFlatFields(llvm::raw_ostream& os, llvm::ArrayRef<ReportField> fields) {
|
||||
for (const ReportField& field : fields)
|
||||
os << "\t" << field.label << ": " << field.value << "\n";
|
||||
}
|
||||
|
||||
void printReportFieldBlock(llvm::raw_ostream& os, llvm::StringRef title, llvm::ArrayRef<ReportField> fields) {
|
||||
os << "\t" << title << ":\n";
|
||||
for (const ReportField& field : fields)
|
||||
os << "\t " << field.label << ": " << field.value << "\n";
|
||||
}
|
||||
|
||||
void printReportTotalsBlock(llvm::raw_ostream& os, llvm::ArrayRef<ReportField> fields) {
|
||||
os << "Totals:\n";
|
||||
for (const ReportField& field : fields)
|
||||
os << "\t" << field.label << ": " << field.value << "\n";
|
||||
}
|
||||
|
||||
void printReportPerCoreAndTotalFields(llvm::raw_ostream& os,
|
||||
llvm::ArrayRef<ReportField> perCoreFields,
|
||||
llvm::ArrayRef<ReportField> totalFields) {
|
||||
printReportFieldBlock(os, "Per core", perCoreFields);
|
||||
printReportFieldBlock(os, "Total", totalFields);
|
||||
}
|
||||
|
||||
void printReportEntrySeparator(llvm::raw_ostream& os, bool hasNextEntry) {
|
||||
if (hasNextEntry)
|
||||
os << "\n";
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
std::fstream openReportFile(const std::string& name);
|
||||
std::string formatReportMemory(uint64_t bytes);
|
||||
|
||||
struct ReportField {
|
||||
std::string label;
|
||||
std::string value;
|
||||
};
|
||||
|
||||
void printReportFlatFields(llvm::raw_ostream& os, llvm::ArrayRef<ReportField> fields);
|
||||
void printReportFieldBlock(llvm::raw_ostream& os, llvm::StringRef title, llvm::ArrayRef<ReportField> fields);
|
||||
void printReportTotalsBlock(llvm::raw_ostream& os, llvm::ArrayRef<ReportField> fields);
|
||||
void printReportPerCoreAndTotalFields(llvm::raw_ostream& os,
|
||||
llvm::ArrayRef<ReportField> perCoreFields,
|
||||
llvm::ArrayRef<ReportField> totalFields);
|
||||
void printReportEntrySeparator(llvm::raw_ostream& os, bool hasNextEntry);
|
||||
|
||||
template <typename EntryTy>
|
||||
int32_t getFirstReportCoreId(const EntryTy& entry) {
|
||||
if (entry.coreIds.empty())
|
||||
return std::numeric_limits<int32_t>::max();
|
||||
return entry.coreIds.front();
|
||||
}
|
||||
|
||||
template <typename EntryRange>
|
||||
void sortReportEntriesByFirstCore(EntryRange& entries) {
|
||||
llvm::stable_sort(entries, [](const auto& lhs, const auto& rhs) {
|
||||
int32_t lhsFirstCore = getFirstReportCoreId(lhs);
|
||||
int32_t rhsFirstCore = getFirstReportCoreId(rhs);
|
||||
if (lhsFirstCore != rhsFirstCore)
|
||||
return lhsFirstCore < rhsFirstCore;
|
||||
return lhs.id < rhs.id;
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -29,6 +29,7 @@ add_pim_library(OMPimCompilerUtils
|
||||
OMPimCompilerOptions
|
||||
OMPimCommon
|
||||
OMPimBufferization
|
||||
OMPimStaticMemoryCoalescing
|
||||
OMPimPasses
|
||||
OMONNXToSpatial
|
||||
OMSpatialToPim
|
||||
|
||||
@@ -24,6 +24,78 @@ static SmallVector<int32_t> getLaneChunkCoreIds(ArrayRef<int32_t> coreIds, size_
|
||||
return laneCoreIds;
|
||||
}
|
||||
|
||||
static void scalarizeBatchOpsInCore(pim::PimCoreOp scalarCore, size_t laneCount, unsigned lane) {
|
||||
IRRewriter rewriter(scalarCore.getContext());
|
||||
SmallVector<Operation*> batchOps;
|
||||
scalarCore.walk([&](Operation* op) {
|
||||
if (isa<pim::PimSendBatchOp,
|
||||
pim::PimSendTensorBatchOp,
|
||||
pim::PimReceiveBatchOp,
|
||||
pim::PimReceiveTensorBatchOp,
|
||||
pim::PimMemCopyHostToDevBatchOp>(op)) {
|
||||
batchOps.push_back(op);
|
||||
}
|
||||
});
|
||||
|
||||
for (Operation* op : batchOps) {
|
||||
rewriter.setInsertionPoint(op);
|
||||
|
||||
if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
|
||||
pim::PimSendOp::create(rewriter,
|
||||
sendBatchOp.getLoc(),
|
||||
sendBatchOp.getInput(),
|
||||
sendBatchOp.getSizeAttr(),
|
||||
rewriter.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
|
||||
rewriter.eraseOp(op);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
|
||||
pim::PimSendTensorOp::create(
|
||||
rewriter,
|
||||
sendTensorBatchOp.getLoc(),
|
||||
sendTensorBatchOp.getInput(),
|
||||
rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
|
||||
rewriter.eraseOp(op);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
|
||||
auto scalarReceive =
|
||||
pim::PimReceiveOp::create(rewriter,
|
||||
receiveBatchOp.getLoc(),
|
||||
receiveBatchOp.getOutput().getType(),
|
||||
receiveBatchOp.getOutputBuffer(),
|
||||
receiveBatchOp.getSizeAttr(),
|
||||
rewriter.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
|
||||
rewriter.replaceOp(op, scalarReceive->getResults());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
|
||||
auto scalarReceive = pim::PimReceiveTensorOp::create(
|
||||
rewriter,
|
||||
receiveTensorBatchOp.getLoc(),
|
||||
receiveTensorBatchOp.getOutput().getType(),
|
||||
receiveTensorBatchOp.getOutputBuffer(),
|
||||
rewriter.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
|
||||
rewriter.replaceOp(op, scalarReceive->getResults());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto memcpBatchOp = cast<pim::PimMemCopyHostToDevBatchOp>(op);
|
||||
auto scalarCopy = pim::PimMemCopyHostToDevOp::create(rewriter,
|
||||
memcpBatchOp.getLoc(),
|
||||
memcpBatchOp.getOutput().getType(),
|
||||
memcpBatchOp.getDeviceTarget(),
|
||||
memcpBatchOp.getHostSource(),
|
||||
memcpBatchOp.getDeviceTargetOffsetAttr(),
|
||||
memcpBatchOp.getHostSourceOffsetAttr(),
|
||||
memcpBatchOp.getSizeAttr());
|
||||
rewriter.replaceOp(op, scalarCopy->getResults());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
|
||||
@@ -50,69 +122,6 @@ LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
|
||||
|
||||
builder.setInsertionPointToEnd(block);
|
||||
for (Operation& op : coreBatchOp.getBody().front()) {
|
||||
if (isa<pim::PimHaltOp>(op)) {
|
||||
pim::PimHaltOp::create(builder, op.getLoc());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
|
||||
pim::PimSendOp::create(builder,
|
||||
sendBatchOp.getLoc(),
|
||||
mapper.lookup(sendBatchOp.getInput()),
|
||||
sendBatchOp.getSizeAttr(),
|
||||
builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
|
||||
pim::PimSendTensorOp::create(
|
||||
builder,
|
||||
sendTensorBatchOp.getLoc(),
|
||||
mapper.lookup(sendTensorBatchOp.getInput()),
|
||||
builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
|
||||
auto scalarReceive =
|
||||
pim::PimReceiveOp::create(builder,
|
||||
receiveBatchOp.getLoc(),
|
||||
receiveBatchOp.getOutput().getType(),
|
||||
mapper.lookup(receiveBatchOp.getOutputBuffer()),
|
||||
receiveBatchOp.getSizeAttr(),
|
||||
builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
|
||||
mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
|
||||
auto scalarReceive = pim::PimReceiveTensorOp::create(
|
||||
builder,
|
||||
receiveTensorBatchOp.getLoc(),
|
||||
receiveTensorBatchOp.getOutput().getType(),
|
||||
mapper.lookup(receiveTensorBatchOp.getOutputBuffer()),
|
||||
builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
|
||||
mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
|
||||
Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource());
|
||||
if (!hostSource)
|
||||
hostSource = memcpBatchOp.getHostSource();
|
||||
|
||||
auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder,
|
||||
memcpBatchOp.getLoc(),
|
||||
memcpBatchOp.getOutput().getType(),
|
||||
mapper.lookup(memcpBatchOp.getDeviceTarget()),
|
||||
hostSource,
|
||||
memcpBatchOp.getDeviceTargetOffsetAttr(),
|
||||
memcpBatchOp.getHostSourceOffsetAttr(),
|
||||
memcpBatchOp.getSizeAttr());
|
||||
mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
Operation* cloned = builder.clone(op, mapper);
|
||||
for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
|
||||
mapper.map(originalResult, clonedResult);
|
||||
@@ -120,6 +129,7 @@ LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
|
||||
|
||||
if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
|
||||
pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
|
||||
scalarizeBatchOpsInCore(scalarCore, laneCount, lane);
|
||||
return callback(scalarCore);
|
||||
}
|
||||
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
|
||||
#include "Common/IR/CompactAsmUtils.hpp"
|
||||
#include "Common/PimCommon.hpp"
|
||||
#include "Common/Support/ReportUtils.hpp"
|
||||
#include "Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
|
||||
@@ -65,6 +66,7 @@ void PimMemory::allocateMemoryForValue(mlir::Value value, MemEntry& memEntry) {
|
||||
if (size_t remainder = firstAvailableAddress % minAlignment)
|
||||
firstAvailableAddress += minAlignment - remainder;
|
||||
|
||||
ownedMemEntriesMap[value] = memEntry;
|
||||
globalMemEntriesMap[value] = memEntry;
|
||||
}
|
||||
|
||||
@@ -112,26 +114,28 @@ void PimMemory::allocateCore(Operation* op) {
|
||||
allocateGatheredMemory();
|
||||
}
|
||||
|
||||
std::string formatMemory(uint64_t bytes) {
|
||||
const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"};
|
||||
int i = 0;
|
||||
double size = static_cast<double>(bytes);
|
||||
while (size >= 1024 && i < 6) {
|
||||
size /= 1024;
|
||||
i++;
|
||||
}
|
||||
// Formats to 2 decimal places
|
||||
std::string out;
|
||||
llvm::raw_string_ostream rss(out);
|
||||
rss << llvm::format("%.2f ", size) << units[i];
|
||||
return rss.str();
|
||||
static void printHostMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) {
|
||||
llvm::SmallVector<ReportField, 2> fields = {
|
||||
{"Number of globals", std::to_string(row.numGlobal)},
|
||||
{"Global memory", formatReportMemory(row.sizeGlobal)}};
|
||||
printReportFlatFields(os, fields);
|
||||
}
|
||||
|
||||
static void printMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) {
|
||||
os << "\tNumber of allocas: " << row.numAlloca << "\n";
|
||||
os << "\tAllocated memory: " << formatMemory(row.sizeAlloca) << "\n";
|
||||
os << "\tNumber of globals: " << row.numGlobal << "\n";
|
||||
os << "\tGlobal memory: " << formatMemory(row.sizeGlobal) << "\n";
|
||||
static void printCoreMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) {
|
||||
llvm::SmallVector<ReportField, 2> fields = {
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca)},
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}};
|
||||
printReportFlatFields(os, fields);
|
||||
}
|
||||
|
||||
static void printBatchMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) {
|
||||
llvm::SmallVector<ReportField, 2> perCoreFields = {
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca)},
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}};
|
||||
llvm::SmallVector<ReportField, 2> totalFields = {
|
||||
{"Number of allocas", std::to_string(entry.totalAllocaCount)},
|
||||
{"Batch memory", formatReportMemory(entry.totalAllocaBytes)}};
|
||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
|
||||
}
|
||||
|
||||
static MemoryReportRow addMemoryReportRows(const MemoryReportRow& lhs, const MemoryReportRow& rhs) {
|
||||
@@ -145,7 +149,7 @@ static MemoryReportRow addMemoryReportRows(const MemoryReportRow& lhs, const Mem
|
||||
|
||||
MemoryReportRow PimMemory::getReportRow() const {
|
||||
MemoryReportRow row;
|
||||
for (auto& [val, memEntry] : globalMemEntriesMap) {
|
||||
for (auto& [val, memEntry] : ownedMemEntriesMap) {
|
||||
if (auto op = val.getDefiningOp()) {
|
||||
if (isa<memref::AllocOp>(op)) {
|
||||
row.numAlloca++;
|
||||
@@ -162,6 +166,8 @@ MemoryReportRow PimMemory::getReportRow() const {
|
||||
}
|
||||
|
||||
void PimMemory::remove(mlir::Value val) {
|
||||
if (auto removeIter = ownedMemEntriesMap.find(val); removeIter != ownedMemEntriesMap.end())
|
||||
ownedMemEntriesMap.erase(removeIter);
|
||||
if (auto removeIter = globalMemEntriesMap.find(val); removeIter != globalMemEntriesMap.end())
|
||||
globalMemEntriesMap.erase(removeIter);
|
||||
}
|
||||
@@ -209,15 +215,26 @@ size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValu
|
||||
void PimAcceleratorMemory::reportHost() { hostReportRow = hostMem.getReportRow(); }
|
||||
|
||||
void PimAcceleratorMemory::recordCoreReport(size_t coreId, const MemoryReportRow& row) {
|
||||
reportEntries.push_back({MemoryReportEntry::Kind::Core, coreId, {static_cast<int32_t>(coreId)}, row});
|
||||
reportEntries.push_back({MemoryReportEntry::Kind::Core,
|
||||
coreId,
|
||||
{static_cast<int32_t>(coreId)},
|
||||
row,
|
||||
row.numAlloca,
|
||||
row.sizeAlloca});
|
||||
}
|
||||
|
||||
void PimAcceleratorMemory::recordBatchReport(uint64_t batchId, ArrayRef<int32_t> coreIds, const MemoryReportRow& row) {
|
||||
void PimAcceleratorMemory::recordBatchReport(uint64_t batchId,
|
||||
ArrayRef<int32_t> coreIds,
|
||||
const MemoryReportRow& perCoreRow,
|
||||
uint64_t totalAllocaCount,
|
||||
uint64_t totalAllocaBytes) {
|
||||
MemoryReportEntry entry;
|
||||
entry.kind = MemoryReportEntry::Kind::Batch;
|
||||
entry.id = batchId;
|
||||
llvm::append_range(entry.coreIds, coreIds);
|
||||
entry.row = row;
|
||||
entry.row = perCoreRow;
|
||||
entry.totalAllocaCount = totalAllocaCount;
|
||||
entry.totalAllocaBytes = totalAllocaBytes;
|
||||
reportEntries.push_back(std::move(entry));
|
||||
}
|
||||
|
||||
@@ -226,36 +243,32 @@ void PimAcceleratorMemory::flushReport() {
|
||||
return;
|
||||
|
||||
llvm::raw_os_ostream os(fileReport);
|
||||
uint64_t totalGlobalMemory = hostReportRow.has_value() ? hostReportRow->sizeGlobal : 0;
|
||||
uint64_t totalCoresMemory = 0;
|
||||
for (const MemoryReportEntry& entry : reportEntries)
|
||||
totalCoresMemory += entry.totalAllocaBytes;
|
||||
|
||||
llvm::SmallVector<ReportField, 2> totalFields = {
|
||||
{"Global memory", formatReportMemory(totalGlobalMemory)},
|
||||
{"Cores memory", formatReportMemory(totalCoresMemory)}};
|
||||
printReportTotalsBlock(os, totalFields);
|
||||
|
||||
if (hostReportRow.has_value()) {
|
||||
os << "Host:\n";
|
||||
printMemoryReportRow(os, *hostReportRow);
|
||||
os << "\nHost:\n";
|
||||
printHostMemoryReportRow(os, *hostReportRow);
|
||||
}
|
||||
|
||||
if (!reportEntries.empty()) {
|
||||
if (hostReportRow.has_value())
|
||||
os << "\n";
|
||||
|
||||
llvm::stable_sort(reportEntries, [](const MemoryReportEntry& lhs, const MemoryReportEntry& rhs) {
|
||||
if (lhs.kind != rhs.kind)
|
||||
return lhs.kind == MemoryReportEntry::Kind::Batch;
|
||||
|
||||
const MemoryReportRow& lhsRow = lhs.row;
|
||||
const MemoryReportRow& rhsRow = rhs.row;
|
||||
if (lhsRow.sizeAlloca != rhsRow.sizeAlloca)
|
||||
return lhsRow.sizeAlloca > rhsRow.sizeAlloca;
|
||||
if (lhsRow.numAlloca != rhsRow.numAlloca)
|
||||
return lhsRow.numAlloca > rhsRow.numAlloca;
|
||||
if (lhsRow.sizeGlobal != rhsRow.sizeGlobal)
|
||||
return lhsRow.sizeGlobal > rhsRow.sizeGlobal;
|
||||
if (lhsRow.numGlobal != rhsRow.numGlobal)
|
||||
return lhsRow.numGlobal > rhsRow.numGlobal;
|
||||
return lhs.id < rhs.id;
|
||||
});
|
||||
sortReportEntriesByFirstCore(reportEntries);
|
||||
|
||||
for (size_t index = 0; index < reportEntries.size();) {
|
||||
size_t runEnd = index + 1;
|
||||
while (runEnd < reportEntries.size() && reportEntries[runEnd].kind == reportEntries[index].kind
|
||||
&& reportEntries[runEnd].row == reportEntries[index].row) {
|
||||
&& reportEntries[runEnd].row == reportEntries[index].row
|
||||
&& reportEntries[runEnd].totalAllocaCount == reportEntries[index].totalAllocaCount
|
||||
&& reportEntries[runEnd].totalAllocaBytes == reportEntries[index].totalAllocaBytes) {
|
||||
++runEnd;
|
||||
}
|
||||
|
||||
@@ -277,9 +290,11 @@ void PimAcceleratorMemory::flushReport() {
|
||||
printCompressedIntegerEntries(os, ArrayRef<int32_t>(coreIds));
|
||||
}
|
||||
os << ":\n";
|
||||
printMemoryReportRow(os, reportEntries[index].row);
|
||||
if (runEnd < reportEntries.size())
|
||||
os << "\n";
|
||||
if (reportEntries[index].kind == MemoryReportEntry::Kind::Batch)
|
||||
printBatchMemoryReportRow(os, reportEntries[index]);
|
||||
else
|
||||
printCoreMemoryReportRow(os, reportEntries[index]);
|
||||
printReportEntrySeparator(os, runEnd < reportEntries.size());
|
||||
|
||||
index = runEnd;
|
||||
}
|
||||
@@ -876,7 +891,9 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
}
|
||||
|
||||
for (Operation* op : coreLikeOps) {
|
||||
auto emitCore = [&](pim::PimCoreOp coreOp, bool temporaryCore) -> OnnxMlirCompilerErrorCodes {
|
||||
auto emitCore = [&](pim::PimCoreOp coreOp,
|
||||
bool temporaryCore,
|
||||
MemoryReportRow* reportRow = nullptr) -> OnnxMlirCompilerErrorCodes {
|
||||
size_t originalCoreId = static_cast<size_t>(coreOp.getCoreId());
|
||||
size_t coreId = emittedCoreIds.lookup(originalCoreId);
|
||||
maxCoreId = std::max(maxCoreId, coreId);
|
||||
@@ -892,13 +909,17 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
|
||||
PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds);
|
||||
aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory);
|
||||
memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp);
|
||||
auto& deviceMemory = memory.getOrCreateDeviceMem(coreId);
|
||||
deviceMemory.allocateCore(coreOp);
|
||||
|
||||
int64_t processedOperations = codeGenCoreOps(coreOp.getBody().front(), coreCodeGen);
|
||||
if (processedOperations < 0)
|
||||
return CompilerFailure;
|
||||
assert(processedOperations > 0);
|
||||
|
||||
if (reportRow)
|
||||
*reportRow = deviceMemory.getReportRow();
|
||||
|
||||
coreFileStream.seek(coreFileStream.tell() - 1);
|
||||
coreFileStream << ']';
|
||||
coreFileStream.close();
|
||||
@@ -936,11 +957,10 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
};
|
||||
|
||||
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
||||
if (auto err = emitCore(coreOp, false))
|
||||
MemoryReportRow coreRow;
|
||||
if (auto err = emitCore(coreOp, false, &coreRow))
|
||||
return err;
|
||||
memory.recordCoreReport(
|
||||
emittedCoreIds.lookup(static_cast<size_t>(coreOp.getCoreId())),
|
||||
memory.getOrCreateDeviceMem(emittedCoreIds.lookup(static_cast<size_t>(coreOp.getCoreId()))).getReportRow());
|
||||
memory.recordCoreReport(emittedCoreIds.lookup(static_cast<size_t>(coreOp.getCoreId())), coreRow);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -949,20 +969,29 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
SmallVector<int32_t> reportedCoreIds;
|
||||
reportedCoreIds.reserve(batchCoreIds.size());
|
||||
MemoryReportRow batchRow;
|
||||
std::optional<MemoryReportRow> batchPerCoreRow;
|
||||
for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane) {
|
||||
OnnxMlirCompilerErrorCodes laneResult = CompilerSuccess;
|
||||
if (failed(withScalarCoreFromBatchLane(coreBatchOp, lane, [&](pim::PimCoreOp coreOp) {
|
||||
size_t originalCoreId = static_cast<size_t>(batchCoreIds[lane]);
|
||||
size_t coreId = emittedCoreIds.lookup(originalCoreId);
|
||||
reportedCoreIds.push_back(static_cast<int32_t>(coreId));
|
||||
laneResult = emitCore(coreOp, true);
|
||||
if (laneResult == CompilerSuccess)
|
||||
batchRow = addMemoryReportRows(batchRow, memory.getOrCreateDeviceMem(coreId).getReportRow());
|
||||
MemoryReportRow laneRow;
|
||||
laneResult = emitCore(coreOp, true, &laneRow);
|
||||
if (laneResult == CompilerSuccess) {
|
||||
if (!batchPerCoreRow.has_value())
|
||||
batchPerCoreRow = laneRow;
|
||||
batchRow = addMemoryReportRows(batchRow, laneRow);
|
||||
}
|
||||
return laneResult == CompilerSuccess ? success() : failure();
|
||||
})))
|
||||
return laneResult == CompilerSuccess ? CompilerFailure : laneResult;
|
||||
}
|
||||
memory.recordBatchReport(nextBatchReportId++, reportedCoreIds, batchRow);
|
||||
memory.recordBatchReport(nextBatchReportId++,
|
||||
reportedCoreIds,
|
||||
batchPerCoreRow.value_or(MemoryReportRow {}),
|
||||
batchRow.numAlloca,
|
||||
batchRow.sizeAlloca);
|
||||
}
|
||||
|
||||
memory.flushReport();
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "onnx-mlir/Compiler/OMCompilerTypes.h"
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||
|
||||
namespace onnx_mlir {
|
||||
@@ -43,11 +44,14 @@ struct MemoryReportEntry {
|
||||
uint64_t id = 0;
|
||||
llvm::SmallVector<int32_t, 8> coreIds;
|
||||
MemoryReportRow row;
|
||||
uint64_t totalAllocaCount = 0;
|
||||
uint64_t totalAllocaBytes = 0;
|
||||
};
|
||||
|
||||
class PimMemory {
|
||||
llvm::SmallVector<std::pair<MemEntry, mlir::Value>, 32> memEntries;
|
||||
llvm::SmallDenseMap<mlir::Value, MemEntry, 32>& globalMemEntriesMap;
|
||||
llvm::SmallDenseMap<mlir::Value, MemEntry, 32> ownedMemEntriesMap;
|
||||
|
||||
size_t minAlignment = 4;
|
||||
size_t firstAvailableAddress = 0;
|
||||
@@ -82,24 +86,18 @@ private:
|
||||
|
||||
public:
|
||||
PimAcceleratorMemory()
|
||||
: hostMem(memEntriesMap) {
|
||||
|
||||
std::string outputDir = getOutputDir();
|
||||
if (outputDir.empty())
|
||||
return;
|
||||
|
||||
std::string dialectsDir = outputDir + "/reports/";
|
||||
createDirectory(dialectsDir);
|
||||
std::fstream file(dialectsDir + "/memory_report.txt", std::ios::out);
|
||||
fileReport = std::move(file);
|
||||
}
|
||||
: hostMem(memEntriesMap), fileReport(openReportFile("memory_report")) {}
|
||||
|
||||
PimMemory& getOrCreateDeviceMem(size_t id);
|
||||
|
||||
size_t getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const;
|
||||
void reportHost();
|
||||
void recordCoreReport(size_t coreId, const MemoryReportRow& row);
|
||||
void recordBatchReport(uint64_t batchId, llvm::ArrayRef<int32_t> coreIds, const MemoryReportRow& row);
|
||||
void recordBatchReport(uint64_t batchId,
|
||||
llvm::ArrayRef<int32_t> coreIds,
|
||||
const MemoryReportRow& perCoreRow,
|
||||
uint64_t totalAllocaCount,
|
||||
uint64_t totalAllocaBytes);
|
||||
void flushReport();
|
||||
void clean(mlir::Operation* op);
|
||||
};
|
||||
|
||||
@@ -41,6 +41,7 @@ void addPassesPim(OwningOpRef<ModuleOp>& module,
|
||||
|
||||
if (pimEmissionTarget >= EmitPimBufferized) {
|
||||
pm.addPass(createPimBufferizationPass());
|
||||
pm.addPass(createPimStaticMemoryCoalescingPass());
|
||||
// pm.addPass(createCountInstructionPass());
|
||||
pm.addPass(createMessagePass("Pim bufferized"));
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ void ONNXToSpatialPass::runOnOperation() {
|
||||
RewritePatternSet prePatterns(ctx);
|
||||
populatePrePatterns(prePatterns, ctx);
|
||||
if (failed(applyPatternsGreedily(moduleOp, std::move(prePatterns))))
|
||||
llvm::dbgs() << "Failed to apply pre-patterns, continuing...\n";
|
||||
moduleOp.emitWarning("failed to apply ONNX-to-Spatial pre-patterns; continuing");
|
||||
|
||||
auto entryFunc = getPimEntryFunc(moduleOp);
|
||||
if (failed(entryFunc)) {
|
||||
@@ -148,7 +148,8 @@ void ONNXToSpatialPass::runOnOperation() {
|
||||
computeOpsCount++;
|
||||
|
||||
if (computeOpsCount > coresCount) {
|
||||
llvm::dbgs() << "Number of compute ops exceeds the core count\n";
|
||||
entryFunc->emitError() << "number of compute ops (" << computeOpsCount << ") exceeds the core count ("
|
||||
<< coresCount << ")";
|
||||
signalPassFailure();
|
||||
return;
|
||||
}
|
||||
@@ -157,7 +158,7 @@ void ONNXToSpatialPass::runOnOperation() {
|
||||
PassManager cleanupPM(ctx);
|
||||
cleanupPM.addPass(createCanonicalizerPass());
|
||||
if (failed(cleanupPM.run(moduleOp)))
|
||||
llvm::dbgs() << "Failed to run canonicalization cleanup, continuing...\n";
|
||||
moduleOp.emitWarning("failed to run ONNX-to-Spatial canonicalization cleanup; continuing");
|
||||
|
||||
annotateWeightsConstants(*entryFunc);
|
||||
|
||||
|
||||
@@ -70,85 +70,6 @@ private:
|
||||
|
||||
} // namespace
|
||||
|
||||
static int32_t translateSpatialCoreIdToPimCoreId(size_t spatialCoreId) { return static_cast<int32_t>(spatialCoreId); }
|
||||
|
||||
static void lowerChannelSend(spatial::SpatChannelSendOp sendOp, IRRewriter& rewriter) {
|
||||
auto sizeAttr = getTensorSizeInBytesAttr(rewriter, sendOp.getInput());
|
||||
auto targetCoreIdAttr = rewriter.getI32IntegerAttr(translateSpatialCoreIdToPimCoreId(sendOp.getTargetCoreId()));
|
||||
|
||||
rewriter.setInsertionPoint(sendOp);
|
||||
PimSendOp::create(rewriter, sendOp.getLoc(), sendOp.getInput(), sizeAttr, targetCoreIdAttr);
|
||||
rewriter.eraseOp(sendOp);
|
||||
}
|
||||
|
||||
static void lowerChannelReceive(spatial::SpatChannelReceiveOp receiveOp, IRRewriter& rewriter) {
|
||||
if (receiveOp->use_empty()) {
|
||||
rewriter.eraseOp(receiveOp);
|
||||
return;
|
||||
}
|
||||
|
||||
auto outputType = cast<ShapedType>(receiveOp.getResult().getType());
|
||||
rewriter.setInsertionPoint(receiveOp);
|
||||
auto outputBuffer = createEmptyTensorFromShaped(rewriter, receiveOp.getLoc(), outputType);
|
||||
auto sizeAttr = getTensorSizeInBytesAttr(rewriter, receiveOp.getResult());
|
||||
auto sourceCoreIdAttr = rewriter.getI32IntegerAttr(translateSpatialCoreIdToPimCoreId(receiveOp.getSourceCoreId()));
|
||||
|
||||
Value received =
|
||||
PimReceiveOp::create(rewriter, receiveOp.getLoc(), outputBuffer.getType(), outputBuffer, sizeAttr, sourceCoreIdAttr)
|
||||
.getOutput();
|
||||
rewriter.replaceOp(receiveOp, received);
|
||||
}
|
||||
|
||||
static void lowerChannelSendTensor(spatial::SpatChannelSendTensorOp sendTensorOp, IRRewriter& rewriter) {
|
||||
SmallVector<int32_t> targetCoreIds;
|
||||
targetCoreIds.reserve(sendTensorOp.getTargetCoreIds().size());
|
||||
for (int32_t targetCoreId : sendTensorOp.getTargetCoreIds())
|
||||
targetCoreIds.push_back(translateSpatialCoreIdToPimCoreId(targetCoreId));
|
||||
|
||||
rewriter.setInsertionPoint(sendTensorOp);
|
||||
PimSendTensorOp::create(
|
||||
rewriter, sendTensorOp.getLoc(), sendTensorOp.getInput(), rewriter.getDenseI32ArrayAttr(targetCoreIds));
|
||||
rewriter.eraseOp(sendTensorOp);
|
||||
}
|
||||
|
||||
static void lowerChannelReceiveTensor(spatial::SpatChannelReceiveTensorOp receiveTensorOp, IRRewriter& rewriter) {
|
||||
SmallVector<int32_t> sourceCoreIds;
|
||||
sourceCoreIds.reserve(receiveTensorOp.getSourceCoreIds().size());
|
||||
for (int32_t sourceCoreId : receiveTensorOp.getSourceCoreIds())
|
||||
sourceCoreIds.push_back(translateSpatialCoreIdToPimCoreId(sourceCoreId));
|
||||
|
||||
rewriter.setInsertionPoint(receiveTensorOp);
|
||||
auto outputType = cast<ShapedType>(receiveTensorOp.getOutput().getType());
|
||||
Value outputBuffer = createEmptyTensorFromShaped(rewriter, receiveTensorOp.getLoc(), outputType).getResult();
|
||||
Value received = PimReceiveTensorOp::create(rewriter,
|
||||
receiveTensorOp.getLoc(),
|
||||
receiveTensorOp.getOutput().getType(),
|
||||
outputBuffer,
|
||||
rewriter.getDenseI32ArrayAttr(sourceCoreIds))
|
||||
.getOutput();
|
||||
rewriter.replaceOp(receiveTensorOp, received);
|
||||
}
|
||||
|
||||
static void lowerExtractRows(spatial::SpatExtractRowsOp extractRowsOp, IRRewriter& rewriter) {
|
||||
rewriter.setInsertionPoint(extractRowsOp);
|
||||
auto inputType = cast<RankedTensorType>(extractRowsOp.getInput().getType());
|
||||
SmallVector<Value> replacements;
|
||||
replacements.reserve(extractRowsOp.getNumResults());
|
||||
for (auto [rowIndex, output] : llvm::enumerate(extractRowsOp.getOutputs())) {
|
||||
auto outputType = cast<RankedTensorType>(output.getType());
|
||||
SmallVector<OpFoldResult> offsets = {
|
||||
rewriter.getIndexAttr(static_cast<int64_t>(rowIndex) * outputType.getDimSize(0)), rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(outputType.getDimSize(0)),
|
||||
rewriter.getIndexAttr(inputType.getDimSize(1))};
|
||||
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
replacements.push_back(
|
||||
tensor::ExtractSliceOp::create(
|
||||
rewriter, extractRowsOp.getLoc(), outputType, extractRowsOp.getInput(), offsets, sizes, strides)
|
||||
.getResult());
|
||||
}
|
||||
rewriter.replaceOp(extractRowsOp, replacements);
|
||||
}
|
||||
|
||||
static memref::GlobalOp getOrCreateZeroGlobal(IRRewriter& rewriter, Location loc, RankedTensorType tensorType) {
|
||||
auto moduleOp = rewriter.getBlock()->getParentOp()->getParentOfType<ModuleOp>();
|
||||
auto memRefType = MemRefType::get(tensorType.getShape(), tensorType.getElementType());
|
||||
@@ -216,97 +137,6 @@ static Value padHVectorInputToCrossbarSize(IRRewriter& rewriter, Location loc, V
|
||||
return PimMemCopyOp::create(rewriter, loc, paddedType, zeroed, vector, zeroAttr, zeroAttr, sizeAttr).getOutput();
|
||||
}
|
||||
|
||||
static void compactSpatialTensorGroups(func::FuncOp funcOp, IRRewriter& rewriter) {
|
||||
SmallVector<spatial::SpatConcatOp> concatOps;
|
||||
funcOp.walk([&](spatial::SpatConcatOp concatOp) { concatOps.push_back(concatOp); });
|
||||
for (auto concatOp : concatOps) {
|
||||
if (concatOp.getAxis() != 0 || concatOp.getInputs().empty())
|
||||
continue;
|
||||
|
||||
SmallVector<Value> packedInputs;
|
||||
bool changed = false;
|
||||
rewriter.setInsertionPoint(concatOp);
|
||||
|
||||
for (unsigned index = 0; index < concatOp.getInputs().size();) {
|
||||
Value input = concatOp.getInputs()[index];
|
||||
|
||||
if (input.getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
unsigned endIndex = index + 1;
|
||||
while (endIndex < concatOp.getInputs().size()
|
||||
&& concatOp.getInputs()[endIndex].getDefiningOp<tensor::ExtractSliceOp>())
|
||||
++endIndex;
|
||||
|
||||
Value packedInput = createPackedExtractSliceTensor(
|
||||
concatOp.getInputs().slice(index, endIndex - index), rewriter, concatOp.getLoc());
|
||||
if (packedInput) {
|
||||
packedInputs.push_back(packedInput);
|
||||
changed = true;
|
||||
index = endIndex;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto result = dyn_cast<OpResult>(input);
|
||||
if (!result) {
|
||||
packedInputs.push_back(input);
|
||||
++index;
|
||||
continue;
|
||||
}
|
||||
|
||||
Operation* owner = result.getOwner();
|
||||
unsigned startIndex = result.getResultNumber();
|
||||
unsigned endIndex = index + 1;
|
||||
while (endIndex < concatOp.getInputs().size()) {
|
||||
auto nextResult = dyn_cast<OpResult>(concatOp.getInputs()[endIndex]);
|
||||
if (!nextResult || nextResult.getOwner() != owner
|
||||
|| nextResult.getResultNumber() != startIndex + (endIndex - index))
|
||||
break;
|
||||
++endIndex;
|
||||
}
|
||||
|
||||
unsigned count = endIndex - index;
|
||||
Value packedInput;
|
||||
if (auto extractRowsOp = dyn_cast<spatial::SpatExtractRowsOp>(owner))
|
||||
packedInput = createPackedExtractRowsSlice(extractRowsOp, startIndex, count, rewriter, concatOp.getLoc());
|
||||
|
||||
if (packedInput) {
|
||||
packedInputs.push_back(packedInput);
|
||||
changed = true;
|
||||
}
|
||||
else {
|
||||
for (unsigned oldIndex = index; oldIndex < endIndex; ++oldIndex)
|
||||
packedInputs.push_back(concatOp.getInputs()[oldIndex]);
|
||||
}
|
||||
|
||||
index = endIndex;
|
||||
}
|
||||
|
||||
if (!changed)
|
||||
continue;
|
||||
|
||||
auto newConcat = pim::PimConcatOp::create(
|
||||
rewriter,
|
||||
concatOp.getLoc(),
|
||||
concatOp.getOutput().getType(),
|
||||
concatOp.getAxisAttr(),
|
||||
ValueRange(packedInputs),
|
||||
createEmptyTensorFromShaped(rewriter, concatOp.getLoc(), cast<ShapedType>(concatOp.getOutput().getType()))
|
||||
.getResult());
|
||||
rewriter.replaceOp(concatOp, newConcat.getOutput());
|
||||
}
|
||||
auto eraseUnusedOps = [&](auto tag) {
|
||||
using OpTy = decltype(tag);
|
||||
SmallVector<OpTy> ops;
|
||||
funcOp.walk([&](OpTy op) { ops.push_back(op); });
|
||||
for (auto op : llvm::reverse(ops))
|
||||
if (op->use_empty())
|
||||
rewriter.eraseOp(op);
|
||||
};
|
||||
eraseUnusedOps(tensor::ConcatOp {});
|
||||
eraseUnusedOps(tensor::ExtractSliceOp {});
|
||||
eraseUnusedOps(spatial::SpatExtractRowsOp {});
|
||||
}
|
||||
|
||||
void SpatialToPimPass::runOnOperation() {
|
||||
coreId = 1;
|
||||
ModuleOp moduleOp = getOperation();
|
||||
@@ -380,7 +210,12 @@ void SpatialToPimPass::runOnOperation() {
|
||||
}
|
||||
}
|
||||
|
||||
compactSpatialTensorGroups(funcOp, rewriter);
|
||||
{
|
||||
RewritePatternSet patterns(ctx);
|
||||
populateTensorPackingPatterns(patterns);
|
||||
walkAndApplyPatterns(funcOp, std::move(patterns));
|
||||
eraseUnusedTensorPackingOps(funcOp, rewriter);
|
||||
}
|
||||
|
||||
SmallVector<spatial::SpatChannelReceiveOp> receiveOps;
|
||||
for (auto op : funcOp.getOps<spatial::SpatChannelReceiveOp>())
|
||||
@@ -392,37 +227,8 @@ void SpatialToPimPass::runOnOperation() {
|
||||
markOpToRemove(receiveOp);
|
||||
continue;
|
||||
}
|
||||
if (receiveOp->use_empty()) {
|
||||
rewriter.eraseOp(receiveOp);
|
||||
continue;
|
||||
}
|
||||
lowerChannelReceive(receiveOp, rewriter);
|
||||
}
|
||||
|
||||
SmallVector<spatial::SpatChannelReceiveTensorOp> receiveTensorOps;
|
||||
for (auto op : funcOp.getOps<spatial::SpatChannelReceiveTensorOp>())
|
||||
receiveTensorOps.push_back(op);
|
||||
for (auto receiveTensorOp : receiveTensorOps)
|
||||
lowerChannelReceiveTensor(receiveTensorOp, rewriter);
|
||||
|
||||
SmallVector<spatial::SpatChannelSendOp> sendOps;
|
||||
for (auto op : funcOp.getOps<spatial::SpatChannelSendOp>())
|
||||
sendOps.push_back(op);
|
||||
for (auto sendOp : sendOps)
|
||||
lowerChannelSend(sendOp, rewriter);
|
||||
|
||||
SmallVector<spatial::SpatChannelSendTensorOp> sendTensorOps;
|
||||
for (auto op : funcOp.getOps<spatial::SpatChannelSendTensorOp>())
|
||||
sendTensorOps.push_back(op);
|
||||
for (auto sendTensorOp : sendTensorOps)
|
||||
lowerChannelSendTensor(sendTensorOp, rewriter);
|
||||
|
||||
SmallVector<spatial::SpatExtractRowsOp> extractRowsOps;
|
||||
for (auto op : funcOp.getOps<spatial::SpatExtractRowsOp>())
|
||||
extractRowsOps.push_back(op);
|
||||
for (auto extractRowsOp : extractRowsOps)
|
||||
lowerExtractRows(extractRowsOp, rewriter);
|
||||
|
||||
{
|
||||
RewritePatternSet coreBodyPatterns(ctx);
|
||||
populateWithGenerated(coreBodyPatterns);
|
||||
@@ -457,7 +263,12 @@ void SpatialToPimPass::runOnOperation() {
|
||||
return;
|
||||
}
|
||||
|
||||
compactSpatialTensorGroups(funcOp, rewriter);
|
||||
{
|
||||
RewritePatternSet patterns(ctx);
|
||||
populateTensorPackingPatterns(patterns);
|
||||
walkAndApplyPatterns(funcOp, std::move(patterns));
|
||||
eraseUnusedTensorPackingOps(funcOp, rewriter);
|
||||
}
|
||||
|
||||
{
|
||||
ConversionTarget communicationTarget(*ctx);
|
||||
|
||||
@@ -1,8 +1,96 @@
|
||||
#include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
namespace {
|
||||
|
||||
struct PackSpatialConcatInputsPattern final : OpRewritePattern<spatial::SpatConcatOp> {
|
||||
using OpRewritePattern::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(spatial::SpatConcatOp concatOp, PatternRewriter& rewriter) const override {
|
||||
if (concatOp.getAxis() != 0 || concatOp.getInputs().empty())
|
||||
return failure();
|
||||
|
||||
SmallVector<Value> packedInputs;
|
||||
bool changed = false;
|
||||
|
||||
for (unsigned index = 0; index < concatOp.getInputs().size();) {
|
||||
Value input = concatOp.getInputs()[index];
|
||||
|
||||
if (input.getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
unsigned endIndex = index + 1;
|
||||
while (endIndex < concatOp.getInputs().size()
|
||||
&& concatOp.getInputs()[endIndex].getDefiningOp<tensor::ExtractSliceOp>())
|
||||
++endIndex;
|
||||
|
||||
Value packedInput = createPackedExtractSliceTensor(
|
||||
concatOp.getInputs().slice(index, endIndex - index), rewriter, concatOp.getLoc());
|
||||
if (packedInput) {
|
||||
packedInputs.push_back(packedInput);
|
||||
changed = true;
|
||||
index = endIndex;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
auto result = dyn_cast<OpResult>(input);
|
||||
if (!result) {
|
||||
packedInputs.push_back(input);
|
||||
++index;
|
||||
continue;
|
||||
}
|
||||
|
||||
Operation* owner = result.getOwner();
|
||||
unsigned startIndex = result.getResultNumber();
|
||||
unsigned endIndex = index + 1;
|
||||
while (endIndex < concatOp.getInputs().size()) {
|
||||
auto nextResult = dyn_cast<OpResult>(concatOp.getInputs()[endIndex]);
|
||||
if (!nextResult || nextResult.getOwner() != owner
|
||||
|| nextResult.getResultNumber() != startIndex + (endIndex - index))
|
||||
break;
|
||||
++endIndex;
|
||||
}
|
||||
|
||||
unsigned count = endIndex - index;
|
||||
Value packedInput;
|
||||
if (auto extractRowsOp = dyn_cast<spatial::SpatExtractRowsOp>(owner))
|
||||
packedInput = createPackedExtractRowsSlice(extractRowsOp, startIndex, count, rewriter, concatOp.getLoc());
|
||||
|
||||
if (packedInput) {
|
||||
packedInputs.push_back(packedInput);
|
||||
changed = true;
|
||||
}
|
||||
else {
|
||||
for (unsigned oldIndex = index; oldIndex < endIndex; ++oldIndex)
|
||||
packedInputs.push_back(concatOp.getInputs()[oldIndex]);
|
||||
}
|
||||
|
||||
index = endIndex;
|
||||
}
|
||||
|
||||
if (!changed)
|
||||
return failure();
|
||||
|
||||
auto outputType = cast<ShapedType>(concatOp.getOutput().getType());
|
||||
auto newConcat = pim::PimConcatOp::create(rewriter,
|
||||
concatOp.getLoc(),
|
||||
concatOp.getOutput().getType(),
|
||||
concatOp.getAxisAttr(),
|
||||
ValueRange(packedInputs),
|
||||
tensor::EmptyOp::create(rewriter,
|
||||
concatOp.getLoc(),
|
||||
outputType.getShape(),
|
||||
outputType.getElementType())
|
||||
.getResult());
|
||||
rewriter.replaceOp(concatOp, newConcat.getOutput());
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
RankedTensorType getPackedTensorType(RankedTensorType elementType, int64_t count) {
|
||||
SmallVector<int64_t> packedShape(elementType.getShape().begin(), elementType.getShape().end());
|
||||
@@ -146,4 +234,23 @@ Value createPackedExtractSliceTensor(ValueRange values, OpBuilder& builder, Loca
|
||||
return tensor::ExtractSliceOp::create(builder, loc, packedType, firstSliceOp.getSource(), offsets, sizes, strides)
|
||||
.getResult();
|
||||
}
|
||||
|
||||
void populateTensorPackingPatterns(RewritePatternSet& patterns) {
|
||||
patterns.add<PackSpatialConcatInputsPattern>(patterns.getContext());
|
||||
}
|
||||
|
||||
void eraseUnusedTensorPackingOps(func::FuncOp funcOp, IRRewriter& rewriter) {
|
||||
auto eraseUnusedOps = [&](auto tag) {
|
||||
using OpTy = decltype(tag);
|
||||
SmallVector<OpTy> ops;
|
||||
funcOp.walk([&](OpTy op) { ops.push_back(op); });
|
||||
for (auto op : llvm::reverse(ops))
|
||||
if (op->use_empty())
|
||||
rewriter.eraseOp(op);
|
||||
};
|
||||
eraseUnusedOps(tensor::ConcatOp {});
|
||||
eraseUnusedOps(tensor::ExtractSliceOp {});
|
||||
eraseUnusedOps(spatial::SpatExtractRowsOp {});
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
@@ -19,5 +20,7 @@ mlir::Value createPackedExtractRowsSlice(spatial::SpatExtractRowsOp extractRowsO
|
||||
mlir::OpBuilder& builder,
|
||||
mlir::Location loc);
|
||||
mlir::Value createPackedExtractSliceTensor(mlir::ValueRange values, mlir::OpBuilder& builder, mlir::Location loc);
|
||||
void populateTensorPackingPatterns(mlir::RewritePatternSet& patterns);
|
||||
void eraseUnusedTensorPackingOps(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
|
||||
@@ -2,6 +2,7 @@ add_onnx_mlir_dialect(Pim pim)
|
||||
add_onnx_mlir_dialect_doc(pim Pim.td)
|
||||
|
||||
add_subdirectory(Transforms/Bufferization)
|
||||
add_subdirectory(Transforms/StaticMemoryCoalescing)
|
||||
|
||||
add_pim_library(PimOps
|
||||
PimOps.hpp
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
add_pim_library(OMPimStaticMemoryCoalescing
|
||||
StaticMemoryCoalescing.cpp
|
||||
StaticMemoryCoalescing.hpp
|
||||
StaticMemoryCoalescingPass.cpp
|
||||
|
||||
EXCLUDE_FROM_OM_LIBS
|
||||
|
||||
INCLUDE_DIRS PUBLIC
|
||||
${PIM_PUBLIC_INCLUDE_DIRS}
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
OMPimCommon
|
||||
PimOps
|
||||
)
|
||||
@@ -0,0 +1,172 @@
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
|
||||
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
|
||||
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
#include <limits>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
namespace pim {
|
||||
|
||||
namespace {
|
||||
|
||||
static bool isSupportedAliasOp(Operation* op) {
|
||||
return isa<memref::SubViewOp, memref::CastOp, memref::CollapseShapeOp, memref::ExpandShapeOp>(op);
|
||||
}
|
||||
|
||||
static bool isCandidateAllocType(MemRefType type) {
|
||||
return type && type.hasStaticShape() && type.getLayout().isIdentity() && type.getElementTypeBitWidth() > 0;
|
||||
}
|
||||
|
||||
static uint64_t getTypeSizeBytes(MemRefType type) {
|
||||
return static_cast<uint64_t>(type.getNumElements() * type.getElementTypeBitWidth() / 8);
|
||||
}
|
||||
|
||||
static FailureOr<uint64_t> getLastUseInstruction(memref::AllocOp allocOp,
|
||||
Block& body,
|
||||
const DenseMap<Operation*, uint64_t>& opOrder) {
|
||||
uint64_t endInstruction = opOrder.lookup(allocOp);
|
||||
SmallPtrSet<Operation*, 16> visited;
|
||||
SmallVector<Value> pendingValues;
|
||||
pendingValues.push_back(allocOp.getResult());
|
||||
|
||||
while (!pendingValues.empty()) {
|
||||
Value value = pendingValues.pop_back_val();
|
||||
for (Operation* user : value.getUsers()) {
|
||||
if (user->getBlock() != &body)
|
||||
return failure();
|
||||
if (!visited.insert(user).second)
|
||||
continue;
|
||||
|
||||
if (isSupportedAliasOp(user)) {
|
||||
for (Value result : user->getResults())
|
||||
pendingValues.push_back(result);
|
||||
}
|
||||
|
||||
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
||||
for (OpResult result : user->getResults()) {
|
||||
OpOperand* tiedOperand = dpsOp.getTiedOpOperand(result);
|
||||
if (!tiedOperand || tiedOperand->get() != value)
|
||||
continue;
|
||||
pendingValues.push_back(result);
|
||||
}
|
||||
}
|
||||
|
||||
auto order = opOrder.find(user);
|
||||
if (order == opOrder.end())
|
||||
return failure();
|
||||
endInstruction = std::max(endInstruction, order->second);
|
||||
}
|
||||
}
|
||||
|
||||
return endInstruction;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
StaticMemoryCoalescingAnalysis analyzeStaticMemoryCoalescingCandidates(Operation* coreLikeOp) {
|
||||
StaticMemoryCoalescingAnalysis analysis;
|
||||
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
|
||||
return analysis;
|
||||
|
||||
Block& body = coreLikeOp->getRegion(0).front();
|
||||
DenseMap<Operation*, uint64_t> opOrder;
|
||||
uint64_t nextInstruction = 0;
|
||||
for (Operation& op : body)
|
||||
opOrder.try_emplace(&op, nextInstruction++);
|
||||
|
||||
for (Operation& op : body) {
|
||||
auto allocOp = dyn_cast<memref::AllocOp>(&op);
|
||||
if (!allocOp)
|
||||
continue;
|
||||
|
||||
auto allocType = dyn_cast<MemRefType>(allocOp.getType());
|
||||
if (!isCandidateAllocType(allocType)) {
|
||||
++analysis.skippedAllocations;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto endInstruction = getLastUseInstruction(allocOp, body, opOrder);
|
||||
if (failed(endInstruction)) {
|
||||
++analysis.skippedAllocations;
|
||||
continue;
|
||||
}
|
||||
|
||||
analysis.candidates.push_back(
|
||||
StaticAllocationCandidate {allocOp, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)});
|
||||
}
|
||||
|
||||
return analysis;
|
||||
}
|
||||
|
||||
StaticMemoryCoalescingStats coalesceStaticMemory(Operation* coreLikeOp, RewriterBase& rewriter) {
|
||||
StaticMemoryCoalescingStats stats;
|
||||
auto analysis = analyzeStaticMemoryCoalescingCandidates(coreLikeOp);
|
||||
stats.skippedAllocations = analysis.skippedAllocations;
|
||||
|
||||
llvm::sort(analysis.candidates, [](const StaticAllocationCandidate& lhs, const StaticAllocationCandidate& rhs) {
|
||||
if (lhs.startInstruction != rhs.startInstruction)
|
||||
return lhs.startInstruction < rhs.startInstruction;
|
||||
return lhs.endInstruction < rhs.endInstruction;
|
||||
});
|
||||
|
||||
struct ActiveStorage {
|
||||
memref::AllocOp root;
|
||||
uint64_t endInstruction = 0;
|
||||
};
|
||||
|
||||
SmallVector<ActiveStorage> active;
|
||||
SmallVector<memref::AllocOp> freeList;
|
||||
|
||||
for (StaticAllocationCandidate& candidate : analysis.candidates) {
|
||||
for (auto it = active.begin(); it != active.end();) {
|
||||
if (it->endInstruction < candidate.startInstruction) {
|
||||
freeList.push_back(it->root);
|
||||
it = active.erase(it);
|
||||
continue;
|
||||
}
|
||||
++it;
|
||||
}
|
||||
|
||||
auto bestFit = freeList.end();
|
||||
uint64_t bestFitBytes = std::numeric_limits<uint64_t>::max();
|
||||
auto candidateType = cast<MemRefType>(candidate.alloc.getType());
|
||||
for (auto it = freeList.begin(); it != freeList.end(); ++it) {
|
||||
auto freeType = cast<MemRefType>((*it).getType());
|
||||
if (freeType != candidateType)
|
||||
continue;
|
||||
|
||||
uint64_t freeBytes = getTypeSizeBytes(freeType);
|
||||
if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes)
|
||||
continue;
|
||||
|
||||
bestFit = it;
|
||||
bestFitBytes = freeBytes;
|
||||
}
|
||||
|
||||
if (bestFit == freeList.end()) {
|
||||
active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction});
|
||||
continue;
|
||||
}
|
||||
|
||||
memref::AllocOp root = *bestFit;
|
||||
freeList.erase(bestFit);
|
||||
candidate.alloc.getResult().replaceAllUsesWith(root.getResult());
|
||||
rewriter.eraseOp(candidate.alloc);
|
||||
active.push_back(ActiveStorage {root, candidate.endInstruction});
|
||||
++stats.removedAllocs;
|
||||
stats.savedBytes += candidate.sizeBytes;
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
} // namespace pim
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
namespace pim {
|
||||
|
||||
struct StaticAllocationCandidate {
|
||||
mlir::memref::AllocOp alloc;
|
||||
uint64_t startInstruction = 0;
|
||||
uint64_t endInstruction = 0;
|
||||
uint64_t sizeBytes = 0;
|
||||
};
|
||||
|
||||
struct StaticMemoryCoalescingAnalysis {
|
||||
llvm::SmallVector<StaticAllocationCandidate> candidates;
|
||||
uint64_t skippedAllocations = 0;
|
||||
};
|
||||
|
||||
struct StaticMemoryCoalescingStats {
|
||||
uint64_t removedAllocs = 0;
|
||||
uint64_t savedBytes = 0;
|
||||
uint64_t skippedAllocations = 0;
|
||||
};
|
||||
|
||||
StaticMemoryCoalescingAnalysis analyzeStaticMemoryCoalescingCandidates(mlir::Operation* coreLikeOp);
|
||||
|
||||
StaticMemoryCoalescingStats coalesceStaticMemory(mlir::Operation* coreLikeOp, mlir::RewriterBase& rewriter);
|
||||
|
||||
} // namespace pim
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,203 @@
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/raw_os_ostream.h"
|
||||
|
||||
#include <fstream>
|
||||
|
||||
#include "Common/IR/CompactAsmUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/DebugDump.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
|
||||
#include "src/Accelerators/PIM/Pass/PIMPasses.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace onnx_mlir::compact_asm;
|
||||
|
||||
namespace onnx_mlir {
|
||||
namespace {
|
||||
|
||||
struct CoalescingReportRow {
|
||||
uint64_t numCandidates = 0;
|
||||
uint64_t numSkipped = 0;
|
||||
uint64_t numRemoved = 0;
|
||||
uint64_t savedBytes = 0;
|
||||
|
||||
bool operator==(const CoalescingReportRow& other) const {
|
||||
return numCandidates == other.numCandidates && numSkipped == other.numSkipped && numRemoved == other.numRemoved
|
||||
&& savedBytes == other.savedBytes;
|
||||
}
|
||||
};
|
||||
|
||||
struct CoalescingReportEntry {
|
||||
enum class Kind {
|
||||
Core,
|
||||
Batch
|
||||
};
|
||||
|
||||
Kind kind = Kind::Core;
|
||||
uint64_t id = 0;
|
||||
llvm::SmallVector<int32_t, 8> coreIds;
|
||||
CoalescingReportRow row;
|
||||
};
|
||||
|
||||
static std::string formatMemory(uint64_t bytes) {
|
||||
return formatReportMemory(bytes);
|
||||
}
|
||||
|
||||
static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
||||
auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
|
||||
assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute");
|
||||
return SmallVector<int32_t>(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end());
|
||||
}
|
||||
|
||||
static void printReportRow(raw_ostream& os, const CoalescingReportRow& row) {
|
||||
llvm::SmallVector<ReportField, 4> fields = {
|
||||
{"Number of candidates", std::to_string(row.numCandidates)},
|
||||
{"Skipped allocations", std::to_string(row.numSkipped)},
|
||||
{"Removed allocations", std::to_string(row.numRemoved)},
|
||||
{"Saved memory", formatMemory(row.savedBytes)}};
|
||||
printReportFlatFields(os, fields);
|
||||
}
|
||||
|
||||
static CoalescingReportRow getTotalRow(const CoalescingReportEntry& entry) {
|
||||
uint64_t factor = std::max<uint64_t>(1, entry.coreIds.size());
|
||||
return {entry.row.numCandidates * factor,
|
||||
entry.row.numSkipped * factor,
|
||||
entry.row.numRemoved * factor,
|
||||
entry.row.savedBytes * factor};
|
||||
}
|
||||
|
||||
static void emitReport(ArrayRef<CoalescingReportEntry> entries) {
|
||||
std::fstream file = openReportFile("static_memory_coalescing_report");
|
||||
if (!file.is_open())
|
||||
return;
|
||||
|
||||
llvm::raw_os_ostream os(file);
|
||||
CoalescingReportRow totalRow;
|
||||
for (const CoalescingReportEntry& entry : entries) {
|
||||
CoalescingReportRow entryTotal = getTotalRow(entry);
|
||||
totalRow.numCandidates += entryTotal.numCandidates;
|
||||
totalRow.numSkipped += entryTotal.numSkipped;
|
||||
totalRow.numRemoved += entryTotal.numRemoved;
|
||||
totalRow.savedBytes += entryTotal.savedBytes;
|
||||
}
|
||||
|
||||
llvm::SmallVector<ReportField, 4> totalFields = {{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
||||
{"Skipped allocations", std::to_string(totalRow.numSkipped)},
|
||||
{"Removed allocations", std::to_string(totalRow.numRemoved)},
|
||||
{"Saved memory", formatMemory(totalRow.savedBytes)}};
|
||||
printReportTotalsBlock(os, totalFields);
|
||||
if (!entries.empty())
|
||||
os << "\n";
|
||||
|
||||
llvm::SmallVector<CoalescingReportEntry, 32> sortedEntries(entries.begin(), entries.end());
|
||||
sortReportEntriesByFirstCore(sortedEntries);
|
||||
|
||||
for (size_t index = 0; index < sortedEntries.size();) {
|
||||
size_t runEnd = index + 1;
|
||||
while (runEnd < sortedEntries.size() && sortedEntries[runEnd].kind == sortedEntries[index].kind
|
||||
&& sortedEntries[runEnd].row == sortedEntries[index].row) {
|
||||
++runEnd;
|
||||
}
|
||||
|
||||
if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) {
|
||||
os << "Batch ";
|
||||
for (size_t batchIndex = index; batchIndex < runEnd; ++batchIndex) {
|
||||
if (batchIndex != index)
|
||||
os << ",\n ";
|
||||
os << sortedEntries[batchIndex].id << " (cores ";
|
||||
printCompressedIntegerEntries(os, ArrayRef<int32_t>(sortedEntries[batchIndex].coreIds));
|
||||
os << ")";
|
||||
}
|
||||
}
|
||||
else {
|
||||
llvm::SmallVector<int32_t, 8> coreIds;
|
||||
for (size_t coreIndex = index; coreIndex < runEnd; ++coreIndex)
|
||||
coreIds.push_back(sortedEntries[coreIndex].coreIds.front());
|
||||
os << "Core ";
|
||||
printCompressedIntegerEntries(os, ArrayRef<int32_t>(coreIds));
|
||||
}
|
||||
|
||||
os << ":\n";
|
||||
if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) {
|
||||
llvm::SmallVector<ReportField, 4> perCoreFields = {
|
||||
{"Number of candidates", std::to_string(sortedEntries[index].row.numCandidates)},
|
||||
{"Skipped allocations", std::to_string(sortedEntries[index].row.numSkipped)},
|
||||
{"Removed allocations", std::to_string(sortedEntries[index].row.numRemoved)},
|
||||
{"Saved memory", formatMemory(sortedEntries[index].row.savedBytes)}};
|
||||
CoalescingReportRow totalRow = getTotalRow(sortedEntries[index]);
|
||||
llvm::SmallVector<ReportField, 4> totalFields = {
|
||||
{"Number of candidates", std::to_string(totalRow.numCandidates)},
|
||||
{"Skipped allocations", std::to_string(totalRow.numSkipped)},
|
||||
{"Removed allocations", std::to_string(totalRow.numRemoved)},
|
||||
{"Saved memory", formatMemory(totalRow.savedBytes)}};
|
||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
|
||||
}
|
||||
else {
|
||||
printReportRow(os, sortedEntries[index].row);
|
||||
}
|
||||
printReportEntrySeparator(os, runEnd < sortedEntries.size());
|
||||
index = runEnd;
|
||||
}
|
||||
|
||||
os.flush();
|
||||
file.close();
|
||||
}
|
||||
|
||||
struct StaticMemoryCoalescingPass : PassWrapper<StaticMemoryCoalescingPass, OperationPass<ModuleOp>> {
|
||||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(StaticMemoryCoalescingPass)
|
||||
|
||||
StringRef getArgument() const override { return "pim-static-memory-coalescing"; }
|
||||
StringRef getDescription() const override { return "Analyze static local PIM memory reuse opportunities"; }
|
||||
|
||||
StaticMemoryCoalescingPass() = default;
|
||||
StaticMemoryCoalescingPass(const StaticMemoryCoalescingPass& pass) {}
|
||||
|
||||
void runOnOperation() override {
|
||||
IRRewriter rewriter(&getContext());
|
||||
SmallVector<CoalescingReportEntry, 32> reportEntries;
|
||||
uint64_t nextBatchId = 0;
|
||||
|
||||
getOperation().walk([&](Operation* op) {
|
||||
if (!isa<pim::PimCoreOp, pim::PimCoreBatchOp>(op))
|
||||
return;
|
||||
|
||||
auto analysis = pim::analyzeStaticMemoryCoalescingCandidates(op);
|
||||
auto stats = pim::coalesceStaticMemory(op, rewriter);
|
||||
CoalescingReportRow row {
|
||||
analysis.candidates.size(), stats.skippedAllocations, stats.removedAllocs, stats.savedBytes};
|
||||
|
||||
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
||||
reportEntries.push_back({CoalescingReportEntry::Kind::Core,
|
||||
static_cast<uint64_t>(coreOp.getCoreId()),
|
||||
{static_cast<int32_t>(coreOp.getCoreId())},
|
||||
row});
|
||||
return;
|
||||
}
|
||||
|
||||
auto coreIds = getBatchCoreIds(cast<pim::PimCoreBatchOp>(op));
|
||||
CoalescingReportEntry entry;
|
||||
entry.kind = CoalescingReportEntry::Kind::Batch;
|
||||
entry.id = nextBatchId++;
|
||||
llvm::append_range(entry.coreIds, coreIds);
|
||||
entry.row = row;
|
||||
reportEntries.push_back(std::move(entry));
|
||||
});
|
||||
|
||||
emitReport(reportEntries);
|
||||
dumpModule(getOperation(), "pim2_coalesced");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<Pass> createPimStaticMemoryCoalescingPass() {
|
||||
return std::make_unique<StaticMemoryCoalescingPass>();
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -40,6 +40,7 @@
|
||||
#include "RegularOpCompaction.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/CompactAsmUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
@@ -764,18 +765,13 @@ void emitMotifProfile(func::FuncOp funcOp) {
|
||||
}
|
||||
|
||||
void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpuCount = 0) {
|
||||
std::string outputDir = getOutputDir();
|
||||
if (outputDir.empty())
|
||||
std::fstream file = openReportFile(name);
|
||||
if (!file.is_open())
|
||||
return;
|
||||
|
||||
std::string reportsDir = outputDir + "/reports";
|
||||
createDirectory(reportsDir);
|
||||
|
||||
std::fstream file(reportsDir + "/" + name + ".txt", std::ios::out);
|
||||
llvm::raw_os_ostream os(file);
|
||||
|
||||
struct ReportRow {
|
||||
uint64_t opId = 0;
|
||||
uint64_t id = 0;
|
||||
uint64_t logicalComputeCount = 0;
|
||||
uint64_t weightCount = 0;
|
||||
uint64_t instructionCount = 0;
|
||||
@@ -786,6 +782,9 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
||||
uint64_t totalComputeOps = 0;
|
||||
uint64_t totalLogicalComputes = 0;
|
||||
uint64_t totalBatchComputeOps = 0;
|
||||
uint64_t totalInstructionCount = 0;
|
||||
uint64_t totalWeightCount = 0;
|
||||
uint64_t nextBatchId = 0;
|
||||
std::vector<ReportRow> collectedData;
|
||||
|
||||
for (Operation& op : funcOp.getBody().front()) {
|
||||
@@ -793,8 +792,13 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
||||
uint64_t numInst = 0;
|
||||
for (auto& _ : spatCompute.getRegion().front())
|
||||
++numInst;
|
||||
collectedData.push_back({totalComputeOps++, 1, spatCompute.getWeights().size(), numInst, false, {}});
|
||||
SmallVector<int32_t> coreIds;
|
||||
if (auto coreId = getComputeCoreId(spatCompute))
|
||||
coreIds.push_back(*coreId);
|
||||
collectedData.push_back({totalComputeOps++, 1, spatCompute.getWeights().size(), numInst, false, coreIds});
|
||||
totalLogicalComputes += 1;
|
||||
totalInstructionCount += numInst;
|
||||
totalWeightCount += spatCompute.getWeights().size();
|
||||
continue;
|
||||
}
|
||||
if (auto batch = dyn_cast<SpatComputeBatch>(&op)) {
|
||||
@@ -805,44 +809,27 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
||||
SmallVector<int32_t> coreIds;
|
||||
if (auto coreIdsAttr = batch->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName))
|
||||
llvm::append_range(coreIds, coreIdsAttr.asArrayRef());
|
||||
collectedData.push_back({totalComputeOps++, logicalCount, batch.getWeights().size(), numInst, true, coreIds});
|
||||
collectedData.push_back({nextBatchId++, logicalCount, batch.getWeights().size(), numInst, true, coreIds});
|
||||
totalComputeOps += 1;
|
||||
totalLogicalComputes += logicalCount;
|
||||
totalBatchComputeOps += 1;
|
||||
totalInstructionCount += numInst * logicalCount;
|
||||
totalWeightCount += batch.getWeights().size();
|
||||
}
|
||||
}
|
||||
|
||||
os << "Used cores: " << usedCpuCount << "\n";
|
||||
os << "Number of top-level compute ops: " << totalComputeOps << "\n";
|
||||
os << "Number of logical computes: " << totalLogicalComputes << "\n";
|
||||
os << "Number of top-level batch compute ops: " << totalBatchComputeOps << "\n";
|
||||
os << "\n";
|
||||
llvm::SmallVector<ReportField, 6> totalFields = {{"Used cores", std::to_string(usedCpuCount)},
|
||||
{"Number of top-level compute ops", std::to_string(totalComputeOps)},
|
||||
{"Number of logical computes", std::to_string(totalLogicalComputes)},
|
||||
{"Number of top-level batch compute ops",
|
||||
std::to_string(totalBatchComputeOps)},
|
||||
{"Number of instructions", std::to_string(totalInstructionCount)},
|
||||
{"Number of used crossbars", std::to_string(totalWeightCount)}};
|
||||
printReportTotalsBlock(os, totalFields);
|
||||
if (!collectedData.empty())
|
||||
os << "\n";
|
||||
|
||||
std::stable_sort(collectedData.begin(), collectedData.end(), [](const ReportRow& lft, const ReportRow& rgt) {
|
||||
if (lft.isRebatched != rgt.isRebatched)
|
||||
return lft.isRebatched > rgt.isRebatched;
|
||||
|
||||
if (lft.instructionCount < rgt.instructionCount)
|
||||
return false;
|
||||
else if (rgt.instructionCount < lft.instructionCount)
|
||||
return true;
|
||||
|
||||
if (lft.weightCount < rgt.weightCount)
|
||||
return false;
|
||||
else if (rgt.weightCount < lft.weightCount)
|
||||
return true;
|
||||
|
||||
if (lft.logicalComputeCount < rgt.logicalComputeCount)
|
||||
return false;
|
||||
else if (rgt.logicalComputeCount < lft.logicalComputeCount)
|
||||
return true;
|
||||
|
||||
if (lft.opId < rgt.opId)
|
||||
return true;
|
||||
else if (rgt.opId < lft.opId)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
});
|
||||
sortReportEntriesByFirstCore(collectedData);
|
||||
|
||||
for (uint64_t cI = 0; cI < totalComputeOps; ++cI) {
|
||||
uint64_t lastIndex = cI;
|
||||
@@ -863,7 +850,7 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
||||
for (uint64_t index = cI; index <= lastIndex; ++index) {
|
||||
if (index != cI)
|
||||
os << ",\n ";
|
||||
os << collectedData[index].opId << " (cores ";
|
||||
os << collectedData[index].id << " (cores ";
|
||||
if (collectedData[index].coreIds.empty())
|
||||
os << "unknown";
|
||||
else
|
||||
@@ -876,14 +863,32 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
|
||||
SmallVector<uint64_t> opIds;
|
||||
opIds.reserve(lastIndex - cI + 1);
|
||||
for (uint64_t index = cI; index <= lastIndex; ++index)
|
||||
opIds.push_back(collectedData[index].opId);
|
||||
opIds.push_back(collectedData[index].id);
|
||||
printCompressedIntegerEntries(os, ArrayRef<uint64_t>(opIds));
|
||||
}
|
||||
|
||||
os << ":\n";
|
||||
os << "\tNumber of logical computes: " << current.logicalComputeCount << "\n";
|
||||
os << "\tNumber of instructions: " << current.instructionCount << "\n";
|
||||
os << "\tNumber of used crossbars: " << current.weightCount << "\n";
|
||||
uint64_t perCoreLogicalComputeCount = current.isRebatched ? 1 : current.logicalComputeCount;
|
||||
uint64_t perCoreInstructionCount = current.instructionCount;
|
||||
uint64_t perCoreWeightCount =
|
||||
current.logicalComputeCount == 0 ? 0 : current.weightCount / current.logicalComputeCount;
|
||||
uint64_t totalEntryInstructionCount = current.instructionCount * current.logicalComputeCount;
|
||||
|
||||
llvm::SmallVector<ReportField, 3> perCoreFields = {
|
||||
{"Number of logical computes", std::to_string(perCoreLogicalComputeCount)},
|
||||
{"Number of instructions", std::to_string(perCoreInstructionCount)},
|
||||
{"Number of used crossbars", std::to_string(perCoreWeightCount)}};
|
||||
if (current.isRebatched) {
|
||||
llvm::SmallVector<ReportField, 3> totalEntryFields = {
|
||||
{"Number of logical computes", std::to_string(current.logicalComputeCount)},
|
||||
{"Number of instructions", std::to_string(totalEntryInstructionCount)},
|
||||
{"Number of used crossbars", std::to_string(current.weightCount)}};
|
||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalEntryFields);
|
||||
}
|
||||
else {
|
||||
printReportFlatFields(os, perCoreFields);
|
||||
}
|
||||
printReportEntrySeparator(os, lastIndex + 1 < totalComputeOps);
|
||||
cI = lastIndex;
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,8 @@ std::unique_ptr<mlir::Pass> createSpatialToPimPass();
|
||||
|
||||
std::unique_ptr<mlir::Pass> createPimBufferizationPass();
|
||||
|
||||
std::unique_ptr<mlir::Pass> createPimStaticMemoryCoalescingPass();
|
||||
|
||||
std::unique_ptr<mlir::Pass> createMergeComputeNodesPass();
|
||||
|
||||
std::unique_ptr<mlir::Pass> createPimHostConstantFoldingPass();
|
||||
|
||||
@@ -39,7 +39,7 @@ struct HostConstantFoldingPass : PassWrapper<HostConstantFoldingPass, OperationP
|
||||
return;
|
||||
}
|
||||
|
||||
dumpModule(getOperation(), "pim2_folded");
|
||||
dumpModule(getOperation(), "pim3_folded");
|
||||
}
|
||||
|
||||
std::shared_ptr<const FrozenRewritePatternSet> patterns;
|
||||
|
||||
@@ -164,7 +164,7 @@ struct MaterializeHostConstantsPass : PassWrapper<MaterializeHostConstantsPass,
|
||||
return;
|
||||
}
|
||||
|
||||
dumpModule(moduleOp, "pim3_materialized");
|
||||
dumpModule(moduleOp, "pim4_materialized");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -102,6 +102,35 @@ static bool isExplicitHostOperand(Operation* op, unsigned operandIndex) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool isSupportedCoreInstructionOp(Operation* op) {
|
||||
return isa<pim::PimMemCopyHostToDevOp,
|
||||
pim::PimMemCopyHostToDevBatchOp,
|
||||
pim::PimMemCopyDevToHostOp,
|
||||
pim::PimMemCopyOp,
|
||||
pim::PimReceiveOp,
|
||||
pim::PimReceiveBatchOp,
|
||||
pim::PimReceiveTensorOp,
|
||||
pim::PimReceiveTensorBatchOp,
|
||||
pim::PimSendOp,
|
||||
pim::PimSendBatchOp,
|
||||
pim::PimSendTensorOp,
|
||||
pim::PimSendTensorBatchOp,
|
||||
pim::PimConcatOp,
|
||||
pim::PimVMMOp,
|
||||
pim::PimTransposeOp,
|
||||
pim::PimVVAddOp,
|
||||
pim::PimVVSubOp,
|
||||
pim::PimVVMulOp,
|
||||
pim::PimVVMaxOp,
|
||||
pim::PimVVDMulOp,
|
||||
pim::PimVAvgOp,
|
||||
pim::PimVReluOp,
|
||||
pim::PimVTanhOp,
|
||||
pim::PimVSigmOp,
|
||||
pim::PimVSoftmaxOp,
|
||||
memref::GetGlobalOp>(op);
|
||||
}
|
||||
|
||||
struct VerificationPass : PassWrapper<VerificationPass, OperationPass<ModuleOp>> {
|
||||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(VerificationPass)
|
||||
|
||||
@@ -214,6 +243,11 @@ private:
|
||||
return walkPimCoreBlock(
|
||||
coreOp.getBody().front(), StaticValueKnowledge {}, [](Operation& op, const StaticValueKnowledge& knowledge) {
|
||||
bool hasFailure = false;
|
||||
if (!isSupportedCoreInstructionOp(&op)) {
|
||||
op.emitOpError("unsupported executable op reached PIM codegen verification");
|
||||
hasFailure = true;
|
||||
}
|
||||
|
||||
for (auto [operandIndex, operand] : llvm::enumerate(op.getOperands())) {
|
||||
if (!isa<BaseMemRefType>(operand.getType()))
|
||||
continue;
|
||||
|
||||
@@ -75,6 +75,7 @@ void PimAccelerator::registerPasses(int optLevel) const {
|
||||
registerPass(createSpatialToGraphvizPass);
|
||||
registerPass(createSpatialToPimPass);
|
||||
registerPass(createPimBufferizationPass);
|
||||
registerPass(createPimStaticMemoryCoalescingPass);
|
||||
registerPass(createMergeComputeNodesPass);
|
||||
registerPass(createPimHostConstantFoldingPass);
|
||||
registerPass(createPimMaterializeHostConstantsPass);
|
||||
|
||||
Reference in New Issue
Block a user