binary pim code for reduced memory usage
Validate Operations / validate-operations (push) Has been cancelled
Validate Operations / validate-operations (push) Has been cancelled
fast pim code emission
This commit is contained in:
+194
-155
@@ -30,6 +30,7 @@
|
||||
#include "Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBinaryFormat.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimWeightEmitter.hpp"
|
||||
@@ -116,25 +117,29 @@ void PimMemory::allocateCore(Operation* op) {
|
||||
|
||||
static void printHostMemoryReportRow(raw_ostream& os, const MemoryReportRow& row) {
|
||||
llvm::SmallVector<ReportField, 2> fields = {
|
||||
{"Number of globals", std::to_string(row.numGlobal)},
|
||||
{"Global memory", formatReportMemory(row.sizeGlobal)}};
|
||||
{"Number of globals", std::to_string(row.numGlobal) },
|
||||
{"Global memory", formatReportMemory(row.sizeGlobal)}
|
||||
};
|
||||
printReportFlatFields(os, fields);
|
||||
}
|
||||
|
||||
static void printCoreMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) {
|
||||
llvm::SmallVector<ReportField, 2> fields = {
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca)},
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}};
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca) },
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}
|
||||
};
|
||||
printReportFlatFields(os, fields);
|
||||
}
|
||||
|
||||
static void printBatchMemoryReportRow(raw_ostream& os, const MemoryReportEntry& entry) {
|
||||
llvm::SmallVector<ReportField, 2> perCoreFields = {
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca)},
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}};
|
||||
{"Number of allocas", std::to_string(entry.row.numAlloca) },
|
||||
{"Allocated memory", formatReportMemory(entry.row.sizeAlloca)}
|
||||
};
|
||||
llvm::SmallVector<ReportField, 2> totalFields = {
|
||||
{"Number of allocas", std::to_string(entry.totalAllocaCount)},
|
||||
{"Batch memory", formatReportMemory(entry.totalAllocaBytes)}};
|
||||
{"Number of allocas", std::to_string(entry.totalAllocaCount) },
|
||||
{"Batch memory", formatReportMemory(entry.totalAllocaBytes)}
|
||||
};
|
||||
printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
|
||||
}
|
||||
|
||||
@@ -215,12 +220,8 @@ size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValu
|
||||
void PimAcceleratorMemory::reportHost() { hostReportRow = hostMem.getReportRow(); }
|
||||
|
||||
void PimAcceleratorMemory::recordCoreReport(size_t coreId, const MemoryReportRow& row) {
|
||||
reportEntries.push_back({MemoryReportEntry::Kind::Core,
|
||||
coreId,
|
||||
{static_cast<int32_t>(coreId)},
|
||||
row,
|
||||
row.numAlloca,
|
||||
row.sizeAlloca});
|
||||
reportEntries.push_back(
|
||||
{MemoryReportEntry::Kind::Core, coreId, {static_cast<int32_t>(coreId)}, row, row.numAlloca, row.sizeAlloca});
|
||||
}
|
||||
|
||||
void PimAcceleratorMemory::recordBatchReport(uint64_t batchId,
|
||||
@@ -250,7 +251,8 @@ void PimAcceleratorMemory::flushReport() {
|
||||
|
||||
llvm::SmallVector<ReportField, 2> totalFields = {
|
||||
{"Global memory", formatReportMemory(totalGlobalMemory)},
|
||||
{"Cores memory", formatReportMemory(totalCoresMemory)}};
|
||||
{"Cores memory", formatReportMemory(totalCoresMemory) }
|
||||
};
|
||||
printReportTotalsBlock(os, totalFields);
|
||||
|
||||
if (hostReportRow.has_value()) {
|
||||
@@ -312,36 +314,25 @@ void PimAcceleratorMemory::clean(mlir::Operation* op) {
|
||||
}
|
||||
}
|
||||
|
||||
json::Object PimCodeGen::createEmptyOffset() {
|
||||
json::Object offset;
|
||||
offset["offset_select"] = 0;
|
||||
offset["offset_value"] = 0;
|
||||
return offset;
|
||||
}
|
||||
|
||||
size_t PimCodeGen::remapCoreId(size_t coreId) const {
|
||||
auto it = emittedCoreIds.find(coreId);
|
||||
assert(it != emittedCoreIds.end() && "Missing emitted core id remapping");
|
||||
return it->second;
|
||||
}
|
||||
|
||||
static json::Object createRs1OnlyOffset() {
|
||||
json::Object offset;
|
||||
offset["offset_select"] = 1;
|
||||
offset["offset_value"] = 0;
|
||||
return offset;
|
||||
}
|
||||
|
||||
void PimCodeGen::emitInstruction(json::Object instruction) const {
|
||||
coreFileStream << json::Value(std::move(instruction)) << ',';
|
||||
void PimCodeGen::emitInstruction(const pim_binary::InstructionRecord& instruction) const {
|
||||
pim_binary::writeInstructionRecord(coreBinaryStream, instruction);
|
||||
++emittedInstructionCount;
|
||||
if (coreJsonStream)
|
||||
*coreJsonStream << json::Value(pim_binary::makeInstructionJson(instruction)) << ',';
|
||||
}
|
||||
|
||||
void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) const {
|
||||
json::Object json;
|
||||
json["op"] = "sldi";
|
||||
json["rd"] = registerNumber;
|
||||
json["imm"] = immediate;
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::sldi;
|
||||
instruction.rd = static_cast<uint8_t>(registerNumber);
|
||||
instruction.r2OrImm = static_cast<int32_t>(immediate);
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::setupRd(size_t rdAddress, size_t rdOffset) const {
|
||||
@@ -369,38 +360,41 @@ void PimCodeGen::emitMemCopyOp(StringRef opName,
|
||||
StringRef sizeFieldName) const {
|
||||
setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = opName;
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json[sizeFieldName] = size;
|
||||
json["offset"] = createEmptyOffset();
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::opcodeFromString(opName);
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.generic1 = 0;
|
||||
instruction.generic2 = 0;
|
||||
instruction.generic3 = static_cast<int32_t>(size);
|
||||
(void)sizeFieldName;
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::emitCommunicationOp(StringRef opName, size_t bufferAddr, size_t coreId, size_t size) const {
|
||||
setupRd(bufferAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = opName;
|
||||
json["rd"] = 0;
|
||||
json["core"] = remapCoreId(coreId);
|
||||
json["size"] = size;
|
||||
json["offset"] = createEmptyOffset();
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::opcodeFromString(opName);
|
||||
instruction.rd = 0;
|
||||
instruction.r2OrImm = static_cast<int32_t>(remapCoreId(coreId));
|
||||
instruction.generic1 = 0;
|
||||
instruction.generic2 = 0;
|
||||
instruction.generic3 = static_cast<int32_t>(size);
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::emitMvmOp(size_t groupId, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset) const {
|
||||
setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "mvmul";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["group"] = groupId;
|
||||
json["relu"] = 0;
|
||||
json["mbiw"] = 8;
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::mvmul;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 8;
|
||||
instruction.generic1 = 0;
|
||||
instruction.generic2 = static_cast<int32_t>(groupId);
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -508,14 +502,13 @@ void PimCodeGen::codeGenVVAddOp(pim::PimVVAddOp vvaddOp, const StaticValueKnowle
|
||||
auto rhsAddr = addressOf(vvaddOp.getRhs(), knowledge);
|
||||
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vvadd";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 2;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vvaddOp.getLhs());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vvadd;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 2;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vvaddOp.getLhs()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVVSubOp(pim::PimVVSubOp vvsubOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -524,14 +517,13 @@ void PimCodeGen::codeGenVVSubOp(pim::PimVVSubOp vvsubOp, const StaticValueKnowle
|
||||
auto rhsAddr = addressOf(vvsubOp.getRhs(), knowledge);
|
||||
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vvsub";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 2;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vvsubOp.getLhs());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vvsub;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 2;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vvsubOp.getLhs()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVVMulOp(pim::PimVVMulOp vvmulOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -540,14 +532,13 @@ void PimCodeGen::codeGenVVMulOp(pim::PimVVMulOp vvmulOp, const StaticValueKnowle
|
||||
auto rhsAddr = addressOf(vvmulOp.getRhs(), knowledge);
|
||||
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vvmul";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 2;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vvmulOp.getLhs());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vvmul;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 2;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vvmulOp.getLhs()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVVMaxOp(pim::PimVVMaxOp vvmaxOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -556,14 +547,13 @@ void PimCodeGen::codeGenVVMaxOp(pim::PimVVMaxOp vvmaxOp, const StaticValueKnowle
|
||||
auto rhsAddr = addressOf(vvmaxOp.getRhs(), knowledge);
|
||||
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vvmax";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 2;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vvmaxOp.getLhs());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vvmax;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 2;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vvmaxOp.getLhs()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVVDMulOp(pim::PimVVDMulOp vvdmulOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -572,14 +562,13 @@ void PimCodeGen::codeGenVVDMulOp(pim::PimVVDMulOp vvdmulOp, const StaticValueKno
|
||||
auto rhsAddr = addressOf(vvdmulOp.getRhs(), knowledge);
|
||||
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vvdmul";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 2;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vvdmulOp.getLhs());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vvdmul;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 2;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vvdmulOp.getLhs()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVAvgOp(pim::PimVAvgOp vavgOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -587,14 +576,14 @@ void PimCodeGen::codeGenVAvgOp(pim::PimVAvgOp vavgOp, const StaticValueKnowledge
|
||||
auto inputAddr = addressOf(vavgOp.getInput(), knowledge);
|
||||
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vavg";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["rs2"] = 1;
|
||||
json["offset"] = createRs1OnlyOffset();
|
||||
json["len"] = getValueSizeInBytes(vavgOp.getInput());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vavg;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.r2OrImm = 1;
|
||||
instruction.generic1 = 1;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vavgOp.getInput()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -602,13 +591,12 @@ void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp, const StaticValueKnowle
|
||||
auto inputAddr = addressOf(vreluOp.getInput(), knowledge);
|
||||
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vrelu";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vreluOp.getInput());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vrelu;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vreluOp.getInput()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVTanhOp(pim::PimVTanhOp vtanhOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -616,13 +604,12 @@ void PimCodeGen::codeGenVTanhOp(pim::PimVTanhOp vtanhOp, const StaticValueKnowle
|
||||
auto inputAddr = addressOf(vtanhOp.getInput(), knowledge);
|
||||
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vtanh";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vtanhOp.getInput());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vtanh;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vtanhOp.getInput()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -630,13 +617,12 @@ void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp, const StaticValueKnowle
|
||||
auto inputAddr = addressOf(vsigmOp.getInput(), knowledge);
|
||||
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vsigm";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vsigmOp.getInput());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vsigm;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vsigmOp.getInput()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticValueKnowledge& knowledge) const {
|
||||
@@ -644,13 +630,12 @@ void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticVa
|
||||
auto inputAddr = addressOf(vsoftmaxOp.getInput(), knowledge);
|
||||
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
||||
|
||||
json::Object json;
|
||||
json["op"] = "vsoftmax";
|
||||
json["rd"] = 0;
|
||||
json["rs1"] = 1;
|
||||
json["offset"] = createEmptyOffset();
|
||||
json["len"] = getValueSizeInBytes(vsoftmaxOp.getInput());
|
||||
emitInstruction(std::move(json));
|
||||
pim_binary::InstructionRecord instruction;
|
||||
instruction.opcode = pim_binary::Opcode::vsoftmax;
|
||||
instruction.rd = 0;
|
||||
instruction.r1 = 1;
|
||||
instruction.generic3 = static_cast<int32_t>(getValueSizeInBytes(vsoftmaxOp.getInput()));
|
||||
emitInstruction(instruction);
|
||||
}
|
||||
|
||||
void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const {}
|
||||
@@ -682,6 +667,30 @@ void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp, const Stati
|
||||
dstStrides[i] = dstStrides[i + 1] * dstShape[i + 1];
|
||||
}
|
||||
|
||||
bool storagePreserving = true;
|
||||
for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) {
|
||||
SmallVector<size_t> srcIdx(rank);
|
||||
size_t remaining = srcFlat;
|
||||
for (size_t d = 0; d < rank; d++) {
|
||||
srcIdx[d] = remaining / srcStrides[d];
|
||||
remaining %= srcStrides[d];
|
||||
}
|
||||
|
||||
size_t dstFlat = 0;
|
||||
for (size_t d = 0; d < rank; d++)
|
||||
dstFlat += srcIdx[perm[d]] * dstStrides[d];
|
||||
|
||||
if (dstFlat != srcFlat) {
|
||||
storagePreserving = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (storagePreserving) {
|
||||
emitMemCopyOp("lmv", dstAddr, 0, srcAddr, 0, totalElements * elementSize, "len");
|
||||
return;
|
||||
}
|
||||
|
||||
// Emit element-by-element copy with transposed addressing
|
||||
for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) {
|
||||
// Decompose flat source index into multi-dimensional index
|
||||
@@ -747,9 +756,25 @@ static SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
|
||||
return coreLikeOps;
|
||||
}
|
||||
|
||||
static SmallDenseMap<memref::GlobalOp, MemEntry, 16>
|
||||
collectMaterializedHostGlobals(ModuleOp moduleOp, func::FuncOp funcOp, const PimAcceleratorMemory& memory) {
|
||||
SmallDenseMap<memref::GlobalOp, MemEntry, 16> materializedHostGlobals;
|
||||
funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
|
||||
if (hasWeightAlways(getGlobalOp))
|
||||
return;
|
||||
auto targetGlobal = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
||||
if (!targetGlobal || materializedHostGlobals.contains(targetGlobal))
|
||||
return;
|
||||
auto it = memory.memEntriesMap.find(getGlobalOp.getResult());
|
||||
if (it != memory.memEntriesMap.end())
|
||||
materializedHostGlobals[targetGlobal] = it->second;
|
||||
});
|
||||
return materializedHostGlobals;
|
||||
}
|
||||
|
||||
static void aliasMaterializedHostGlobals(ModuleOp moduleOp,
|
||||
func::FuncOp funcOp,
|
||||
pim::PimCoreOp coreOp,
|
||||
const SmallDenseMap<memref::GlobalOp, MemEntry, 16>& materializedHostGlobals,
|
||||
PimAcceleratorMemory& memory) {
|
||||
coreOp.walk([&](memref::GetGlobalOp getGlobalOp) {
|
||||
if (hasWeightAlways(getGlobalOp) || memory.memEntriesMap.contains(getGlobalOp.getResult()))
|
||||
@@ -759,16 +784,9 @@ static void aliasMaterializedHostGlobals(ModuleOp moduleOp,
|
||||
if (!targetGlobal)
|
||||
return;
|
||||
|
||||
mlir::Value aliasedValue;
|
||||
funcOp.walk([&](memref::GetGlobalOp candidate) {
|
||||
if (aliasedValue || candidate == getGlobalOp || !memory.memEntriesMap.contains(candidate.getResult()))
|
||||
return;
|
||||
if (lookupGlobalForGetGlobal(moduleOp, candidate) == targetGlobal)
|
||||
aliasedValue = candidate.getResult();
|
||||
});
|
||||
|
||||
if (aliasedValue)
|
||||
memory.memEntriesMap[getGlobalOp.getResult()] = memory.memEntriesMap[aliasedValue];
|
||||
auto it = materializedHostGlobals.find(targetGlobal);
|
||||
if (it != materializedHostGlobals.end())
|
||||
memory.memEntriesMap[getGlobalOp.getResult()] = it->second;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -837,7 +855,7 @@ static int64_t codeGenCoreOps(Block& block, PimCodeGen& coreCodeGen) {
|
||||
return failed(result) ? -1 : static_cast<int64_t>(processedOperations);
|
||||
}
|
||||
|
||||
OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::string& outputDirPath) {
|
||||
OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::string& outputDirPath) {
|
||||
if (!outputDirPath.empty()) {
|
||||
if (auto error = sys::fs::create_directory(outputDirPath)) {
|
||||
errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n';
|
||||
@@ -857,7 +875,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath))
|
||||
return err;
|
||||
|
||||
if (auto err = writeHostCoreJson(outputDirPath))
|
||||
if (auto err = writeHostCoreArtifacts(outputDirPath))
|
||||
return err;
|
||||
|
||||
// For each core, specify the number of crossbar per array group.
|
||||
@@ -870,6 +888,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
auto mapCoreWeightToFileName = createAndPopulateWeightFolder(funcOp, outputDirPath);
|
||||
|
||||
SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
|
||||
SmallDenseMap<memref::GlobalOp, MemEntry, 16> materializedHostGlobals =
|
||||
collectMaterializedHostGlobals(moduleOp, funcOp, memory);
|
||||
llvm::DenseMap<size_t, size_t> emittedCoreIds;
|
||||
size_t nextEmittedCoreId = 1;
|
||||
|
||||
@@ -899,16 +919,30 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
maxCoreId = std::max(maxCoreId, coreId);
|
||||
|
||||
std::error_code errorCode;
|
||||
auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".json";
|
||||
raw_fd_ostream coreFileStream(outputCorePath, errorCode);
|
||||
auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".pim";
|
||||
raw_fd_ostream coreBinaryStream(outputCorePath, errorCode, sys::fs::OF_None);
|
||||
if (errorCode) {
|
||||
errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n';
|
||||
return InvalidOutputFileAccess;
|
||||
}
|
||||
coreFileStream << '[';
|
||||
|
||||
PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds);
|
||||
aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory);
|
||||
std::unique_ptr<raw_fd_ostream> coreJsonStream;
|
||||
if (pimEmitJson.getValue()) {
|
||||
std::string outputCoreJsonPath = outputDirPath + "/core_" + std::to_string(coreId) + ".json";
|
||||
errorCode = std::error_code();
|
||||
coreJsonStream = std::make_unique<raw_fd_ostream>(outputCoreJsonPath, errorCode);
|
||||
if (errorCode) {
|
||||
errs() << "Error while opening core json file `" << outputCoreJsonPath << "`: " << errorCode.message()
|
||||
<< '\n';
|
||||
return InvalidOutputFileAccess;
|
||||
}
|
||||
*coreJsonStream << '[';
|
||||
}
|
||||
|
||||
pim_binary::writeHeader(coreBinaryStream);
|
||||
|
||||
PimCodeGen coreCodeGen(memory, coreBinaryStream, coreJsonStream.get(), emittedCoreIds);
|
||||
aliasMaterializedHostGlobals(moduleOp, coreOp, materializedHostGlobals, memory);
|
||||
auto& deviceMemory = memory.getOrCreateDeviceMem(coreId);
|
||||
deviceMemory.allocateCore(coreOp);
|
||||
|
||||
@@ -920,9 +954,14 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
|
||||
if (reportRow)
|
||||
*reportRow = deviceMemory.getReportRow();
|
||||
|
||||
coreFileStream.seek(coreFileStream.tell() - 1);
|
||||
coreFileStream << ']';
|
||||
coreFileStream.close();
|
||||
pim_binary::patchInstructionCount(coreBinaryStream, coreCodeGen.getEmittedInstructionCount());
|
||||
coreBinaryStream.close();
|
||||
|
||||
if (coreJsonStream) {
|
||||
coreJsonStream->seek(coreJsonStream->tell() - 1);
|
||||
*coreJsonStream << ']';
|
||||
coreJsonStream->close();
|
||||
}
|
||||
|
||||
auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
|
||||
if (auto error = sys::fs::create_directory(coreWeightsDirPath)) {
|
||||
|
||||
Reference in New Issue
Block a user