1322 lines
53 KiB
C++
1322 lines
53 KiB
C++
#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
|
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/IR/AsmState.h"
|
|
#include "mlir/IR/Attributes.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/IRMapping.h"
|
|
#include "mlir/IR/Value.h"
|
|
#include "mlir/IR/Verifier.h"
|
|
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/FileSystem.h"
|
|
#include "llvm/Support/Format.h"
|
|
#include "llvm/Support/JSON.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <absl/types/compare.h>
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <cstdint>
|
|
#include <fstream>
|
|
#include <string>
|
|
#include <utility>
|
|
|
|
#include "Common/PimCommon.hpp"
|
|
#include "Conversion/ONNXToSpatial/Common/Common.hpp"
|
|
#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
|
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
|
|
|
using namespace llvm;
|
|
using namespace mlir;
|
|
using namespace onnx_mlir;
|
|
|
|
static size_t getValueSizeInBytes(mlir::Value value) {
|
|
auto type = cast<ShapedType>(value.getType());
|
|
return type.getNumElements() * type.getElementTypeBitWidth() / 8;
|
|
}
|
|
|
|
MemEntry* PimMemory::gatherMemEntry(mlir::Value value) {
|
|
auto type = cast<ShapedType>(value.getType());
|
|
assert("Only static shape is supported" && type.hasStaticShape());
|
|
size_t allocSize = type.getNumElements() * type.getElementType().getIntOrFloatBitWidth() / 8;
|
|
MemEntry memEntry = {0, allocSize};
|
|
return &memEntries.emplace_back(memEntry, value).first;
|
|
}
|
|
|
|
void PimMemory::allocateGatheredMemory() {
|
|
llvm::sort(memEntries, [](auto a, auto b) -> bool { return a.first.size > b.first.size; });
|
|
for (auto& [memEntry, value] : memEntries)
|
|
allocateMemoryForValue(value, memEntry);
|
|
}
|
|
|
|
void PimMemory::allocateMemoryForValue(mlir::Value value, MemEntry& memEntry) {
|
|
memEntry.address = firstAvailableAddress;
|
|
firstAvailableAddress += memEntry.size;
|
|
// Alignment
|
|
if (size_t remainder = firstAvailableAddress % minAlignment)
|
|
firstAvailableAddress += minAlignment - remainder;
|
|
|
|
globalMemEntriesMap[value] = memEntry;
|
|
}
|
|
|
|
void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) {
|
|
SmallDenseMap<memref::GlobalOp, mlir::Value, 8> globalConstants;
|
|
SmallVector<std::pair<mlir::Value, mlir::Value>, 16> globalAliases;
|
|
SmallVector<mlir::Value> args;
|
|
|
|
for (mlir::Value arg : funcOp.getArguments()) {
|
|
gatherMemEntry(arg);
|
|
args.push_back(arg);
|
|
}
|
|
|
|
funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
|
|
if (!hasWeightAlways(getGlobalOp)) {
|
|
auto globalMemrefOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
|
if (globalMemrefOp.getName().starts_with("arg")) {
|
|
StringRef indexStr = globalMemrefOp.getName().substr(4);
|
|
int index = 0;
|
|
llvm::to_integer(indexStr, index, 10);
|
|
globalAliases.push_back({getGlobalOp.getResult(), args[index]});
|
|
}
|
|
auto [iter, inserted] = globalConstants.try_emplace(globalMemrefOp, getGlobalOp.getResult());
|
|
if (inserted)
|
|
gatherMemEntry(getGlobalOp.getResult());
|
|
else
|
|
globalAliases.push_back({getGlobalOp.getResult(), iter->second});
|
|
}
|
|
});
|
|
|
|
funcOp.walk([&](memref::AllocOp allocOp) {
|
|
if (!allocOp->getParentOfType<pim::PimCoreOp>())
|
|
gatherMemEntry(allocOp.getResult());
|
|
});
|
|
|
|
allocateGatheredMemory();
|
|
|
|
for (auto [alias, original] : globalAliases)
|
|
globalMemEntriesMap[alias] = getMemEntry(original);
|
|
}
|
|
|
|
void PimMemory::allocateCore(Operation* op) {
|
|
op->walk([&](memref::AllocOp allocOp) { gatherMemEntry(allocOp); });
|
|
|
|
allocateGatheredMemory();
|
|
}
|
|
|
|
std::string formatMemory(uint64_t bytes) {
|
|
const char* units[] = {"B", "KB", "MB", "GB", "TB", "PB", "EB"};
|
|
int i = 0;
|
|
double size = static_cast<double>(bytes);
|
|
while (size >= 1024 && i < 6) {
|
|
size /= 1024;
|
|
i++;
|
|
}
|
|
// Formats to 2 decimal places
|
|
std::string out;
|
|
llvm::raw_string_ostream rss(out);
|
|
rss << llvm::format("%.2f ", size) << units[i];
|
|
return rss.str();
|
|
}
|
|
|
|
void PimMemory::report(llvm::raw_ostream& file) {
|
|
// Key: {OpName, Size}, Value: Vector of Addresses
|
|
// This groups all "memref.alloc" of "1KB" together
|
|
std::vector orderedList(globalMemEntriesMap.begin(), globalMemEntriesMap.end());
|
|
std::sort(
|
|
orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.address < rgt.second.address; });
|
|
auto newEnd = std::unique(orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) {
|
|
return lft.second.address == rgt.second.address;
|
|
});
|
|
orderedList.erase(newEnd, orderedList.end());
|
|
std::sort(
|
|
orderedList.begin(), orderedList.end(), [](auto lft, auto rgt) { return lft.second.size < rgt.second.size; });
|
|
std::map<std::pair<std::string, uint64_t>, std::vector<uint64_t>> groupedStats;
|
|
|
|
for (auto& [value, memEntry] : orderedList) {
|
|
std::string opName = "Unknown/BlockArg";
|
|
if (auto op = value.getDefiningOp())
|
|
opName = op->getName().getStringRef().str();
|
|
|
|
groupedStats[{opName, memEntry.size}].push_back(memEntry.address);
|
|
}
|
|
|
|
file << "--- Memory Usage Report ---\n";
|
|
|
|
uint64_t totalMemory = 0;
|
|
for (auto const& [key, addresses] : groupedStats) {
|
|
const std::string& opName = key.first;
|
|
uint64_t size = key.second;
|
|
|
|
file.indent(4) << "Type: " << opName << " [" << formatMemory(size) << "]\n";
|
|
file.indent(6) << "Count: " << addresses.size() << "\n";
|
|
file.indent(6) << "Total Memory: " << formatMemory(size * addresses.size()) << "\n";
|
|
totalMemory += size * addresses.size();
|
|
|
|
// Optional: Print address range or first/last address to keep it concise
|
|
if (!addresses.empty()) {
|
|
auto [min, max] = std::minmax_element(addresses.begin(), addresses.end());
|
|
file.indent(6) << "Range: " << llvm::format_hex(*min, 10) << " -> " << llvm::format_hex(*max, 10) << "\n";
|
|
}
|
|
file << "\n";
|
|
file << "Total Core Memory: " << formatMemory(totalMemory) << "\n";
|
|
}
|
|
}
|
|
|
|
|
|
void PimMemory::remove(mlir::Value val) {
|
|
if (auto removeIter = globalMemEntriesMap.find(val); removeIter != globalMemEntriesMap.end())
|
|
globalMemEntriesMap.erase(removeIter);
|
|
}
|
|
|
|
MemEntry PimMemory::getMemEntry(mlir::Value value) const {
|
|
auto iter = globalMemEntriesMap.find(value);
|
|
assert("Missing memEntry for value" && iter != globalMemEntriesMap.end());
|
|
return iter->second;
|
|
}
|
|
|
|
PimMemory& PimAcceleratorMemory::getOrCreateDeviceMem(size_t id) {
|
|
return deviceMem.try_emplace(id, memEntriesMap).first->second;
|
|
}
|
|
|
|
size_t PimAcceleratorMemory::getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge) const {
|
|
auto resolvedAddress = resolveContiguousAddress(value, knowledge);
|
|
if (failed(resolvedAddress)) {
|
|
errs() << "Failed to resolve contiguous address for value: ";
|
|
value.print(errs());
|
|
errs() << "\n";
|
|
if (auto* definingOp = value.getDefiningOp()) {
|
|
errs() << "Defining op:\n";
|
|
definingOp->print(errs());
|
|
errs() << "\n";
|
|
}
|
|
llvm_unreachable("Failed to resolve contiguous address");
|
|
}
|
|
|
|
auto iter = memEntriesMap.find(resolvedAddress->base);
|
|
if (iter == memEntriesMap.end()) {
|
|
errs() << "Missing mem entry for value: ";
|
|
resolvedAddress->base.print(errs());
|
|
errs() << "\n";
|
|
if (auto* definingOp = resolvedAddress->base.getDefiningOp()) {
|
|
errs() << "Defining op:\n";
|
|
definingOp->print(errs());
|
|
errs() << "\n";
|
|
}
|
|
llvm_unreachable("Missing mem entry");
|
|
}
|
|
|
|
return iter->second.address + resolvedAddress->byteOffset;
|
|
}
|
|
|
|
void PimAcceleratorMemory::reportHost() {
|
|
llvm::raw_os_ostream os(fileReport);
|
|
os << "Host Memory\n";
|
|
hostMem.report(os);
|
|
os.flush();
|
|
}
|
|
|
|
void PimAcceleratorMemory::reportCore(size_t coreId) {
|
|
llvm::raw_os_ostream os(fileReport);
|
|
os << "Core " << coreId << " Memory\n";
|
|
deviceMem.at(coreId).report(os);
|
|
os.flush();
|
|
}
|
|
|
|
void PimAcceleratorMemory::clean(mlir::Operation* op) {
|
|
for (auto value : op->getResults()) {
|
|
hostMem.remove(value);
|
|
for (auto& device : deviceMem)
|
|
device.second.remove(value);
|
|
}
|
|
}
|
|
|
|
json::Object PimCodeGen::createEmptyOffset() {
|
|
json::Object offset;
|
|
offset["offset_select"] = 0;
|
|
offset["offset_value"] = 0;
|
|
return offset;
|
|
}
|
|
|
|
size_t PimCodeGen::remapCoreId(size_t coreId) const {
|
|
auto it = emittedCoreIds.find(coreId);
|
|
assert(it != emittedCoreIds.end() && "Missing emitted core id remapping");
|
|
return it->second;
|
|
}
|
|
|
|
static json::Object createRs1OnlyOffset() {
|
|
json::Object offset;
|
|
offset["offset_select"] = 1;
|
|
offset["offset_value"] = 0;
|
|
return offset;
|
|
}
|
|
|
|
void PimCodeGen::emitInstruction(json::Object instruction) const {
|
|
coreFileStream << json::Value(std::move(instruction)) << ',';
|
|
}
|
|
|
|
void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) const {
|
|
json::Object json;
|
|
json["op"] = "sldi";
|
|
json["rd"] = registerNumber;
|
|
json["imm"] = immediate;
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::setupRd(size_t rdAddress, size_t rdOffset) const {
|
|
genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
|
|
}
|
|
|
|
void PimCodeGen::setupRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset) const {
|
|
genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
|
|
genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
|
|
}
|
|
|
|
void PimCodeGen::setupRdRs1Rs2(
|
|
size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset) const {
|
|
genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
|
|
genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
|
|
genSetRegisterImmediateUnsigned(2, rs2Address + rs2Offset);
|
|
}
|
|
|
|
void PimCodeGen::emitMemCopyOp(StringRef opName,
|
|
size_t rdAddr,
|
|
size_t rdOffset,
|
|
size_t rs1Addr,
|
|
size_t rs1Offset,
|
|
size_t size,
|
|
StringRef sizeFieldName) const {
|
|
setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset);
|
|
|
|
json::Object json;
|
|
json["op"] = opName;
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json[sizeFieldName] = size;
|
|
json["offset"] = createEmptyOffset();
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::emitCommunicationOp(StringRef opName, size_t bufferAddr, size_t coreId, size_t size) const {
|
|
setupRd(bufferAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = opName;
|
|
json["rd"] = 0;
|
|
json["core"] = remapCoreId(coreId);
|
|
json["size"] = size;
|
|
json["offset"] = createEmptyOffset();
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::emitMvmOp(size_t groupId, size_t rdAddr, size_t rdOffset, size_t rs1Addr, size_t rs1Offset) const {
|
|
setupRdRs1(rdAddr, rdOffset, rs1Addr, rs1Offset);
|
|
|
|
json::Object json;
|
|
json["op"] = "mvmul";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["group"] = groupId;
|
|
json["relu"] = 0;
|
|
json["mbiw"] = 8;
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const {
|
|
emitMemCopyOp("ld",
|
|
addressOf(loadOp.getDeviceTarget(), knowledge),
|
|
loadOp.getDeviceTargetOffset(),
|
|
addressOf(loadOp.getHostSource(), knowledge),
|
|
loadOp.getHostSourceOffset(),
|
|
loadOp.getSize());
|
|
}
|
|
|
|
void PimCodeGen::codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp, const StaticValueKnowledge& knowledge) const {
|
|
emitMemCopyOp("st",
|
|
addressOf(storeOp.getHostTarget(), knowledge),
|
|
storeOp.getHostTargetOffset(),
|
|
addressOf(storeOp.getDeviceSource(), knowledge),
|
|
storeOp.getDeviceSourceOffset(),
|
|
storeOp.getSize());
|
|
}
|
|
|
|
void PimCodeGen::codeGenLmvOp(pim::PimMemCopyOp lmvOp, const StaticValueKnowledge& knowledge) const {
|
|
emitMemCopyOp("lmv",
|
|
addressOf(lmvOp.getTarget(), knowledge),
|
|
lmvOp.getTargetOffset(),
|
|
addressOf(lmvOp.getSource(), knowledge),
|
|
lmvOp.getSourceOffset(),
|
|
lmvOp.getSize(),
|
|
"len");
|
|
}
|
|
|
|
void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp, const StaticValueKnowledge& knowledge) const {
|
|
emitCommunicationOp(
|
|
"recv", addressOf(receiveOp.getOutputBuffer(), knowledge), receiveOp.getSourceCoreId(), receiveOp.getSize());
|
|
}
|
|
|
|
void PimCodeGen::codeGenReceiveManyOp(pim::PimReceiveManyOp receiveManyOp, const StaticValueKnowledge& knowledge) const {
|
|
for (auto [outputBuffer, sourceCoreId] : llvm::zip(receiveManyOp.getOutputBuffers(), receiveManyOp.getSourceCoreIds()))
|
|
emitCommunicationOp("recv", addressOf(outputBuffer, knowledge), sourceCoreId, getValueSizeInBytes(outputBuffer));
|
|
}
|
|
|
|
void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const {
|
|
emitCommunicationOp("send", addressOf(sendOp.getInput(), knowledge), sendOp.getTargetCoreId(), sendOp.getSize());
|
|
}
|
|
|
|
void PimCodeGen::codeGenSendManyOp(pim::PimSendManyOp sendManyOp, const StaticValueKnowledge& knowledge) const {
|
|
for (auto [input, targetCoreId] : llvm::zip(sendManyOp.getInputs(), sendManyOp.getTargetCoreIds()))
|
|
emitCommunicationOp("send", addressOf(input, knowledge), targetCoreId, getValueSizeInBytes(input));
|
|
}
|
|
|
|
void PimCodeGen::codeGenExtractRowsOp(pim::PimExtractRowsOp extractRowsOp, const StaticValueKnowledge& knowledge) const {
|
|
auto inputType = cast<ShapedType>(extractRowsOp.getInput().getType());
|
|
assert(inputType.hasStaticShape() && inputType.getRank() == 2 && "extract_rows codegen requires static rank-2 input");
|
|
|
|
size_t elementSize = inputType.getElementTypeBitWidth() / 8;
|
|
size_t rowSizeInBytes = static_cast<size_t>(inputType.getDimSize(1)) * elementSize;
|
|
size_t inputAddr = addressOf(extractRowsOp.getInput(), knowledge);
|
|
|
|
for (auto [rowIndex, outputBuffer] : llvm::enumerate(extractRowsOp.getOutputBuffers()))
|
|
emitMemCopyOp("lmv",
|
|
addressOf(outputBuffer, knowledge),
|
|
0,
|
|
inputAddr,
|
|
rowIndex * rowSizeInBytes,
|
|
rowSizeInBytes,
|
|
"len");
|
|
}
|
|
|
|
void PimCodeGen::codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputType = cast<ShapedType>(concatOp.getOutputBuffer().getType());
|
|
assert(outputType.hasStaticShape() && "concat codegen requires static output shape");
|
|
|
|
int64_t axis = concatOp.getAxis();
|
|
ArrayRef<int64_t> outputShape = outputType.getShape();
|
|
size_t elementSize = outputType.getElementTypeBitWidth() / 8;
|
|
size_t outputAddr = addressOf(concatOp.getOutputBuffer(), knowledge);
|
|
|
|
size_t outerCount = 1;
|
|
for (int64_t dim = 0; dim < axis; ++dim)
|
|
outerCount *= static_cast<size_t>(outputShape[dim]);
|
|
|
|
size_t innerCount = 1;
|
|
for (size_t dim = static_cast<size_t>(axis) + 1; dim < outputShape.size(); ++dim)
|
|
innerCount *= static_cast<size_t>(outputShape[dim]);
|
|
|
|
size_t outputConcatDim = static_cast<size_t>(outputShape[axis]);
|
|
size_t concatOffset = 0;
|
|
for (mlir::Value input : concatOp.getInputs()) {
|
|
auto inputType = cast<ShapedType>(input.getType());
|
|
assert(inputType.hasStaticShape() && "concat codegen requires static input shapes");
|
|
|
|
size_t inputConcatDim = static_cast<size_t>(inputType.getDimSize(axis));
|
|
size_t blockSizeInBytes = inputConcatDim * innerCount * elementSize;
|
|
size_t inputAddr = addressOf(input, knowledge);
|
|
|
|
for (size_t outerIndex = 0; outerIndex < outerCount; ++outerIndex) {
|
|
size_t dstOffset = (outerIndex * outputConcatDim + concatOffset) * innerCount * elementSize;
|
|
size_t srcOffset = outerIndex * inputConcatDim * innerCount * elementSize;
|
|
emitMemCopyOp("lmv", outputAddr, dstOffset, inputAddr, srcOffset, blockSizeInBytes, "len");
|
|
}
|
|
|
|
concatOffset += inputConcatDim;
|
|
}
|
|
}
|
|
|
|
template <typename MVMTy>
|
|
void PimCodeGen::codeGenMVMLikeOp(size_t mvmId,
|
|
MVMTy mvmLikeOp,
|
|
bool transposeMatrix,
|
|
const StaticValueKnowledge& knowledge) {
|
|
emitMvmOp(mvmId, addressOf(mvmLikeOp.getOutputBuffer(), knowledge), 0, addressOf(mvmLikeOp.getInput(), knowledge), 0);
|
|
|
|
// TODO: save weights somewhere (if transposeMatrix=true, transpose the weight matrix)
|
|
}
|
|
|
|
void PimCodeGen::codeGenVVAddOp(pim::PimVVAddOp vvaddOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vvaddOp.getOutputBuffer(), knowledge);
|
|
auto lhsAddr = addressOf(vvaddOp.getLhs(), knowledge);
|
|
auto rhsAddr = addressOf(vvaddOp.getRhs(), knowledge);
|
|
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vvadd";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 2;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vvaddOp.getLhs());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVVSubOp(pim::PimVVSubOp vvsubOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vvsubOp.getOutputBuffer(), knowledge);
|
|
auto lhsAddr = addressOf(vvsubOp.getLhs(), knowledge);
|
|
auto rhsAddr = addressOf(vvsubOp.getRhs(), knowledge);
|
|
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vvsub";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 2;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vvsubOp.getLhs());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVVMulOp(pim::PimVVMulOp vvmulOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vvmulOp.getOutputBuffer(), knowledge);
|
|
auto lhsAddr = addressOf(vvmulOp.getLhs(), knowledge);
|
|
auto rhsAddr = addressOf(vvmulOp.getRhs(), knowledge);
|
|
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vvmul";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 2;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vvmulOp.getLhs());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVVMaxOp(pim::PimVVMaxOp vvmaxOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vvmaxOp.getOutputBuffer(), knowledge);
|
|
auto lhsAddr = addressOf(vvmaxOp.getLhs(), knowledge);
|
|
auto rhsAddr = addressOf(vvmaxOp.getRhs(), knowledge);
|
|
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vvmax";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 2;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vvmaxOp.getLhs());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVVDMulOp(pim::PimVVDMulOp vvdmulOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vvdmulOp.getOutputBuffer(), knowledge);
|
|
auto lhsAddr = addressOf(vvdmulOp.getLhs(), knowledge);
|
|
auto rhsAddr = addressOf(vvdmulOp.getRhs(), knowledge);
|
|
setupRdRs1Rs2(outputBufferAddr, 0, lhsAddr, 0, rhsAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vvdmul";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 2;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vvdmulOp.getLhs());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVAvgOp(pim::PimVAvgOp vavgOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vavgOp.getOutputBuffer(), knowledge);
|
|
auto inputAddr = addressOf(vavgOp.getInput(), knowledge);
|
|
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vavg";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["rs2"] = 1;
|
|
json["offset"] = createRs1OnlyOffset();
|
|
json["len"] = getValueSizeInBytes(vavgOp.getInput());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vreluOp.getOutputBuffer(), knowledge);
|
|
auto inputAddr = addressOf(vreluOp.getInput(), knowledge);
|
|
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vrelu";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vreluOp.getInput());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVTanhOp(pim::PimVTanhOp vtanhOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vtanhOp.getOutputBuffer(), knowledge);
|
|
auto inputAddr = addressOf(vtanhOp.getInput(), knowledge);
|
|
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vtanh";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vtanhOp.getInput());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVSigmOp(pim::PimVSigmOp vsigmOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vsigmOp.getOutputBuffer(), knowledge);
|
|
auto inputAddr = addressOf(vsigmOp.getInput(), knowledge);
|
|
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vsigm";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vsigmOp.getInput());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGenVSoftmaxOp(pim::PimVSoftmaxOp vsoftmaxOp, const StaticValueKnowledge& knowledge) const {
|
|
auto outputBufferAddr = addressOf(vsoftmaxOp.getOutputBuffer(), knowledge);
|
|
auto inputAddr = addressOf(vsoftmaxOp.getInput(), knowledge);
|
|
setupRdRs1(outputBufferAddr, 0, inputAddr, 0);
|
|
|
|
json::Object json;
|
|
json["op"] = "vsoftmax";
|
|
json["rd"] = 0;
|
|
json["rs1"] = 1;
|
|
json["offset"] = createEmptyOffset();
|
|
json["len"] = getValueSizeInBytes(vsoftmaxOp.getInput());
|
|
emitInstruction(std::move(json));
|
|
}
|
|
|
|
void PimCodeGen::codeGetGlobalOp(memref::GetGlobalOp getGlobalOp, const StaticValueKnowledge& knowledge) const {}
|
|
|
|
void PimCodeGen::codeGenTransposeOp(pim::PimTransposeOp transposeOp, const StaticValueKnowledge& knowledge) const {
|
|
auto srcAddr = addressOf(transposeOp.getInput(), knowledge);
|
|
auto dstAddr = addressOf(transposeOp.getOutputBuffer(), knowledge);
|
|
|
|
auto srcType = cast<ShapedType>(transposeOp.getInput().getType());
|
|
auto srcShape = srcType.getShape();
|
|
size_t rank = srcShape.size();
|
|
size_t elementSize = srcType.getElementTypeBitWidth() / 8;
|
|
size_t totalElements = srcType.getNumElements();
|
|
|
|
// Read permutation. Destination dim i corresponds to source dim perm[i].
|
|
SmallVector<int64_t> perm = map_to_vector(transposeOp.getPermutation().getAsRange<IntegerAttr>(),
|
|
[](auto attr) -> int64_t { return attr.getInt(); });
|
|
|
|
// Destination shape: dstShape[i] = srcShape[perm[i]]
|
|
SmallVector<int64_t> dstShape(rank);
|
|
for (size_t i = 0; i < rank; i++)
|
|
dstShape[i] = srcShape[perm[i]];
|
|
|
|
// Row-major strides for source and destination
|
|
SmallVector<size_t> srcStrides(rank, 1);
|
|
SmallVector<size_t> dstStrides(rank, 1);
|
|
for (int64_t i = rank - 2; i >= 0; i--) {
|
|
srcStrides[i] = srcStrides[i + 1] * srcShape[i + 1];
|
|
dstStrides[i] = dstStrides[i + 1] * dstShape[i + 1];
|
|
}
|
|
|
|
// Emit element-by-element copy with transposed addressing
|
|
for (size_t srcFlat = 0; srcFlat < totalElements; srcFlat++) {
|
|
// Decompose flat source index into multi-dimensional index
|
|
SmallVector<size_t> srcIdx(rank);
|
|
size_t remaining = srcFlat;
|
|
for (size_t d = 0; d < rank; d++) {
|
|
srcIdx[d] = remaining / srcStrides[d];
|
|
remaining %= srcStrides[d];
|
|
}
|
|
|
|
// Compute flat destination index: dstIdx[d] = srcIdx[perm[d]]
|
|
size_t dstFlat = 0;
|
|
for (size_t d = 0; d < rank; d++)
|
|
dstFlat += srcIdx[perm[d]] * dstStrides[d];
|
|
|
|
emitMemCopyOp("lmv", dstAddr, dstFlat * elementSize, srcAddr, srcFlat * elementSize, elementSize, "len");
|
|
}
|
|
}
|
|
|
|
size_t getMatrixSize(ShapedType matrixShape) {
|
|
if (matrixShape.getRank() != 2 && matrixShape.getRank() != 4)
|
|
assert(false && "Unsupported matrix shape");
|
|
return std::max(matrixShape.getDimSize(0), matrixShape.getDimSize(1));
|
|
}
|
|
|
|
std::string getMemorySizeAsString(size_t size) {
|
|
if (size > 1024 * 1024 * 1024)
|
|
return std::to_string(size / 1024 / 1024 / 1024) + " GB";
|
|
if (size > 1024 * 1024)
|
|
return std::to_string(size / 1024 / 1024) + " MB";
|
|
if (size > 1024)
|
|
return std::to_string(size / 1024) + " KB";
|
|
return std::to_string(size) + " Bytes";
|
|
}
|
|
|
|
static SmallVector<unsigned, 8> getUsedWeightIndices(Block& block) {
|
|
SmallVector<unsigned, 8> indices;
|
|
auto addIndex = [&](unsigned weightIndex) {
|
|
if (!llvm::is_contained(indices, weightIndex))
|
|
indices.push_back(weightIndex);
|
|
};
|
|
|
|
block.walk([&](pim::PimMVMOp mvmOp) { addIndex(mvmOp.getWeightIndex()); });
|
|
block.walk([&](pim::PimVMMOp vmmOp) { addIndex(vmmOp.getWeightIndex()); });
|
|
llvm::sort(indices);
|
|
return indices;
|
|
}
|
|
|
|
static SmallVector<unsigned, 8> getUsedWeightIndices(pim::PimCoreOp coreOp) {
|
|
return getUsedWeightIndices(coreOp.getBody().front());
|
|
}
|
|
|
|
static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
|
auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
|
|
assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute");
|
|
return SmallVector<int32_t>(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end());
|
|
}
|
|
|
|
static SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
|
|
SmallVector<Operation*> coreLikeOps;
|
|
for (Operation& op : funcOp.getBody().front())
|
|
if (dyn_cast<pim::PimCoreOp>(&op) || dyn_cast<pim::PimCoreBatchOp>(&op))
|
|
coreLikeOps.push_back(&op);
|
|
return coreLikeOps;
|
|
}
|
|
|
|
static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, unsigned lane) {
|
|
OpBuilder builder(coreBatchOp);
|
|
builder.setInsertionPointAfter(coreBatchOp);
|
|
|
|
size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
|
|
size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount;
|
|
SmallVector<mlir::Value> laneWeights;
|
|
laneWeights.reserve(weightsPerLane);
|
|
for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex)
|
|
laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]);
|
|
|
|
auto coreIds = getBatchCoreIds(coreBatchOp);
|
|
auto scalarCore = pim::PimCoreOp::create(
|
|
builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane]));
|
|
Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end());
|
|
IRMapping mapper;
|
|
if (coreBatchOp.getBody().front().getNumArguments() == 1)
|
|
mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]);
|
|
|
|
builder.setInsertionPointToEnd(block);
|
|
for (Operation& op : coreBatchOp.getBody().front()) {
|
|
if (isa<pim::PimHaltOp>(op)) {
|
|
pim::PimHaltOp::create(builder, op.getLoc());
|
|
continue;
|
|
}
|
|
|
|
if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
|
|
pim::PimSendOp::create(builder,
|
|
sendBatchOp.getLoc(),
|
|
mapper.lookup(sendBatchOp.getInput()),
|
|
sendBatchOp.getSizeAttr(),
|
|
builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
|
|
continue;
|
|
}
|
|
|
|
if (auto sendManyBatchOp = dyn_cast<pim::PimSendManyBatchOp>(op)) {
|
|
SmallVector<int32_t> laneTargetCoreIds;
|
|
laneTargetCoreIds.reserve(sendManyBatchOp.getInputs().size());
|
|
for (auto valueIndex : llvm::seq<size_t>(0, sendManyBatchOp.getInputs().size()))
|
|
laneTargetCoreIds.push_back(
|
|
sendManyBatchOp.getTargetCoreIds()[valueIndex * laneCount + static_cast<size_t>(lane)]);
|
|
|
|
SmallVector<mlir::Value> mappedInputs;
|
|
mappedInputs.reserve(sendManyBatchOp.getInputs().size());
|
|
for (mlir::Value input : sendManyBatchOp.getInputs())
|
|
mappedInputs.push_back(mapper.lookup(input));
|
|
|
|
pim::PimSendManyOp::create(builder,
|
|
sendManyBatchOp.getLoc(),
|
|
builder.getDenseI32ArrayAttr(laneTargetCoreIds),
|
|
ValueRange(mappedInputs));
|
|
continue;
|
|
}
|
|
|
|
if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
|
|
auto scalarReceive =
|
|
pim::PimReceiveOp::create(builder,
|
|
receiveBatchOp.getLoc(),
|
|
receiveBatchOp.getOutput().getType(),
|
|
mapper.lookup(receiveBatchOp.getOutputBuffer()),
|
|
receiveBatchOp.getSizeAttr(),
|
|
builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
|
|
mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
|
|
continue;
|
|
}
|
|
|
|
if (auto receiveManyBatchOp = dyn_cast<pim::PimReceiveManyBatchOp>(op)) {
|
|
SmallVector<int32_t> laneSourceCoreIds;
|
|
laneSourceCoreIds.reserve(receiveManyBatchOp.getOutputs().size());
|
|
for (auto valueIndex : llvm::seq<size_t>(0, receiveManyBatchOp.getOutputs().size()))
|
|
laneSourceCoreIds.push_back(
|
|
receiveManyBatchOp.getSourceCoreIds()[valueIndex * laneCount + static_cast<size_t>(lane)]);
|
|
|
|
SmallVector<mlir::Value> mappedOutputBuffers;
|
|
mappedOutputBuffers.reserve(receiveManyBatchOp.getOutputBuffers().size());
|
|
for (mlir::Value outputBuffer : receiveManyBatchOp.getOutputBuffers())
|
|
mappedOutputBuffers.push_back(mapper.lookup(outputBuffer));
|
|
|
|
auto scalarReceiveMany =
|
|
pim::PimReceiveManyOp::create(builder,
|
|
receiveManyBatchOp.getLoc(),
|
|
receiveManyBatchOp->getResultTypes(),
|
|
ValueRange(mappedOutputBuffers),
|
|
builder.getDenseI32ArrayAttr(laneSourceCoreIds));
|
|
for (auto [originalOutput, scalarOutput] : llvm::zip(receiveManyBatchOp.getOutputs(), scalarReceiveMany.getOutputs()))
|
|
mapper.map(originalOutput, scalarOutput);
|
|
continue;
|
|
}
|
|
|
|
if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
|
|
mlir::Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource());
|
|
if (!hostSource)
|
|
hostSource = memcpBatchOp.getHostSource();
|
|
|
|
auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder,
|
|
memcpBatchOp.getLoc(),
|
|
memcpBatchOp.getOutput().getType(),
|
|
mapper.lookup(memcpBatchOp.getDeviceTarget()),
|
|
hostSource,
|
|
memcpBatchOp.getDeviceTargetOffsetAttr(),
|
|
memcpBatchOp.getHostSourceOffsetAttr(),
|
|
memcpBatchOp.getSizeAttr());
|
|
mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
|
|
continue;
|
|
}
|
|
|
|
Operation* cloned = builder.clone(op, mapper);
|
|
for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
|
|
mapper.map(originalResult, clonedResult);
|
|
}
|
|
|
|
if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
|
|
pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
|
|
return scalarCore;
|
|
}
|
|
|
|
static void aliasMaterializedHostGlobals(ModuleOp moduleOp,
|
|
func::FuncOp funcOp,
|
|
pim::PimCoreOp coreOp,
|
|
PimAcceleratorMemory& memory) {
|
|
coreOp.walk([&](memref::GetGlobalOp getGlobalOp) {
|
|
if (hasWeightAlways(getGlobalOp) || memory.memEntriesMap.contains(getGlobalOp.getResult()))
|
|
return;
|
|
|
|
auto targetGlobal = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
|
if (!targetGlobal)
|
|
return;
|
|
|
|
mlir::Value aliasedValue;
|
|
funcOp.walk([&](memref::GetGlobalOp candidate) {
|
|
if (aliasedValue || candidate == getGlobalOp || !memory.memEntriesMap.contains(candidate.getResult()))
|
|
return;
|
|
if (lookupGlobalForGetGlobal(moduleOp, candidate) == targetGlobal)
|
|
aliasedValue = candidate.getResult();
|
|
});
|
|
|
|
if (aliasedValue)
|
|
memory.memEntriesMap[getGlobalOp.getResult()] = memory.memEntriesMap[aliasedValue];
|
|
});
|
|
}
|
|
|
|
/// Write global constant data into a binary memory image at their allocated addresses.
|
|
static OnnxMlirCompilerErrorCodes
|
|
writeMemoryBinary(ModuleOp moduleOp, func::FuncOp funcOp, PimAcceleratorMemory& memory, StringRef outputDirPath) {
|
|
auto memoryFilePath = (outputDirPath + "/memory.bin").str();
|
|
std::error_code errorCode;
|
|
raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, sys::fs::OF_None);
|
|
if (errorCode) {
|
|
errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
|
|
std::vector<char> memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0);
|
|
|
|
SmallPtrSet<Operation*, 16> writtenGlobals;
|
|
funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
|
|
if (hasWeightAlways(getGlobalOp))
|
|
return;
|
|
auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
|
if (!globalOp)
|
|
return;
|
|
if (!writtenGlobals.insert(globalOp.getOperation()).second)
|
|
return;
|
|
auto initialValue = globalOp.getInitialValue();
|
|
if (!initialValue)
|
|
return;
|
|
auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
|
|
if (!denseAttr)
|
|
return;
|
|
|
|
MemEntry memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
|
|
ArrayRef<char> rawData = denseAttr.getRawData();
|
|
char* dst = memoryBuffer.data() + memEntry.address;
|
|
|
|
if (denseAttr.isSplat()) {
|
|
size_t elementSize = rawData.size();
|
|
assert(elementSize * getGlobalOp.getType().getNumElements() == memEntry.size && "Data size mismatch");
|
|
for (size_t offset = 0; offset < memEntry.size; offset += elementSize)
|
|
std::memcpy(dst + offset, rawData.data(), std::min(elementSize, memEntry.size - offset));
|
|
}
|
|
else {
|
|
assert(rawData.size() == memEntry.size && "Data size mismatch");
|
|
std::memcpy(dst, rawData.data(), rawData.size());
|
|
}
|
|
});
|
|
|
|
memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size());
|
|
memoryFileStream.close();
|
|
return CompilerSuccess;
|
|
}
|
|
|
|
/// Dispatch all operations in a core region to the appropriate code generator.
|
|
/// scf.for loops are statically unrolled via walkPimCoreBlock so that addressing is
|
|
/// fully resolved before the JSON instructions are emitted.
|
|
/// Returns the number of emitted instructions, or -1 on failure.
|
|
static int64_t codeGenCoreOps(Block& block, PimCodeGen& coreCodeGen) {
|
|
size_t processedOperations = 0;
|
|
auto result =
|
|
walkPimCoreBlock(block, StaticValueKnowledge {}, [&](Operation& op, const StaticValueKnowledge& knowledge) {
|
|
if (auto loadOp = dyn_cast<pim::PimMemCopyHostToDevOp>(op))
|
|
coreCodeGen.codeGenLoadOp(loadOp, knowledge);
|
|
else if (auto storeOp = dyn_cast<pim::PimMemCopyDevToHostOp>(op))
|
|
coreCodeGen.codeGenStoreOp(storeOp, knowledge);
|
|
else if (auto lmvOp = dyn_cast<pim::PimMemCopyOp>(op))
|
|
coreCodeGen.codeGenLmvOp(lmvOp, knowledge);
|
|
else if (auto receiveOp = dyn_cast<pim::PimReceiveOp>(op))
|
|
coreCodeGen.codeGenReceiveOp(receiveOp, knowledge);
|
|
else if (auto receiveManyOp = dyn_cast<pim::PimReceiveManyOp>(op))
|
|
coreCodeGen.codeGenReceiveManyOp(receiveManyOp, knowledge);
|
|
else if (auto sendOp = dyn_cast<pim::PimSendOp>(op))
|
|
coreCodeGen.codeGenSendOp(sendOp, knowledge);
|
|
else if (auto sendManyOp = dyn_cast<pim::PimSendManyOp>(op))
|
|
coreCodeGen.codeGenSendManyOp(sendManyOp, knowledge);
|
|
else if (auto extractRowsOp = dyn_cast<pim::PimExtractRowsOp>(op))
|
|
coreCodeGen.codeGenExtractRowsOp(extractRowsOp, knowledge);
|
|
else if (auto concatOp = dyn_cast<pim::PimConcatOp>(op))
|
|
coreCodeGen.codeGenConcatOp(concatOp, knowledge);
|
|
else if (auto vmmOp = dyn_cast<pim::PimVMMOp>(op))
|
|
coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(vmmOp.getWeightIndex(), vmmOp, true, knowledge);
|
|
else if (auto mvmOp = dyn_cast<pim::PimMVMOp>(op))
|
|
coreCodeGen.codeGenMVMLikeOp<pim::PimMVMOp>(mvmOp.getWeightIndex(), mvmOp, false, knowledge);
|
|
else if (auto transposeOp = dyn_cast<pim::PimTransposeOp>(op))
|
|
coreCodeGen.codeGenTransposeOp(transposeOp, knowledge);
|
|
else if (auto vvaddOp = dyn_cast<pim::PimVVAddOp>(op))
|
|
coreCodeGen.codeGenVVAddOp(vvaddOp, knowledge);
|
|
else if (auto vvsubOp = dyn_cast<pim::PimVVSubOp>(op))
|
|
coreCodeGen.codeGenVVSubOp(vvsubOp, knowledge);
|
|
else if (auto vvmulOp = dyn_cast<pim::PimVVMulOp>(op))
|
|
coreCodeGen.codeGenVVMulOp(vvmulOp, knowledge);
|
|
else if (auto vvmaxOp = dyn_cast<pim::PimVVMaxOp>(op))
|
|
coreCodeGen.codeGenVVMaxOp(vvmaxOp, knowledge);
|
|
else if (auto vvdmulOp = dyn_cast<pim::PimVVDMulOp>(op))
|
|
coreCodeGen.codeGenVVDMulOp(vvdmulOp, knowledge);
|
|
else if (auto vavgOp = dyn_cast<pim::PimVAvgOp>(op))
|
|
coreCodeGen.codeGenVAvgOp(vavgOp, knowledge);
|
|
else if (auto vreluOp = dyn_cast<pim::PimVReluOp>(op))
|
|
coreCodeGen.codeGenVReluOp(vreluOp, knowledge);
|
|
else if (auto vtanhOp = dyn_cast<pim::PimVTanhOp>(op))
|
|
coreCodeGen.codeGenVTanhOp(vtanhOp, knowledge);
|
|
else if (auto vsigmOp = dyn_cast<pim::PimVSigmOp>(op))
|
|
coreCodeGen.codeGenVSigmOp(vsigmOp, knowledge);
|
|
else if (auto vsoftmaxOp = dyn_cast<pim::PimVSoftmaxOp>(op))
|
|
coreCodeGen.codeGenVSoftmaxOp(vsoftmaxOp, knowledge);
|
|
else if (auto getGlobalOp = dyn_cast<memref::GetGlobalOp>(op))
|
|
coreCodeGen.codeGetGlobalOp(getGlobalOp, knowledge);
|
|
else {
|
|
op.emitError("Unsupported codegen for this operation");
|
|
op.dump();
|
|
return failure();
|
|
}
|
|
processedOperations++;
|
|
return success();
|
|
});
|
|
return failed(result) ? -1 : static_cast<int64_t>(processedOperations);
|
|
}
|
|
|
|
/// Write crossbar weight matrices as padded binary files for a single core.
|
|
static OnnxMlirCompilerErrorCodes writeCrossbarWeights(ModuleOp moduleOp,
|
|
pim::PimCoreOp coreOp,
|
|
StringRef coreWeightsDirPath,
|
|
json::Array& xbarsPerGroup) {
|
|
int64_t xbarSize = crossbarSize.getValue();
|
|
std::error_code errorCode;
|
|
size_t weightIndex = 0;
|
|
|
|
for (auto weight : coreOp.getWeights()) {
|
|
xbarsPerGroup.push_back(weightIndex);
|
|
|
|
auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
|
|
if (!getGlobalOp) {
|
|
coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(weightIndex));
|
|
weightIndex++;
|
|
continue;
|
|
}
|
|
|
|
auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
|
if (!globalOp) {
|
|
coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(weightIndex));
|
|
weightIndex++;
|
|
continue;
|
|
}
|
|
|
|
auto initialValue = globalOp.getInitialValue();
|
|
if (!initialValue) {
|
|
coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(weightIndex));
|
|
weightIndex++;
|
|
continue;
|
|
}
|
|
|
|
auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
|
|
if (!denseAttr) {
|
|
coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(weightIndex));
|
|
weightIndex++;
|
|
continue;
|
|
}
|
|
|
|
auto type = denseAttr.getType();
|
|
auto shape = type.getShape();
|
|
assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
|
|
int64_t numRows = shape[0];
|
|
int64_t numCols = shape[1];
|
|
assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
|
|
|
|
size_t elementByteWidth = type.getElementType().getIntOrFloatBitWidth() / 8;
|
|
|
|
auto weightFilePath = (coreWeightsDirPath + "/crossbar_" + std::to_string(weightIndex) + ".bin").str();
|
|
raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
|
|
if (errorCode) {
|
|
errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
|
|
uint64_t zero = 0;
|
|
for (int64_t row = 0; row < xbarSize; row++) {
|
|
for (int64_t col = 0; col < xbarSize; col++) {
|
|
if (row < numRows && col < numCols) {
|
|
int64_t index = row * numCols + col;
|
|
APInt bits = denseAttr.getValues<APFloat>()[index].bitcastToAPInt();
|
|
uint64_t word = bits.getZExtValue();
|
|
weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
|
|
}
|
|
else {
|
|
weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
|
|
}
|
|
}
|
|
}
|
|
|
|
weightFileStream.close();
|
|
weightIndex++;
|
|
}
|
|
|
|
return CompilerSuccess;
|
|
}
|
|
|
|
llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>>
|
|
createAndPopulateWeightFolder(func::FuncOp funcOp, StringRef outputDirPath) {
|
|
ModuleOp moduleOp = funcOp->getParentOfType<ModuleOp>();
|
|
auto coreWeightsDirPath = outputDirPath + "/weights";
|
|
auto error = sys::fs::create_directory(coreWeightsDirPath);
|
|
assert(!error && "Error creating weights directory");
|
|
size_t indexFileName = 0;
|
|
|
|
int64_t xbarSize = crossbarSize.getValue();
|
|
llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>> mapCoreWeightToFileName;
|
|
llvm::DenseMap<memref::GlobalOp, std::string> mapGlobalOpToFileName;
|
|
|
|
SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
|
|
|
|
for (Operation* op : coreLikeOps) {
|
|
SmallVector<pim::PimCoreOp> scalarCores;
|
|
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
|
scalarCores.push_back(coreOp);
|
|
}
|
|
else {
|
|
auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
|
|
for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
|
|
scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane));
|
|
}
|
|
|
|
for (pim::PimCoreOp coreOp : scalarCores) {
|
|
size_t coreId = static_cast<size_t>(coreOp.getCoreId());
|
|
for (unsigned index : getUsedWeightIndices(coreOp)) {
|
|
if (index >= coreOp.getWeights().size()) {
|
|
coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
|
|
assert(index < coreOp.getWeights().size() && "Weight index is out of range");
|
|
}
|
|
mlir::Value weight = coreOp.getWeights()[index];
|
|
|
|
auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
|
|
if (!getGlobalOp) {
|
|
coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(index));
|
|
assert(!getGlobalOp && "Weight is not from a memref.get_global");
|
|
}
|
|
|
|
auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
|
|
if (!globalOp) {
|
|
coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(index));
|
|
assert(!globalOp && "Could not find memref.global");
|
|
}
|
|
|
|
auto initialValue = globalOp.getInitialValue();
|
|
if (!initialValue) {
|
|
coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(index));
|
|
assert(!initialValue && "memref.global has no initial value");
|
|
}
|
|
|
|
auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
|
|
if (!denseAttr) {
|
|
coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(index));
|
|
assert(!denseAttr && "memref.global initial value is not dense");
|
|
}
|
|
|
|
if (mapGlobalOpToFileName.contains(globalOp)) {
|
|
auto& fileName = mapGlobalOpToFileName[globalOp];
|
|
std::pair<mlir::Value, std::string> weightToFile = {weight, fileName};
|
|
mapCoreWeightToFileName[coreId].insert(weightToFile);
|
|
continue;
|
|
}
|
|
|
|
auto type = denseAttr.getType();
|
|
auto shape = type.getShape();
|
|
assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
|
|
int64_t numRows = shape[0];
|
|
int64_t numCols = shape[1];
|
|
assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
|
|
|
|
size_t elementByteWidth = type.getElementType().getIntOrFloatBitWidth() / 8;
|
|
|
|
std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin";
|
|
auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str();
|
|
std::error_code errorCode;
|
|
raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
|
|
if (errorCode) {
|
|
errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
|
|
assert(errorCode);
|
|
}
|
|
|
|
uint64_t zero = 0;
|
|
for (int64_t row = 0; row < xbarSize; row++) {
|
|
for (int64_t col = 0; col < xbarSize; col++) {
|
|
if (row < numRows && col < numCols) {
|
|
int64_t index = row * numCols + col;
|
|
APInt bits = denseAttr.getValues<APFloat>()[index].bitcastToAPInt();
|
|
uint64_t word = bits.getZExtValue();
|
|
weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
|
|
}
|
|
else {
|
|
weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
|
|
}
|
|
}
|
|
}
|
|
|
|
weightFileStream.close();
|
|
mapGlobalOpToFileName.insert({globalOp, newFileName});
|
|
mapCoreWeightToFileName[coreId].insert({weight, newFileName});
|
|
}
|
|
}
|
|
|
|
for (pim::PimCoreOp coreOp : scalarCores)
|
|
if (coreOp.getOperation() != op)
|
|
coreOp.erase();
|
|
}
|
|
return mapCoreWeightToFileName;
|
|
}
|
|
|
|
/// Write the top-level PIM configuration JSON (core count, crossbar config, I/O addresses).
|
|
static OnnxMlirCompilerErrorCodes writeConfigJson(func::FuncOp funcOp,
|
|
PimAcceleratorMemory& memory,
|
|
size_t maxCoreId,
|
|
json::Object xbarsPerArrayGroup,
|
|
StringRef outputDirPath) {
|
|
json::Object configJson;
|
|
|
|
// pimsim-nn indexes cores directly by their numeric core ID, with the host
|
|
// occupying core 0.
|
|
configJson["core_cnt"] = maxCoreId + 1;
|
|
|
|
// TODO: Should this be based on the floating point type used in the model?
|
|
// The 2 following values determine the bitwidth of the vectors' elements: bitwidth = adc_count * cell_precision
|
|
|
|
// Number of ADC for MVM units
|
|
configJson["adc_count"] = 16;
|
|
// The bit precision of each ADC
|
|
configJson["cell_precision"] = 2;
|
|
|
|
// Crossbar configuration
|
|
configJson["xbar_array_count"] = crossbarCountInCore.getValue();
|
|
configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()};
|
|
configJson["array_group_map"] = std::move(xbarsPerArrayGroup);
|
|
|
|
// Memory layout of inputs and outputs
|
|
json::Array inputsAddresses;
|
|
for (BlockArgument input : funcOp.getArguments())
|
|
inputsAddresses.push_back(memory.getValueAddress(input));
|
|
configJson["inputs_addresses"] = std::move(inputsAddresses);
|
|
|
|
json::Array outputsAddresses;
|
|
for (func::ReturnOp returnOp : funcOp.getOps<func::ReturnOp>())
|
|
for (mlir::Value output : returnOp.getOperands())
|
|
outputsAddresses.push_back(memory.getValueAddress(output));
|
|
configJson["outputs_addresses"] = std::move(outputsAddresses);
|
|
|
|
auto configPath = (outputDirPath + "/config.json").str();
|
|
std::error_code errorCode;
|
|
raw_fd_ostream jsonOS(configPath, errorCode);
|
|
if (errorCode) {
|
|
errs() << "Error while opening config file: " << errorCode.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
jsonOS << json::Value(std::move(configJson)) << '\n';
|
|
jsonOS.close();
|
|
|
|
return CompilerSuccess;
|
|
}
|
|
|
|
OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::string& outputDirPath) {
|
|
if (!outputDirPath.empty()) {
|
|
if (auto error = sys::fs::create_directory(outputDirPath)) {
|
|
errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
}
|
|
|
|
auto entryFunc = getPimEntryFunc(moduleOp);
|
|
if (failed(entryFunc))
|
|
return CompilerFailure;
|
|
auto funcOp = *entryFunc;
|
|
|
|
PimAcceleratorMemory memory;
|
|
memory.hostMem.allocateHost(moduleOp, funcOp);
|
|
memory.reportHost();
|
|
|
|
if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath))
|
|
return err;
|
|
|
|
// Write empty host core file
|
|
std::error_code errorCode;
|
|
auto outputHostCorePath = outputDirPath + "/core_0.json";
|
|
raw_fd_ostream hostFileStream(outputHostCorePath, errorCode);
|
|
if (errorCode) {
|
|
errs() << "Error while opening host core file `" << outputHostCorePath << "`: " << errorCode.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
// The host core json contains 2 random instructions, just to make pimsim-nn happy
|
|
hostFileStream << "[{\"imm\":0,\"op\":\"sldi\",\"rd\":0},{\"imm\":0,\"op\":\"sldi\",\"rd\":0}]";
|
|
hostFileStream.close();
|
|
|
|
// For each core, specify the number of crossbar per array group.
|
|
// This implementation always assigns one crossbar per group.
|
|
json::Object xbarsPerArrayGroup;
|
|
size_t maxCoreId = 0;
|
|
|
|
// Create Weight Folder
|
|
auto mapCoreWeightToFileName = createAndPopulateWeightFolder(funcOp, outputDirPath);
|
|
|
|
SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
|
|
llvm::DenseMap<size_t, size_t> emittedCoreIds;
|
|
size_t nextEmittedCoreId = 1;
|
|
|
|
for (Operation* op : coreLikeOps) {
|
|
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
|
size_t originalCoreId = static_cast<size_t>(coreOp.getCoreId());
|
|
if (!emittedCoreIds.contains(originalCoreId))
|
|
emittedCoreIds[originalCoreId] = nextEmittedCoreId++;
|
|
continue;
|
|
}
|
|
|
|
auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
|
|
auto batchCoreIds = getBatchCoreIds(coreBatchOp);
|
|
for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane) {
|
|
size_t originalCoreId = static_cast<size_t>(batchCoreIds[lane]);
|
|
if (!emittedCoreIds.contains(originalCoreId))
|
|
emittedCoreIds[originalCoreId] = nextEmittedCoreId++;
|
|
}
|
|
}
|
|
|
|
for (Operation* op : coreLikeOps) {
|
|
SmallVector<pim::PimCoreOp> scalarCores;
|
|
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
|
scalarCores.push_back(coreOp);
|
|
}
|
|
else {
|
|
auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
|
|
for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
|
|
scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane));
|
|
}
|
|
|
|
for (pim::PimCoreOp coreOp : scalarCores) {
|
|
size_t originalCoreId = static_cast<size_t>(coreOp.getCoreId());
|
|
size_t coreId = emittedCoreIds.lookup(originalCoreId);
|
|
maxCoreId = std::max(maxCoreId, coreId);
|
|
|
|
std::error_code errorCode;
|
|
auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".json";
|
|
raw_fd_ostream coreFileStream(outputCorePath, errorCode);
|
|
if (errorCode) {
|
|
errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
coreFileStream << '[';
|
|
|
|
PimCodeGen coreCodeGen(memory, coreFileStream, emittedCoreIds);
|
|
aliasMaterializedHostGlobals(moduleOp, funcOp, coreOp, memory);
|
|
memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp);
|
|
memory.reportCore(coreId);
|
|
|
|
int64_t processedOperations = codeGenCoreOps(coreOp.getBody().front(), coreCodeGen);
|
|
if (processedOperations < 0)
|
|
return CompilerFailure;
|
|
assert(processedOperations > 0);
|
|
|
|
coreFileStream.seek(coreFileStream.tell() - 1);
|
|
coreFileStream << ']';
|
|
coreFileStream.close();
|
|
|
|
auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
|
|
if (auto error = sys::fs::create_directory(coreWeightsDirPath)) {
|
|
errs() << "Error creating core directory: " << coreWeightsDirPath << ": " << error.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
|
|
auto& mapWeightToFile = mapCoreWeightToFileName[originalCoreId];
|
|
json::Array xbarsPerGroup;
|
|
for (unsigned index : getUsedWeightIndices(coreOp)) {
|
|
if (index >= coreOp.getWeights().size()) {
|
|
coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
|
|
assert(index < coreOp.getWeights().size() && "Weight index is out of range");
|
|
}
|
|
mlir::Value weight = coreOp.getWeights()[index];
|
|
xbarsPerGroup.push_back(index);
|
|
assert(mapWeightToFile.contains(weight) && "Weight was not materialized into a file!!");
|
|
auto& fileName = mapWeightToFile[weight];
|
|
if (auto error = sys::fs::create_link(outputDirPath + "/weights/" + fileName,
|
|
coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")) {
|
|
errs() << "Error creating link file: " << (outputDirPath + "/weights/" + fileName) << " to "
|
|
<< (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")
|
|
<< "\nError:" << error.message() << '\n';
|
|
return InvalidOutputFileAccess;
|
|
}
|
|
}
|
|
|
|
xbarsPerArrayGroup["core" + std::to_string(coreId)] = std::move(xbarsPerGroup);
|
|
}
|
|
|
|
for (pim::PimCoreOp coreOp : scalarCores)
|
|
if (coreOp.getOperation() != op) {
|
|
coreOp.walk([&memory](Operation* op) { memory.clean(op); });
|
|
coreOp.erase();
|
|
}
|
|
}
|
|
|
|
return writeConfigJson(funcOp, memory, maxCoreId, std::move(xbarsPerArrayGroup), outputDirPath);
|
|
}
|