faster pim VerificationPass.cpp and pim code emission
Validate Operations / validate-operations (push) Has been cancelled
Validate Operations / validate-operations (push) Has been cancelled
This commit is contained in:
@@ -16,7 +16,6 @@ add_pim_library(OMPimCompilerOptions
|
||||
add_pim_library(OMPimCompilerUtils
|
||||
PimCompilerUtils.cpp
|
||||
PimArtifactWriter.cpp
|
||||
PimBatchEmission.cpp
|
||||
PimCodeGen.cpp
|
||||
PimWeightEmitter.cpp
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ writeMemoryBinary(ModuleOp moduleOp, func::FuncOp funcOp, PimAcceleratorMemory&
|
||||
if (!denseAttr)
|
||||
return;
|
||||
|
||||
MemEntry memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
|
||||
MemEntry memEntry = memory.hostMem.getMemEntry({getGlobalOp.getResult(), std::nullopt});
|
||||
ArrayRef<char> rawData = denseAttr.getRawData();
|
||||
char* dst = memoryBuffer.data() + memEntry.address;
|
||||
|
||||
|
||||
@@ -1,193 +0,0 @@
|
||||
#include "mlir/Dialect/Arith/IR/Arith.h"
|
||||
#include "mlir/Dialect/SCF/IR/SCF.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/IRMapping.h"
|
||||
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
namespace {
|
||||
|
||||
static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
|
||||
auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
|
||||
assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute");
|
||||
return SmallVector<int32_t>(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end());
|
||||
}
|
||||
|
||||
static SmallVector<int32_t> getLaneChunkCoreIds(ArrayRef<int32_t> coreIds, size_t laneCount, unsigned lane) {
|
||||
SmallVector<int32_t> laneCoreIds;
|
||||
laneCoreIds.reserve(coreIds.size() / laneCount);
|
||||
for (size_t chunkIndex = 0; chunkIndex < coreIds.size() / laneCount; ++chunkIndex)
|
||||
laneCoreIds.push_back(coreIds[chunkIndex * laneCount + lane]);
|
||||
return laneCoreIds;
|
||||
}
|
||||
|
||||
static Value getOrCloneCapturedValue(OpBuilder& builder, Block& oldBlock, Value value, IRMapping& mapper) {
|
||||
if (Value mapped = mapper.lookupOrNull(value))
|
||||
return mapped;
|
||||
|
||||
if (auto blockArgument = dyn_cast<BlockArgument>(value)) {
|
||||
assert(blockArgument.getOwner() != &oldBlock && "expected block argument to be mapped before cloning");
|
||||
assert(false && "unexpected captured block argument while scalarizing pim.core_batch");
|
||||
}
|
||||
|
||||
Operation* definingOp = value.getDefiningOp();
|
||||
assert(definingOp && "expected captured value to be defined by an operation");
|
||||
assert(definingOp->getBlock() != &oldBlock && "expected in-block value to be mapped before cloning");
|
||||
|
||||
for (Value operand : definingOp->getOperands())
|
||||
(void) getOrCloneCapturedValue(builder, oldBlock, operand, mapper);
|
||||
|
||||
Operation* cloned = builder.clone(*definingOp, mapper);
|
||||
for (auto [originalResult, clonedResult] : llvm::zip(definingOp->getResults(), cloned->getResults()))
|
||||
mapper.map(originalResult, clonedResult);
|
||||
return mapper.lookup(value);
|
||||
}
|
||||
|
||||
static void cloneScalarizedLaneBody(OpBuilder& builder,
|
||||
pim::PimCoreBatchOp coreBatchOp,
|
||||
unsigned lane,
|
||||
OperationFolder& constantFolder) {
|
||||
Block& oldBlock = coreBatchOp.getBody().front();
|
||||
Operation* anchorOp = builder.getInsertionBlock()->getParentOp();
|
||||
size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
|
||||
size_t weightCount = coreBatchOp.getWeights().size();
|
||||
|
||||
IRMapping mapper;
|
||||
for (auto [argIndex, blockArg] : llvm::enumerate(oldBlock.getArguments())) {
|
||||
if (blockArg.getType().isIndex()) {
|
||||
mapper.map(blockArg, getOrCreateHostIndexConstant(anchorOp, static_cast<int64_t>(lane), constantFolder));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (argIndex <= weightCount) {
|
||||
auto scalarCoreOp = cast<pim::PimCoreOp>(anchorOp);
|
||||
mapper.map(blockArg, scalarCoreOp.getWeightArgument(argIndex - 1));
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t inputIndex = argIndex - 1 - weightCount;
|
||||
assert(inputIndex < coreBatchOp.getInputs().size() && "pim.core_batch block input index out of range");
|
||||
mapper.map(blockArg, coreBatchOp.getInputs()[inputIndex]);
|
||||
}
|
||||
|
||||
for (Operation& op : oldBlock) {
|
||||
if (isa<pim::PimHaltOp>(op))
|
||||
continue;
|
||||
|
||||
for (Value operand : op.getOperands())
|
||||
(void) getOrCloneCapturedValue(builder, oldBlock, operand, mapper);
|
||||
|
||||
if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
|
||||
pim::PimSendOp::create(
|
||||
builder,
|
||||
sendBatchOp.getLoc(),
|
||||
mapper.lookup(sendBatchOp.getInput()),
|
||||
sendBatchOp.getSizeAttr(),
|
||||
getOrCreateHostIndexConstant(anchorOp, sendBatchOp.getTargetCoreIds()[lane], constantFolder));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
|
||||
pim::PimSendTensorOp::create(
|
||||
builder,
|
||||
sendTensorBatchOp.getLoc(),
|
||||
mapper.lookup(sendTensorBatchOp.getInput()),
|
||||
builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
|
||||
auto scalarReceive = pim::PimReceiveOp::create(
|
||||
builder,
|
||||
receiveBatchOp.getLoc(),
|
||||
receiveBatchOp.getOutput().getType(),
|
||||
mapper.lookup(receiveBatchOp.getOutputBuffer()),
|
||||
receiveBatchOp.getSizeAttr(),
|
||||
getOrCreateHostIndexConstant(anchorOp, receiveBatchOp.getSourceCoreIds()[lane], constantFolder));
|
||||
mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
|
||||
auto scalarReceive = pim::PimReceiveTensorOp::create(
|
||||
builder,
|
||||
receiveTensorBatchOp.getLoc(),
|
||||
receiveTensorBatchOp.getOutput().getType(),
|
||||
mapper.lookup(receiveTensorBatchOp.getOutputBuffer()),
|
||||
builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
|
||||
mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
|
||||
auto scalarCopy = pim::PimMemCopyHostToDevOp::create(
|
||||
builder,
|
||||
memcpBatchOp.getLoc(),
|
||||
memcpBatchOp.getOutput().getType(),
|
||||
getOrCreateHostIndexConstant(anchorOp, memcpBatchOp.getDeviceTargetOffset(), constantFolder),
|
||||
getOrCreateHostIndexConstant(anchorOp, memcpBatchOp.getHostSourceOffset(), constantFolder),
|
||||
mapper.lookup(memcpBatchOp.getDeviceTarget()),
|
||||
mapper.lookup(memcpBatchOp.getHostSource()),
|
||||
memcpBatchOp.getSizeAttr());
|
||||
mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
|
||||
continue;
|
||||
}
|
||||
|
||||
Operation* cloned = builder.clone(op, mapper);
|
||||
for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
|
||||
mapper.map(originalResult, clonedResult);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LogicalResult withScalarCoreFromBatchLanes(pim::PimCoreBatchOp coreBatchOp,
|
||||
ArrayRef<unsigned> lanes,
|
||||
llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
|
||||
assert(!lanes.empty() && "expected at least one batch lane");
|
||||
|
||||
OwningOpRef<ModuleOp> scratchModule = ModuleOp::create(coreBatchOp.getLoc());
|
||||
OpBuilder builder(scratchModule->getContext());
|
||||
OperationFolder constantFolder(scratchModule->getContext());
|
||||
builder.setInsertionPointToStart(scratchModule->getBody());
|
||||
|
||||
SmallVector<Value> weights(coreBatchOp.getWeights().begin(), coreBatchOp.getWeights().end());
|
||||
auto coreIds = getBatchCoreIds(coreBatchOp);
|
||||
int32_t coreId = coreIds[lanes.front()];
|
||||
for (unsigned lane : lanes)
|
||||
assert(coreIds[lane] == coreId && "all grouped lanes must target the same core");
|
||||
|
||||
auto scalarCore =
|
||||
pim::PimCoreOp::create(builder, coreBatchOp.getLoc(), ValueRange(weights), builder.getI32IntegerAttr(coreId));
|
||||
SmallVector<Type> weightTypes;
|
||||
SmallVector<Location> weightLocs;
|
||||
weightTypes.reserve(weights.size());
|
||||
weightLocs.reserve(weights.size());
|
||||
for (Value weight : weights) {
|
||||
weightTypes.push_back(weight.getType());
|
||||
weightLocs.push_back(weight.getLoc());
|
||||
}
|
||||
Block* block =
|
||||
builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end(), TypeRange(weightTypes), weightLocs);
|
||||
builder.setInsertionPointToEnd(block);
|
||||
for (unsigned lane : lanes)
|
||||
cloneScalarizedLaneBody(builder, coreBatchOp, lane, constantFolder);
|
||||
if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
|
||||
pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
|
||||
return callback(scalarCore);
|
||||
}
|
||||
|
||||
LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
|
||||
unsigned lane,
|
||||
llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
|
||||
return withScalarCoreFromBatchLanes(coreBatchOp, ArrayRef<unsigned> {lane}, callback);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "llvm/ADT/STLFunctionalExtras.h"
|
||||
|
||||
#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
mlir::LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
|
||||
unsigned lane,
|
||||
llvm::function_ref<mlir::LogicalResult(pim::PimCoreOp)> callback);
|
||||
mlir::LogicalResult withScalarCoreFromBatchLanes(pim::PimCoreBatchOp coreBatchOp,
|
||||
llvm::ArrayRef<unsigned> lanes,
|
||||
llvm::function_ref<mlir::LogicalResult(pim::PimCoreOp)> callback);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
+691
-251
File diff suppressed because it is too large
Load Diff
@@ -4,13 +4,16 @@
|
||||
|
||||
#include "llvm-project/clang/include/clang/Basic/LLVM.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
#include "llvm/ADT/Hashing.h"
|
||||
#include "llvm/Support/JSON.h"
|
||||
#include "llvm/Support/raw_os_ostream.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <limits>
|
||||
#include <optional>
|
||||
|
||||
#include "onnx-mlir/Compiler/OMCompilerTypes.h"
|
||||
#include "src/Accelerators/PIM/Common/IR/AddressAnalysis.hpp"
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBinaryFormat.hpp"
|
||||
@@ -23,6 +26,13 @@ struct MemEntry {
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct MemoryValueKey {
|
||||
mlir::Value value;
|
||||
std::optional<unsigned> lane;
|
||||
|
||||
bool operator==(const MemoryValueKey& other) const { return value == other.value && lane == other.lane; }
|
||||
};
|
||||
|
||||
struct MemoryReportRow {
|
||||
uint64_t numAlloca = 0;
|
||||
uint64_t sizeAlloca = 0;
|
||||
@@ -50,33 +60,33 @@ struct MemoryReportEntry {
|
||||
};
|
||||
|
||||
class PimMemory {
|
||||
llvm::SmallVector<std::pair<MemEntry, mlir::Value>, 32> memEntries;
|
||||
llvm::SmallDenseMap<mlir::Value, MemEntry, 32>& globalMemEntriesMap;
|
||||
llvm::SmallDenseMap<mlir::Value, MemEntry, 32> ownedMemEntriesMap;
|
||||
llvm::SmallVector<std::pair<MemEntry, MemoryValueKey>, 32> memEntries;
|
||||
llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& globalMemEntriesMap;
|
||||
llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32> ownedMemEntriesMap;
|
||||
|
||||
size_t minAlignment = 4;
|
||||
size_t firstAvailableAddress = 0;
|
||||
|
||||
MemEntry* gatherMemEntry(mlir::Value value);
|
||||
MemEntry* gatherMemEntry(mlir::Value value, std::optional<unsigned> lane = std::nullopt);
|
||||
void allocateGatheredMemory();
|
||||
void allocateMemoryForValue(mlir::Value value, MemEntry& memEntry);
|
||||
void allocateMemoryForValue(const MemoryValueKey& key, MemEntry& memEntry);
|
||||
|
||||
public:
|
||||
PimMemory(llvm::SmallDenseMap<mlir::Value, MemEntry, 32>& globalMemEntriesMap)
|
||||
PimMemory(llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& globalMemEntriesMap)
|
||||
: globalMemEntriesMap(globalMemEntriesMap) {}
|
||||
|
||||
void allocateHost(mlir::ModuleOp moduleOp, mlir::func::FuncOp funcOp);
|
||||
void allocateCore(mlir::Operation* op);
|
||||
void allocateCore(mlir::Operation* op, std::optional<unsigned> lane = std::nullopt);
|
||||
MemoryReportRow getReportRow() const;
|
||||
void remove(mlir::Value val);
|
||||
|
||||
size_t getFirstAvailableAddress() const { return firstAvailableAddress; }
|
||||
MemEntry getMemEntry(mlir::Value value) const;
|
||||
MemEntry getMemEntry(const MemoryValueKey& key) const;
|
||||
};
|
||||
|
||||
class PimAcceleratorMemory {
|
||||
public:
|
||||
llvm::SmallDenseMap<mlir::Value, MemEntry, 32> memEntriesMap;
|
||||
llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32> memEntriesMap;
|
||||
PimMemory hostMem;
|
||||
|
||||
private:
|
||||
@@ -84,14 +94,21 @@ private:
|
||||
std::fstream fileReport;
|
||||
std::optional<MemoryReportRow> hostReportRow;
|
||||
llvm::SmallVector<MemoryReportEntry, 32> reportEntries;
|
||||
mutable llvm::DenseMap<mlir::Value, CompiledIndexExpr> compiledIndexExprs;
|
||||
mutable llvm::DenseMap<mlir::Value, CompiledAddressExpr> compiledAddressExprs;
|
||||
|
||||
public:
|
||||
PimAcceleratorMemory()
|
||||
: hostMem(memEntriesMap), fileReport(openReportFile("memory_report")) {}
|
||||
PimAcceleratorMemory(const llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& initialMemEntries, bool enableReport)
|
||||
: memEntriesMap(initialMemEntries), hostMem(memEntriesMap), fileReport(enableReport ? openReportFile("memory_report") : std::fstream()) {}
|
||||
|
||||
PimMemory& getOrCreateDeviceMem(size_t id);
|
||||
|
||||
size_t getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const;
|
||||
size_t getValueAddress(mlir::Value value,
|
||||
const StaticValueKnowledge& knowledge = {},
|
||||
std::optional<unsigned> lane = std::nullopt) const;
|
||||
llvm::FailureOr<int64_t> getIndexValue(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const;
|
||||
void reportHost();
|
||||
void recordCoreReport(size_t coreId, const MemoryReportRow& row);
|
||||
void recordBatchReport(uint64_t batchId,
|
||||
@@ -103,15 +120,24 @@ public:
|
||||
void clean(mlir::Operation* op);
|
||||
};
|
||||
|
||||
struct CoreEmissionJob {
|
||||
mlir::Operation* coreLikeOp = nullptr;
|
||||
size_t originalCoreId = 0;
|
||||
size_t emittedCoreId = 0;
|
||||
llvm::SmallVector<unsigned, 4> lanes;
|
||||
std::optional<uint64_t> batchReportId;
|
||||
};
|
||||
|
||||
class PimCodeGen {
|
||||
PimAcceleratorMemory& memory;
|
||||
llvm::raw_fd_ostream& coreBinaryStream;
|
||||
llvm::raw_fd_ostream* coreJsonStream;
|
||||
const llvm::DenseMap<size_t, size_t>& emittedCoreIds;
|
||||
std::optional<unsigned> batchLane;
|
||||
mutable uint32_t emittedInstructionCount = 0;
|
||||
|
||||
size_t addressOf(mlir::Value value, const StaticValueKnowledge& knowledge) const {
|
||||
return memory.getValueAddress(value, knowledge);
|
||||
return memory.getValueAddress(value, knowledge, batchLane);
|
||||
}
|
||||
size_t remapCoreId(size_t coreId) const;
|
||||
|
||||
@@ -141,6 +167,10 @@ public:
|
||||
: memory(memory), coreBinaryStream(coreBinary), coreJsonStream(coreJson), emittedCoreIds(emittedCoreIds) {}
|
||||
|
||||
uint32_t getEmittedInstructionCount() const { return emittedInstructionCount; }
|
||||
void setBatchLane(std::optional<unsigned> lane) { batchLane = lane; }
|
||||
llvm::FailureOr<int64_t> indexOf(mlir::Value value, const StaticValueKnowledge& knowledge) const {
|
||||
return memory.getIndexValue(value, knowledge);
|
||||
}
|
||||
|
||||
void codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenLoadBatchOp(pim::PimMemCopyHostToDevBatchOp loadOp, const StaticValueKnowledge& knowledge) const;
|
||||
@@ -151,6 +181,14 @@ public:
|
||||
void codeGenReceiveTensorOp(pim::PimReceiveTensorOp receiveTensorOp, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenSendTensorOp(pim::PimSendTensorOp sendTensorOp, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenReceiveBatchOp(pim::PimReceiveBatchOp receiveOp, unsigned lane, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenReceiveTensorBatchOp(pim::PimReceiveTensorBatchOp receiveOp,
|
||||
llvm::ArrayRef<int32_t> laneCoreIds,
|
||||
const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenSendBatchOp(pim::PimSendBatchOp sendOp, unsigned lane, const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenSendTensorBatchOp(pim::PimSendTensorBatchOp sendOp,
|
||||
llvm::ArrayRef<int32_t> laneCoreIds,
|
||||
const StaticValueKnowledge& knowledge) const;
|
||||
void codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const;
|
||||
|
||||
template <typename MVMTy>
|
||||
@@ -173,3 +211,24 @@ public:
|
||||
OnnxMlirCompilerErrorCodes compileToPimCode(mlir::ModuleOp& moduleOpRef, std::string& outputDirName);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
|
||||
namespace llvm {
|
||||
|
||||
template <>
|
||||
struct DenseMapInfo<onnx_mlir::MemoryValueKey> {
|
||||
static onnx_mlir::MemoryValueKey getEmptyKey() {
|
||||
return {DenseMapInfo<mlir::Value>::getEmptyKey(), 0};
|
||||
}
|
||||
|
||||
static onnx_mlir::MemoryValueKey getTombstoneKey() {
|
||||
return {DenseMapInfo<mlir::Value>::getTombstoneKey(), 0};
|
||||
}
|
||||
|
||||
static unsigned getHashValue(const onnx_mlir::MemoryValueKey& key) {
|
||||
return hash_combine(key.value, key.lane.value_or(std::numeric_limits<unsigned>::max()));
|
||||
}
|
||||
|
||||
static bool isEqual(const onnx_mlir::MemoryValueKey& lhs, const onnx_mlir::MemoryValueKey& rhs) { return lhs == rhs; }
|
||||
};
|
||||
|
||||
} // namespace llvm
|
||||
|
||||
@@ -3,17 +3,19 @@
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <type_traits>
|
||||
|
||||
#include "Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/BatchCoreUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/ShapeUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/WeightUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimWeightEmitter.hpp"
|
||||
@@ -126,30 +128,6 @@ FailureOr<DenseWeightView> resolveDenseWeightView(ModuleOp moduleOp, mlir::Value
|
||||
return view;
|
||||
}
|
||||
|
||||
SmallVector<unsigned, 8> getUsedWeightIndices(Block& block) {
|
||||
SmallVector<unsigned, 8> indices;
|
||||
auto coreOp = dyn_cast<pim::PimCoreOp>(block.getParentOp());
|
||||
auto addWeight = [&](mlir::Value weight) {
|
||||
if (!coreOp)
|
||||
return;
|
||||
for (unsigned weightIndex = 0; weightIndex < coreOp.getWeights().size(); ++weightIndex) {
|
||||
if (coreOp.getWeightArgument(weightIndex) != weight)
|
||||
continue;
|
||||
if (!llvm::is_contained(indices, weightIndex))
|
||||
indices.push_back(weightIndex);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
block.walk([&](pim::PimVMMOp vmmOp) { addWeight(vmmOp.getWeight()); });
|
||||
llvm::sort(indices);
|
||||
return indices;
|
||||
}
|
||||
|
||||
SmallVector<unsigned, 8> getUsedWeightIndices(pim::PimCoreOp coreOp) {
|
||||
return getUsedWeightIndices(coreOp.getBody().front());
|
||||
}
|
||||
|
||||
SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
|
||||
SmallVector<Operation*> coreLikeOps;
|
||||
for (Operation& op : funcOp.getBody().front())
|
||||
@@ -171,86 +149,117 @@ createAndPopulateWeightFolder(func::FuncOp funcOp, StringRef outputDirPath) {
|
||||
int64_t xbarSize = crossbarSize.getValue();
|
||||
llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>> mapCoreWeightToFileName;
|
||||
llvm::DenseMap<memref::GlobalOp, std::string> mapGlobalOpToFileName;
|
||||
llvm::DenseMap<mlir::Value, std::string> mapWeightValueToFileName;
|
||||
|
||||
SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
|
||||
|
||||
for (Operation* op : coreLikeOps) {
|
||||
auto processCore = [&](pim::PimCoreOp coreOp) {
|
||||
size_t coreId = static_cast<size_t>(coreOp.getCoreId());
|
||||
for (unsigned index : getUsedWeightIndices(coreOp)) {
|
||||
if (index >= coreOp.getWeights().size()) {
|
||||
coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
|
||||
assert(index < coreOp.getWeights().size() && "Weight index is out of range");
|
||||
}
|
||||
mlir::Value weight = coreOp.getWeights()[index];
|
||||
auto processWeight = [&](Operation* ownerOp,
|
||||
mlir::Value weight,
|
||||
size_t weightIndex,
|
||||
size_t coreId) -> LogicalResult {
|
||||
auto weightView = resolveDenseWeightView(moduleOp, weight);
|
||||
if (failed(weightView)) {
|
||||
ownerOp->emitWarning("Weight is not from a memref.get_global at index " + std::to_string(weightIndex));
|
||||
assert(succeeded(weightView) && "Weight is not from a dense memref.global view");
|
||||
}
|
||||
|
||||
auto weightView = resolveDenseWeightView(moduleOp, weight);
|
||||
if (failed(weightView)) {
|
||||
coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(index));
|
||||
assert(succeeded(weightView) && "Weight is not from a dense memref.global view");
|
||||
}
|
||||
if (mapCoreWeightToFileName[coreId].contains(weight))
|
||||
return success();
|
||||
|
||||
if (mapCoreWeightToFileName[coreId].contains(weight))
|
||||
continue;
|
||||
if (auto weightFile = mapWeightValueToFileName.find(weight); weightFile != mapWeightValueToFileName.end()) {
|
||||
mapCoreWeightToFileName[coreId].insert({weight, weightFile->second});
|
||||
return success();
|
||||
}
|
||||
|
||||
auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
|
||||
auto globalOp = getGlobalOp ? lookupGlobalForGetGlobal(moduleOp, getGlobalOp) : memref::GlobalOp {};
|
||||
if (globalOp && mapGlobalOpToFileName.contains(globalOp)) {
|
||||
auto& fileName = mapGlobalOpToFileName[globalOp];
|
||||
mapCoreWeightToFileName[coreId].insert({weight, fileName});
|
||||
continue;
|
||||
}
|
||||
auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
|
||||
auto globalOp = getGlobalOp ? lookupGlobalForGetGlobal(moduleOp, getGlobalOp) : memref::GlobalOp {};
|
||||
if (globalOp && mapGlobalOpToFileName.contains(globalOp)) {
|
||||
auto& fileName = mapGlobalOpToFileName[globalOp];
|
||||
mapWeightValueToFileName[weight] = fileName;
|
||||
mapCoreWeightToFileName[coreId].insert({weight, fileName});
|
||||
return success();
|
||||
}
|
||||
|
||||
DenseElementsAttr denseAttr = weightView->denseAttr;
|
||||
ArrayRef<int64_t> shape = weightView->shape;
|
||||
assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
|
||||
int64_t numRows = shape[0];
|
||||
int64_t numCols = shape[1];
|
||||
assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
|
||||
DenseElementsAttr denseAttr = weightView->denseAttr;
|
||||
ArrayRef<int64_t> shape = weightView->shape;
|
||||
assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
|
||||
int64_t numRows = shape[0];
|
||||
int64_t numCols = shape[1];
|
||||
assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
|
||||
|
||||
size_t elementByteWidth = getElementTypeSizeInBytes(denseAttr.getElementType());
|
||||
size_t elementByteWidth = getElementTypeSizeInBytes(denseAttr.getElementType());
|
||||
|
||||
std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin";
|
||||
auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str();
|
||||
std::error_code errorCode;
|
||||
raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
|
||||
if (errorCode) {
|
||||
errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
|
||||
assert(errorCode);
|
||||
}
|
||||
std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin";
|
||||
auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str();
|
||||
std::error_code errorCode;
|
||||
raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
|
||||
if (errorCode) {
|
||||
errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
|
||||
assert(errorCode);
|
||||
}
|
||||
|
||||
uint64_t zero = 0;
|
||||
for (int64_t row = 0; row < xbarSize; row++) {
|
||||
for (int64_t col = 0; col < xbarSize; col++) {
|
||||
if (row < numRows && col < numCols) {
|
||||
int64_t elementIndex = weightView->offset + row * weightView->strides[0] + col * weightView->strides[1];
|
||||
APInt bits = denseAttr.getValues<APFloat>()[elementIndex].bitcastToAPInt();
|
||||
uint64_t word = bits.getZExtValue();
|
||||
weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
|
||||
}
|
||||
else {
|
||||
weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
|
||||
}
|
||||
uint64_t zero = 0;
|
||||
for (int64_t row = 0; row < xbarSize; row++) {
|
||||
for (int64_t col = 0; col < xbarSize; col++) {
|
||||
if (row < numRows && col < numCols) {
|
||||
int64_t elementIndex = weightView->offset + row * weightView->strides[0] + col * weightView->strides[1];
|
||||
APInt bits = denseAttr.getValues<APFloat>()[elementIndex].bitcastToAPInt();
|
||||
uint64_t word = bits.getZExtValue();
|
||||
weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
|
||||
}
|
||||
else {
|
||||
weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
|
||||
}
|
||||
}
|
||||
|
||||
weightFileStream.close();
|
||||
if (globalOp)
|
||||
mapGlobalOpToFileName.insert({globalOp, newFileName});
|
||||
mapCoreWeightToFileName[coreId].insert({weight, newFileName});
|
||||
}
|
||||
|
||||
weightFileStream.close();
|
||||
if (globalOp)
|
||||
mapGlobalOpToFileName.insert({globalOp, newFileName});
|
||||
mapWeightValueToFileName[weight] = newFileName;
|
||||
mapCoreWeightToFileName[coreId].insert({weight, newFileName});
|
||||
return success();
|
||||
};
|
||||
|
||||
auto processCoreLike = [&](auto coreLikeOp) {
|
||||
auto usedIndices = getUsedWeightIndices(coreLikeOp);
|
||||
for (unsigned index : usedIndices) {
|
||||
if (index >= coreLikeOp.getWeights().size()) {
|
||||
coreLikeOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
|
||||
assert(index < coreLikeOp.getWeights().size() && "Weight index is out of range");
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<std::decay_t<decltype(coreLikeOp)>, pim::PimCoreOp>) {
|
||||
size_t coreId = static_cast<size_t>(coreLikeOp.getCoreId());
|
||||
for (unsigned index : usedIndices)
|
||||
if (failed(processWeight(coreLikeOp, coreLikeOp.getWeights()[index], index, coreId)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
else {
|
||||
auto batchCoreIds = getBatchCoreIds(coreLikeOp);
|
||||
SmallVector<size_t> orderedCoreIds;
|
||||
llvm::SmallSet<size_t, 8> seenCoreIds;
|
||||
for (int32_t coreId : batchCoreIds)
|
||||
if (seenCoreIds.insert(static_cast<size_t>(coreId)).second)
|
||||
orderedCoreIds.push_back(static_cast<size_t>(coreId));
|
||||
|
||||
for (size_t coreId : orderedCoreIds)
|
||||
for (unsigned index : usedIndices)
|
||||
if (failed(processWeight(coreLikeOp, coreLikeOp.getWeights()[index], index, coreId)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
|
||||
(void) processCore(coreOp);
|
||||
(void) processCoreLike(coreOp);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
|
||||
for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
|
||||
if (failed(withScalarCoreFromBatchLane(coreBatchOp, lane, processCore)))
|
||||
return mapCoreWeightToFileName;
|
||||
(void) processCoreLike(cast<pim::PimCoreBatchOp>(op));
|
||||
}
|
||||
return mapCoreWeightToFileName;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user