Memory Liveness

This commit is contained in:
ilgeco
2026-06-03 18:15:30 +02:00
parent 2a8faf9c6b
commit 20cf40c9ba
15 changed files with 1263 additions and 112 deletions
@@ -35,7 +35,7 @@ FailureOr<Value> materializeContiguousInputMemRef(Value memrefValue, Location lo
}
Value allocateContiguousResultMemRefLike(Value memrefValue, Location loc, RewriterBase& rewriter) {
if (succeeded(resolveContiguousAddress(memrefValue)))
if (succeeded(resolveContiguousAddress(memrefValue)) || succeeded(compileContiguousAddressExpr(memrefValue)))
return memrefValue;
auto shapedType = cast<ShapedType>(memrefValue.getType());
@@ -19,7 +19,7 @@ namespace pim {
namespace {
static bool isSupportedAliasOp(Operation* op) {
static bool isSupportedAliasOp(Operation *op) {
return isa<memref::SubViewOp, memref::CastOp, memref::CollapseShapeOp, memref::ExpandShapeOp>(op);
}
@@ -32,32 +32,51 @@ static uint64_t getTypeSizeBytes(MemRefType type) {
return static_cast<uint64_t>(type.getNumElements() * getElementTypeSizeInBytes(type.getElementType()));
}
static Operation* getTopLevelAncestorInBody(Operation* op, Block& body) {
Operation* current = op;
while (current && current->getBlock() != &body)
static Operation *getTopLevelAncestorInBlock(Operation *op, Block &block) {
Operation *current = op;
while (current && current->getBlock() != &block)
current = current->getParentOp();
return current;
}
static void analyzeBlock(Block &block, MemoryCoalescingAnalysis &analysis);
static FailureOr<uint64_t>
getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Operation*, uint64_t>& opOrder) {
getLastUseInstruction(memref::AllocOp allocOp, Block &scopeBlock, const DenseMap<Operation *, uint64_t> &opOrder) {
uint64_t endInstruction = opOrder.lookup(allocOp);
SmallPtrSet<Operation*, 16> visited;
SmallPtrSet<Value, 16> visitedValues;
SmallPtrSet<Operation *, 16> visitedUsers;
SmallVector<Value> pendingValues;
pendingValues.push_back(allocOp.getResult());
while (!pendingValues.empty()) {
Value value = pendingValues.pop_back_val();
for (Operation* user : value.getUsers()) {
Operation* orderedUser = getTopLevelAncestorInBody(user, body);
if (!orderedUser)
return failure();
if (!visited.insert(user).second)
if (!visitedValues.insert(value).second)
continue;
for (Operation *user : value.getUsers()) {
if (!visitedUsers.insert(user).second)
continue;
if (isSupportedAliasOp(user))
for (Value result : user->getResults())
pendingValues.push_back(result);
llvm::append_range(pendingValues, user->getResults());
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
for (OpResult result : user->getResults()) {
OpOperand *tiedOperand = dpsOp.getTiedOpOperand(result);
if (tiedOperand && tiedOperand->get() == value)
pendingValues.push_back(result);
}
}
if (auto forOp = dyn_cast<scf::ForOp>(user)) {
for (auto [index, initArg] : llvm::enumerate(forOp.getInitArgs())) {
if (initArg != value)
continue;
pendingValues.push_back(forOp.getRegionIterArgs()[index]);
pendingValues.push_back(forOp.getResult(index));
}
}
if (auto yieldOp = dyn_cast<scf::YieldOp>(user)) {
auto forOp = dyn_cast<scf::ForOp>(yieldOp->getParentOp());
@@ -68,20 +87,9 @@ getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Opera
pendingValues.push_back(forOp.getResult(index));
}
if (auto forOp = dyn_cast<scf::ForOp>(user)) {
for (auto [index, initArg] : llvm::enumerate(forOp.getInitArgs()))
if (initArg == value)
pendingValues.push_back(forOp.getResult(index));
}
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
for (OpResult result : user->getResults()) {
OpOperand* tiedOperand = dpsOp.getTiedOpOperand(result);
if (!tiedOperand || tiedOperand->get() != value)
continue;
pendingValues.push_back(result);
}
}
Operation *orderedUser = getTopLevelAncestorInBlock(user, scopeBlock);
if (!orderedUser)
return failure();
auto order = opOrder.find(orderedUser);
if (order == opOrder.end())
@@ -93,101 +101,126 @@ getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Opera
return endInstruction;
}
} // namespace
static void analyzeBlock(Block &block, MemoryCoalescingAnalysis &analysis) {
for (Operation &op : block)
for (Region &region : op.getRegions())
for (Block &nestedBlock : region)
analyzeBlock(nestedBlock, analysis);
MemoryCoalescingAnalysis analyzeMemoryCoalescingCandidates(Operation* coreLikeOp) {
MemoryCoalescingAnalysis analysis;
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
return analysis;
Block& body = coreLikeOp->getRegion(0).front();
DenseMap<Operation*, uint64_t> opOrder;
DenseMap<Operation *, uint64_t> opOrder;
uint64_t nextInstruction = 0;
for (Operation& op : body)
for (Operation &op : block)
opOrder.try_emplace(&op, nextInstruction++);
for (Operation& op : body) {
MemoryCoalescingBlockAnalysis blockAnalysis;
blockAnalysis.block = &block;
for (Operation &op : block) {
auto allocOp = dyn_cast<memref::AllocOp>(&op);
if (!allocOp)
continue;
auto allocType = dyn_cast<MemRefType>(allocOp.getType());
if (!isCandidateAllocType(allocType)) {
++analysis.skippedAllocations;
++blockAnalysis.skippedAllocations;
continue;
}
auto endInstruction = getLastUseInstruction(allocOp, body, opOrder);
auto endInstruction = getLastUseInstruction(allocOp, block, opOrder);
if (failed(endInstruction)) {
++analysis.skippedAllocations;
++blockAnalysis.skippedAllocations;
continue;
}
analysis.candidates.push_back(
AllocationCandidate {allocOp, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)});
blockAnalysis.candidates.push_back(
AllocationCandidate {allocOp, &block, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)});
}
analysis.skippedAllocations += blockAnalysis.skippedAllocations;
if (!blockAnalysis.candidates.empty() || blockAnalysis.skippedAllocations != 0)
analysis.blocks.push_back(std::move(blockAnalysis));
}
} // namespace
uint64_t MemoryCoalescingAnalysis::getCandidateCount() const {
uint64_t total = 0;
for (const MemoryCoalescingBlockAnalysis &block : blocks)
total += block.candidates.size();
return total;
}
MemoryCoalescingAnalysis analyzeMemoryCoalescingCandidates(Operation *coreLikeOp) {
MemoryCoalescingAnalysis analysis;
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
return analysis;
analyzeBlock(coreLikeOp->getRegion(0).front(), analysis);
return analysis;
}
MemoryCoalescingStats
coalesceMemory(Operation* coreLikeOp, const MemoryCoalescingAnalysis& analysis, RewriterBase& rewriter) {
coalesceMemory(Operation *coreLikeOp, const MemoryCoalescingAnalysis &analysis, RewriterBase &rewriter) {
(void) coreLikeOp;
MemoryCoalescingStats stats;
stats.skippedAllocations = analysis.skippedAllocations;
auto candidates = analysis.candidates;
llvm::sort(candidates, [](const AllocationCandidate& lhs, const AllocationCandidate& rhs) {
if (lhs.startInstruction != rhs.startInstruction)
return lhs.startInstruction < rhs.startInstruction;
return lhs.endInstruction < rhs.endInstruction;
});
for (const MemoryCoalescingBlockAnalysis &blockAnalysis : analysis.blocks) {
auto candidates = blockAnalysis.candidates;
llvm::sort(candidates, [](const AllocationCandidate &lhs, const AllocationCandidate &rhs) {
if (lhs.startInstruction != rhs.startInstruction)
return lhs.startInstruction < rhs.startInstruction;
return lhs.endInstruction < rhs.endInstruction;
});
struct ActiveStorage {
memref::AllocOp root;
uint64_t endInstruction = 0;
};
struct ActiveStorage {
memref::AllocOp root;
uint64_t endInstruction = 0;
};
SmallVector<ActiveStorage> active;
SmallVector<memref::AllocOp> freeList;
SmallVector<ActiveStorage> active;
SmallVector<memref::AllocOp> freeList;
for (AllocationCandidate& candidate : candidates) {
for (auto it = active.begin(); it != active.end();) {
if (it->endInstruction < candidate.startInstruction) {
freeList.push_back(it->root);
it = active.erase(it);
for (AllocationCandidate &candidate : candidates) {
for (auto it = active.begin(); it != active.end();) {
if (it->endInstruction < candidate.startInstruction) {
freeList.push_back(it->root);
it = active.erase(it);
continue;
}
++it;
}
auto bestFit = freeList.end();
uint64_t bestFitBytes = std::numeric_limits<uint64_t>::max();
auto candidateType = cast<MemRefType>(candidate.alloc.getType());
for (auto it = freeList.begin(); it != freeList.end(); ++it) {
auto freeType = cast<MemRefType>((*it).getType());
if (freeType != candidateType)
continue;
uint64_t freeBytes = getTypeSizeBytes(freeType);
if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes)
continue;
bestFit = it;
bestFitBytes = freeBytes;
}
if (bestFit == freeList.end()) {
active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction});
continue;
}
++it;
memref::AllocOp root = *bestFit;
freeList.erase(bestFit);
candidate.alloc.getResult().replaceAllUsesWith(root.getResult());
rewriter.eraseOp(candidate.alloc);
active.push_back(ActiveStorage {root, candidate.endInstruction});
++stats.removedAllocs;
stats.savedBytes += candidate.sizeBytes;
}
auto bestFit = freeList.end();
uint64_t bestFitBytes = std::numeric_limits<uint64_t>::max();
auto candidateType = cast<MemRefType>(candidate.alloc.getType());
for (auto it = freeList.begin(); it != freeList.end(); ++it) {
auto freeType = cast<MemRefType>((*it).getType());
if (freeType != candidateType)
continue;
uint64_t freeBytes = getTypeSizeBytes(freeType);
if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes)
continue;
bestFit = it;
bestFitBytes = freeBytes;
}
if (bestFit == freeList.end()) {
active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction});
continue;
}
memref::AllocOp root = *bestFit;
freeList.erase(bestFit);
candidate.alloc.getResult().replaceAllUsesWith(root.getResult());
rewriter.eraseOp(candidate.alloc);
active.push_back(ActiveStorage {root, candidate.endInstruction});
++stats.removedAllocs;
stats.savedBytes += candidate.sizeBytes;
}
return stats;
@@ -10,16 +10,25 @@ namespace pim {
struct AllocationCandidate {
mlir::memref::AllocOp alloc;
mlir::Block *scopeBlock = nullptr;
uint64_t startInstruction = 0;
uint64_t endInstruction = 0;
uint64_t sizeBytes = 0;
};
struct MemoryCoalescingAnalysis {
struct MemoryCoalescingBlockAnalysis {
mlir::Block *block = nullptr;
llvm::SmallVector<AllocationCandidate> candidates;
uint64_t skippedAllocations = 0;
};
struct MemoryCoalescingAnalysis {
llvm::SmallVector<MemoryCoalescingBlockAnalysis> blocks;
uint64_t skippedAllocations = 0;
uint64_t getCandidateCount() const;
};
struct MemoryCoalescingStats {
uint64_t removedAllocs = 0;
uint64_t savedBytes = 0;
@@ -23,9 +23,9 @@ using namespace onnx_mlir::compact_asm;
namespace onnx_mlir {
namespace {
// This pass assumes bufferization has already normalized executable PIM
// operands. It only reuses compatible local allocations with non-overlapping
// lifetimes; it does not repair memory contiguity.
// This pass is an IR cleanup step after bufferization. It only rewrites
// obviously compatible local allocations with non-overlapping lifetimes inside
// the same block and leaves the final physical memory planning to codegen.
struct CoalescingReportRow {
uint64_t numCandidates = 0;
@@ -174,7 +174,7 @@ struct PimMemoryCoalescingPass : PassWrapper<PimMemoryCoalescingPass, OperationP
auto analysis = pim::analyzeMemoryCoalescingCandidates(op);
auto stats = pim::coalesceMemory(op, analysis, rewriter);
CoalescingReportRow row {
analysis.candidates.size(), stats.skippedAllocations, stats.removedAllocs, stats.savedBytes};
analysis.getCandidateCount(), stats.skippedAllocations, stats.removedAllocs, stats.savedBytes};
if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
auto checkedCoreId =