|
|
|
@@ -19,7 +19,7 @@ namespace pim {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
static bool isSupportedAliasOp(Operation* op) {
|
|
|
|
|
static bool isSupportedAliasOp(Operation *op) {
|
|
|
|
|
return isa<memref::SubViewOp, memref::CastOp, memref::CollapseShapeOp, memref::ExpandShapeOp>(op);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -32,32 +32,51 @@ static uint64_t getTypeSizeBytes(MemRefType type) {
|
|
|
|
|
return static_cast<uint64_t>(type.getNumElements() * getElementTypeSizeInBytes(type.getElementType()));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static Operation* getTopLevelAncestorInBody(Operation* op, Block& body) {
|
|
|
|
|
Operation* current = op;
|
|
|
|
|
while (current && current->getBlock() != &body)
|
|
|
|
|
static Operation *getTopLevelAncestorInBlock(Operation *op, Block &block) {
|
|
|
|
|
Operation *current = op;
|
|
|
|
|
while (current && current->getBlock() != &block)
|
|
|
|
|
current = current->getParentOp();
|
|
|
|
|
return current;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void analyzeBlock(Block &block, MemoryCoalescingAnalysis &analysis);
|
|
|
|
|
|
|
|
|
|
static FailureOr<uint64_t>
|
|
|
|
|
getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Operation*, uint64_t>& opOrder) {
|
|
|
|
|
getLastUseInstruction(memref::AllocOp allocOp, Block &scopeBlock, const DenseMap<Operation *, uint64_t> &opOrder) {
|
|
|
|
|
uint64_t endInstruction = opOrder.lookup(allocOp);
|
|
|
|
|
SmallPtrSet<Operation*, 16> visited;
|
|
|
|
|
SmallPtrSet<Value, 16> visitedValues;
|
|
|
|
|
SmallPtrSet<Operation *, 16> visitedUsers;
|
|
|
|
|
SmallVector<Value> pendingValues;
|
|
|
|
|
pendingValues.push_back(allocOp.getResult());
|
|
|
|
|
|
|
|
|
|
while (!pendingValues.empty()) {
|
|
|
|
|
Value value = pendingValues.pop_back_val();
|
|
|
|
|
for (Operation* user : value.getUsers()) {
|
|
|
|
|
Operation* orderedUser = getTopLevelAncestorInBody(user, body);
|
|
|
|
|
if (!orderedUser)
|
|
|
|
|
return failure();
|
|
|
|
|
if (!visited.insert(user).second)
|
|
|
|
|
if (!visitedValues.insert(value).second)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (Operation *user : value.getUsers()) {
|
|
|
|
|
if (!visitedUsers.insert(user).second)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
if (isSupportedAliasOp(user))
|
|
|
|
|
for (Value result : user->getResults())
|
|
|
|
|
pendingValues.push_back(result);
|
|
|
|
|
llvm::append_range(pendingValues, user->getResults());
|
|
|
|
|
|
|
|
|
|
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
|
|
|
|
for (OpResult result : user->getResults()) {
|
|
|
|
|
OpOperand *tiedOperand = dpsOp.getTiedOpOperand(result);
|
|
|
|
|
if (tiedOperand && tiedOperand->get() == value)
|
|
|
|
|
pendingValues.push_back(result);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (auto forOp = dyn_cast<scf::ForOp>(user)) {
|
|
|
|
|
for (auto [index, initArg] : llvm::enumerate(forOp.getInitArgs())) {
|
|
|
|
|
if (initArg != value)
|
|
|
|
|
continue;
|
|
|
|
|
pendingValues.push_back(forOp.getRegionIterArgs()[index]);
|
|
|
|
|
pendingValues.push_back(forOp.getResult(index));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (auto yieldOp = dyn_cast<scf::YieldOp>(user)) {
|
|
|
|
|
auto forOp = dyn_cast<scf::ForOp>(yieldOp->getParentOp());
|
|
|
|
@@ -68,20 +87,9 @@ getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Opera
|
|
|
|
|
pendingValues.push_back(forOp.getResult(index));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (auto forOp = dyn_cast<scf::ForOp>(user)) {
|
|
|
|
|
for (auto [index, initArg] : llvm::enumerate(forOp.getInitArgs()))
|
|
|
|
|
if (initArg == value)
|
|
|
|
|
pendingValues.push_back(forOp.getResult(index));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
|
|
|
|
for (OpResult result : user->getResults()) {
|
|
|
|
|
OpOperand* tiedOperand = dpsOp.getTiedOpOperand(result);
|
|
|
|
|
if (!tiedOperand || tiedOperand->get() != value)
|
|
|
|
|
continue;
|
|
|
|
|
pendingValues.push_back(result);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Operation *orderedUser = getTopLevelAncestorInBlock(user, scopeBlock);
|
|
|
|
|
if (!orderedUser)
|
|
|
|
|
return failure();
|
|
|
|
|
|
|
|
|
|
auto order = opOrder.find(orderedUser);
|
|
|
|
|
if (order == opOrder.end())
|
|
|
|
@@ -93,101 +101,126 @@ getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Opera
|
|
|
|
|
return endInstruction;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
static void analyzeBlock(Block &block, MemoryCoalescingAnalysis &analysis) {
|
|
|
|
|
for (Operation &op : block)
|
|
|
|
|
for (Region ®ion : op.getRegions())
|
|
|
|
|
for (Block &nestedBlock : region)
|
|
|
|
|
analyzeBlock(nestedBlock, analysis);
|
|
|
|
|
|
|
|
|
|
MemoryCoalescingAnalysis analyzeMemoryCoalescingCandidates(Operation* coreLikeOp) {
|
|
|
|
|
MemoryCoalescingAnalysis analysis;
|
|
|
|
|
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
|
|
|
|
|
return analysis;
|
|
|
|
|
|
|
|
|
|
Block& body = coreLikeOp->getRegion(0).front();
|
|
|
|
|
DenseMap<Operation*, uint64_t> opOrder;
|
|
|
|
|
DenseMap<Operation *, uint64_t> opOrder;
|
|
|
|
|
uint64_t nextInstruction = 0;
|
|
|
|
|
for (Operation& op : body)
|
|
|
|
|
for (Operation &op : block)
|
|
|
|
|
opOrder.try_emplace(&op, nextInstruction++);
|
|
|
|
|
|
|
|
|
|
for (Operation& op : body) {
|
|
|
|
|
MemoryCoalescingBlockAnalysis blockAnalysis;
|
|
|
|
|
blockAnalysis.block = █
|
|
|
|
|
|
|
|
|
|
for (Operation &op : block) {
|
|
|
|
|
auto allocOp = dyn_cast<memref::AllocOp>(&op);
|
|
|
|
|
if (!allocOp)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
auto allocType = dyn_cast<MemRefType>(allocOp.getType());
|
|
|
|
|
if (!isCandidateAllocType(allocType)) {
|
|
|
|
|
++analysis.skippedAllocations;
|
|
|
|
|
++blockAnalysis.skippedAllocations;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto endInstruction = getLastUseInstruction(allocOp, body, opOrder);
|
|
|
|
|
auto endInstruction = getLastUseInstruction(allocOp, block, opOrder);
|
|
|
|
|
if (failed(endInstruction)) {
|
|
|
|
|
++analysis.skippedAllocations;
|
|
|
|
|
++blockAnalysis.skippedAllocations;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
analysis.candidates.push_back(
|
|
|
|
|
AllocationCandidate {allocOp, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)});
|
|
|
|
|
blockAnalysis.candidates.push_back(
|
|
|
|
|
AllocationCandidate {allocOp, &block, opOrder.lookup(allocOp), *endInstruction, getTypeSizeBytes(allocType)});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
analysis.skippedAllocations += blockAnalysis.skippedAllocations;
|
|
|
|
|
if (!blockAnalysis.candidates.empty() || blockAnalysis.skippedAllocations != 0)
|
|
|
|
|
analysis.blocks.push_back(std::move(blockAnalysis));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
uint64_t MemoryCoalescingAnalysis::getCandidateCount() const {
|
|
|
|
|
uint64_t total = 0;
|
|
|
|
|
for (const MemoryCoalescingBlockAnalysis &block : blocks)
|
|
|
|
|
total += block.candidates.size();
|
|
|
|
|
return total;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MemoryCoalescingAnalysis analyzeMemoryCoalescingCandidates(Operation *coreLikeOp) {
|
|
|
|
|
MemoryCoalescingAnalysis analysis;
|
|
|
|
|
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
|
|
|
|
|
return analysis;
|
|
|
|
|
|
|
|
|
|
analyzeBlock(coreLikeOp->getRegion(0).front(), analysis);
|
|
|
|
|
return analysis;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MemoryCoalescingStats
|
|
|
|
|
coalesceMemory(Operation* coreLikeOp, const MemoryCoalescingAnalysis& analysis, RewriterBase& rewriter) {
|
|
|
|
|
coalesceMemory(Operation *coreLikeOp, const MemoryCoalescingAnalysis &analysis, RewriterBase &rewriter) {
|
|
|
|
|
(void) coreLikeOp;
|
|
|
|
|
|
|
|
|
|
MemoryCoalescingStats stats;
|
|
|
|
|
stats.skippedAllocations = analysis.skippedAllocations;
|
|
|
|
|
|
|
|
|
|
auto candidates = analysis.candidates;
|
|
|
|
|
llvm::sort(candidates, [](const AllocationCandidate& lhs, const AllocationCandidate& rhs) {
|
|
|
|
|
if (lhs.startInstruction != rhs.startInstruction)
|
|
|
|
|
return lhs.startInstruction < rhs.startInstruction;
|
|
|
|
|
return lhs.endInstruction < rhs.endInstruction;
|
|
|
|
|
});
|
|
|
|
|
for (const MemoryCoalescingBlockAnalysis &blockAnalysis : analysis.blocks) {
|
|
|
|
|
auto candidates = blockAnalysis.candidates;
|
|
|
|
|
llvm::sort(candidates, [](const AllocationCandidate &lhs, const AllocationCandidate &rhs) {
|
|
|
|
|
if (lhs.startInstruction != rhs.startInstruction)
|
|
|
|
|
return lhs.startInstruction < rhs.startInstruction;
|
|
|
|
|
return lhs.endInstruction < rhs.endInstruction;
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
struct ActiveStorage {
|
|
|
|
|
memref::AllocOp root;
|
|
|
|
|
uint64_t endInstruction = 0;
|
|
|
|
|
};
|
|
|
|
|
struct ActiveStorage {
|
|
|
|
|
memref::AllocOp root;
|
|
|
|
|
uint64_t endInstruction = 0;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
SmallVector<ActiveStorage> active;
|
|
|
|
|
SmallVector<memref::AllocOp> freeList;
|
|
|
|
|
SmallVector<ActiveStorage> active;
|
|
|
|
|
SmallVector<memref::AllocOp> freeList;
|
|
|
|
|
|
|
|
|
|
for (AllocationCandidate& candidate : candidates) {
|
|
|
|
|
for (auto it = active.begin(); it != active.end();) {
|
|
|
|
|
if (it->endInstruction < candidate.startInstruction) {
|
|
|
|
|
freeList.push_back(it->root);
|
|
|
|
|
it = active.erase(it);
|
|
|
|
|
for (AllocationCandidate &candidate : candidates) {
|
|
|
|
|
for (auto it = active.begin(); it != active.end();) {
|
|
|
|
|
if (it->endInstruction < candidate.startInstruction) {
|
|
|
|
|
freeList.push_back(it->root);
|
|
|
|
|
it = active.erase(it);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
++it;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto bestFit = freeList.end();
|
|
|
|
|
uint64_t bestFitBytes = std::numeric_limits<uint64_t>::max();
|
|
|
|
|
auto candidateType = cast<MemRefType>(candidate.alloc.getType());
|
|
|
|
|
for (auto it = freeList.begin(); it != freeList.end(); ++it) {
|
|
|
|
|
auto freeType = cast<MemRefType>((*it).getType());
|
|
|
|
|
if (freeType != candidateType)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint64_t freeBytes = getTypeSizeBytes(freeType);
|
|
|
|
|
if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bestFit = it;
|
|
|
|
|
bestFitBytes = freeBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (bestFit == freeList.end()) {
|
|
|
|
|
active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction});
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
++it;
|
|
|
|
|
|
|
|
|
|
memref::AllocOp root = *bestFit;
|
|
|
|
|
freeList.erase(bestFit);
|
|
|
|
|
candidate.alloc.getResult().replaceAllUsesWith(root.getResult());
|
|
|
|
|
rewriter.eraseOp(candidate.alloc);
|
|
|
|
|
active.push_back(ActiveStorage {root, candidate.endInstruction});
|
|
|
|
|
++stats.removedAllocs;
|
|
|
|
|
stats.savedBytes += candidate.sizeBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto bestFit = freeList.end();
|
|
|
|
|
uint64_t bestFitBytes = std::numeric_limits<uint64_t>::max();
|
|
|
|
|
auto candidateType = cast<MemRefType>(candidate.alloc.getType());
|
|
|
|
|
for (auto it = freeList.begin(); it != freeList.end(); ++it) {
|
|
|
|
|
auto freeType = cast<MemRefType>((*it).getType());
|
|
|
|
|
if (freeType != candidateType)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
uint64_t freeBytes = getTypeSizeBytes(freeType);
|
|
|
|
|
if (freeBytes < candidate.sizeBytes || freeBytes >= bestFitBytes)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
bestFit = it;
|
|
|
|
|
bestFitBytes = freeBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (bestFit == freeList.end()) {
|
|
|
|
|
active.push_back(ActiveStorage {candidate.alloc, candidate.endInstruction});
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
memref::AllocOp root = *bestFit;
|
|
|
|
|
freeList.erase(bestFit);
|
|
|
|
|
candidate.alloc.getResult().replaceAllUsesWith(root.getResult());
|
|
|
|
|
rewriter.eraseOp(candidate.alloc);
|
|
|
|
|
active.push_back(ActiveStorage {root, candidate.endInstruction});
|
|
|
|
|
++stats.removedAllocs;
|
|
|
|
|
stats.savedBytes += candidate.sizeBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return stats;
|
|
|
|
|