|
|
|
@@ -5,8 +5,8 @@
|
|
|
|
|
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
|
|
|
|
|
|
|
|
|
|
#include "llvm/ADT/DenseMap.h"
|
|
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
|
|
|
|
|
|
#include <numeric>
|
|
|
|
@@ -42,10 +42,10 @@ static MemoryValueKey getMemoryValueKey(mlir::Value value, std::optional<unsigne
|
|
|
|
|
struct MemoryTouchInterval {
|
|
|
|
|
uint64_t start = 0;
|
|
|
|
|
uint64_t end = 0;
|
|
|
|
|
Operation *startOp = nullptr;
|
|
|
|
|
Operation *endOp = nullptr;
|
|
|
|
|
Operation *firstTouchOp = nullptr;
|
|
|
|
|
Operation *lastTouchOp = nullptr;
|
|
|
|
|
Operation* startOp = nullptr;
|
|
|
|
|
Operation* endOp = nullptr;
|
|
|
|
|
Operation* firstTouchOp = nullptr;
|
|
|
|
|
Operation* lastTouchOp = nullptr;
|
|
|
|
|
uint64_t firstTouchPosition = 0;
|
|
|
|
|
uint64_t lastTouchPosition = 0;
|
|
|
|
|
bool hasRuntimeUse = false;
|
|
|
|
@@ -57,8 +57,8 @@ struct MemoryTouchInterval {
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct OperationOrdering {
|
|
|
|
|
llvm::DenseMap<Operation *, uint64_t> position;
|
|
|
|
|
llvm::DenseMap<Operation *, uint64_t> subtreeEnd;
|
|
|
|
|
llvm::DenseMap<Operation*, uint64_t> position;
|
|
|
|
|
llvm::DenseMap<Operation*, uint64_t> subtreeEnd;
|
|
|
|
|
uint64_t nextPosition = 0;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
@@ -70,7 +70,7 @@ static std::string printValueToString(mlir::Value value) {
|
|
|
|
|
return text;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static std::string printOperationToString(Operation *op) {
|
|
|
|
|
static std::string printOperationToString(Operation* op) {
|
|
|
|
|
if (!op)
|
|
|
|
|
return "<none>";
|
|
|
|
|
std::string text;
|
|
|
|
@@ -116,7 +116,7 @@ static std::string summarizeValue(mlir::Value value, size_t maxLen = 72) {
|
|
|
|
|
return abbreviate(collapseWhitespace(printValueToString(value)), maxLen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static std::string summarizeOperation(Operation *op, size_t maxLen = 96) {
|
|
|
|
|
static std::string summarizeOperation(Operation* op, size_t maxLen = 96) {
|
|
|
|
|
if (!op)
|
|
|
|
|
return "<none>";
|
|
|
|
|
std::string prefix = op->getName().getStringRef().str();
|
|
|
|
@@ -130,34 +130,34 @@ static std::string summarizeLocation(Location loc, size_t maxLen = 88) {
|
|
|
|
|
return abbreviate(collapseWhitespace(printLocationToString(loc)), maxLen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void assignOperationOrdering(Operation *op, OperationOrdering &ordering) {
|
|
|
|
|
static void assignOperationOrdering(Operation* op, OperationOrdering& ordering) {
|
|
|
|
|
uint64_t position = ordering.nextPosition++;
|
|
|
|
|
ordering.position[op] = position;
|
|
|
|
|
uint64_t end = position;
|
|
|
|
|
for (Region ®ion : op->getRegions())
|
|
|
|
|
for (Block &block : region)
|
|
|
|
|
for (Operation &nestedOp : block) {
|
|
|
|
|
for (Region& region : op->getRegions())
|
|
|
|
|
for (Block& block : region)
|
|
|
|
|
for (Operation& nestedOp : block) {
|
|
|
|
|
assignOperationOrdering(&nestedOp, ordering);
|
|
|
|
|
end = std::max(end, ordering.subtreeEnd.lookup(&nestedOp));
|
|
|
|
|
}
|
|
|
|
|
ordering.subtreeEnd[op] = end;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static OperationOrdering buildOperationOrdering(Operation *coreLikeOp) {
|
|
|
|
|
static OperationOrdering buildOperationOrdering(Operation* coreLikeOp) {
|
|
|
|
|
OperationOrdering ordering;
|
|
|
|
|
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
|
|
|
|
|
return ordering;
|
|
|
|
|
|
|
|
|
|
for (Operation &op : coreLikeOp->getRegion(0).front())
|
|
|
|
|
for (Operation& op : coreLikeOp->getRegion(0).front())
|
|
|
|
|
assignOperationOrdering(&op, ordering);
|
|
|
|
|
return ordering;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool isSupportedAliasOp(Operation *op) {
|
|
|
|
|
static bool isSupportedAliasOp(Operation* op) {
|
|
|
|
|
return isa<memref::SubViewOp, memref::CastOp, memref::CollapseShapeOp, memref::ExpandShapeOp>(op);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool isRuntimeMemoryTouchOp(Operation *op) {
|
|
|
|
|
static bool isRuntimeMemoryTouchOp(Operation* op) {
|
|
|
|
|
return isa<pim::PimMemCopyHostToDevOp,
|
|
|
|
|
pim::PimMemCopyDevToHostOp,
|
|
|
|
|
pim::PimMemCopyOp,
|
|
|
|
@@ -178,27 +178,27 @@ static bool isRuntimeMemoryTouchOp(Operation *op) {
|
|
|
|
|
pim::PimVSoftmaxOp>(op);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool isIgnoredLivenessUser(Operation *op) {
|
|
|
|
|
static bool isIgnoredLivenessUser(Operation* op) {
|
|
|
|
|
return isSupportedAliasOp(op) || isa<scf::ForOp, scf::YieldOp, memref::DeallocOp>(op) || isCoreStaticAddressOp(op);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool isWithin(mlir::Value value, Region *region) {
|
|
|
|
|
static bool isWithin(mlir::Value value, Region* region) {
|
|
|
|
|
if (!region)
|
|
|
|
|
return false;
|
|
|
|
|
if (auto blockArg = dyn_cast<BlockArgument>(value))
|
|
|
|
|
return blockArg.getOwner()->getParent() == region;
|
|
|
|
|
if (Operation *definingOp = value.getDefiningOp())
|
|
|
|
|
if (Operation* definingOp = value.getDefiningOp())
|
|
|
|
|
return definingOp->getParentRegion() == region || region->isAncestor(definingOp->getParentRegion());
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool isNestedAllocation(Operation *coreLikeOp, memref::AllocOp allocOp) {
|
|
|
|
|
static bool isNestedAllocation(Operation* coreLikeOp, memref::AllocOp allocOp) {
|
|
|
|
|
if (!coreLikeOp || coreLikeOp->getNumRegions() != 1 || coreLikeOp->getRegion(0).empty())
|
|
|
|
|
return false;
|
|
|
|
|
return allocOp->getBlock() != &coreLikeOp->getRegion(0).front();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void addFallbackReason(std::string &reason, StringRef newReason) {
|
|
|
|
|
static void addFallbackReason(std::string& reason, StringRef newReason) {
|
|
|
|
|
if (newReason.empty())
|
|
|
|
|
return;
|
|
|
|
|
if (!reason.empty())
|
|
|
|
@@ -206,7 +206,7 @@ static void addFallbackReason(std::string &reason, StringRef newReason) {
|
|
|
|
|
reason += newReason.str();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void appendAliasDescription(llvm::SmallVectorImpl<std::string> &aliases, mlir::Value value) {
|
|
|
|
|
static void appendAliasDescription(llvm::SmallVectorImpl<std::string>& aliases, mlir::Value value) {
|
|
|
|
|
std::string text = printValueToString(value);
|
|
|
|
|
if (!llvm::is_contained(aliases, text))
|
|
|
|
|
aliases.push_back(std::move(text));
|
|
|
|
@@ -215,16 +215,15 @@ static void appendAliasDescription(llvm::SmallVectorImpl<std::string> &aliases,
|
|
|
|
|
struct OrderedTouchRange {
|
|
|
|
|
uint64_t start = 0;
|
|
|
|
|
uint64_t end = 0;
|
|
|
|
|
Operation *startOp = nullptr;
|
|
|
|
|
Operation *endOp = nullptr;
|
|
|
|
|
Operation* startOp = nullptr;
|
|
|
|
|
Operation* endOp = nullptr;
|
|
|
|
|
bool escapedLoop = false;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static OrderedTouchRange
|
|
|
|
|
getEffectiveTouchRange(mlir::Value definingValue, Operation *user, const OperationOrdering &ordering) {
|
|
|
|
|
OrderedTouchRange range {
|
|
|
|
|
ordering.position.lookup(user), ordering.position.lookup(user), user, user, false};
|
|
|
|
|
for (Operation *current = user; current; current = current->getParentOp()) {
|
|
|
|
|
getEffectiveTouchRange(mlir::Value definingValue, Operation* user, const OperationOrdering& ordering) {
|
|
|
|
|
OrderedTouchRange range {ordering.position.lookup(user), ordering.position.lookup(user), user, user, false};
|
|
|
|
|
for (Operation* current = user; current; current = current->getParentOp()) {
|
|
|
|
|
auto forOp = dyn_cast<scf::ForOp>(current);
|
|
|
|
|
if (!forOp || isWithin(definingValue, &forOp.getRegion()))
|
|
|
|
|
continue;
|
|
|
|
@@ -238,7 +237,7 @@ getEffectiveTouchRange(mlir::Value definingValue, Operation *user, const Operati
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static MemoryTouchInterval
|
|
|
|
|
computeMemoryTouchInterval(memref::AllocOp allocOp, const OperationOrdering &ordering, uint64_t fallbackEnd) {
|
|
|
|
|
computeMemoryTouchInterval(memref::AllocOp allocOp, const OperationOrdering& ordering, uint64_t fallbackEnd) {
|
|
|
|
|
MemoryTouchInterval interval;
|
|
|
|
|
interval.start = ordering.position.lookup(allocOp);
|
|
|
|
|
interval.end = interval.start;
|
|
|
|
@@ -246,7 +245,7 @@ computeMemoryTouchInterval(memref::AllocOp allocOp, const OperationOrdering &ord
|
|
|
|
|
interval.endOp = allocOp;
|
|
|
|
|
|
|
|
|
|
SmallPtrSet<mlir::Value, 16> visitedValues;
|
|
|
|
|
SmallPtrSet<Operation *, 32> visitedUsers;
|
|
|
|
|
SmallPtrSet<Operation*, 32> visitedUsers;
|
|
|
|
|
SmallVector<mlir::Value> pendingValues;
|
|
|
|
|
pendingValues.push_back(allocOp.getResult());
|
|
|
|
|
auto parentLoop = allocOp->getParentOfType<scf::ForOp>();
|
|
|
|
@@ -256,7 +255,7 @@ computeMemoryTouchInterval(memref::AllocOp allocOp, const OperationOrdering &ord
|
|
|
|
|
if (!visitedValues.insert(value).second)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
for (Operation *user : value.getUsers()) {
|
|
|
|
|
for (Operation* user : value.getUsers()) {
|
|
|
|
|
if (!visitedUsers.insert(user).second)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
@@ -269,7 +268,7 @@ computeMemoryTouchInterval(memref::AllocOp allocOp, const OperationOrdering &ord
|
|
|
|
|
|
|
|
|
|
if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
|
|
|
|
|
for (OpResult result : user->getResults()) {
|
|
|
|
|
OpOperand *tiedOperand = dpsOp.getTiedOpOperand(result);
|
|
|
|
|
OpOperand* tiedOperand = dpsOp.getTiedOpOperand(result);
|
|
|
|
|
if (!tiedOperand || tiedOperand->get() != value)
|
|
|
|
|
continue;
|
|
|
|
|
pendingValues.push_back(result);
|
|
|
|
@@ -379,11 +378,11 @@ static FailureOr<size_t> getAllocSizeBytes(memref::AllocOp allocOp) {
|
|
|
|
|
return pim::checkedSize(*checkedBytes, allocOp, "memory allocation byte size");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool intervalsOverlap(const LocalAllocInterval &lhs, const LocalAllocInterval &rhs) {
|
|
|
|
|
static bool intervalsOverlap(const LocalAllocInterval& lhs, const LocalAllocInterval& rhs) {
|
|
|
|
|
return !(lhs.end < rhs.start || rhs.end < lhs.start);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static uint64_t getSlotLogicalBytes(const PlannedPhysicalSlot &slot, ArrayRef<LocalAllocInterval> intervals) {
|
|
|
|
|
static uint64_t getSlotLogicalBytes(const PlannedPhysicalSlot& slot, ArrayRef<LocalAllocInterval> intervals) {
|
|
|
|
|
uint64_t slotLogicalBytes = 0;
|
|
|
|
|
for (size_t intervalIndex : slot.intervalIndices)
|
|
|
|
|
slotLogicalBytes += intervals[intervalIndex].size;
|
|
|
|
@@ -392,7 +391,7 @@ static uint64_t getSlotLogicalBytes(const PlannedPhysicalSlot &slot, ArrayRef<Lo
|
|
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
SmallVector<LocalAllocInterval, 0> onnx_mlir::buildLocalAllocIntervals(Operation *coreLikeOp,
|
|
|
|
|
SmallVector<LocalAllocInterval, 0> onnx_mlir::buildLocalAllocIntervals(Operation* coreLikeOp,
|
|
|
|
|
std::optional<unsigned> lane) {
|
|
|
|
|
SmallVector<LocalAllocInterval, 0> intervals;
|
|
|
|
|
OperationOrdering ordering = buildOperationOrdering(coreLikeOp);
|
|
|
|
@@ -442,8 +441,8 @@ SmallVector<PlannedPhysicalSlot, 0> onnx_mlir::planPhysicalSlots(MutableArrayRef
|
|
|
|
|
SmallVector<size_t> intervalOrder(intervals.size());
|
|
|
|
|
std::iota(intervalOrder.begin(), intervalOrder.end(), 0);
|
|
|
|
|
llvm::stable_sort(intervalOrder, [&](size_t lhsIndex, size_t rhsIndex) {
|
|
|
|
|
const LocalAllocInterval &lhs = intervals[lhsIndex];
|
|
|
|
|
const LocalAllocInterval &rhs = intervals[rhsIndex];
|
|
|
|
|
const LocalAllocInterval& lhs = intervals[lhsIndex];
|
|
|
|
|
const LocalAllocInterval& rhs = intervals[rhsIndex];
|
|
|
|
|
if (lhs.size != rhs.size)
|
|
|
|
|
return lhs.size > rhs.size;
|
|
|
|
|
if (lhs.start != rhs.start)
|
|
|
|
@@ -454,16 +453,15 @@ SmallVector<PlannedPhysicalSlot, 0> onnx_mlir::planPhysicalSlots(MutableArrayRef
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
for (size_t intervalIndex : intervalOrder) {
|
|
|
|
|
LocalAllocInterval &interval = intervals[intervalIndex];
|
|
|
|
|
PlannedPhysicalSlot *bestSlot = nullptr;
|
|
|
|
|
auto bestKey = std::tuple<size_t, size_t, size_t, size_t>(
|
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max());
|
|
|
|
|
LocalAllocInterval& interval = intervals[intervalIndex];
|
|
|
|
|
PlannedPhysicalSlot* bestSlot = nullptr;
|
|
|
|
|
auto bestKey = std::tuple<size_t, size_t, size_t, size_t>(std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
|
std::numeric_limits<size_t>::max());
|
|
|
|
|
|
|
|
|
|
for (size_t slotIndex = 0; slotIndex < slots.size(); ++slotIndex) {
|
|
|
|
|
PlannedPhysicalSlot &slot = slots[slotIndex];
|
|
|
|
|
PlannedPhysicalSlot& slot = slots[slotIndex];
|
|
|
|
|
bool compatible = true;
|
|
|
|
|
for (size_t otherIndex : slot.intervalIndices) {
|
|
|
|
|
if (intervalsOverlap(interval, intervals[otherIndex])) {
|
|
|
|
@@ -476,8 +474,8 @@ SmallVector<PlannedPhysicalSlot, 0> onnx_mlir::planPhysicalSlots(MutableArrayRef
|
|
|
|
|
|
|
|
|
|
size_t resultingSize = std::max(slot.requiredSize, interval.size);
|
|
|
|
|
size_t growth = resultingSize - slot.requiredSize;
|
|
|
|
|
auto candidateKey = std::tuple<size_t, size_t, size_t, size_t>(
|
|
|
|
|
growth, resultingSize, slot.intervalIndices.size(), slot.id);
|
|
|
|
|
auto candidateKey =
|
|
|
|
|
std::tuple<size_t, size_t, size_t, size_t>(growth, resultingSize, slot.intervalIndices.size(), slot.id);
|
|
|
|
|
if (candidateKey < bestKey) {
|
|
|
|
|
bestKey = candidateKey;
|
|
|
|
|
bestSlot = &slot;
|
|
|
|
@@ -503,7 +501,7 @@ SmallVector<PlannedPhysicalSlot, 0> onnx_mlir::planPhysicalSlots(MutableArrayRef
|
|
|
|
|
return slots;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation* coreLikeOp,
|
|
|
|
|
std::optional<unsigned> lane,
|
|
|
|
|
ArrayRef<LocalAllocInterval> intervals,
|
|
|
|
|
ArrayRef<PlannedPhysicalSlot> slots,
|
|
|
|
@@ -522,7 +520,7 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
size_t largestPhysicalSlot = 0;
|
|
|
|
|
size_t maximumAssignedAddress = 0;
|
|
|
|
|
|
|
|
|
|
for (const LocalAllocInterval &interval : intervals) {
|
|
|
|
|
for (const LocalAllocInterval& interval : intervals) {
|
|
|
|
|
totalLogicalBytes += interval.size;
|
|
|
|
|
largestLogicalAllocation = std::max(largestLogicalAllocation, interval.size);
|
|
|
|
|
maximumAssignedAddress = std::max(maximumAssignedAddress, interval.assignedAddress + interval.physicalSlotSize);
|
|
|
|
@@ -535,7 +533,7 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
if (interval.escapesLoop)
|
|
|
|
|
++loopEscapingIntervals;
|
|
|
|
|
}
|
|
|
|
|
for (const PlannedPhysicalSlot &slot : slots) {
|
|
|
|
|
for (const PlannedPhysicalSlot& slot : slots) {
|
|
|
|
|
totalPhysicalBytes += slot.size;
|
|
|
|
|
largestPhysicalSlot = std::max(largestPhysicalSlot, slot.size);
|
|
|
|
|
if (slot.intervalIndices.size() > 1)
|
|
|
|
@@ -553,7 +551,8 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
os << "Lane: " << *lane << "\n";
|
|
|
|
|
os << "Summary:\n";
|
|
|
|
|
os << " logical allocation bytes: " << formatReportMemory(totalLogicalBytes) << " (" << totalLogicalBytes << ")\n";
|
|
|
|
|
os << " physical allocation bytes: " << formatReportMemory(totalPhysicalBytes) << " (" << totalPhysicalBytes << ")\n";
|
|
|
|
|
os << " physical allocation bytes: " << formatReportMemory(totalPhysicalBytes) << " (" << totalPhysicalBytes
|
|
|
|
|
<< ")\n";
|
|
|
|
|
os << " saved bytes: " << formatReportMemory(savedBytes) << " (" << savedBytes << ")\n";
|
|
|
|
|
os << " saved percent: " << format("%.2f%%", savedPercent) << "\n";
|
|
|
|
|
os << " intervals: " << intervals.size() << "\n";
|
|
|
|
@@ -566,7 +565,8 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
os << " largest logical allocation: " << largestLogicalAllocation << "\n";
|
|
|
|
|
os << " largest physical slot: " << largestPhysicalSlot << "\n";
|
|
|
|
|
os << " address limit: " << addressLimit << "\n";
|
|
|
|
|
os << " peak physical memory: " << formatReportMemory(maximumAssignedAddress) << " (" << maximumAssignedAddress << ")\n";
|
|
|
|
|
os << " peak physical memory: " << formatReportMemory(maximumAssignedAddress) << " (" << maximumAssignedAddress
|
|
|
|
|
<< ")\n";
|
|
|
|
|
os << " maximum assigned address: " << maximumAssignedAddress << "\n";
|
|
|
|
|
|
|
|
|
|
os << "\nHow To Read:\n";
|
|
|
|
@@ -575,16 +575,15 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
os << " Large single-use slots, fallback intervals, and nested single-use allocations are the best places\n";
|
|
|
|
|
os << " to inspect if allocations should be moved, sunk, or made easier to coalesce earlier in the pipeline.\n";
|
|
|
|
|
|
|
|
|
|
SmallVector<const PlannedPhysicalSlot *> reusedSlots;
|
|
|
|
|
SmallVector<const PlannedPhysicalSlot *> singleUseSlots;
|
|
|
|
|
for (const PlannedPhysicalSlot &slot : slots) {
|
|
|
|
|
SmallVector<const PlannedPhysicalSlot*> reusedSlots;
|
|
|
|
|
SmallVector<const PlannedPhysicalSlot*> singleUseSlots;
|
|
|
|
|
for (const PlannedPhysicalSlot& slot : slots)
|
|
|
|
|
if (slot.intervalIndices.size() > 1)
|
|
|
|
|
reusedSlots.push_back(&slot);
|
|
|
|
|
else
|
|
|
|
|
singleUseSlots.push_back(&slot);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
llvm::stable_sort(reusedSlots, [&](const PlannedPhysicalSlot *lhs, const PlannedPhysicalSlot *rhs) {
|
|
|
|
|
llvm::stable_sort(reusedSlots, [&](const PlannedPhysicalSlot* lhs, const PlannedPhysicalSlot* rhs) {
|
|
|
|
|
uint64_t lhsLogicalBytes = getSlotLogicalBytes(*lhs, intervals);
|
|
|
|
|
uint64_t rhsLogicalBytes = getSlotLogicalBytes(*rhs, intervals);
|
|
|
|
|
if (lhs->intervalIndices.size() != rhs->intervalIndices.size())
|
|
|
|
@@ -595,7 +594,7 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
return lhs->size > rhs->size;
|
|
|
|
|
return lhs->id < rhs->id;
|
|
|
|
|
});
|
|
|
|
|
llvm::stable_sort(singleUseSlots, [&](const PlannedPhysicalSlot *lhs, const PlannedPhysicalSlot *rhs) {
|
|
|
|
|
llvm::stable_sort(singleUseSlots, [&](const PlannedPhysicalSlot* lhs, const PlannedPhysicalSlot* rhs) {
|
|
|
|
|
if (lhs->size != rhs->size)
|
|
|
|
|
return lhs->size > rhs->size;
|
|
|
|
|
return lhs->id < rhs->id;
|
|
|
|
@@ -607,18 +606,16 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
os << "\nBest Reuse:\n";
|
|
|
|
|
if (reusedSlots.empty()) {
|
|
|
|
|
os << " no slots were shared by multiple intervals\n";
|
|
|
|
|
} else {
|
|
|
|
|
for (const PlannedPhysicalSlot *slot : ArrayRef(reusedSlots).take_front(kSummaryReuseLimit)) {
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
for (const PlannedPhysicalSlot* slot : ArrayRef(reusedSlots).take_front(kSummaryReuseLimit)) {
|
|
|
|
|
uint64_t slotLogicalBytes = getSlotLogicalBytes(*slot, intervals);
|
|
|
|
|
os << " slot #" << slot->id
|
|
|
|
|
<< " addr=" << slot->address
|
|
|
|
|
<< " size=" << formatReportMemory(slot->size)
|
|
|
|
|
<< " intervals=" << slot->intervalIndices.size()
|
|
|
|
|
<< " logical_sum=" << formatReportMemory(slotLogicalBytes) << "\n";
|
|
|
|
|
os << " slot #" << slot->id << " addr=" << slot->address << " size=" << formatReportMemory(slot->size)
|
|
|
|
|
<< " intervals=" << slot->intervalIndices.size() << " logical_sum=" << formatReportMemory(slotLogicalBytes)
|
|
|
|
|
<< "\n";
|
|
|
|
|
for (size_t intervalIndex : slot->intervalIndices) {
|
|
|
|
|
const LocalAllocInterval &interval = intervals[intervalIndex];
|
|
|
|
|
os << " #" << interval.id
|
|
|
|
|
<< " [" << interval.start << "," << interval.end << "]"
|
|
|
|
|
const LocalAllocInterval& interval = intervals[intervalIndex];
|
|
|
|
|
os << " #" << interval.id << " [" << interval.start << "," << interval.end << "]"
|
|
|
|
|
<< " logical=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " first=" << summarizeOperation(interval.firstTouchOp, 40)
|
|
|
|
|
<< " last=" << summarizeOperation(interval.lastTouchOp, 40) << "\n";
|
|
|
|
@@ -628,12 +625,11 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
|
|
|
|
|
os << "\nTop Offenders:\n";
|
|
|
|
|
bool printedAttention = false;
|
|
|
|
|
for (const PlannedPhysicalSlot *slot : ArrayRef(singleUseSlots).take_front(kSummaryOffenderLimit)) {
|
|
|
|
|
const LocalAllocInterval &interval = intervals[slot->intervalIndices.front()];
|
|
|
|
|
for (const PlannedPhysicalSlot* slot : ArrayRef(singleUseSlots).take_front(kSummaryOffenderLimit)) {
|
|
|
|
|
const LocalAllocInterval& interval = intervals[slot->intervalIndices.front()];
|
|
|
|
|
printedAttention = true;
|
|
|
|
|
os << " slot #" << slot->id << " is single-use"
|
|
|
|
|
<< " size=" << formatReportMemory(slot->size)
|
|
|
|
|
<< " interval=#" << interval.id
|
|
|
|
|
<< " size=" << formatReportMemory(slot->size) << " interval=#" << interval.id
|
|
|
|
|
<< " value=" << summarizeValue(interval.key.value, 56) << "\n";
|
|
|
|
|
os << " first=" << summarizeOperation(interval.firstTouchOp, 40)
|
|
|
|
|
<< " last=" << summarizeOperation(interval.lastTouchOp, 40)
|
|
|
|
@@ -641,28 +637,26 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
<< " escapes_loop=" << (interval.escapesLoop ? "yes" : "no") << "\n";
|
|
|
|
|
}
|
|
|
|
|
size_t fallbackPrinted = 0;
|
|
|
|
|
for (const LocalAllocInterval &interval : intervals) {
|
|
|
|
|
for (const LocalAllocInterval& interval : intervals) {
|
|
|
|
|
if (!(interval.startUsedAllocFallback || interval.endUsedFallback) || fallbackPrinted >= kSummaryOffenderLimit)
|
|
|
|
|
continue;
|
|
|
|
|
printedAttention = true;
|
|
|
|
|
++fallbackPrinted;
|
|
|
|
|
os << " fallback interval #" << interval.id
|
|
|
|
|
<< " size=" << formatReportMemory(interval.size)
|
|
|
|
|
os << " fallback interval #" << interval.id << " size=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " value=" << summarizeValue(interval.key.value, 56) << "\n";
|
|
|
|
|
os << " reason: " << (interval.fallbackReason.empty() ? "<none>" : interval.fallbackReason) << "\n";
|
|
|
|
|
}
|
|
|
|
|
size_t nestedPrinted = 0;
|
|
|
|
|
for (const LocalAllocInterval &interval : intervals) {
|
|
|
|
|
for (const LocalAllocInterval& interval : intervals) {
|
|
|
|
|
if (nestedPrinted >= kSummaryOffenderLimit)
|
|
|
|
|
break;
|
|
|
|
|
if (!(interval.insideNestedRegion && slots[interval.slotPlanIndex].intervalIndices.size() == 1))
|
|
|
|
|
continue;
|
|
|
|
|
printedAttention = true;
|
|
|
|
|
++nestedPrinted;
|
|
|
|
|
os << " nested single-use interval #" << interval.id
|
|
|
|
|
<< " slot #" << interval.physicalSlotId
|
|
|
|
|
<< " size=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " value=" << summarizeValue(interval.key.value, 56) << "\n";
|
|
|
|
|
os << " nested single-use interval #" << interval.id << " slot #" << interval.physicalSlotId
|
|
|
|
|
<< " size=" << formatReportMemory(interval.size) << " value=" << summarizeValue(interval.key.value, 56)
|
|
|
|
|
<< "\n";
|
|
|
|
|
os << " hint: move or sink this alloc inside the nested region if the IR allows it.\n";
|
|
|
|
|
}
|
|
|
|
|
if (!printedAttention)
|
|
|
|
@@ -670,18 +664,17 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
|
|
|
|
|
if (reportLevel == PimMemoryReportFull) {
|
|
|
|
|
os << "\nSlot Reuse:\n";
|
|
|
|
|
for (const PlannedPhysicalSlot &slot : slots) {
|
|
|
|
|
for (const PlannedPhysicalSlot& slot : slots) {
|
|
|
|
|
uint64_t slotLogicalBytes = getSlotLogicalBytes(slot, intervals);
|
|
|
|
|
os << " slot #" << slot.id << " addr=" << slot.address << " size=" << formatReportMemory(slot.size) << " ("
|
|
|
|
|
<< slot.size << ")"
|
|
|
|
|
<< " intervals=" << slot.intervalIndices.size()
|
|
|
|
|
<< " logical_sum=" << formatReportMemory(slotLogicalBytes) << "\n";
|
|
|
|
|
<< " intervals=" << slot.intervalIndices.size() << " logical_sum=" << formatReportMemory(slotLogicalBytes)
|
|
|
|
|
<< "\n";
|
|
|
|
|
for (size_t intervalIndex : slot.intervalIndices) {
|
|
|
|
|
const LocalAllocInterval &interval = intervals[intervalIndex];
|
|
|
|
|
const LocalAllocInterval& interval = intervals[intervalIndex];
|
|
|
|
|
mlir::Value allocValue = interval.key.value;
|
|
|
|
|
os << " [" << interval.start << "," << interval.end << "]"
|
|
|
|
|
<< " #" << interval.id
|
|
|
|
|
<< " logical=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " #" << interval.id << " logical=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " nested=" << (interval.insideNestedRegion ? "yes" : "no")
|
|
|
|
|
<< " escapes_loop=" << (interval.escapesLoop ? "yes" : "no")
|
|
|
|
|
<< " first=" << summarizeOperation(interval.firstTouchOp, 48)
|
|
|
|
@@ -693,16 +686,14 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
|
|
|
|
|
if (reportLevel == PimMemoryReportFull) {
|
|
|
|
|
os << "\nInterval Details:\n";
|
|
|
|
|
for (const LocalAllocInterval &interval : intervals) {
|
|
|
|
|
const PlannedPhysicalSlot &slot = slots[interval.slotPlanIndex];
|
|
|
|
|
for (const LocalAllocInterval& interval : intervals) {
|
|
|
|
|
const PlannedPhysicalSlot& slot = slots[interval.slotPlanIndex];
|
|
|
|
|
mlir::Value allocValue = interval.key.value;
|
|
|
|
|
Operation *definingOp = allocValue.getDefiningOp();
|
|
|
|
|
os << " #" << interval.id
|
|
|
|
|
<< " slot=" << slot.id
|
|
|
|
|
<< " live=[" << interval.start << "," << interval.end << "]"
|
|
|
|
|
Operation* definingOp = allocValue.getDefiningOp();
|
|
|
|
|
os << " #" << interval.id << " slot=" << slot.id << " live=[" << interval.start << "," << interval.end << "]"
|
|
|
|
|
<< " logical=" << formatReportMemory(interval.size)
|
|
|
|
|
<< " slot_size=" << formatReportMemory(interval.physicalSlotSize)
|
|
|
|
|
<< " addr=" << interval.assignedAddress << "\n";
|
|
|
|
|
<< " slot_size=" << formatReportMemory(interval.physicalSlotSize) << " addr=" << interval.assignedAddress
|
|
|
|
|
<< "\n";
|
|
|
|
|
os << " value=" << summarizeValue(allocValue, 88) << "\n";
|
|
|
|
|
os << " type=" << allocValue.getType() << "\n";
|
|
|
|
|
os << " loc="
|
|
|
|
@@ -731,7 +722,7 @@ MemoryPlanArtifacts onnx_mlir::buildMemoryPlanArtifacts(Operation *coreLikeOp,
|
|
|
|
|
os << " fallback_reason=" << interval.fallbackReason << "\n";
|
|
|
|
|
if (!interval.aliasesFollowed.empty()) {
|
|
|
|
|
os << " aliases_followed=" << interval.aliasesFollowed.size() << "\n";
|
|
|
|
|
for (const std::string &alias : interval.aliasesFollowed)
|
|
|
|
|
for (const std::string& alias : interval.aliasesFollowed)
|
|
|
|
|
os << " - " << abbreviate(collapseWhitespace(alias), 108) << "\n";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|