centralize logic for materializing contiguous memory into bufferization

fix codegen symlinks overwrite
remove deprecated pim memcp_hd_batch op
This commit is contained in:
NiccoloN
2026-05-30 15:54:24 +02:00
parent 2d5b03c08f
commit ff36729140
29 changed files with 642 additions and 822 deletions
@@ -1,11 +1,8 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/Matchers.h"
#include "llvm/ADT/SmallVector.h"
#include <algorithm>
#include <functional>
#include "IndexingUtils.hpp"
@@ -20,35 +17,6 @@ using namespace mlir;
namespace onnx_mlir {
static Value addIndexValues(Value lhs, Value rhs, ConversionPatternRewriter& rewriter, Location loc) {
APInt lhsConst;
if (matchPattern(lhs, m_ConstantInt(&lhsConst)) && lhsConst.isZero())
return rhs;
APInt rhsConst;
if (matchPattern(rhs, m_ConstantInt(&rhsConst)) && rhsConst.isZero())
return lhs;
return arith::AddIOp::create(rewriter, loc, lhs, rhs).getResult();
}
static Value multiplyIndexValue(Value value, OpFoldResult factor, ConversionPatternRewriter& rewriter, Location loc) {
APInt factorConst;
if (auto attr = dyn_cast<Attribute>(factor))
factorConst = cast<IntegerAttr>(attr).getValue();
else if (!matchPattern(cast<Value>(factor), m_ConstantInt(&factorConst)))
return arith::MulIOp::create(rewriter, loc, value, cast<Value>(factor)).getResult();
if (factorConst.isZero())
return getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
if (factorConst.isOne())
return value;
auto factorValue =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), factorConst.getSExtValue());
return arith::MulIOp::create(rewriter, loc, value, factorValue).getResult();
}
bool hasStaticPositiveShape(ArrayRef<int64_t> shape) {
return llvm::all_of(shape, [](int64_t dim) { return dim > 0; });
}
@@ -124,39 +92,6 @@ SmallVector<OpFoldResult> getStaticSizes(PatternRewriter& rewriter, ArrayRef<int
return sizes;
}
static bool isContiguousTensorSlice(Value source, RankedTensorType resultType, ArrayRef<OpFoldResult> strides) {
auto sourceType = dyn_cast<RankedTensorType>(source.getType());
if (!sourceType || !sourceType.hasStaticShape() || !resultType.hasStaticShape()
|| sourceType.getRank() != resultType.getRank())
return false;
for (OpFoldResult stride : strides) {
APInt strideValue;
if (auto attr = dyn_cast<Attribute>(stride)) {
if (cast<IntegerAttr>(attr).getInt() != 1)
return false;
continue;
}
if (!matchPattern(cast<Value>(stride), m_ConstantInt(&strideValue)) || !strideValue.isOne())
return false;
}
auto sizesAndShape = llvm::zip_equal(llvm::make_range(resultType.getShape().rbegin(), resultType.getShape().rend()),
llvm::make_range(sourceType.getShape().rbegin(), sourceType.getShape().rend()));
auto firstDifferentSize = std::find_if(sizesAndShape.begin(), sizesAndShape.end(), [&](auto sizeAndShape) -> bool {
auto [size, dimension] = sizeAndShape;
return size != dimension;
});
if (firstDifferentSize == sizesAndShape.end())
return true;
++firstDifferentSize;
return std::all_of(firstDifferentSize, sizesAndShape.end(), [](auto sizeAndShape) {
auto [size, _dimension] = sizeAndShape;
return size == 1;
});
}
SmallVector<Value> sliceTensor(
const Value& tensorToSlice, size_t axis, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc) {
ArrayRef<long> shape = getTensorShape(tensorToSlice);
@@ -222,90 +157,6 @@ sliceVectorPerCrossbarPerCore(const Value& vectorToSlice, ConversionPatternRewri
return slicesPerCore;
}
Value materializeContiguousTensorSlice(Value source,
RankedTensorType resultType,
ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> strides,
ConversionPatternRewriter& rewriter,
Location loc) {
assert(resultType.hasStaticShape() && "expected static result type");
size_t rank = static_cast<size_t>(resultType.getRank());
assert(offsets.size() == rank && "expected rank-matching offsets");
assert(strides.size() == rank && "expected rank-matching strides");
SmallVector<OpFoldResult> sizes;
sizes.reserve(resultType.getRank());
for (int64_t size : resultType.getShape())
sizes.push_back(rewriter.getIndexAttr(size));
if (isContiguousTensorSlice(source, resultType, strides))
return tensor::ExtractSliceOp::create(rewriter, loc, resultType, source, offsets, sizes, strides).getResult();
if (resultType.getRank() == 0)
return tensor::ExtractSliceOp::create(rewriter, loc, resultType, source, offsets, sizes, strides).getResult();
Value init = tensor::EmptyOp::create(rewriter, loc, resultType.getShape(), resultType.getElementType()).getResult();
SmallVector<Value> zeroIndices(resultType.getRank());
for (Value& zeroIndex : zeroIndices)
zeroIndex = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
SmallVector<Value> resultIndices;
resultIndices.reserve(resultType.getRank());
auto buildLoopNest = [&](auto&& self, unsigned dim, Value accumulator) -> Value {
if (dim == resultType.getRank()) {
SmallVector<Value> sourceIndices;
sourceIndices.reserve(resultType.getRank());
for (unsigned idx = 0; idx < resultType.getRank(); ++idx) {
Value offsetValue = getOrMaterializeIndexValue(rewriter, offsets[idx]);
Value scaledIndex = multiplyIndexValue(resultIndices[idx], strides[idx], rewriter, loc);
sourceIndices.push_back(addIndexValues(offsetValue, scaledIndex, rewriter, loc));
}
SmallVector<OpFoldResult> sourceOffsets;
SmallVector<OpFoldResult> destinationOffsets;
SmallVector<OpFoldResult> unitSizes;
SmallVector<OpFoldResult> unitStrides;
sourceOffsets.reserve(resultType.getRank());
destinationOffsets.reserve(resultType.getRank());
unitSizes.reserve(resultType.getRank());
unitStrides.reserve(resultType.getRank());
for (Value index : sourceIndices)
sourceOffsets.push_back(index);
for (Value index : resultIndices)
destinationOffsets.push_back(index);
for (int64_t idx = 0; idx < resultType.getRank(); ++idx) {
unitSizes.push_back(rewriter.getIndexAttr(1));
unitStrides.push_back(rewriter.getIndexAttr(1));
}
auto elementTensorType =
RankedTensorType::get(SmallVector<int64_t>(resultType.getRank(), 1), resultType.getElementType());
Value elementSlice =
tensor::ExtractSliceOp::create(rewriter, loc, elementTensorType, source, sourceOffsets, unitSizes, unitStrides)
.getResult();
return tensor::InsertSliceOp::create(
rewriter, loc, elementSlice, accumulator, destinationOffsets, unitSizes, unitStrides)
.getResult();
}
Value lower = zeroIndices[dim];
Value upper =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), resultType.getDimSize(dim));
Value step = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
auto loop = scf::ForOp::create(rewriter, loc, lower, upper, step, ValueRange {accumulator});
rewriter.setInsertionPointToStart(loop.getBody());
resultIndices.push_back(loop.getInductionVar());
Value updated = self(self, dim + 1, loop.getRegionIterArgs().front());
resultIndices.pop_back();
scf::YieldOp::create(rewriter, loc, updated);
rewriter.setInsertionPointAfter(loop);
return loop.getResult(0);
};
return buildLoopNest(buildLoopNest, 0, init);
}
Value extractAxisSlice(
PatternRewriter& rewriter, Location loc, Value source, int64_t axis, int64_t offset, int64_t size) {
auto sourceType = cast<RankedTensorType>(source.getType());
@@ -108,13 +108,6 @@ llvm::SmallVector<mlir::Value> sliceVector(const mlir::Value& vectorToSlice,
llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>> sliceVectorPerCrossbarPerCore(
const mlir::Value& vectorToSlice, mlir::ConversionPatternRewriter& rewriter, mlir::Location loc);
mlir::Value materializeContiguousTensorSlice(mlir::Value source,
mlir::RankedTensorType resultType,
llvm::ArrayRef<mlir::OpFoldResult> offsets,
llvm::ArrayRef<mlir::OpFoldResult> strides,
mlir::ConversionPatternRewriter& rewriter,
mlir::Location loc);
mlir::Value extractAxisSlice(
mlir::PatternRewriter& rewriter, mlir::Location loc, mlir::Value source, int64_t axis, int64_t offset, int64_t size);
@@ -303,9 +303,11 @@ createDynamicGemmBatchRow(Value lane, int64_t numOutCols, ConversionPatternRewri
static Value extractDynamicGemmBColumn(
Value matrix, Value column, RankedTensorType vectorType, ConversionPatternRewriter& rewriter, Location loc) {
SmallVector<OpFoldResult> offsets {rewriter.getIndexAttr(0), column};
SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(vectorType.getDimSize(1)), rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> strides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
auto columnSliceType = RankedTensorType::get({vectorType.getDimSize(1), 1}, vectorType.getElementType());
Value columnSlice = materializeContiguousTensorSlice(matrix, columnSliceType, offsets, strides, rewriter, loc);
Value columnSlice =
tensor::ExtractSliceOp::create(rewriter, loc, columnSliceType, matrix, offsets, sizes, strides).getResult();
SmallVector<ReassociationIndices> collapseReassociation {
ReassociationIndices {0, 1}
};
@@ -23,7 +23,7 @@ using namespace mlir;
namespace onnx_mlir {
namespace {
static Value materializeContiguousTile(ConversionPatternRewriter& rewriter, Location loc, Value tile) {
static Value materializeTileTensor(ConversionPatternRewriter& rewriter, Location loc, Value tile) {
auto tileType = cast<RankedTensorType>(tile.getType());
Value empty = tensor::EmptyOp::create(rewriter, loc, tileType.getShape(), tileType.getElementType());
return insertStaticSlice(rewriter, loc, tile, empty, getZeroOffsets(rewriter, tileType.getRank()));
@@ -319,7 +319,7 @@ struct PoolToSpatialComputeBase : public OpConversionPattern<PoolOp> {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value windowValue =
tensor::ExtractSliceOp::create(rewriter, loc, tileType, paddedInput, offsets, sizes, strides);
windowValue = materializeContiguousTile(rewriter, loc, windowValue);
windowValue = materializeTileTensor(rewriter, loc, windowValue);
reducedWindow = ReduceOp::create(rewriter, loc, tileType, reducedWindow, windowValue);
}
}
@@ -335,7 +335,7 @@ struct PoolToSpatialComputeBase : public OpConversionPattern<PoolOp> {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scaleSlice = tensor::ExtractSliceOp::create(
rewriter, loc, tileType, averageScaleTensor, scaleOffsets, scaleSizes, scaleStrides);
scaleSlice = materializeContiguousTile(rewriter, loc, scaleSlice);
scaleSlice = materializeTileTensor(rewriter, loc, scaleSlice);
reducedWindow = spatial::SpatVMulOp::create(rewriter, loc, tileType, reducedWindow, scaleSlice);
}