centralize logic for materializing contiguous memory into bufferization
fix codegen symlinks overwrite remove deprecated pim memcp_hd_batch op
This commit is contained in:
@@ -1,11 +1,8 @@
|
||||
#include "mlir/Dialect/Arith/IR/Arith.h"
|
||||
#include "mlir/Dialect/SCF/IR/SCF.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/Matchers.h"
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
#include "IndexingUtils.hpp"
|
||||
@@ -20,35 +17,6 @@ using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
static Value addIndexValues(Value lhs, Value rhs, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
APInt lhsConst;
|
||||
if (matchPattern(lhs, m_ConstantInt(&lhsConst)) && lhsConst.isZero())
|
||||
return rhs;
|
||||
|
||||
APInt rhsConst;
|
||||
if (matchPattern(rhs, m_ConstantInt(&rhsConst)) && rhsConst.isZero())
|
||||
return lhs;
|
||||
|
||||
return arith::AddIOp::create(rewriter, loc, lhs, rhs).getResult();
|
||||
}
|
||||
|
||||
static Value multiplyIndexValue(Value value, OpFoldResult factor, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
APInt factorConst;
|
||||
if (auto attr = dyn_cast<Attribute>(factor))
|
||||
factorConst = cast<IntegerAttr>(attr).getValue();
|
||||
else if (!matchPattern(cast<Value>(factor), m_ConstantInt(&factorConst)))
|
||||
return arith::MulIOp::create(rewriter, loc, value, cast<Value>(factor)).getResult();
|
||||
|
||||
if (factorConst.isZero())
|
||||
return getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
if (factorConst.isOne())
|
||||
return value;
|
||||
|
||||
auto factorValue =
|
||||
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), factorConst.getSExtValue());
|
||||
return arith::MulIOp::create(rewriter, loc, value, factorValue).getResult();
|
||||
}
|
||||
|
||||
bool hasStaticPositiveShape(ArrayRef<int64_t> shape) {
|
||||
return llvm::all_of(shape, [](int64_t dim) { return dim > 0; });
|
||||
}
|
||||
@@ -124,39 +92,6 @@ SmallVector<OpFoldResult> getStaticSizes(PatternRewriter& rewriter, ArrayRef<int
|
||||
return sizes;
|
||||
}
|
||||
|
||||
static bool isContiguousTensorSlice(Value source, RankedTensorType resultType, ArrayRef<OpFoldResult> strides) {
|
||||
auto sourceType = dyn_cast<RankedTensorType>(source.getType());
|
||||
if (!sourceType || !sourceType.hasStaticShape() || !resultType.hasStaticShape()
|
||||
|| sourceType.getRank() != resultType.getRank())
|
||||
return false;
|
||||
|
||||
for (OpFoldResult stride : strides) {
|
||||
APInt strideValue;
|
||||
if (auto attr = dyn_cast<Attribute>(stride)) {
|
||||
if (cast<IntegerAttr>(attr).getInt() != 1)
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
if (!matchPattern(cast<Value>(stride), m_ConstantInt(&strideValue)) || !strideValue.isOne())
|
||||
return false;
|
||||
}
|
||||
|
||||
auto sizesAndShape = llvm::zip_equal(llvm::make_range(resultType.getShape().rbegin(), resultType.getShape().rend()),
|
||||
llvm::make_range(sourceType.getShape().rbegin(), sourceType.getShape().rend()));
|
||||
auto firstDifferentSize = std::find_if(sizesAndShape.begin(), sizesAndShape.end(), [&](auto sizeAndShape) -> bool {
|
||||
auto [size, dimension] = sizeAndShape;
|
||||
return size != dimension;
|
||||
});
|
||||
if (firstDifferentSize == sizesAndShape.end())
|
||||
return true;
|
||||
|
||||
++firstDifferentSize;
|
||||
return std::all_of(firstDifferentSize, sizesAndShape.end(), [](auto sizeAndShape) {
|
||||
auto [size, _dimension] = sizeAndShape;
|
||||
return size == 1;
|
||||
});
|
||||
}
|
||||
|
||||
SmallVector<Value> sliceTensor(
|
||||
const Value& tensorToSlice, size_t axis, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
ArrayRef<long> shape = getTensorShape(tensorToSlice);
|
||||
@@ -222,90 +157,6 @@ sliceVectorPerCrossbarPerCore(const Value& vectorToSlice, ConversionPatternRewri
|
||||
return slicesPerCore;
|
||||
}
|
||||
|
||||
Value materializeContiguousTensorSlice(Value source,
|
||||
RankedTensorType resultType,
|
||||
ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
assert(resultType.hasStaticShape() && "expected static result type");
|
||||
size_t rank = static_cast<size_t>(resultType.getRank());
|
||||
assert(offsets.size() == rank && "expected rank-matching offsets");
|
||||
assert(strides.size() == rank && "expected rank-matching strides");
|
||||
|
||||
SmallVector<OpFoldResult> sizes;
|
||||
sizes.reserve(resultType.getRank());
|
||||
for (int64_t size : resultType.getShape())
|
||||
sizes.push_back(rewriter.getIndexAttr(size));
|
||||
|
||||
if (isContiguousTensorSlice(source, resultType, strides))
|
||||
return tensor::ExtractSliceOp::create(rewriter, loc, resultType, source, offsets, sizes, strides).getResult();
|
||||
|
||||
if (resultType.getRank() == 0)
|
||||
return tensor::ExtractSliceOp::create(rewriter, loc, resultType, source, offsets, sizes, strides).getResult();
|
||||
|
||||
Value init = tensor::EmptyOp::create(rewriter, loc, resultType.getShape(), resultType.getElementType()).getResult();
|
||||
SmallVector<Value> zeroIndices(resultType.getRank());
|
||||
for (Value& zeroIndex : zeroIndices)
|
||||
zeroIndex = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
|
||||
SmallVector<Value> resultIndices;
|
||||
resultIndices.reserve(resultType.getRank());
|
||||
|
||||
auto buildLoopNest = [&](auto&& self, unsigned dim, Value accumulator) -> Value {
|
||||
if (dim == resultType.getRank()) {
|
||||
SmallVector<Value> sourceIndices;
|
||||
sourceIndices.reserve(resultType.getRank());
|
||||
for (unsigned idx = 0; idx < resultType.getRank(); ++idx) {
|
||||
Value offsetValue = getOrMaterializeIndexValue(rewriter, offsets[idx]);
|
||||
Value scaledIndex = multiplyIndexValue(resultIndices[idx], strides[idx], rewriter, loc);
|
||||
sourceIndices.push_back(addIndexValues(offsetValue, scaledIndex, rewriter, loc));
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> sourceOffsets;
|
||||
SmallVector<OpFoldResult> destinationOffsets;
|
||||
SmallVector<OpFoldResult> unitSizes;
|
||||
SmallVector<OpFoldResult> unitStrides;
|
||||
sourceOffsets.reserve(resultType.getRank());
|
||||
destinationOffsets.reserve(resultType.getRank());
|
||||
unitSizes.reserve(resultType.getRank());
|
||||
unitStrides.reserve(resultType.getRank());
|
||||
for (Value index : sourceIndices)
|
||||
sourceOffsets.push_back(index);
|
||||
for (Value index : resultIndices)
|
||||
destinationOffsets.push_back(index);
|
||||
for (int64_t idx = 0; idx < resultType.getRank(); ++idx) {
|
||||
unitSizes.push_back(rewriter.getIndexAttr(1));
|
||||
unitStrides.push_back(rewriter.getIndexAttr(1));
|
||||
}
|
||||
|
||||
auto elementTensorType =
|
||||
RankedTensorType::get(SmallVector<int64_t>(resultType.getRank(), 1), resultType.getElementType());
|
||||
Value elementSlice =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, elementTensorType, source, sourceOffsets, unitSizes, unitStrides)
|
||||
.getResult();
|
||||
return tensor::InsertSliceOp::create(
|
||||
rewriter, loc, elementSlice, accumulator, destinationOffsets, unitSizes, unitStrides)
|
||||
.getResult();
|
||||
}
|
||||
|
||||
Value lower = zeroIndices[dim];
|
||||
Value upper =
|
||||
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), resultType.getDimSize(dim));
|
||||
Value step = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
|
||||
auto loop = scf::ForOp::create(rewriter, loc, lower, upper, step, ValueRange {accumulator});
|
||||
rewriter.setInsertionPointToStart(loop.getBody());
|
||||
resultIndices.push_back(loop.getInductionVar());
|
||||
Value updated = self(self, dim + 1, loop.getRegionIterArgs().front());
|
||||
resultIndices.pop_back();
|
||||
scf::YieldOp::create(rewriter, loc, updated);
|
||||
rewriter.setInsertionPointAfter(loop);
|
||||
return loop.getResult(0);
|
||||
};
|
||||
|
||||
return buildLoopNest(buildLoopNest, 0, init);
|
||||
}
|
||||
|
||||
Value extractAxisSlice(
|
||||
PatternRewriter& rewriter, Location loc, Value source, int64_t axis, int64_t offset, int64_t size) {
|
||||
auto sourceType = cast<RankedTensorType>(source.getType());
|
||||
|
||||
@@ -108,13 +108,6 @@ llvm::SmallVector<mlir::Value> sliceVector(const mlir::Value& vectorToSlice,
|
||||
llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>> sliceVectorPerCrossbarPerCore(
|
||||
const mlir::Value& vectorToSlice, mlir::ConversionPatternRewriter& rewriter, mlir::Location loc);
|
||||
|
||||
mlir::Value materializeContiguousTensorSlice(mlir::Value source,
|
||||
mlir::RankedTensorType resultType,
|
||||
llvm::ArrayRef<mlir::OpFoldResult> offsets,
|
||||
llvm::ArrayRef<mlir::OpFoldResult> strides,
|
||||
mlir::ConversionPatternRewriter& rewriter,
|
||||
mlir::Location loc);
|
||||
|
||||
mlir::Value extractAxisSlice(
|
||||
mlir::PatternRewriter& rewriter, mlir::Location loc, mlir::Value source, int64_t axis, int64_t offset, int64_t size);
|
||||
|
||||
|
||||
@@ -303,9 +303,11 @@ createDynamicGemmBatchRow(Value lane, int64_t numOutCols, ConversionPatternRewri
|
||||
static Value extractDynamicGemmBColumn(
|
||||
Value matrix, Value column, RankedTensorType vectorType, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
SmallVector<OpFoldResult> offsets {rewriter.getIndexAttr(0), column};
|
||||
SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(vectorType.getDimSize(1)), rewriter.getIndexAttr(1)};
|
||||
SmallVector<OpFoldResult> strides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
auto columnSliceType = RankedTensorType::get({vectorType.getDimSize(1), 1}, vectorType.getElementType());
|
||||
Value columnSlice = materializeContiguousTensorSlice(matrix, columnSliceType, offsets, strides, rewriter, loc);
|
||||
Value columnSlice =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, columnSliceType, matrix, offsets, sizes, strides).getResult();
|
||||
SmallVector<ReassociationIndices> collapseReassociation {
|
||||
ReassociationIndices {0, 1}
|
||||
};
|
||||
|
||||
@@ -23,7 +23,7 @@ using namespace mlir;
|
||||
namespace onnx_mlir {
|
||||
namespace {
|
||||
|
||||
static Value materializeContiguousTile(ConversionPatternRewriter& rewriter, Location loc, Value tile) {
|
||||
static Value materializeTileTensor(ConversionPatternRewriter& rewriter, Location loc, Value tile) {
|
||||
auto tileType = cast<RankedTensorType>(tile.getType());
|
||||
Value empty = tensor::EmptyOp::create(rewriter, loc, tileType.getShape(), tileType.getElementType());
|
||||
return insertStaticSlice(rewriter, loc, tile, empty, getZeroOffsets(rewriter, tileType.getRank()));
|
||||
@@ -319,7 +319,7 @@ struct PoolToSpatialComputeBase : public OpConversionPattern<PoolOp> {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value windowValue =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, tileType, paddedInput, offsets, sizes, strides);
|
||||
windowValue = materializeContiguousTile(rewriter, loc, windowValue);
|
||||
windowValue = materializeTileTensor(rewriter, loc, windowValue);
|
||||
reducedWindow = ReduceOp::create(rewriter, loc, tileType, reducedWindow, windowValue);
|
||||
}
|
||||
}
|
||||
@@ -335,7 +335,7 @@ struct PoolToSpatialComputeBase : public OpConversionPattern<PoolOp> {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value scaleSlice = tensor::ExtractSliceOp::create(
|
||||
rewriter, loc, tileType, averageScaleTensor, scaleOffsets, scaleSizes, scaleStrides);
|
||||
scaleSlice = materializeContiguousTile(rewriter, loc, scaleSlice);
|
||||
scaleSlice = materializeTileTensor(rewriter, loc, scaleSlice);
|
||||
reducedWindow = spatial::SpatVMulOp::create(rewriter, loc, tileType, reducedWindow, scaleSlice);
|
||||
}
|
||||
|
||||
|
||||
@@ -59,10 +59,7 @@ static Value createHostTargetOffset(IRRewriter& rewriter,
|
||||
ShapedType destinationType,
|
||||
IRMapping& mapper) {
|
||||
int64_t elementBytes = static_cast<int64_t>(getElementTypeSizeInBytes(destinationType.getElementType()));
|
||||
SmallVector<int64_t> strides(destinationType.getRank(), 1);
|
||||
ArrayRef<int64_t> shape = destinationType.getShape();
|
||||
for (int64_t dim = destinationType.getRank() - 2; dim >= 0; --dim)
|
||||
strides[dim] = strides[dim + 1] * shape[dim + 1];
|
||||
SmallVector<int64_t> strides = computeRowMajorStrides(destinationType.getShape());
|
||||
|
||||
Value totalOffset;
|
||||
Location loc = insertSlice.getLoc();
|
||||
@@ -162,14 +159,15 @@ LogicalResult raptor::SpatialToPimPass::lowerComputeBatchOp(spatial::SpatCompute
|
||||
BlockArgument newArg = coreBatchOp.getInputArgument(inputIndex);
|
||||
auto newArgType = cast<ShapedType>(newArg.getType());
|
||||
auto outputBuffer = createEmptyTensorFromShaped(rewriter, loc, newArgType);
|
||||
auto copied = pim::PimMemCopyHostToDevBatchOp::create(rewriter,
|
||||
loc,
|
||||
outputBuffer.getType(),
|
||||
outputBuffer,
|
||||
newArg,
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
getTensorSizeInBytesAttr(rewriter, newArg))
|
||||
Value zeroOffset = getOrCreateIndexConstant(rewriter, coreBatchOp.getOperation(), 0);
|
||||
auto copied = pim::PimMemCopyHostToDevOp::create(rewriter,
|
||||
loc,
|
||||
outputBuffer.getType(),
|
||||
zeroOffset,
|
||||
zeroOffset,
|
||||
outputBuffer,
|
||||
newArg,
|
||||
getTensorSizeInBytesAttr(rewriter, newArg))
|
||||
.getOutput();
|
||||
mapper.map(*oldArg, copied);
|
||||
}
|
||||
@@ -233,14 +231,15 @@ LogicalResult raptor::SpatialToPimPass::lowerComputeBatchOp(spatial::SpatCompute
|
||||
}
|
||||
auto clonedType = cast<ShapedType>(clonedTensor.getType());
|
||||
auto outputBuffer = createEmptyTensorFromShaped(rewriter, loc, clonedType);
|
||||
auto copied = pim::PimMemCopyHostToDevBatchOp::create(rewriter,
|
||||
loc,
|
||||
outputBuffer.getType(),
|
||||
outputBuffer,
|
||||
clonedTensor,
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
getTensorSizeInBytesAttr(rewriter, clonedTensor))
|
||||
Value zeroOffset = getOrCreateIndexConstant(rewriter, coreBatchOp.getOperation(), 0);
|
||||
auto copied = pim::PimMemCopyHostToDevOp::create(rewriter,
|
||||
loc,
|
||||
outputBuffer.getType(),
|
||||
zeroOffset,
|
||||
zeroOffset,
|
||||
outputBuffer,
|
||||
clonedTensor,
|
||||
getTensorSizeInBytesAttr(rewriter, clonedTensor))
|
||||
.getOutput();
|
||||
mapper.map(toTensorOp.getResult(), copied);
|
||||
continue;
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
#include "mlir/IR/ValueRange.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cstddef>
|
||||
|
||||
#include "Common.hpp"
|
||||
|
||||
@@ -13,48 +11,6 @@ using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
size_t getSliceActualOffset(tensor::ExtractSliceOp& sliceOp, ShapedType& inputShape) {
|
||||
/*
|
||||
EXAMPLE RUN:
|
||||
[1, 10, 3, 4] inputShape
|
||||
[0, 2, 1, 3] offsets
|
||||
|
||||
acc = 1
|
||||
---
|
||||
ret = 3
|
||||
acc = 4
|
||||
---
|
||||
ret = 3 + 4 * 1 = 7
|
||||
acc = 12
|
||||
---
|
||||
ret = 7 + 12 * 2 = 31
|
||||
acc = 120
|
||||
---
|
||||
ret = 31 + 120 * 0 = 31
|
||||
acc = 120
|
||||
*/
|
||||
|
||||
size_t returnValue = 0;
|
||||
|
||||
auto sliceOffsets = sliceOp.getStaticOffsets();
|
||||
auto inputDimSizes = inputShape.getShape();
|
||||
|
||||
assert(sliceOffsets.size() == inputDimSizes.size());
|
||||
|
||||
size_t accumulatedDimensionSize = 1;
|
||||
|
||||
// Reverse iterate the two vectors
|
||||
for (auto it : reverse(zip(sliceOffsets, inputDimSizes))) {
|
||||
auto curSliceOffset = std::get<0>(it);
|
||||
auto curInputDimSize = std::get<1>(it);
|
||||
|
||||
returnValue += accumulatedDimensionSize * curSliceOffset;
|
||||
accumulatedDimensionSize *= curInputDimSize;
|
||||
}
|
||||
|
||||
return returnValue;
|
||||
}
|
||||
|
||||
IntegerAttr getTensorSizeInBytesAttr(Builder& builder, mlir::Value value) {
|
||||
return builder.getI32IntegerAttr(static_cast<int32_t>(getShapedTypeSizeInBytes(cast<ShapedType>(value.getType()))));
|
||||
}
|
||||
|
||||
@@ -6,20 +6,6 @@
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* \brief Get the offset of the ExtractSliceOp based on its static offsets and
|
||||
* its static tensor input.
|
||||
*
|
||||
* The static offsets represent the starting position of the slice in each
|
||||
* dimension, while the static tensor input gives its dimension size.
|
||||
*
|
||||
* \param sliceOp The ExtractSliceOp for which the actual offset needs to be
|
||||
* calculated.
|
||||
* \param inputShape The ShapedType of the ExtractSliceOp's input tensor
|
||||
* \return The actual offset of the ExtractSliceOp.
|
||||
*/
|
||||
size_t getSliceActualOffset(mlir::tensor::ExtractSliceOp& sliceOp, mlir::ShapedType& inputShape);
|
||||
|
||||
mlir::IntegerAttr getTensorSizeInBytesAttr(mlir::Builder& builder, mlir::Value value);
|
||||
|
||||
template <class T>
|
||||
|
||||
@@ -83,18 +83,6 @@ static Value createZeroedDeviceHVector(IRRewriter& rewriter,
|
||||
auto zeroValue = memref::GetGlobalOp::create(rewriter, loc, zeroGlobal.getType(), zeroGlobal.getName());
|
||||
auto zeroIndex = getOrCreateIndexConstant(constantFolder, outputBuffer.getOperation(), 0);
|
||||
auto sizeAttr = rewriter.getI32IntegerAttr(static_cast<int32_t>(getShapedTypeSizeInBytes(tensorType)));
|
||||
|
||||
if (outputBuffer->getParentOfType<PimCoreBatchOp>())
|
||||
return PimMemCopyHostToDevBatchOp::create(rewriter,
|
||||
loc,
|
||||
tensorType,
|
||||
outputBuffer,
|
||||
zeroValue,
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
rewriter.getI32IntegerAttr(0),
|
||||
sizeAttr)
|
||||
.getOutput();
|
||||
|
||||
return PimMemCopyHostToDevOp::create(
|
||||
rewriter, loc, tensorType, zeroIndex, zeroIndex, outputBuffer, zeroValue, sizeAttr)
|
||||
.getOutput();
|
||||
|
||||
Reference in New Issue
Block a user