add shared loop creation helpers
Validate Operations / validate-operations (push) Has been cancelled
Validate Operations / validate-operations (push) Has been cancelled
add shared checked arithmetic helpers refactor pim passes into Pim/Transforms more robust memory coalescing pass
This commit is contained in:
@@ -8,6 +8,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||
@@ -305,58 +306,67 @@ static Value createIm2colRowComputes(Value x,
|
||||
auto cStrideHeight = getOrCreateIndexConstant(rewriter, anchorOp, strideHeight);
|
||||
auto cStrideWidth = getOrCreateIndexConstant(rewriter, anchorOp, strideWidth);
|
||||
|
||||
auto im2colLoop = scf::ForOp::create(rewriter, loc, c0, cNumPatches, c1, ValueRange {im2colInit});
|
||||
rewriter.setInsertionPointToStart(im2colLoop.getBody());
|
||||
auto im2colLoop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
loc,
|
||||
c0,
|
||||
cNumPatches,
|
||||
c1,
|
||||
ValueRange {im2colInit},
|
||||
[&](OpBuilder&, Location nestedLoc, Value patchIndex, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
|
||||
Value im2colAcc = iterArgs.front();
|
||||
Value batchIndex = arith::DivUIOp::create(rewriter, nestedLoc, patchIndex, cNumPatchesPerBatch);
|
||||
Value batchPatchIndex = arith::RemUIOp::create(rewriter, nestedLoc, patchIndex, cNumPatchesPerBatch);
|
||||
Value outHeightIndex = arith::DivUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutWidth);
|
||||
Value outWidthIndex = arith::RemUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutWidth);
|
||||
Value inputHeightOffset = arith::MulIOp::create(rewriter, nestedLoc, outHeightIndex, cStrideHeight);
|
||||
Value inputWidthOffset = arith::MulIOp::create(rewriter, nestedLoc, outWidthIndex, cStrideWidth);
|
||||
|
||||
Value patchIndex = im2colLoop.getInductionVar();
|
||||
Value im2colAcc = im2colLoop.getRegionIterArgs().front();
|
||||
SmallVector<OpFoldResult> offsets = {
|
||||
batchIndex, rewriter.getIndexAttr(0), inputHeightOffset, inputWidthOffset};
|
||||
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(numChannelsIn),
|
||||
rewriter.getIndexAttr(wHeight),
|
||||
rewriter.getIndexAttr(wWidth)};
|
||||
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(dilationHeight),
|
||||
rewriter.getIndexAttr(dilationWidth)};
|
||||
auto patchType = RankedTensorType::get({1, numChannelsIn, wHeight, wWidth}, elemType);
|
||||
Value patch =
|
||||
tensor::ExtractSliceOp::create(rewriter, nestedLoc, patchType, paddedInput, offsets, sizes, strides);
|
||||
|
||||
Value batchIndex = arith::DivUIOp::create(rewriter, loc, patchIndex, cNumPatchesPerBatch);
|
||||
Value batchPatchIndex = arith::RemUIOp::create(rewriter, loc, patchIndex, cNumPatchesPerBatch);
|
||||
Value outHeightIndex = arith::DivUIOp::create(rewriter, loc, batchPatchIndex, cOutWidth);
|
||||
Value outWidthIndex = arith::RemUIOp::create(rewriter, loc, batchPatchIndex, cOutWidth);
|
||||
Value inputHeightOffset = arith::MulIOp::create(rewriter, loc, outHeightIndex, cStrideHeight);
|
||||
Value inputWidthOffset = arith::MulIOp::create(rewriter, loc, outWidthIndex, cStrideWidth);
|
||||
Value row = tensor::CollapseShapeOp::create(rewriter,
|
||||
nestedLoc,
|
||||
im2colRowType,
|
||||
patch,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0},
|
||||
{1, 2, 3}
|
||||
});
|
||||
|
||||
SmallVector<OpFoldResult> offsets = {batchIndex, rewriter.getIndexAttr(0), inputHeightOffset, inputWidthOffset};
|
||||
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(numChannelsIn),
|
||||
rewriter.getIndexAttr(wHeight),
|
||||
rewriter.getIndexAttr(wWidth)};
|
||||
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(dilationHeight),
|
||||
rewriter.getIndexAttr(dilationWidth)};
|
||||
auto patchType = RankedTensorType::get({1, numChannelsIn, wHeight, wWidth}, elemType);
|
||||
Value patch = tensor::ExtractSliceOp::create(rewriter, loc, patchType, paddedInput, offsets, sizes, strides);
|
||||
|
||||
Value row = tensor::CollapseShapeOp::create(rewriter,
|
||||
loc,
|
||||
im2colRowType,
|
||||
patch,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0},
|
||||
{1, 2, 3}
|
||||
});
|
||||
|
||||
SmallVector<OpFoldResult> rowOffsets = {patchIndex, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> rowSizes = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(patchSize)};
|
||||
SmallVector<OpFoldResult> rowStrides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value updatedIm2col =
|
||||
tensor::InsertSliceOp::create(rewriter, loc, row, im2colAcc, rowOffsets, rowSizes, rowStrides);
|
||||
scf::YieldOp::create(rewriter, loc, updatedIm2col);
|
||||
|
||||
rewriter.setInsertionPointAfter(im2colLoop);
|
||||
Value im2col = im2colLoop.getResult(0);
|
||||
SmallVector<OpFoldResult> rowOffsets = {patchIndex, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> rowSizes = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(patchSize)};
|
||||
SmallVector<OpFoldResult> rowStrides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value updatedIm2col =
|
||||
tensor::InsertSliceOp::create(rewriter, nestedLoc, row, im2colAcc, rowOffsets, rowSizes, rowStrides);
|
||||
yielded.push_back(updatedIm2col);
|
||||
return success();
|
||||
});
|
||||
if (failed(im2colLoop))
|
||||
return failure();
|
||||
Value im2col = im2colLoop->results.front();
|
||||
|
||||
Value gemmInputRows = im2col;
|
||||
if (packFactor != 1)
|
||||
gemmInputRows = packRowsForParallelGemm(im2col, im2colType, packFactor, rewriter, loc);
|
||||
|
||||
spatial::SpatYieldOp::create(rewriter, loc, gemmInputRows);
|
||||
return success();
|
||||
});
|
||||
|
||||
return im2colComputeOp.getResult(0);
|
||||
assert(succeeded(im2colComputeOp) && "Conv im2col compute construction must succeed");
|
||||
return im2colComputeOp->getResult(0);
|
||||
}
|
||||
|
||||
static Value createCollectedConvOutput(ValueRange gemmRows,
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "Common/IR/ConstantUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/AffineUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/ShapeUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
|
||||
@@ -247,16 +248,16 @@ static Value createPaddedInputCompute(Value input,
|
||||
return computeOp.getResult(0);
|
||||
}
|
||||
|
||||
static spatial::SpatComputeBatch createVmmBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType paddedBType,
|
||||
RankedTensorType partialPiecesType,
|
||||
int64_t numOutRows,
|
||||
int64_t numKSlices,
|
||||
int64_t numOutHSlices,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatComputeBatch> createVmmBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType paddedBType,
|
||||
RankedTensorType partialPiecesType,
|
||||
int64_t numOutRows,
|
||||
int64_t numKSlices,
|
||||
int64_t numOutHSlices,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t laneCount = partialPiecesType.getDimSize(0);
|
||||
auto batchOp = createSpatComputeBatch(
|
||||
rewriter,
|
||||
@@ -294,7 +295,8 @@ static spatial::SpatComputeBatch createVmmBatch(Value a,
|
||||
createParallelInsertSliceIntoBatchOutput(
|
||||
rewriter, loc, piece, args.outputs.front(), pieceOffsets, pieceSizes, unitStrides);
|
||||
});
|
||||
assert(succeeded(batchOp) && "expected Gemm VMM batch construction to succeed");
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
return *batchOp;
|
||||
}
|
||||
|
||||
@@ -416,15 +418,15 @@ static Value createBroadcastedBiasScalar(Value bias,
|
||||
return tensor::SplatOp::create(rewriter, loc, scalarType, scalar).getResult();
|
||||
}
|
||||
|
||||
static spatial::SpatComputeBatch createVvdmulBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType bType,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
bool bAlreadyTransposed,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatComputeBatch> createVvdmulBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType bType,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
bool bAlreadyTransposed,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t numOutRows = outType.getDimSize(0);
|
||||
const int64_t numOutCols = outType.getDimSize(1);
|
||||
const int64_t reductionSize = aType.getDimSize(1);
|
||||
@@ -454,26 +456,27 @@ static spatial::SpatComputeBatch createVvdmulBatch(Value a,
|
||||
createParallelInsertSliceIntoBatchOutput(
|
||||
rewriter, loc, scalar, args.outputs.front(), outputOffsets, scalarSizes, unitStrides);
|
||||
});
|
||||
assert(succeeded(batchOp) && "expected Gemm VVDMul batch construction to succeed");
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
return *batchOp;
|
||||
}
|
||||
|
||||
static spatial::SpatCompute createDynamicGemmOutputCompute(Value scalarPieces,
|
||||
Value bias,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType biasType,
|
||||
RankedTensorType outType,
|
||||
float alpha,
|
||||
float beta,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatCompute> createDynamicGemmOutputCompute(Value scalarPieces,
|
||||
Value bias,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType biasType,
|
||||
RankedTensorType outType,
|
||||
float alpha,
|
||||
float beta,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t laneCount = scalarPiecesType.getDimSize(0);
|
||||
const int64_t numOutCols = outType.getDimSize(1);
|
||||
SmallVector<Value> inputs {scalarPieces};
|
||||
if (bias)
|
||||
inputs.push_back(bias);
|
||||
|
||||
return createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) {
|
||||
return createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) -> LogicalResult {
|
||||
Value pieces = blockArgs[0];
|
||||
Value biasArg = bias ? blockArgs[1] : Value();
|
||||
auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
|
||||
@@ -481,40 +484,50 @@ static spatial::SpatCompute createDynamicGemmOutputCompute(Value scalarPieces,
|
||||
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
|
||||
Value cLaneCount = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), laneCount);
|
||||
auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
|
||||
rewriter.setInsertionPointToStart(loop.getBody());
|
||||
auto loop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
loc,
|
||||
c0,
|
||||
cLaneCount,
|
||||
c1,
|
||||
ValueRange {outputInit},
|
||||
[&](OpBuilder&, Location nestedLoc, Value lane, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
|
||||
Value outputAcc = iterArgs.front();
|
||||
Value row = createDynamicGemmBatchRow(lane, numOutCols, rewriter, nestedLoc);
|
||||
Value column =
|
||||
onnx_mlir::affineModConst(rewriter, nestedLoc, lane, numOutCols, rewriter.getInsertionBlock()->getParentOp());
|
||||
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value scalar = tensor::ExtractSliceOp::create(
|
||||
rewriter, nestedLoc, scalarType, pieces, scalarOffsets, scalarSizes, unitStrides)
|
||||
.getResult();
|
||||
if (alpha != 1.0f) {
|
||||
Value alphaTensor = createScalarTensorConstant(scalarType, alpha, rewriter, nestedLoc);
|
||||
scalar = spatial::SpatVMulOp::create(rewriter, nestedLoc, scalarType, scalar, alphaTensor).getResult();
|
||||
}
|
||||
if (biasArg) {
|
||||
Value biasScalar =
|
||||
createBroadcastedBiasScalar(biasArg, biasType, row, column, scalarType, rewriter, nestedLoc);
|
||||
if (beta != 1.0f) {
|
||||
Value betaTensor = createScalarTensorConstant(scalarType, beta, rewriter, nestedLoc);
|
||||
biasScalar =
|
||||
spatial::SpatVMulOp::create(rewriter, nestedLoc, scalarType, biasScalar, betaTensor).getResult();
|
||||
}
|
||||
scalar = spatial::SpatVAddOp::create(rewriter, nestedLoc, scalarType, scalar, biasScalar).getResult();
|
||||
}
|
||||
SmallVector<OpFoldResult> outputOffsets {row, column};
|
||||
Value outputNext =
|
||||
tensor::InsertSliceOp::create(rewriter, nestedLoc, scalar, outputAcc, outputOffsets, scalarSizes, unitStrides)
|
||||
.getResult();
|
||||
yielded.push_back(outputNext);
|
||||
return success();
|
||||
});
|
||||
if (failed(loop))
|
||||
return failure();
|
||||
|
||||
Value lane = loop.getInductionVar();
|
||||
Value outputAcc = loop.getRegionIterArgs().front();
|
||||
Value row = createDynamicGemmBatchRow(lane, numOutCols, rewriter, loc);
|
||||
Value column =
|
||||
onnx_mlir::affineModConst(rewriter, loc, lane, numOutCols, rewriter.getInsertionBlock()->getParentOp());
|
||||
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value scalar =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, unitStrides)
|
||||
.getResult();
|
||||
if (alpha != 1.0f) {
|
||||
Value alphaTensor = createScalarTensorConstant(scalarType, alpha, rewriter, loc);
|
||||
scalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, scalar, alphaTensor).getResult();
|
||||
}
|
||||
if (biasArg) {
|
||||
Value biasScalar = createBroadcastedBiasScalar(biasArg, biasType, row, column, scalarType, rewriter, loc);
|
||||
if (beta != 1.0f) {
|
||||
Value betaTensor = createScalarTensorConstant(scalarType, beta, rewriter, loc);
|
||||
biasScalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, biasScalar, betaTensor).getResult();
|
||||
}
|
||||
scalar = spatial::SpatVAddOp::create(rewriter, loc, scalarType, scalar, biasScalar).getResult();
|
||||
}
|
||||
SmallVector<OpFoldResult> outputOffsets {row, column};
|
||||
Value outputNext =
|
||||
tensor::InsertSliceOp::create(rewriter, loc, scalar, outputAcc, outputOffsets, scalarSizes, unitStrides)
|
||||
.getResult();
|
||||
scf::YieldOp::create(rewriter, loc, outputNext);
|
||||
|
||||
rewriter.setInsertionPointAfter(loop);
|
||||
spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
|
||||
spatial::SpatYieldOp::create(rewriter, loc, loop->results.front());
|
||||
return success();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -579,85 +592,92 @@ static Value reducePartialPiecesForHSlice(Value partialPiecesArg,
|
||||
return activePieces.front();
|
||||
}
|
||||
|
||||
static spatial::SpatCompute createReductionCompute(Value partialPieces,
|
||||
Value bias,
|
||||
RankedTensorType partialPiecesType,
|
||||
RankedTensorType outType,
|
||||
RankedTensorType paddedOutType,
|
||||
int64_t numKSlices,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatCompute> createReductionCompute(Value partialPieces,
|
||||
Value bias,
|
||||
RankedTensorType partialPiecesType,
|
||||
RankedTensorType outType,
|
||||
RankedTensorType paddedOutType,
|
||||
int64_t numKSlices,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
SmallVector<Value> inputs {partialPieces};
|
||||
if (bias)
|
||||
inputs.push_back(bias);
|
||||
|
||||
auto computeOp = createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) {
|
||||
Value partialPiecesArg = blockArgs[0];
|
||||
Value biasArg = bias ? blockArgs[1] : Value();
|
||||
if (biasArg && cast<RankedTensorType>(biasArg.getType()) != paddedOutType)
|
||||
biasArg = createZeroPaddedTensor(biasArg, paddedOutType, rewriter, loc);
|
||||
auto computeOp =
|
||||
createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) -> LogicalResult {
|
||||
Value partialPiecesArg = blockArgs[0];
|
||||
Value biasArg = bias ? blockArgs[1] : Value();
|
||||
if (biasArg && cast<RankedTensorType>(biasArg.getType()) != paddedOutType)
|
||||
biasArg = createZeroPaddedTensor(biasArg, paddedOutType, rewriter, loc);
|
||||
|
||||
const int64_t numOutRows = outType.getDimSize(0);
|
||||
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(1), crossbarSize.getValue());
|
||||
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
|
||||
partialPiecesType.getElementType());
|
||||
const int64_t numOutRows = outType.getDimSize(0);
|
||||
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(1), crossbarSize.getValue());
|
||||
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
|
||||
partialPiecesType.getElementType());
|
||||
|
||||
Value outputInit =
|
||||
tensor::EmptyOp::create(rewriter, loc, paddedOutType.getShape(), paddedOutType.getElementType()).getResult();
|
||||
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
SmallVector<OpFoldResult> pieceSizes {rewriter.getIndexAttr(numOutRows),
|
||||
rewriter.getIndexAttr(crossbarSize.getValue())};
|
||||
Value outputInit =
|
||||
tensor::EmptyOp::create(rewriter, loc, paddedOutType.getShape(), paddedOutType.getElementType()).getResult();
|
||||
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
SmallVector<OpFoldResult> pieceSizes {rewriter.getIndexAttr(numOutRows),
|
||||
rewriter.getIndexAttr(crossbarSize.getValue())};
|
||||
|
||||
auto buildOutputSlice = [&](Value outputAcc, Value hSlice) -> Value {
|
||||
Value reduced =
|
||||
reducePartialPiecesForHSlice(partialPiecesArg, hSlice, pieceType, numKSlices, numOutRows, rewriter, loc);
|
||||
Value hOffset = onnx_mlir::affineMulConst(
|
||||
rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
|
||||
if (biasArg) {
|
||||
SmallVector<OpFoldResult> biasOffsets {rewriter.getIndexAttr(0), hOffset};
|
||||
Value biasSlice =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, pieceType, biasArg, biasOffsets, pieceSizes, unitStrides)
|
||||
.getResult();
|
||||
reduced = spatial::SpatVAddOp::create(rewriter, loc, pieceType, reduced, biasSlice).getResult();
|
||||
auto buildOutputSlice = [&](Value outputAcc, Value hSlice) -> Value {
|
||||
Value reduced =
|
||||
reducePartialPiecesForHSlice(partialPiecesArg, hSlice, pieceType, numKSlices, numOutRows, rewriter, loc);
|
||||
Value hOffset = onnx_mlir::affineMulConst(
|
||||
rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
|
||||
if (biasArg) {
|
||||
SmallVector<OpFoldResult> biasOffsets {rewriter.getIndexAttr(0), hOffset};
|
||||
Value biasSlice =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, pieceType, biasArg, biasOffsets, pieceSizes, unitStrides)
|
||||
.getResult();
|
||||
reduced = spatial::SpatVAddOp::create(rewriter, loc, pieceType, reduced, biasSlice).getResult();
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), hOffset};
|
||||
return tensor::InsertSliceOp::create(rewriter, loc, reduced, outputAcc, outputOffsets, pieceSizes, unitStrides)
|
||||
.getResult();
|
||||
};
|
||||
|
||||
Value paddedOutput = outputInit;
|
||||
if (numOutHSlices == 1) {
|
||||
Value hSlice = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
paddedOutput = buildOutputSlice(outputInit, hSlice);
|
||||
}
|
||||
else {
|
||||
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
|
||||
Value cOutHSlices =
|
||||
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
|
||||
auto hLoop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
loc,
|
||||
c0,
|
||||
cOutHSlices,
|
||||
c1,
|
||||
ValueRange {outputInit},
|
||||
[&](OpBuilder&, Location, Value hSlice, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
|
||||
yielded.push_back(buildOutputSlice(iterArgs.front(), hSlice));
|
||||
return success();
|
||||
});
|
||||
if (failed(hLoop))
|
||||
return failure();
|
||||
paddedOutput = hLoop->results.front();
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), hOffset};
|
||||
return tensor::InsertSliceOp::create(rewriter, loc, reduced, outputAcc, outputOffsets, pieceSizes, unitStrides)
|
||||
.getResult();
|
||||
};
|
||||
|
||||
Value paddedOutput = outputInit;
|
||||
if (numOutHSlices == 1) {
|
||||
Value hSlice = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
paddedOutput = buildOutputSlice(outputInit, hSlice);
|
||||
}
|
||||
else {
|
||||
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
|
||||
Value cOutHSlices =
|
||||
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
|
||||
auto hLoop = scf::ForOp::create(rewriter, loc, c0, cOutHSlices, c1, ValueRange {outputInit});
|
||||
rewriter.setInsertionPointToStart(hLoop.getBody());
|
||||
|
||||
Value hSlice = hLoop.getInductionVar();
|
||||
Value outputAcc = hLoop.getRegionIterArgs().front();
|
||||
scf::YieldOp::create(rewriter, loc, buildOutputSlice(outputAcc, hSlice));
|
||||
|
||||
rewriter.setInsertionPointAfter(hLoop);
|
||||
paddedOutput = hLoop.getResult(0);
|
||||
}
|
||||
|
||||
Value result = paddedOutput;
|
||||
if (paddedOutType != outType) {
|
||||
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(outType.getDimSize(0)),
|
||||
rewriter.getIndexAttr(outType.getDimSize(1))};
|
||||
result =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, unitStrides)
|
||||
.getResult();
|
||||
}
|
||||
spatial::SpatYieldOp::create(rewriter, loc, result);
|
||||
});
|
||||
Value result = paddedOutput;
|
||||
if (paddedOutType != outType) {
|
||||
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(outType.getDimSize(0)),
|
||||
rewriter.getIndexAttr(outType.getDimSize(1))};
|
||||
result =
|
||||
tensor::ExtractSliceOp::create(rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, unitStrides)
|
||||
.getResult();
|
||||
}
|
||||
spatial::SpatYieldOp::create(rewriter, loc, result);
|
||||
return success();
|
||||
});
|
||||
|
||||
return computeOp;
|
||||
}
|
||||
@@ -755,9 +775,13 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
|
||||
auto scalarPiecesType = RankedTensorType::get({laneCount64, 1}, outType.getElementType());
|
||||
auto batchOp =
|
||||
createVvdmulBatch(a, b, aType, bType, scalarPiecesType, outType, gemmOpAdaptor.getTransB(), rewriter, loc);
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
auto outputCompute = createDynamicGemmOutputCompute(
|
||||
batchOp.getResult(0), hasC ? c : Value(), scalarPiecesType, biasType, outType, alpha, beta, rewriter, loc);
|
||||
rewriter.replaceOp(gemmOp, outputCompute.getResults());
|
||||
batchOp->getResult(0), hasC ? c : Value(), scalarPiecesType, biasType, outType, alpha, beta, rewriter, loc);
|
||||
if (failed(outputCompute))
|
||||
return failure();
|
||||
rewriter.replaceOp(gemmOp, outputCompute->getResults());
|
||||
return success();
|
||||
}
|
||||
|
||||
@@ -832,10 +856,14 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
|
||||
RankedTensorType::get({laneCount64, static_cast<int64_t>(crossbarSize.getValue())}, outType.getElementType());
|
||||
auto batchOp =
|
||||
createVmmBatch(a, b, aType, paddedBType, partialPiecesType, numOutRows, numKSlices, numOutHSlices, rewriter, loc);
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
auto reductionCompute = createReductionCompute(
|
||||
batchOp.getResult(0), bias, partialPiecesType, outType, paddedOutType, numKSlices, rewriter, loc);
|
||||
batchOp->getResult(0), bias, partialPiecesType, outType, paddedOutType, numKSlices, rewriter, loc);
|
||||
if (failed(reductionCompute))
|
||||
return failure();
|
||||
|
||||
rewriter.replaceOp(gemmOp, reductionCompute.getResults());
|
||||
rewriter.replaceOp(gemmOp, reductionCompute->getResults());
|
||||
return success();
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
|
||||
#include "src/Accelerators/PIM/Common/IR/AffineUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
|
||||
@@ -281,18 +282,18 @@ static Value getBatchLaneIndex(
|
||||
rewriter, loc, lane, numOutRows * numKSlices * numOutHSlices, rewriter.getInsertionBlock()->getParentOp());
|
||||
}
|
||||
|
||||
static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
int64_t aBatchCount,
|
||||
RankedTensorType bType,
|
||||
int64_t bBatchCount,
|
||||
RankedTensorType partialPiecesType,
|
||||
int64_t numOutRows,
|
||||
int64_t numKSlices,
|
||||
int64_t numOutHSlices,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatComputeBatch> createBatchedVmmBatch(Value a,
|
||||
Value b,
|
||||
RankedTensorType aType,
|
||||
int64_t aBatchCount,
|
||||
RankedTensorType bType,
|
||||
int64_t bBatchCount,
|
||||
RankedTensorType partialPiecesType,
|
||||
int64_t numOutRows,
|
||||
int64_t numKSlices,
|
||||
int64_t numOutHSlices,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t laneCount = partialPiecesType.getDimSize(0);
|
||||
auto batchOp = createSpatComputeBatch(
|
||||
rewriter,
|
||||
@@ -331,7 +332,8 @@ static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
|
||||
createParallelInsertSliceIntoBatchOutput(
|
||||
rewriter, loc, piece, args.outputs.front(), pieceOffsets, pieceSizes, getUnitStrides(rewriter, 2));
|
||||
});
|
||||
assert(succeeded(batchOp) && "expected batched MatMul VMM construction to succeed");
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
return *batchOp;
|
||||
}
|
||||
|
||||
@@ -422,17 +424,17 @@ static Value extractDynamicBatchedRowVector(Value matrix,
|
||||
});
|
||||
}
|
||||
|
||||
static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
|
||||
int64_t aBatchCount,
|
||||
Value b,
|
||||
int64_t bBatchCount,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType bType,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
bool bAlreadyTransposed,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<spatial::SpatComputeBatch> createBatchedVvdmulBatch(Value a,
|
||||
int64_t aBatchCount,
|
||||
Value b,
|
||||
int64_t bBatchCount,
|
||||
RankedTensorType aType,
|
||||
RankedTensorType bType,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
bool bAlreadyTransposed,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t numBatches = outType.getDimSize(0);
|
||||
const int64_t numOutRows = outType.getDimSize(1);
|
||||
const int64_t numOutCols = outType.getDimSize(2);
|
||||
@@ -466,64 +468,73 @@ static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
|
||||
createParallelInsertSliceIntoBatchOutput(
|
||||
rewriter, loc, scalar, args.outputs.front(), outputOffsets, scalarSizes, getUnitStrides(rewriter, 2));
|
||||
});
|
||||
assert(succeeded(batchOp) && "expected batched MatMul VVDMul construction to succeed");
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
return *batchOp;
|
||||
}
|
||||
|
||||
static Value createBatchedDynamicOutputCompute(Value scalarPieces,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<Value> createBatchedDynamicOutputCompute(Value scalarPieces,
|
||||
RankedTensorType scalarPiecesType,
|
||||
RankedTensorType outType,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
const int64_t laneCount = scalarPiecesType.getDimSize(0);
|
||||
const int64_t numOutRows = outType.getDimSize(1);
|
||||
const int64_t numOutCols = outType.getDimSize(2);
|
||||
auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
|
||||
auto outputScalarType = RankedTensorType::get({1, 1, 1}, outType.getElementType());
|
||||
|
||||
auto computeOp =
|
||||
createSpatCompute<1>(rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) {
|
||||
auto computeOp = createSpatCompute<1>(
|
||||
rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) -> LogicalResult {
|
||||
Value outputInit =
|
||||
tensor::EmptyOp::create(rewriter, loc, outType.getShape(), outType.getElementType()).getResult();
|
||||
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
|
||||
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
|
||||
Value cLaneCount = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), laneCount);
|
||||
auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
|
||||
rewriter.setInsertionPointToStart(loop.getBody());
|
||||
|
||||
Value lane = loop.getInductionVar();
|
||||
Value outputAcc = loop.getRegionIterArgs().front();
|
||||
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
|
||||
Value batch = affineFloorDivConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
|
||||
Value batchLane = affineModConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
|
||||
Value row = affineFloorDivConst(rewriter, loc, batchLane, numOutCols, anchorOp);
|
||||
Value column = affineModConst(rewriter, loc, batchLane, numOutCols, anchorOp);
|
||||
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value scalar = tensor::ExtractSliceOp::create(
|
||||
rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
|
||||
Value expanded = tensor::ExpandShapeOp::create(rewriter,
|
||||
loc,
|
||||
outputScalarType,
|
||||
scalar,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0},
|
||||
{1, 2}
|
||||
});
|
||||
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
|
||||
SmallVector<OpFoldResult> outputSizes {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
scf::YieldOp::create(
|
||||
auto loop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
loc,
|
||||
tensor::InsertSliceOp::create(
|
||||
rewriter, loc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
|
||||
.getResult());
|
||||
|
||||
rewriter.setInsertionPointAfter(loop);
|
||||
spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
|
||||
c0,
|
||||
cLaneCount,
|
||||
c1,
|
||||
ValueRange {outputInit},
|
||||
[&](OpBuilder&, Location nestedLoc, Value lane, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
|
||||
Value outputAcc = iterArgs.front();
|
||||
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
|
||||
Value batch = affineFloorDivConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
|
||||
Value batchLane = affineModConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
|
||||
Value row = affineFloorDivConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
|
||||
Value column = affineModConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
|
||||
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
|
||||
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value scalar = tensor::ExtractSliceOp::create(
|
||||
rewriter, nestedLoc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
|
||||
Value expanded = tensor::ExpandShapeOp::create(rewriter,
|
||||
nestedLoc,
|
||||
outputScalarType,
|
||||
scalar,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0},
|
||||
{1, 2}
|
||||
});
|
||||
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
|
||||
SmallVector<OpFoldResult> outputSizes = {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Value next =
|
||||
tensor::InsertSliceOp::create(
|
||||
rewriter, nestedLoc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
|
||||
.getResult();
|
||||
yielded.push_back(next);
|
||||
return success();
|
||||
});
|
||||
if (failed(loop))
|
||||
return failure();
|
||||
spatial::SpatYieldOp::create(rewriter, loc, loop->results.front());
|
||||
return success();
|
||||
});
|
||||
return computeOp.getResult(0);
|
||||
if (failed(computeOp))
|
||||
return failure();
|
||||
return computeOp->getResult(0);
|
||||
}
|
||||
|
||||
static Value transposeBatchedOutput(Value value, RankedTensorType outputType, PatternRewriter& rewriter, Location loc) {
|
||||
@@ -587,16 +598,16 @@ static Value reduceBatchedPartialPiecesForHSlice(Value partialPiecesArg,
|
||||
return activePieces.front();
|
||||
}
|
||||
|
||||
static Value createBatchedReductionCompute(Value partialPieces,
|
||||
RankedTensorType partialPiecesType,
|
||||
RankedTensorType outType,
|
||||
RankedTensorType paddedOutType,
|
||||
int64_t numBatches,
|
||||
int64_t numKSlices,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
static FailureOr<Value> createBatchedReductionCompute(Value partialPieces,
|
||||
RankedTensorType partialPiecesType,
|
||||
RankedTensorType outType,
|
||||
RankedTensorType paddedOutType,
|
||||
int64_t numBatches,
|
||||
int64_t numKSlices,
|
||||
PatternRewriter& rewriter,
|
||||
Location loc) {
|
||||
auto computeOp = createSpatCompute<1>(
|
||||
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) {
|
||||
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) -> LogicalResult {
|
||||
const int64_t numOutRows = outType.getDimSize(1);
|
||||
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(2), crossbarSize.getValue());
|
||||
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
|
||||
@@ -612,43 +623,55 @@ static Value createBatchedReductionCompute(Value partialPieces,
|
||||
Value cNumOutHSlices =
|
||||
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
|
||||
|
||||
auto batchLoop = scf::ForOp::create(rewriter, loc, c0, cNumBatches, c1, ValueRange {outputInit});
|
||||
rewriter.setInsertionPointToStart(batchLoop.getBody());
|
||||
Value batch = batchLoop.getInductionVar();
|
||||
Value batchAcc = batchLoop.getRegionIterArgs().front();
|
||||
|
||||
auto hLoop = scf::ForOp::create(rewriter, loc, c0, cNumOutHSlices, c1, ValueRange {batchAcc});
|
||||
rewriter.setInsertionPointToStart(hLoop.getBody());
|
||||
Value hSlice = hLoop.getInductionVar();
|
||||
Value outputAcc = hLoop.getRegionIterArgs().front();
|
||||
|
||||
Value reduced = reduceBatchedPartialPiecesForHSlice(
|
||||
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, loc);
|
||||
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
|
||||
loc,
|
||||
outputSliceType,
|
||||
reduced,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0, 1},
|
||||
{2}
|
||||
});
|
||||
Value hOffset =
|
||||
affineMulConst(rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
|
||||
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
|
||||
SmallVector<OpFoldResult> outputSizes {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(numOutRows), rewriter.getIndexAttr(crossbarSize.getValue())};
|
||||
scf::YieldOp::create(
|
||||
auto batchLoop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
loc,
|
||||
tensor::InsertSliceOp::create(
|
||||
rewriter, loc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
|
||||
.getResult());
|
||||
|
||||
rewriter.setInsertionPointAfter(hLoop);
|
||||
scf::YieldOp::create(rewriter, loc, hLoop.getResult(0));
|
||||
|
||||
rewriter.setInsertionPointAfter(batchLoop);
|
||||
Value paddedOutput = batchLoop.getResult(0);
|
||||
c0,
|
||||
cNumBatches,
|
||||
c1,
|
||||
ValueRange {outputInit},
|
||||
[&](
|
||||
OpBuilder&, Location batchLoc, Value batch, ValueRange batchIterArgs, SmallVectorImpl<Value>& batchYielded) {
|
||||
auto hLoop = buildNormalizedScfFor(
|
||||
rewriter,
|
||||
batchLoc,
|
||||
c0,
|
||||
cNumOutHSlices,
|
||||
c1,
|
||||
ValueRange {batchIterArgs.front()},
|
||||
[&](OpBuilder&, Location hLoc, Value hSlice, ValueRange hIterArgs, SmallVectorImpl<Value>& hYielded) {
|
||||
Value outputAcc = hIterArgs.front();
|
||||
Value reduced = reduceBatchedPartialPiecesForHSlice(
|
||||
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, hLoc);
|
||||
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
|
||||
hLoc,
|
||||
outputSliceType,
|
||||
reduced,
|
||||
SmallVector<ReassociationIndices> {
|
||||
{0, 1},
|
||||
{2}
|
||||
});
|
||||
Value hOffset = affineMulConst(
|
||||
rewriter, hLoc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
|
||||
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
|
||||
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(numOutRows),
|
||||
rewriter.getIndexAttr(crossbarSize.getValue())};
|
||||
Value next =
|
||||
tensor::InsertSliceOp::create(
|
||||
rewriter, hLoc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
|
||||
.getResult();
|
||||
hYielded.push_back(next);
|
||||
return success();
|
||||
});
|
||||
if (failed(hLoop))
|
||||
return failure();
|
||||
batchYielded.push_back(hLoop->results.front());
|
||||
return success();
|
||||
});
|
||||
if (failed(batchLoop))
|
||||
return failure();
|
||||
Value paddedOutput = batchLoop->results.front();
|
||||
Value result = paddedOutput;
|
||||
if (paddedOutType != outType) {
|
||||
SmallVector<OpFoldResult> outputOffsets {
|
||||
@@ -660,8 +683,11 @@ static Value createBatchedReductionCompute(Value partialPieces,
|
||||
rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, getUnitStrides(rewriter, 3));
|
||||
}
|
||||
spatial::SpatYieldOp::create(rewriter, loc, result);
|
||||
return success();
|
||||
});
|
||||
return computeOp.getResult(0);
|
||||
if (failed(computeOp))
|
||||
return failure();
|
||||
return computeOp->getResult(0);
|
||||
}
|
||||
|
||||
struct MatMulShapeInfo {
|
||||
@@ -841,22 +867,27 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
|
||||
numOutHSlices,
|
||||
rewriter,
|
||||
loc);
|
||||
Value result = createBatchedReductionCompute(batchOp.getResult(0),
|
||||
partialPiecesType,
|
||||
directOutType,
|
||||
paddedOutType,
|
||||
shapeInfo->batch,
|
||||
numKSlices,
|
||||
rewriter,
|
||||
loc);
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
auto result = createBatchedReductionCompute(batchOp->getResult(0),
|
||||
partialPiecesType,
|
||||
directOutType,
|
||||
paddedOutType,
|
||||
shapeInfo->batch,
|
||||
numKSlices,
|
||||
rewriter,
|
||||
loc);
|
||||
if (failed(result))
|
||||
return failure();
|
||||
Value finalResult = *result;
|
||||
if (useTransposedForm)
|
||||
result = transposeBatchedOutput(
|
||||
result,
|
||||
finalResult = transposeBatchedOutput(
|
||||
finalResult,
|
||||
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
|
||||
rewriter,
|
||||
loc);
|
||||
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
|
||||
rewriter.replaceOp(matmulOp, result);
|
||||
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
|
||||
rewriter.replaceOp(matmulOp, finalResult);
|
||||
return success();
|
||||
}
|
||||
}
|
||||
@@ -873,16 +904,21 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
|
||||
false,
|
||||
rewriter,
|
||||
loc);
|
||||
Value result =
|
||||
createBatchedDynamicOutputCompute(batchOp.getResult(0), scalarPiecesType, directOutType, rewriter, loc);
|
||||
if (failed(batchOp))
|
||||
return failure();
|
||||
auto result =
|
||||
createBatchedDynamicOutputCompute(batchOp->getResult(0), scalarPiecesType, directOutType, rewriter, loc);
|
||||
if (failed(result))
|
||||
return failure();
|
||||
Value finalResult = *result;
|
||||
if (useTransposedForm)
|
||||
result = transposeBatchedOutput(
|
||||
result,
|
||||
finalResult = transposeBatchedOutput(
|
||||
finalResult,
|
||||
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
|
||||
rewriter,
|
||||
loc);
|
||||
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
|
||||
rewriter.replaceOp(matmulOp, result);
|
||||
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
|
||||
rewriter.replaceOp(matmulOp, finalResult);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user