add shared loop creation helpers
Validate Operations / validate-operations (push) Has been cancelled

add shared checked arithmetic helpers
refactor pim passes into Pim/Transforms
more robust memory coalescing pass
This commit is contained in:
NiccoloN
2026-06-01 16:49:06 +02:00
parent 356be6ccc2
commit 636310d0cb
55 changed files with 2007 additions and 1103 deletions
@@ -8,6 +8,7 @@
#include <algorithm>
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
@@ -305,58 +306,67 @@ static Value createIm2colRowComputes(Value x,
auto cStrideHeight = getOrCreateIndexConstant(rewriter, anchorOp, strideHeight);
auto cStrideWidth = getOrCreateIndexConstant(rewriter, anchorOp, strideWidth);
auto im2colLoop = scf::ForOp::create(rewriter, loc, c0, cNumPatches, c1, ValueRange {im2colInit});
rewriter.setInsertionPointToStart(im2colLoop.getBody());
auto im2colLoop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cNumPatches,
c1,
ValueRange {im2colInit},
[&](OpBuilder&, Location nestedLoc, Value patchIndex, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
Value im2colAcc = iterArgs.front();
Value batchIndex = arith::DivUIOp::create(rewriter, nestedLoc, patchIndex, cNumPatchesPerBatch);
Value batchPatchIndex = arith::RemUIOp::create(rewriter, nestedLoc, patchIndex, cNumPatchesPerBatch);
Value outHeightIndex = arith::DivUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutWidth);
Value outWidthIndex = arith::RemUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutWidth);
Value inputHeightOffset = arith::MulIOp::create(rewriter, nestedLoc, outHeightIndex, cStrideHeight);
Value inputWidthOffset = arith::MulIOp::create(rewriter, nestedLoc, outWidthIndex, cStrideWidth);
Value patchIndex = im2colLoop.getInductionVar();
Value im2colAcc = im2colLoop.getRegionIterArgs().front();
SmallVector<OpFoldResult> offsets = {
batchIndex, rewriter.getIndexAttr(0), inputHeightOffset, inputWidthOffset};
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(numChannelsIn),
rewriter.getIndexAttr(wHeight),
rewriter.getIndexAttr(wWidth)};
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(dilationHeight),
rewriter.getIndexAttr(dilationWidth)};
auto patchType = RankedTensorType::get({1, numChannelsIn, wHeight, wWidth}, elemType);
Value patch =
tensor::ExtractSliceOp::create(rewriter, nestedLoc, patchType, paddedInput, offsets, sizes, strides);
Value batchIndex = arith::DivUIOp::create(rewriter, loc, patchIndex, cNumPatchesPerBatch);
Value batchPatchIndex = arith::RemUIOp::create(rewriter, loc, patchIndex, cNumPatchesPerBatch);
Value outHeightIndex = arith::DivUIOp::create(rewriter, loc, batchPatchIndex, cOutWidth);
Value outWidthIndex = arith::RemUIOp::create(rewriter, loc, batchPatchIndex, cOutWidth);
Value inputHeightOffset = arith::MulIOp::create(rewriter, loc, outHeightIndex, cStrideHeight);
Value inputWidthOffset = arith::MulIOp::create(rewriter, loc, outWidthIndex, cStrideWidth);
Value row = tensor::CollapseShapeOp::create(rewriter,
nestedLoc,
im2colRowType,
patch,
SmallVector<ReassociationIndices> {
{0},
{1, 2, 3}
});
SmallVector<OpFoldResult> offsets = {batchIndex, rewriter.getIndexAttr(0), inputHeightOffset, inputWidthOffset};
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(numChannelsIn),
rewriter.getIndexAttr(wHeight),
rewriter.getIndexAttr(wWidth)};
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(dilationHeight),
rewriter.getIndexAttr(dilationWidth)};
auto patchType = RankedTensorType::get({1, numChannelsIn, wHeight, wWidth}, elemType);
Value patch = tensor::ExtractSliceOp::create(rewriter, loc, patchType, paddedInput, offsets, sizes, strides);
Value row = tensor::CollapseShapeOp::create(rewriter,
loc,
im2colRowType,
patch,
SmallVector<ReassociationIndices> {
{0},
{1, 2, 3}
});
SmallVector<OpFoldResult> rowOffsets = {patchIndex, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> rowSizes = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(patchSize)};
SmallVector<OpFoldResult> rowStrides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value updatedIm2col =
tensor::InsertSliceOp::create(rewriter, loc, row, im2colAcc, rowOffsets, rowSizes, rowStrides);
scf::YieldOp::create(rewriter, loc, updatedIm2col);
rewriter.setInsertionPointAfter(im2colLoop);
Value im2col = im2colLoop.getResult(0);
SmallVector<OpFoldResult> rowOffsets = {patchIndex, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> rowSizes = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(patchSize)};
SmallVector<OpFoldResult> rowStrides = {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value updatedIm2col =
tensor::InsertSliceOp::create(rewriter, nestedLoc, row, im2colAcc, rowOffsets, rowSizes, rowStrides);
yielded.push_back(updatedIm2col);
return success();
});
if (failed(im2colLoop))
return failure();
Value im2col = im2colLoop->results.front();
Value gemmInputRows = im2col;
if (packFactor != 1)
gemmInputRows = packRowsForParallelGemm(im2col, im2colType, packFactor, rewriter, loc);
spatial::SpatYieldOp::create(rewriter, loc, gemmInputRows);
return success();
});
return im2colComputeOp.getResult(0);
assert(succeeded(im2colComputeOp) && "Conv im2col compute construction must succeed");
return im2colComputeOp->getResult(0);
}
static Value createCollectedConvOutput(ValueRange gemmRows,
@@ -15,6 +15,7 @@
#include "Common/IR/ConstantUtils.hpp"
#include "src/Accelerators/PIM/Common/IR/AffineUtils.hpp"
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Common/IR/ShapeUtils.hpp"
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
#include "src/Accelerators/PIM/Common/Support/Diagnostics.hpp"
@@ -247,16 +248,16 @@ static Value createPaddedInputCompute(Value input,
return computeOp.getResult(0);
}
static spatial::SpatComputeBatch createVmmBatch(Value a,
Value b,
RankedTensorType aType,
RankedTensorType paddedBType,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createVmmBatch(Value a,
Value b,
RankedTensorType aType,
RankedTensorType paddedBType,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
ConversionPatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = partialPiecesType.getDimSize(0);
auto batchOp = createSpatComputeBatch(
rewriter,
@@ -294,7 +295,8 @@ static spatial::SpatComputeBatch createVmmBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, piece, args.outputs.front(), pieceOffsets, pieceSizes, unitStrides);
});
assert(succeeded(batchOp) && "expected Gemm VMM batch construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
@@ -416,15 +418,15 @@ static Value createBroadcastedBiasScalar(Value bias,
return tensor::SplatOp::create(rewriter, loc, scalarType, scalar).getResult();
}
static spatial::SpatComputeBatch createVvdmulBatch(Value a,
Value b,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createVvdmulBatch(Value a,
Value b,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
ConversionPatternRewriter& rewriter,
Location loc) {
const int64_t numOutRows = outType.getDimSize(0);
const int64_t numOutCols = outType.getDimSize(1);
const int64_t reductionSize = aType.getDimSize(1);
@@ -454,26 +456,27 @@ static spatial::SpatComputeBatch createVvdmulBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, scalar, args.outputs.front(), outputOffsets, scalarSizes, unitStrides);
});
assert(succeeded(batchOp) && "expected Gemm VVDMul batch construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
static spatial::SpatCompute createDynamicGemmOutputCompute(Value scalarPieces,
Value bias,
RankedTensorType scalarPiecesType,
RankedTensorType biasType,
RankedTensorType outType,
float alpha,
float beta,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatCompute> createDynamicGemmOutputCompute(Value scalarPieces,
Value bias,
RankedTensorType scalarPiecesType,
RankedTensorType biasType,
RankedTensorType outType,
float alpha,
float beta,
ConversionPatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = scalarPiecesType.getDimSize(0);
const int64_t numOutCols = outType.getDimSize(1);
SmallVector<Value> inputs {scalarPieces};
if (bias)
inputs.push_back(bias);
return createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) {
return createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) -> LogicalResult {
Value pieces = blockArgs[0];
Value biasArg = bias ? blockArgs[1] : Value();
auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
@@ -481,40 +484,50 @@ static spatial::SpatCompute createDynamicGemmOutputCompute(Value scalarPieces,
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
Value cLaneCount = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), laneCount);
auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(loop.getBody());
auto loop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cLaneCount,
c1,
ValueRange {outputInit},
[&](OpBuilder&, Location nestedLoc, Value lane, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
Value outputAcc = iterArgs.front();
Value row = createDynamicGemmBatchRow(lane, numOutCols, rewriter, nestedLoc);
Value column =
onnx_mlir::affineModConst(rewriter, nestedLoc, lane, numOutCols, rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar = tensor::ExtractSliceOp::create(
rewriter, nestedLoc, scalarType, pieces, scalarOffsets, scalarSizes, unitStrides)
.getResult();
if (alpha != 1.0f) {
Value alphaTensor = createScalarTensorConstant(scalarType, alpha, rewriter, nestedLoc);
scalar = spatial::SpatVMulOp::create(rewriter, nestedLoc, scalarType, scalar, alphaTensor).getResult();
}
if (biasArg) {
Value biasScalar =
createBroadcastedBiasScalar(biasArg, biasType, row, column, scalarType, rewriter, nestedLoc);
if (beta != 1.0f) {
Value betaTensor = createScalarTensorConstant(scalarType, beta, rewriter, nestedLoc);
biasScalar =
spatial::SpatVMulOp::create(rewriter, nestedLoc, scalarType, biasScalar, betaTensor).getResult();
}
scalar = spatial::SpatVAddOp::create(rewriter, nestedLoc, scalarType, scalar, biasScalar).getResult();
}
SmallVector<OpFoldResult> outputOffsets {row, column};
Value outputNext =
tensor::InsertSliceOp::create(rewriter, nestedLoc, scalar, outputAcc, outputOffsets, scalarSizes, unitStrides)
.getResult();
yielded.push_back(outputNext);
return success();
});
if (failed(loop))
return failure();
Value lane = loop.getInductionVar();
Value outputAcc = loop.getRegionIterArgs().front();
Value row = createDynamicGemmBatchRow(lane, numOutCols, rewriter, loc);
Value column =
onnx_mlir::affineModConst(rewriter, loc, lane, numOutCols, rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar =
tensor::ExtractSliceOp::create(rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, unitStrides)
.getResult();
if (alpha != 1.0f) {
Value alphaTensor = createScalarTensorConstant(scalarType, alpha, rewriter, loc);
scalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, scalar, alphaTensor).getResult();
}
if (biasArg) {
Value biasScalar = createBroadcastedBiasScalar(biasArg, biasType, row, column, scalarType, rewriter, loc);
if (beta != 1.0f) {
Value betaTensor = createScalarTensorConstant(scalarType, beta, rewriter, loc);
biasScalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, biasScalar, betaTensor).getResult();
}
scalar = spatial::SpatVAddOp::create(rewriter, loc, scalarType, scalar, biasScalar).getResult();
}
SmallVector<OpFoldResult> outputOffsets {row, column};
Value outputNext =
tensor::InsertSliceOp::create(rewriter, loc, scalar, outputAcc, outputOffsets, scalarSizes, unitStrides)
.getResult();
scf::YieldOp::create(rewriter, loc, outputNext);
rewriter.setInsertionPointAfter(loop);
spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
spatial::SpatYieldOp::create(rewriter, loc, loop->results.front());
return success();
});
}
@@ -579,85 +592,92 @@ static Value reducePartialPiecesForHSlice(Value partialPiecesArg,
return activePieces.front();
}
static spatial::SpatCompute createReductionCompute(Value partialPieces,
Value bias,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numKSlices,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatCompute> createReductionCompute(Value partialPieces,
Value bias,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numKSlices,
ConversionPatternRewriter& rewriter,
Location loc) {
SmallVector<Value> inputs {partialPieces};
if (bias)
inputs.push_back(bias);
auto computeOp = createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) {
Value partialPiecesArg = blockArgs[0];
Value biasArg = bias ? blockArgs[1] : Value();
if (biasArg && cast<RankedTensorType>(biasArg.getType()) != paddedOutType)
biasArg = createZeroPaddedTensor(biasArg, paddedOutType, rewriter, loc);
auto computeOp =
createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) -> LogicalResult {
Value partialPiecesArg = blockArgs[0];
Value biasArg = bias ? blockArgs[1] : Value();
if (biasArg && cast<RankedTensorType>(biasArg.getType()) != paddedOutType)
biasArg = createZeroPaddedTensor(biasArg, paddedOutType, rewriter, loc);
const int64_t numOutRows = outType.getDimSize(0);
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(1), crossbarSize.getValue());
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
partialPiecesType.getElementType());
const int64_t numOutRows = outType.getDimSize(0);
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(1), crossbarSize.getValue());
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
partialPiecesType.getElementType());
Value outputInit =
tensor::EmptyOp::create(rewriter, loc, paddedOutType.getShape(), paddedOutType.getElementType()).getResult();
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> pieceSizes {rewriter.getIndexAttr(numOutRows),
rewriter.getIndexAttr(crossbarSize.getValue())};
Value outputInit =
tensor::EmptyOp::create(rewriter, loc, paddedOutType.getShape(), paddedOutType.getElementType()).getResult();
SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> pieceSizes {rewriter.getIndexAttr(numOutRows),
rewriter.getIndexAttr(crossbarSize.getValue())};
auto buildOutputSlice = [&](Value outputAcc, Value hSlice) -> Value {
Value reduced =
reducePartialPiecesForHSlice(partialPiecesArg, hSlice, pieceType, numKSlices, numOutRows, rewriter, loc);
Value hOffset = onnx_mlir::affineMulConst(
rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
if (biasArg) {
SmallVector<OpFoldResult> biasOffsets {rewriter.getIndexAttr(0), hOffset};
Value biasSlice =
tensor::ExtractSliceOp::create(rewriter, loc, pieceType, biasArg, biasOffsets, pieceSizes, unitStrides)
.getResult();
reduced = spatial::SpatVAddOp::create(rewriter, loc, pieceType, reduced, biasSlice).getResult();
auto buildOutputSlice = [&](Value outputAcc, Value hSlice) -> Value {
Value reduced =
reducePartialPiecesForHSlice(partialPiecesArg, hSlice, pieceType, numKSlices, numOutRows, rewriter, loc);
Value hOffset = onnx_mlir::affineMulConst(
rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
if (biasArg) {
SmallVector<OpFoldResult> biasOffsets {rewriter.getIndexAttr(0), hOffset};
Value biasSlice =
tensor::ExtractSliceOp::create(rewriter, loc, pieceType, biasArg, biasOffsets, pieceSizes, unitStrides)
.getResult();
reduced = spatial::SpatVAddOp::create(rewriter, loc, pieceType, reduced, biasSlice).getResult();
}
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), hOffset};
return tensor::InsertSliceOp::create(rewriter, loc, reduced, outputAcc, outputOffsets, pieceSizes, unitStrides)
.getResult();
};
Value paddedOutput = outputInit;
if (numOutHSlices == 1) {
Value hSlice = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
paddedOutput = buildOutputSlice(outputInit, hSlice);
}
else {
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
Value cOutHSlices =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
auto hLoop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cOutHSlices,
c1,
ValueRange {outputInit},
[&](OpBuilder&, Location, Value hSlice, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
yielded.push_back(buildOutputSlice(iterArgs.front(), hSlice));
return success();
});
if (failed(hLoop))
return failure();
paddedOutput = hLoop->results.front();
}
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), hOffset};
return tensor::InsertSliceOp::create(rewriter, loc, reduced, outputAcc, outputOffsets, pieceSizes, unitStrides)
.getResult();
};
Value paddedOutput = outputInit;
if (numOutHSlices == 1) {
Value hSlice = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
paddedOutput = buildOutputSlice(outputInit, hSlice);
}
else {
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
Value cOutHSlices =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
auto hLoop = scf::ForOp::create(rewriter, loc, c0, cOutHSlices, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(hLoop.getBody());
Value hSlice = hLoop.getInductionVar();
Value outputAcc = hLoop.getRegionIterArgs().front();
scf::YieldOp::create(rewriter, loc, buildOutputSlice(outputAcc, hSlice));
rewriter.setInsertionPointAfter(hLoop);
paddedOutput = hLoop.getResult(0);
}
Value result = paddedOutput;
if (paddedOutType != outType) {
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(outType.getDimSize(0)),
rewriter.getIndexAttr(outType.getDimSize(1))};
result =
tensor::ExtractSliceOp::create(rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, unitStrides)
.getResult();
}
spatial::SpatYieldOp::create(rewriter, loc, result);
});
Value result = paddedOutput;
if (paddedOutType != outType) {
SmallVector<OpFoldResult> outputOffsets {rewriter.getIndexAttr(0), rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(outType.getDimSize(0)),
rewriter.getIndexAttr(outType.getDimSize(1))};
result =
tensor::ExtractSliceOp::create(rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, unitStrides)
.getResult();
}
spatial::SpatYieldOp::create(rewriter, loc, result);
return success();
});
return computeOp;
}
@@ -755,9 +775,13 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
auto scalarPiecesType = RankedTensorType::get({laneCount64, 1}, outType.getElementType());
auto batchOp =
createVvdmulBatch(a, b, aType, bType, scalarPiecesType, outType, gemmOpAdaptor.getTransB(), rewriter, loc);
if (failed(batchOp))
return failure();
auto outputCompute = createDynamicGemmOutputCompute(
batchOp.getResult(0), hasC ? c : Value(), scalarPiecesType, biasType, outType, alpha, beta, rewriter, loc);
rewriter.replaceOp(gemmOp, outputCompute.getResults());
batchOp->getResult(0), hasC ? c : Value(), scalarPiecesType, biasType, outType, alpha, beta, rewriter, loc);
if (failed(outputCompute))
return failure();
rewriter.replaceOp(gemmOp, outputCompute->getResults());
return success();
}
@@ -832,10 +856,14 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
RankedTensorType::get({laneCount64, static_cast<int64_t>(crossbarSize.getValue())}, outType.getElementType());
auto batchOp =
createVmmBatch(a, b, aType, paddedBType, partialPiecesType, numOutRows, numKSlices, numOutHSlices, rewriter, loc);
if (failed(batchOp))
return failure();
auto reductionCompute = createReductionCompute(
batchOp.getResult(0), bias, partialPiecesType, outType, paddedOutType, numKSlices, rewriter, loc);
batchOp->getResult(0), bias, partialPiecesType, outType, paddedOutType, numKSlices, rewriter, loc);
if (failed(reductionCompute))
return failure();
rewriter.replaceOp(gemmOp, reductionCompute.getResults());
rewriter.replaceOp(gemmOp, reductionCompute->getResults());
return success();
}
@@ -8,6 +8,7 @@
#include "llvm/ADT/SmallVector.h"
#include "src/Accelerators/PIM/Common/IR/AffineUtils.hpp"
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
@@ -281,18 +282,18 @@ static Value getBatchLaneIndex(
rewriter, loc, lane, numOutRows * numKSlices * numOutHSlices, rewriter.getInsertionBlock()->getParentOp());
}
static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
Value b,
RankedTensorType aType,
int64_t aBatchCount,
RankedTensorType bType,
int64_t bBatchCount,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createBatchedVmmBatch(Value a,
Value b,
RankedTensorType aType,
int64_t aBatchCount,
RankedTensorType bType,
int64_t bBatchCount,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
PatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = partialPiecesType.getDimSize(0);
auto batchOp = createSpatComputeBatch(
rewriter,
@@ -331,7 +332,8 @@ static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, piece, args.outputs.front(), pieceOffsets, pieceSizes, getUnitStrides(rewriter, 2));
});
assert(succeeded(batchOp) && "expected batched MatMul VMM construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
@@ -422,17 +424,17 @@ static Value extractDynamicBatchedRowVector(Value matrix,
});
}
static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
int64_t aBatchCount,
Value b,
int64_t bBatchCount,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createBatchedVvdmulBatch(Value a,
int64_t aBatchCount,
Value b,
int64_t bBatchCount,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
PatternRewriter& rewriter,
Location loc) {
const int64_t numBatches = outType.getDimSize(0);
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutCols = outType.getDimSize(2);
@@ -466,64 +468,73 @@ static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, scalar, args.outputs.front(), outputOffsets, scalarSizes, getUnitStrides(rewriter, 2));
});
assert(succeeded(batchOp) && "expected batched MatMul VVDMul construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
static Value createBatchedDynamicOutputCompute(Value scalarPieces,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> createBatchedDynamicOutputCompute(Value scalarPieces,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
PatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = scalarPiecesType.getDimSize(0);
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutCols = outType.getDimSize(2);
auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
auto outputScalarType = RankedTensorType::get({1, 1, 1}, outType.getElementType());
auto computeOp =
createSpatCompute<1>(rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) {
auto computeOp = createSpatCompute<1>(
rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) -> LogicalResult {
Value outputInit =
tensor::EmptyOp::create(rewriter, loc, outType.getShape(), outType.getElementType()).getResult();
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
Value cLaneCount = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), laneCount);
auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(loop.getBody());
Value lane = loop.getInductionVar();
Value outputAcc = loop.getRegionIterArgs().front();
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
Value batch = affineFloorDivConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
Value batchLane = affineModConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
Value row = affineFloorDivConst(rewriter, loc, batchLane, numOutCols, anchorOp);
Value column = affineModConst(rewriter, loc, batchLane, numOutCols, anchorOp);
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar = tensor::ExtractSliceOp::create(
rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
Value expanded = tensor::ExpandShapeOp::create(rewriter,
loc,
outputScalarType,
scalar,
SmallVector<ReassociationIndices> {
{0},
{1, 2}
});
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
SmallVector<OpFoldResult> outputSizes {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
scf::YieldOp::create(
auto loop = buildNormalizedScfFor(
rewriter,
loc,
tensor::InsertSliceOp::create(
rewriter, loc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult());
rewriter.setInsertionPointAfter(loop);
spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
c0,
cLaneCount,
c1,
ValueRange {outputInit},
[&](OpBuilder&, Location nestedLoc, Value lane, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
Value outputAcc = iterArgs.front();
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
Value batch = affineFloorDivConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
Value batchLane = affineModConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
Value row = affineFloorDivConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
Value column = affineModConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar = tensor::ExtractSliceOp::create(
rewriter, nestedLoc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
Value expanded = tensor::ExpandShapeOp::create(rewriter,
nestedLoc,
outputScalarType,
scalar,
SmallVector<ReassociationIndices> {
{0},
{1, 2}
});
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
SmallVector<OpFoldResult> outputSizes = {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value next =
tensor::InsertSliceOp::create(
rewriter, nestedLoc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult();
yielded.push_back(next);
return success();
});
if (failed(loop))
return failure();
spatial::SpatYieldOp::create(rewriter, loc, loop->results.front());
return success();
});
return computeOp.getResult(0);
if (failed(computeOp))
return failure();
return computeOp->getResult(0);
}
static Value transposeBatchedOutput(Value value, RankedTensorType outputType, PatternRewriter& rewriter, Location loc) {
@@ -587,16 +598,16 @@ static Value reduceBatchedPartialPiecesForHSlice(Value partialPiecesArg,
return activePieces.front();
}
static Value createBatchedReductionCompute(Value partialPieces,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numBatches,
int64_t numKSlices,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> createBatchedReductionCompute(Value partialPieces,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numBatches,
int64_t numKSlices,
PatternRewriter& rewriter,
Location loc) {
auto computeOp = createSpatCompute<1>(
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) {
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) -> LogicalResult {
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(2), crossbarSize.getValue());
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
@@ -612,43 +623,55 @@ static Value createBatchedReductionCompute(Value partialPieces,
Value cNumOutHSlices =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
auto batchLoop = scf::ForOp::create(rewriter, loc, c0, cNumBatches, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(batchLoop.getBody());
Value batch = batchLoop.getInductionVar();
Value batchAcc = batchLoop.getRegionIterArgs().front();
auto hLoop = scf::ForOp::create(rewriter, loc, c0, cNumOutHSlices, c1, ValueRange {batchAcc});
rewriter.setInsertionPointToStart(hLoop.getBody());
Value hSlice = hLoop.getInductionVar();
Value outputAcc = hLoop.getRegionIterArgs().front();
Value reduced = reduceBatchedPartialPiecesForHSlice(
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, loc);
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
loc,
outputSliceType,
reduced,
SmallVector<ReassociationIndices> {
{0, 1},
{2}
});
Value hOffset =
affineMulConst(rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
SmallVector<OpFoldResult> outputSizes {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(numOutRows), rewriter.getIndexAttr(crossbarSize.getValue())};
scf::YieldOp::create(
auto batchLoop = buildNormalizedScfFor(
rewriter,
loc,
tensor::InsertSliceOp::create(
rewriter, loc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult());
rewriter.setInsertionPointAfter(hLoop);
scf::YieldOp::create(rewriter, loc, hLoop.getResult(0));
rewriter.setInsertionPointAfter(batchLoop);
Value paddedOutput = batchLoop.getResult(0);
c0,
cNumBatches,
c1,
ValueRange {outputInit},
[&](
OpBuilder&, Location batchLoc, Value batch, ValueRange batchIterArgs, SmallVectorImpl<Value>& batchYielded) {
auto hLoop = buildNormalizedScfFor(
rewriter,
batchLoc,
c0,
cNumOutHSlices,
c1,
ValueRange {batchIterArgs.front()},
[&](OpBuilder&, Location hLoc, Value hSlice, ValueRange hIterArgs, SmallVectorImpl<Value>& hYielded) {
Value outputAcc = hIterArgs.front();
Value reduced = reduceBatchedPartialPiecesForHSlice(
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, hLoc);
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
hLoc,
outputSliceType,
reduced,
SmallVector<ReassociationIndices> {
{0, 1},
{2}
});
Value hOffset = affineMulConst(
rewriter, hLoc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(numOutRows),
rewriter.getIndexAttr(crossbarSize.getValue())};
Value next =
tensor::InsertSliceOp::create(
rewriter, hLoc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult();
hYielded.push_back(next);
return success();
});
if (failed(hLoop))
return failure();
batchYielded.push_back(hLoop->results.front());
return success();
});
if (failed(batchLoop))
return failure();
Value paddedOutput = batchLoop->results.front();
Value result = paddedOutput;
if (paddedOutType != outType) {
SmallVector<OpFoldResult> outputOffsets {
@@ -660,8 +683,11 @@ static Value createBatchedReductionCompute(Value partialPieces,
rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, getUnitStrides(rewriter, 3));
}
spatial::SpatYieldOp::create(rewriter, loc, result);
return success();
});
return computeOp.getResult(0);
if (failed(computeOp))
return failure();
return computeOp->getResult(0);
}
struct MatMulShapeInfo {
@@ -841,22 +867,27 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
numOutHSlices,
rewriter,
loc);
Value result = createBatchedReductionCompute(batchOp.getResult(0),
partialPiecesType,
directOutType,
paddedOutType,
shapeInfo->batch,
numKSlices,
rewriter,
loc);
if (failed(batchOp))
return failure();
auto result = createBatchedReductionCompute(batchOp->getResult(0),
partialPiecesType,
directOutType,
paddedOutType,
shapeInfo->batch,
numKSlices,
rewriter,
loc);
if (failed(result))
return failure();
Value finalResult = *result;
if (useTransposedForm)
result = transposeBatchedOutput(
result,
finalResult = transposeBatchedOutput(
finalResult,
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
rewriter,
loc);
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, result);
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, finalResult);
return success();
}
}
@@ -873,16 +904,21 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
false,
rewriter,
loc);
Value result =
createBatchedDynamicOutputCompute(batchOp.getResult(0), scalarPiecesType, directOutType, rewriter, loc);
if (failed(batchOp))
return failure();
auto result =
createBatchedDynamicOutputCompute(batchOp->getResult(0), scalarPiecesType, directOutType, rewriter, loc);
if (failed(result))
return failure();
Value finalResult = *result;
if (useTransposedForm)
result = transposeBatchedOutput(
result,
finalResult = transposeBatchedOutput(
finalResult,
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
rewriter,
loc);
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, result);
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, finalResult);
return success();
}
};
@@ -12,6 +12,7 @@
#include <optional>
#include <type_traits>
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
@@ -275,86 +276,102 @@ struct PoolToSpatialComputeBase : public OpConversionPattern<PoolOp> {
Value cStrideHeight = getOrCreateIndexConstant(rewriter, anchorOp, strideHeight);
Value cStrideWidth = getOrCreateIndexConstant(rewriter, anchorOp, strideWidth);
auto outputLoop = scf::ForOp::create(rewriter, loc, c0, cOutputPatchCount, c1, ValueRange {pooledOutputInit});
rewriter.setInsertionPointToStart(outputLoop.getBody());
auto outputLoop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cOutputPatchCount,
c1,
ValueRange {pooledOutputInit},
[&](OpBuilder&,
Location nestedLoc,
Value outputPatchIndex,
ValueRange iterArgs,
SmallVectorImpl<Value>& yielded) {
Value pooledOutputAcc = iterArgs.front();
Value batchIndex = arith::DivUIOp::create(rewriter, nestedLoc, outputPatchIndex, cOutputPixelsPerBatch);
Value batchPatchIndex =
arith::RemUIOp::create(rewriter, nestedLoc, outputPatchIndex, cOutputPixelsPerBatch);
Value outHeightIndex = arith::DivUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutputWidth);
Value outWidthIndex = arith::RemUIOp::create(rewriter, nestedLoc, batchPatchIndex, cOutputWidth);
Value windowBaseH = arith::MulIOp::create(rewriter, nestedLoc, outHeightIndex, cStrideHeight);
Value windowBaseW = arith::MulIOp::create(rewriter, nestedLoc, outWidthIndex, cStrideWidth);
Value outputPatchIndex = outputLoop.getInductionVar();
Value pooledOutputAcc = outputLoop.getRegionIterArgs().front();
Value updatedOutput = pooledOutputAcc;
for (int64_t channelTile = 0; channelTile < channelTileCount; ++channelTile) {
const int64_t tileChannels = std::min<int64_t>(xbarSize, channels - channelTile * xbarSize);
auto tileType = RankedTensorType::get({1, tileChannels, 1, 1}, outType.getElementType());
Value reducedWindow =
createPoolFillTensor(rewriter, nestedLoc, tileType, std::is_same_v<PoolOp, ONNXMaxPoolSingleOutOp>);
Value batchIndex = arith::DivUIOp::create(rewriter, loc, outputPatchIndex, cOutputPixelsPerBatch);
Value batchPatchIndex = arith::RemUIOp::create(rewriter, loc, outputPatchIndex, cOutputPixelsPerBatch);
Value outHeightIndex = arith::DivUIOp::create(rewriter, loc, batchPatchIndex, cOutputWidth);
Value outWidthIndex = arith::RemUIOp::create(rewriter, loc, batchPatchIndex, cOutputWidth);
Value windowBaseH = arith::MulIOp::create(rewriter, loc, outHeightIndex, cStrideHeight);
Value windowBaseW = arith::MulIOp::create(rewriter, loc, outWidthIndex, cStrideWidth);
for (int64_t kernelH = 0; kernelH < kernelHeight; ++kernelH) {
Value paddedInH = windowBaseH;
if (kernelH * dilationHeight != 0) {
Value kernelHOffset = getOrCreateIndexConstant(rewriter, anchorOp, kernelH * dilationHeight);
paddedInH = arith::AddIOp::create(rewriter, nestedLoc, paddedInH, kernelHOffset);
}
Value updatedOutput = pooledOutputAcc;
for (int64_t channelTile = 0; channelTile < channelTileCount; ++channelTile) {
const int64_t tileChannels = std::min<int64_t>(xbarSize, channels - channelTile * xbarSize);
auto tileType = RankedTensorType::get({1, tileChannels, 1, 1}, outType.getElementType());
Value reducedWindow =
createPoolFillTensor(rewriter, loc, tileType, std::is_same_v<PoolOp, ONNXMaxPoolSingleOutOp>);
for (int64_t kernelW = 0; kernelW < kernelWidth; ++kernelW) {
Value paddedInW = windowBaseW;
if (kernelW * dilationWidth != 0) {
Value kernelWOffset = getOrCreateIndexConstant(rewriter, anchorOp, kernelW * dilationWidth);
paddedInW = arith::AddIOp::create(rewriter, nestedLoc, paddedInW, kernelWOffset);
}
for (int64_t kernelH = 0; kernelH < kernelHeight; ++kernelH) {
Value paddedInH = windowBaseH;
if (kernelH * dilationHeight != 0) {
Value kernelHOffset = getOrCreateIndexConstant(rewriter, anchorOp, kernelH * dilationHeight);
paddedInH = arith::AddIOp::create(rewriter, loc, paddedInH, kernelHOffset);
}
for (int64_t kernelW = 0; kernelW < kernelWidth; ++kernelW) {
Value paddedInW = windowBaseW;
if (kernelW * dilationWidth != 0) {
Value kernelWOffset = getOrCreateIndexConstant(rewriter, anchorOp, kernelW * dilationWidth);
paddedInW = arith::AddIOp::create(rewriter, loc, paddedInW, kernelWOffset);
SmallVector<OpFoldResult> offsets = {
batchIndex, rewriter.getIndexAttr(channelTile * xbarSize), paddedInH, paddedInW};
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> strides = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
Value windowValue =
tensor::ExtractSliceOp::create(rewriter, nestedLoc, tileType, paddedInput, offsets, sizes, strides);
windowValue = materializeTileTensor(rewriter, nestedLoc, windowValue);
reducedWindow = ReduceOp::create(rewriter, nestedLoc, tileType, reducedWindow, windowValue);
}
}
SmallVector<OpFoldResult> offsets = {
batchIndex, rewriter.getIndexAttr(channelTile * xbarSize), paddedInH, paddedInW};
SmallVector<OpFoldResult> sizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> strides = {
if constexpr (std::is_same_v<PoolOp, ONNXAveragePoolOp>) {
SmallVector<OpFoldResult> scaleOffsets = {rewriter.getIndexAttr(0),
rewriter.getIndexAttr(channelTile * xbarSize),
outHeightIndex,
outWidthIndex};
SmallVector<OpFoldResult> scaleSizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> scaleStrides = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
Value scaleSlice = tensor::ExtractSliceOp::create(
rewriter, nestedLoc, tileType, averageScaleTensor, scaleOffsets, scaleSizes, scaleStrides);
scaleSlice = materializeTileTensor(rewriter, nestedLoc, scaleSlice);
reducedWindow = spatial::SpatVMulOp::create(rewriter, nestedLoc, tileType, reducedWindow, scaleSlice);
}
SmallVector<OpFoldResult> outputOffsets = {
batchIndex, rewriter.getIndexAttr(channelTile * xbarSize), outHeightIndex, outWidthIndex};
SmallVector<OpFoldResult> outputSizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> outputStrides = {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value windowValue =
tensor::ExtractSliceOp::create(rewriter, loc, tileType, paddedInput, offsets, sizes, strides);
windowValue = materializeTileTensor(rewriter, loc, windowValue);
reducedWindow = ReduceOp::create(rewriter, loc, tileType, reducedWindow, windowValue);
updatedOutput = tensor::InsertSliceOp::create(
rewriter, nestedLoc, reducedWindow, updatedOutput, outputOffsets, outputSizes, outputStrides);
}
}
yielded.push_back(updatedOutput);
return success();
});
if (failed(outputLoop))
return failure();
if constexpr (std::is_same_v<PoolOp, ONNXAveragePoolOp>) {
SmallVector<OpFoldResult> scaleOffsets = {
rewriter.getIndexAttr(0), rewriter.getIndexAttr(channelTile * xbarSize), outHeightIndex, outWidthIndex};
SmallVector<OpFoldResult> scaleSizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> scaleStrides = {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scaleSlice = tensor::ExtractSliceOp::create(
rewriter, loc, tileType, averageScaleTensor, scaleOffsets, scaleSizes, scaleStrides);
scaleSlice = materializeTileTensor(rewriter, loc, scaleSlice);
reducedWindow = spatial::SpatVMulOp::create(rewriter, loc, tileType, reducedWindow, scaleSlice);
}
SmallVector<OpFoldResult> outputOffsets = {
batchIndex, rewriter.getIndexAttr(channelTile * xbarSize), outHeightIndex, outWidthIndex};
SmallVector<OpFoldResult> outputSizes = {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(tileChannels),
rewriter.getIndexAttr(1),
rewriter.getIndexAttr(1)};
SmallVector<OpFoldResult> outputStrides = {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
updatedOutput = tensor::InsertSliceOp::create(
rewriter, loc, reducedWindow, updatedOutput, outputOffsets, outputSizes, outputStrides);
}
scf::YieldOp::create(rewriter, loc, updatedOutput);
rewriter.setInsertionPointAfter(outputLoop);
spatial::SpatYieldOp::create(rewriter, loc, outputLoop.getResult(0));
spatial::SpatYieldOp::create(rewriter, loc, outputLoop->results.front());
return success();
});
if (failed(computeOp))
@@ -3,6 +3,7 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Transforms/DialectConversion.h"
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
@@ -42,13 +43,13 @@ static Value buildLoopSoftmaxSlice(Value input,
return tensor::InsertSliceOp::create(rewriter, loc, softmaxSlice, accumulator, offsets, sizes, strides);
}
static Value buildLoopSoftmaxNest(Value input,
Value accumulator,
RankedTensorType inputType,
int64_t axis,
SmallVectorImpl<Value>& outerIndices,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> buildLoopSoftmaxNest(Value input,
Value accumulator,
RankedTensorType inputType,
int64_t axis,
SmallVectorImpl<Value>& outerIndices,
ConversionPatternRewriter& rewriter,
Location loc) {
if (axis == inputType.getRank() - 1)
return buildLoopSoftmaxSlice(input, accumulator, inputType, outerIndices, rewriter, loc);
@@ -57,38 +58,50 @@ static Value buildLoopSoftmaxNest(Value input,
Value c1 = getOrCreateIndexConstant(rewriter, anchorOp, 1);
Value cUpper = getOrCreateIndexConstant(rewriter, anchorOp, inputType.getDimSize(axis));
auto loop = scf::ForOp::create(rewriter, loc, c0, cUpper, c1, ValueRange {accumulator});
rewriter.setInsertionPointToStart(loop.getBody());
Value loopIndex = loop.getInductionVar();
Value loopAccumulator = loop.getRegionIterArgs().front();
outerIndices.push_back(loopIndex);
Value updatedAccumulator =
buildLoopSoftmaxNest(input, loopAccumulator, inputType, axis + 1, outerIndices, rewriter, loc);
outerIndices.pop_back();
scf::YieldOp::create(rewriter, loc, updatedAccumulator);
rewriter.setInsertionPointAfter(loop);
return loop.getResult(0);
auto loop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cUpper,
c1,
ValueRange {accumulator},
[&](OpBuilder& builder, Location nestedLoc, Value loopIndex, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
outerIndices.push_back(loopIndex);
auto updatedAccumulator =
buildLoopSoftmaxNest(input, iterArgs.front(), inputType, axis + 1, outerIndices, rewriter, nestedLoc);
outerIndices.pop_back();
if (failed(updatedAccumulator))
return failure();
yielded.push_back(*updatedAccumulator);
return success();
});
if (failed(loop))
return failure();
return loop->results.front();
}
static Value createLoopSoftmaxCompute(Value input, ConversionPatternRewriter& rewriter, Location loc) {
static FailureOr<Value> createLoopSoftmaxCompute(Value input, ConversionPatternRewriter& rewriter, Location loc) {
auto inputType = cast<RankedTensorType>(input.getType());
constexpr size_t numInputs = 1;
auto computeOp =
createSpatCompute<numInputs>(rewriter, loc, TypeRange {inputType}, {}, ValueRange {input}, [&](Value x) {
auto computeOp = createSpatCompute<numInputs>(
rewriter, loc, TypeRange {inputType}, {}, ValueRange {input}, [&](Value x) -> LogicalResult {
if (inputType.getRank() == 1) {
Value softmax = spatial::SpatSoftmaxOp::create(rewriter, loc, inputType, x).getResult();
spatial::SpatYieldOp::create(rewriter, loc, softmax);
return;
return success();
}
Value outputInit = tensor::EmptyOp::create(rewriter, loc, inputType.getShape(), inputType.getElementType());
SmallVector<Value> outerIndices;
Value result = buildLoopSoftmaxNest(x, outputInit, inputType, /*axis=*/0, outerIndices, rewriter, loc);
spatial::SpatYieldOp::create(rewriter, loc, result);
auto result = buildLoopSoftmaxNest(x, outputInit, inputType, /*axis=*/0, outerIndices, rewriter, loc);
if (failed(result))
return failure();
spatial::SpatYieldOp::create(rewriter, loc, *result);
return success();
});
return computeOp.getResult(0);
if (failed(computeOp))
return failure();
return computeOp->getResult(0);
}
struct SoftmaxToSpatialCompute : OpConversionPattern<ONNXSoftmaxOp> {
@@ -108,7 +121,10 @@ struct SoftmaxToSpatialCompute : OpConversionPattern<ONNXSoftmaxOp> {
Value input = adaptor.getInput();
Value result;
if (*axis == inputType.getRank() - 1) {
result = createLoopSoftmaxCompute(input, rewriter, softmaxOp.getLoc());
auto computed = createLoopSoftmaxCompute(input, rewriter, softmaxOp.getLoc());
if (failed(computed))
return failure();
result = *computed;
}
else {
SmallVector<int64_t> permutation;
@@ -122,8 +138,10 @@ struct SoftmaxToSpatialCompute : OpConversionPattern<ONNXSoftmaxOp> {
auto transposedType = RankedTensorType::get(
permuteShape(inputType.getShape(), permutation), inputType.getElementType(), inputType.getEncoding());
Value transposedInput = transposeMaybeInCompute(input, transposedType, permutation, rewriter, softmaxOp.getLoc());
Value transposedResult = createLoopSoftmaxCompute(transposedInput, rewriter, softmaxOp.getLoc());
result = transposeMaybeInCompute(transposedResult, inputType, inversePermutation, rewriter, softmaxOp.getLoc());
auto transposedResult = createLoopSoftmaxCompute(transposedInput, rewriter, softmaxOp.getLoc());
if (failed(transposedResult))
return failure();
result = transposeMaybeInCompute(*transposedResult, inputType, inversePermutation, rewriter, softmaxOp.getLoc());
}
rewriter.replaceOp(softmaxOp, result);
@@ -9,6 +9,7 @@
#include "llvm/ADT/SmallVector.h"
#include "src/Accelerators/PIM/Common/IR/WeightUtils.hpp"
#include "src/Accelerators/PIM/Common/Support/CheckedArithmetic.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/WeightMaterialization.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
@@ -192,13 +193,12 @@ struct PromoteWeightLikeComputeBatchInputsPattern : OpRewritePattern<spatial::Sp
rewriter.setInsertionPointAfter(compute);
auto newCompute =
spatial::SpatComputeBatch::create(rewriter,
compute.getLoc(),
compute.getResultTypes(),
rewriter.getI32IntegerAttr(static_cast<int32_t>(compute.getLaneCount())),
promoted->newWeights,
promoted->newInputs);
auto laneCountAttr = pim::getCheckedI32Attr(
rewriter, compute, static_cast<uint64_t>(compute.getLaneCount()), "promoted compute_batch lane count");
if (failed(laneCountAttr))
return failure();
auto newCompute = spatial::SpatComputeBatch::create(
rewriter, compute.getLoc(), compute.getResultTypes(), *laneCountAttr, promoted->newWeights, promoted->newInputs);
auto laneArg = compute.getLaneArgument();
if (!laneArg)
return rewriter.notifyMatchFailure(compute, "missing compute_batch lane block argument");
@@ -5,6 +5,7 @@
#include "llvm/ADT/STLExtras.h"
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
@@ -26,11 +27,11 @@ static Value buildNearestAsymmetricIndex(
return arith::MinUIOp::create(rewriter, loc, inputIndex, cInputDimLast);
}
static Value buildNearestResizeLoop(Value input,
RankedTensorType inputType,
RankedTensorType resultType,
ConversionPatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> buildNearestResizeLoop(Value input,
RankedTensorType inputType,
RankedTensorType resultType,
ConversionPatternRewriter& rewriter,
Location loc) {
auto elemType = resultType.getElementType();
SmallVector<int64_t> unitShape(resultType.getRank(), 1);
auto unitTensorType = RankedTensorType::get(unitShape, elemType);
@@ -48,54 +49,94 @@ static Value buildNearestResizeLoop(Value input,
Value outputInit = tensor::EmptyOp::create(rewriter, loc, resultType.getShape(), elemType);
auto batchLoop = scf::ForOp::create(rewriter, loc, c0, cOutputN, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(batchLoop.getBody());
auto batchLoop = buildNormalizedScfFor(
rewriter,
loc,
c0,
cOutputN,
c1,
ValueRange {outputInit},
[&](OpBuilder&, Location nestedLoc, Value outputN, ValueRange batchIterArgs, SmallVectorImpl<Value>& batchYielded) {
Value outputBatchAcc = batchIterArgs.front();
Value inputN =
buildNearestAsymmetricIndex(outputN, inputType.getDimSize(0), resultType.getDimSize(0), rewriter, nestedLoc);
Value outputN = batchLoop.getInductionVar();
Value outputBatchAcc = batchLoop.getRegionIterArgs().front();
Value inputN = buildNearestAsymmetricIndex(outputN, inputType.getDimSize(0), resultType.getDimSize(0), rewriter, loc);
auto channelLoop = buildNormalizedScfFor(
rewriter,
nestedLoc,
c0,
cOutputC,
c1,
ValueRange {outputBatchAcc},
[&](OpBuilder&,
Location channelLoc,
Value outputC,
ValueRange channelIterArgs,
SmallVectorImpl<Value>& channelYielded) {
Value outputChannelAcc = channelIterArgs.front();
Value inputC = buildNearestAsymmetricIndex(
outputC, inputType.getDimSize(1), resultType.getDimSize(1), rewriter, channelLoc);
auto channelLoop = scf::ForOp::create(rewriter, loc, c0, cOutputC, c1, ValueRange {outputBatchAcc});
rewriter.setInsertionPointToStart(channelLoop.getBody());
auto heightLoop = buildNormalizedScfFor(
rewriter,
channelLoc,
c0,
cOutputH,
c1,
ValueRange {outputChannelAcc},
[&](OpBuilder&,
Location heightLoc,
Value outputH,
ValueRange heightIterArgs,
SmallVectorImpl<Value>& heightYielded) {
Value outputHeightAcc = heightIterArgs.front();
Value inputH = buildNearestAsymmetricIndex(
outputH, inputType.getDimSize(2), resultType.getDimSize(2), rewriter, heightLoc);
Value outputC = channelLoop.getInductionVar();
Value outputChannelAcc = channelLoop.getRegionIterArgs().front();
Value inputC = buildNearestAsymmetricIndex(outputC, inputType.getDimSize(1), resultType.getDimSize(1), rewriter, loc);
auto widthLoop = buildNormalizedScfFor(
rewriter,
heightLoc,
c0,
cOutputW,
c1,
ValueRange {outputHeightAcc},
[&](OpBuilder&,
Location widthLoc,
Value outputW,
ValueRange widthIterArgs,
SmallVectorImpl<Value>& widthYielded) {
Value outputWidthAcc = widthIterArgs.front();
Value inputW = buildNearestAsymmetricIndex(
outputW, inputType.getDimSize(3), resultType.getDimSize(3), rewriter, widthLoc);
auto heightLoop = scf::ForOp::create(rewriter, loc, c0, cOutputH, c1, ValueRange {outputChannelAcc});
rewriter.setInsertionPointToStart(heightLoop.getBody());
SmallVector<OpFoldResult> inputOffsets = {inputN, inputC, inputH, inputW};
Value inputSlice = tensor::ExtractSliceOp::create(
rewriter, widthLoc, unitTensorType, input, inputOffsets, unitSizes, unitStrides);
Value outputH = heightLoop.getInductionVar();
Value outputHeightAcc = heightLoop.getRegionIterArgs().front();
Value inputH = buildNearestAsymmetricIndex(outputH, inputType.getDimSize(2), resultType.getDimSize(2), rewriter, loc);
auto widthLoop = scf::ForOp::create(rewriter, loc, c0, cOutputW, c1, ValueRange {outputHeightAcc});
rewriter.setInsertionPointToStart(widthLoop.getBody());
Value outputW = widthLoop.getInductionVar();
Value outputWidthAcc = widthLoop.getRegionIterArgs().front();
Value inputW = buildNearestAsymmetricIndex(outputW, inputType.getDimSize(3), resultType.getDimSize(3), rewriter, loc);
SmallVector<OpFoldResult> inputOffsets = {inputN, inputC, inputH, inputW};
Value inputSlice =
tensor::ExtractSliceOp::create(rewriter, loc, unitTensorType, input, inputOffsets, unitSizes, unitStrides);
SmallVector<OpFoldResult> outputOffsets = {outputN, outputC, outputH, outputW};
Value updatedOutput =
tensor::InsertSliceOp::create(rewriter, loc, inputSlice, outputWidthAcc, outputOffsets, unitSizes, unitStrides);
scf::YieldOp::create(rewriter, loc, updatedOutput);
rewriter.setInsertionPointAfter(widthLoop);
scf::YieldOp::create(rewriter, loc, widthLoop.getResult(0));
rewriter.setInsertionPointAfter(heightLoop);
scf::YieldOp::create(rewriter, loc, heightLoop.getResult(0));
rewriter.setInsertionPointAfter(channelLoop);
scf::YieldOp::create(rewriter, loc, channelLoop.getResult(0));
rewriter.setInsertionPointAfter(batchLoop);
return batchLoop.getResult(0);
SmallVector<OpFoldResult> outputOffsets = {outputN, outputC, outputH, outputW};
Value updatedOutput = tensor::InsertSliceOp::create(
rewriter, widthLoc, inputSlice, outputWidthAcc, outputOffsets, unitSizes, unitStrides);
widthYielded.push_back(updatedOutput);
return success();
});
if (failed(widthLoop))
return failure();
heightYielded.push_back(widthLoop->results.front());
return success();
});
if (failed(heightLoop))
return failure();
channelYielded.push_back(heightLoop->results.front());
return success();
});
if (failed(channelLoop))
return failure();
batchYielded.push_back(channelLoop->results.front());
return success();
});
if (failed(batchLoop))
return failure();
return batchLoop->results.front();
}
struct Resize : OpConversionPattern<ONNXResizeOp> {
@@ -120,12 +161,17 @@ struct Resize : OpConversionPattern<ONNXResizeOp> {
|| llvm::any_of(resultType.getShape(), [](int64_t dim) { return dim <= 0; }))
return rewriter.notifyMatchFailure(resizeOp, "resize lowering requires positive static dimensions.");
auto computeOp =
createSpatCompute<1>(rewriter, resizeOp.getLoc(), TypeRange {resultType}, {}, adaptor.getX(), [&](Value x) {
Value result = buildNearestResizeLoop(x, inputType, resultType, rewriter, resizeOp.getLoc());
spatial::SpatYieldOp::create(rewriter, resizeOp.getLoc(), result);
auto computeOp = createSpatCompute<1>(
rewriter, resizeOp.getLoc(), TypeRange {resultType}, {}, adaptor.getX(), [&](Value x) -> LogicalResult {
auto result = buildNearestResizeLoop(x, inputType, resultType, rewriter, resizeOp.getLoc());
if (failed(result))
return failure();
spatial::SpatYieldOp::create(rewriter, resizeOp.getLoc(), *result);
return success();
});
rewriter.replaceOp(resizeOp, computeOp.getResults());
if (failed(computeOp))
return failure();
rewriter.replaceOp(resizeOp, computeOp->getResults());
return success();
}
};