add shared loop creation helpers
Validate Operations / validate-operations (push) Has been cancelled

add shared checked arithmetic helpers
refactor pim passes into Pim/Transforms
more robust memory coalescing pass
This commit is contained in:
NiccoloN
2026-06-01 16:49:06 +02:00
parent 356be6ccc2
commit 636310d0cb
55 changed files with 2007 additions and 1103 deletions
@@ -8,6 +8,7 @@
#include "llvm/ADT/SmallVector.h"
#include "src/Accelerators/PIM/Common/IR/AffineUtils.hpp"
#include "src/Accelerators/PIM/Common/IR/LoopUtils.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/CompileTime.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Patterns.hpp"
@@ -281,18 +282,18 @@ static Value getBatchLaneIndex(
rewriter, loc, lane, numOutRows * numKSlices * numOutHSlices, rewriter.getInsertionBlock()->getParentOp());
}
static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
Value b,
RankedTensorType aType,
int64_t aBatchCount,
RankedTensorType bType,
int64_t bBatchCount,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createBatchedVmmBatch(Value a,
Value b,
RankedTensorType aType,
int64_t aBatchCount,
RankedTensorType bType,
int64_t bBatchCount,
RankedTensorType partialPiecesType,
int64_t numOutRows,
int64_t numKSlices,
int64_t numOutHSlices,
PatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = partialPiecesType.getDimSize(0);
auto batchOp = createSpatComputeBatch(
rewriter,
@@ -331,7 +332,8 @@ static spatial::SpatComputeBatch createBatchedVmmBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, piece, args.outputs.front(), pieceOffsets, pieceSizes, getUnitStrides(rewriter, 2));
});
assert(succeeded(batchOp) && "expected batched MatMul VMM construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
@@ -422,17 +424,17 @@ static Value extractDynamicBatchedRowVector(Value matrix,
});
}
static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
int64_t aBatchCount,
Value b,
int64_t bBatchCount,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<spatial::SpatComputeBatch> createBatchedVvdmulBatch(Value a,
int64_t aBatchCount,
Value b,
int64_t bBatchCount,
RankedTensorType aType,
RankedTensorType bType,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
bool bAlreadyTransposed,
PatternRewriter& rewriter,
Location loc) {
const int64_t numBatches = outType.getDimSize(0);
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutCols = outType.getDimSize(2);
@@ -466,64 +468,73 @@ static spatial::SpatComputeBatch createBatchedVvdmulBatch(Value a,
createParallelInsertSliceIntoBatchOutput(
rewriter, loc, scalar, args.outputs.front(), outputOffsets, scalarSizes, getUnitStrides(rewriter, 2));
});
assert(succeeded(batchOp) && "expected batched MatMul VVDMul construction to succeed");
if (failed(batchOp))
return failure();
return *batchOp;
}
static Value createBatchedDynamicOutputCompute(Value scalarPieces,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> createBatchedDynamicOutputCompute(Value scalarPieces,
RankedTensorType scalarPiecesType,
RankedTensorType outType,
PatternRewriter& rewriter,
Location loc) {
const int64_t laneCount = scalarPiecesType.getDimSize(0);
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutCols = outType.getDimSize(2);
auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
auto outputScalarType = RankedTensorType::get({1, 1, 1}, outType.getElementType());
auto computeOp =
createSpatCompute<1>(rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) {
auto computeOp = createSpatCompute<1>(
rewriter, loc, TypeRange {outType}, {}, ValueRange {scalarPieces}, [&](Value pieces) -> LogicalResult {
Value outputInit =
tensor::EmptyOp::create(rewriter, loc, outType.getShape(), outType.getElementType()).getResult();
Value c0 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 0);
Value c1 = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), 1);
Value cLaneCount = getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), laneCount);
auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(loop.getBody());
Value lane = loop.getInductionVar();
Value outputAcc = loop.getRegionIterArgs().front();
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
Value batch = affineFloorDivConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
Value batchLane = affineModConst(rewriter, loc, lane, numOutRows * numOutCols, anchorOp);
Value row = affineFloorDivConst(rewriter, loc, batchLane, numOutCols, anchorOp);
Value column = affineModConst(rewriter, loc, batchLane, numOutCols, anchorOp);
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar = tensor::ExtractSliceOp::create(
rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
Value expanded = tensor::ExpandShapeOp::create(rewriter,
loc,
outputScalarType,
scalar,
SmallVector<ReassociationIndices> {
{0},
{1, 2}
});
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
SmallVector<OpFoldResult> outputSizes {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
scf::YieldOp::create(
auto loop = buildNormalizedScfFor(
rewriter,
loc,
tensor::InsertSliceOp::create(
rewriter, loc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult());
rewriter.setInsertionPointAfter(loop);
spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
c0,
cLaneCount,
c1,
ValueRange {outputInit},
[&](OpBuilder&, Location nestedLoc, Value lane, ValueRange iterArgs, SmallVectorImpl<Value>& yielded) {
Value outputAcc = iterArgs.front();
Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
Value batch = affineFloorDivConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
Value batchLane = affineModConst(rewriter, nestedLoc, lane, numOutRows * numOutCols, anchorOp);
Value row = affineFloorDivConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
Value column = affineModConst(rewriter, nestedLoc, batchLane, numOutCols, anchorOp);
SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value scalar = tensor::ExtractSliceOp::create(
rewriter, nestedLoc, scalarType, pieces, scalarOffsets, scalarSizes, getUnitStrides(rewriter, 2));
Value expanded = tensor::ExpandShapeOp::create(rewriter,
nestedLoc,
outputScalarType,
scalar,
SmallVector<ReassociationIndices> {
{0},
{1, 2}
});
SmallVector<OpFoldResult> outputOffsets {batch, row, column};
SmallVector<OpFoldResult> outputSizes = {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
Value next =
tensor::InsertSliceOp::create(
rewriter, nestedLoc, expanded, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult();
yielded.push_back(next);
return success();
});
if (failed(loop))
return failure();
spatial::SpatYieldOp::create(rewriter, loc, loop->results.front());
return success();
});
return computeOp.getResult(0);
if (failed(computeOp))
return failure();
return computeOp->getResult(0);
}
static Value transposeBatchedOutput(Value value, RankedTensorType outputType, PatternRewriter& rewriter, Location loc) {
@@ -587,16 +598,16 @@ static Value reduceBatchedPartialPiecesForHSlice(Value partialPiecesArg,
return activePieces.front();
}
static Value createBatchedReductionCompute(Value partialPieces,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numBatches,
int64_t numKSlices,
PatternRewriter& rewriter,
Location loc) {
static FailureOr<Value> createBatchedReductionCompute(Value partialPieces,
RankedTensorType partialPiecesType,
RankedTensorType outType,
RankedTensorType paddedOutType,
int64_t numBatches,
int64_t numKSlices,
PatternRewriter& rewriter,
Location loc) {
auto computeOp = createSpatCompute<1>(
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) {
rewriter, loc, TypeRange {outType}, {}, ValueRange {partialPieces}, [&](Value partialPiecesArg) -> LogicalResult {
const int64_t numOutRows = outType.getDimSize(1);
const int64_t numOutHSlices = ceilIntegerDivide(outType.getDimSize(2), crossbarSize.getValue());
auto pieceType = RankedTensorType::get({numOutRows, static_cast<int64_t>(crossbarSize.getValue())},
@@ -612,43 +623,55 @@ static Value createBatchedReductionCompute(Value partialPieces,
Value cNumOutHSlices =
getOrCreateIndexConstant(rewriter, rewriter.getInsertionBlock()->getParentOp(), numOutHSlices);
auto batchLoop = scf::ForOp::create(rewriter, loc, c0, cNumBatches, c1, ValueRange {outputInit});
rewriter.setInsertionPointToStart(batchLoop.getBody());
Value batch = batchLoop.getInductionVar();
Value batchAcc = batchLoop.getRegionIterArgs().front();
auto hLoop = scf::ForOp::create(rewriter, loc, c0, cNumOutHSlices, c1, ValueRange {batchAcc});
rewriter.setInsertionPointToStart(hLoop.getBody());
Value hSlice = hLoop.getInductionVar();
Value outputAcc = hLoop.getRegionIterArgs().front();
Value reduced = reduceBatchedPartialPiecesForHSlice(
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, loc);
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
loc,
outputSliceType,
reduced,
SmallVector<ReassociationIndices> {
{0, 1},
{2}
});
Value hOffset =
affineMulConst(rewriter, loc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
SmallVector<OpFoldResult> outputSizes {
rewriter.getIndexAttr(1), rewriter.getIndexAttr(numOutRows), rewriter.getIndexAttr(crossbarSize.getValue())};
scf::YieldOp::create(
auto batchLoop = buildNormalizedScfFor(
rewriter,
loc,
tensor::InsertSliceOp::create(
rewriter, loc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult());
rewriter.setInsertionPointAfter(hLoop);
scf::YieldOp::create(rewriter, loc, hLoop.getResult(0));
rewriter.setInsertionPointAfter(batchLoop);
Value paddedOutput = batchLoop.getResult(0);
c0,
cNumBatches,
c1,
ValueRange {outputInit},
[&](
OpBuilder&, Location batchLoc, Value batch, ValueRange batchIterArgs, SmallVectorImpl<Value>& batchYielded) {
auto hLoop = buildNormalizedScfFor(
rewriter,
batchLoc,
c0,
cNumOutHSlices,
c1,
ValueRange {batchIterArgs.front()},
[&](OpBuilder&, Location hLoc, Value hSlice, ValueRange hIterArgs, SmallVectorImpl<Value>& hYielded) {
Value outputAcc = hIterArgs.front();
Value reduced = reduceBatchedPartialPiecesForHSlice(
partialPiecesArg, batch, hSlice, pieceType, numKSlices, numOutHSlices, numOutRows, rewriter, hLoc);
Value expandedReduced = tensor::ExpandShapeOp::create(rewriter,
hLoc,
outputSliceType,
reduced,
SmallVector<ReassociationIndices> {
{0, 1},
{2}
});
Value hOffset = affineMulConst(
rewriter, hLoc, hSlice, crossbarSize.getValue(), rewriter.getInsertionBlock()->getParentOp());
SmallVector<OpFoldResult> outputOffsets {batch, rewriter.getIndexAttr(0), hOffset};
SmallVector<OpFoldResult> outputSizes {rewriter.getIndexAttr(1),
rewriter.getIndexAttr(numOutRows),
rewriter.getIndexAttr(crossbarSize.getValue())};
Value next =
tensor::InsertSliceOp::create(
rewriter, hLoc, expandedReduced, outputAcc, outputOffsets, outputSizes, getUnitStrides(rewriter, 3))
.getResult();
hYielded.push_back(next);
return success();
});
if (failed(hLoop))
return failure();
batchYielded.push_back(hLoop->results.front());
return success();
});
if (failed(batchLoop))
return failure();
Value paddedOutput = batchLoop->results.front();
Value result = paddedOutput;
if (paddedOutType != outType) {
SmallVector<OpFoldResult> outputOffsets {
@@ -660,8 +683,11 @@ static Value createBatchedReductionCompute(Value partialPieces,
rewriter, loc, outType, paddedOutput, outputOffsets, outputSizes, getUnitStrides(rewriter, 3));
}
spatial::SpatYieldOp::create(rewriter, loc, result);
return success();
});
return computeOp.getResult(0);
if (failed(computeOp))
return failure();
return computeOp->getResult(0);
}
struct MatMulShapeInfo {
@@ -841,22 +867,27 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
numOutHSlices,
rewriter,
loc);
Value result = createBatchedReductionCompute(batchOp.getResult(0),
partialPiecesType,
directOutType,
paddedOutType,
shapeInfo->batch,
numKSlices,
rewriter,
loc);
if (failed(batchOp))
return failure();
auto result = createBatchedReductionCompute(batchOp->getResult(0),
partialPiecesType,
directOutType,
paddedOutType,
shapeInfo->batch,
numKSlices,
rewriter,
loc);
if (failed(result))
return failure();
Value finalResult = *result;
if (useTransposedForm)
result = transposeBatchedOutput(
result,
finalResult = transposeBatchedOutput(
finalResult,
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
rewriter,
loc);
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, result);
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, finalResult);
return success();
}
}
@@ -873,16 +904,21 @@ struct MatMulBatchedToSpatialComputes : OpRewritePattern<ONNXMatMulOp> {
false,
rewriter,
loc);
Value result =
createBatchedDynamicOutputCompute(batchOp.getResult(0), scalarPiecesType, directOutType, rewriter, loc);
if (failed(batchOp))
return failure();
auto result =
createBatchedDynamicOutputCompute(batchOp->getResult(0), scalarPiecesType, directOutType, rewriter, loc);
if (failed(result))
return failure();
Value finalResult = *result;
if (useTransposedForm)
result = transposeBatchedOutput(
result,
finalResult = transposeBatchedOutput(
finalResult,
RankedTensorType::get({shapeInfo->batch, shapeInfo->m, shapeInfo->n}, shapeInfo->outType.getElementType()),
rewriter,
loc);
result = expandBatchDims(result, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, result);
finalResult = expandBatchDims(finalResult, shapeInfo->outType, shapeInfo->batchShape.size(), rewriter, loc);
rewriter.replaceOp(matmulOp, finalResult);
return success();
}
};