428 lines
18 KiB
C++
428 lines
18 KiB
C++
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
#include "mlir/IR/Value.h"
|
|
#include "mlir/IR/ValueRange.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <cassert>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
|
|
#include "src/Accelerators/PIM/Common/PimCommon.hpp"
|
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
|
|
|
using namespace mlir;
|
|
|
|
namespace onnx_mlir {
|
|
|
|
Value applyReducePatternNew(SmallVector<Value>& valuesToReduce,
|
|
ConversionPatternRewriter& rewriter,
|
|
std::function<Value(const Value&, const Value&)> reduce,
|
|
std::function<Value(const Value&)> preprocess,
|
|
std::function<Value(const Value&)> postprocess) {
|
|
// Simple case: if we have only one input, just return it
|
|
if (valuesToReduce.size() == 1)
|
|
return valuesToReduce[0];
|
|
|
|
if (preprocess) {
|
|
for (auto& valToReduce : valuesToReduce) {
|
|
rewriter.setInsertionPointAfterValue(valToReduce);
|
|
valToReduce = preprocess(valToReduce);
|
|
}
|
|
}
|
|
|
|
// It is possible that `valuesToReduce` contains two entries for the same
|
|
// computeOp. In this case, we need to apply the reduction within-computef
|
|
|
|
// Keep a map between a computeOp and the last Value for this reduction
|
|
std::unordered_map<Operation*, Value> lastValueForCompute;
|
|
for (auto& valToReduce : valuesToReduce) {
|
|
Operation* computeOp = valToReduce.getParentBlock()->getParentOp();
|
|
// if (valToReduce.getDefiningOp()) {
|
|
// // If the value is defined by an operation, we take the parent
|
|
// operation computeOp = valToReduce.getDefiningOp()->getParentOp();
|
|
// } else {
|
|
// // Otherwise it is a block argument,
|
|
// computeOp->getBlock()->getParentOp();
|
|
// }
|
|
|
|
assert(isa<spatial::SpatWeightedCompute>(computeOp) && "Expected a ComputeOp");
|
|
|
|
auto it = lastValueForCompute.find(computeOp);
|
|
|
|
if (it != lastValueForCompute.end()) {
|
|
// If we have already seen this computeOp, apply the reduction
|
|
// within-compute
|
|
Value lastWithinComputeValue = it->second;
|
|
|
|
if (valToReduce.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp()))
|
|
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
|
else
|
|
rewriter.setInsertionPointAfterValue(valToReduce);
|
|
valToReduce = reduce(lastWithinComputeValue, valToReduce);
|
|
lastValueForCompute[computeOp] = valToReduce;
|
|
}
|
|
|
|
lastValueForCompute[computeOp] = valToReduce;
|
|
}
|
|
|
|
// Now, reconstruct from the map the valuesToReduce list
|
|
valuesToReduce.clear();
|
|
valuesToReduce.reserve(lastValueForCompute.size());
|
|
for (auto& entry : lastValueForCompute)
|
|
valuesToReduce.push_back(entry.second);
|
|
|
|
Location loc = valuesToReduce[0].getLoc();
|
|
auto channelType = spatial::SpatChannelType::get(rewriter.getContext());
|
|
|
|
// Recursive algorithm to reduce the inputs to a single one:
|
|
// - Take two inputs at a time, and reduce them into a single one, updating
|
|
// the valuesToReduce list which becomes half the size.
|
|
// - Repeat until there is only one input left.
|
|
llvm::OwningArrayRef<Value> valuesToReduceRef(valuesToReduce);
|
|
while (valuesToReduceRef.size() > 1) {
|
|
SmallVector<Value> nextValuesToReduce;
|
|
nextValuesToReduce.reserve(valuesToReduceRef.size() / 2);
|
|
for (size_t i = 0; i < valuesToReduceRef.size() - 1; i += 2) {
|
|
auto firstValue = valuesToReduceRef[i];
|
|
auto secondValue = valuesToReduceRef[i + 1];
|
|
|
|
auto firstCompute = firstValue.getParentBlock()->getParentOp();
|
|
auto secondCompute = secondValue.getParentBlock()->getParentOp();
|
|
|
|
assert(isa<spatial::SpatWeightedCompute>(firstCompute));
|
|
assert(isa<spatial::SpatWeightedCompute>(secondCompute));
|
|
|
|
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
|
std::swap(firstValue, secondValue);
|
|
std::swap(firstCompute, secondCompute);
|
|
}
|
|
|
|
// 1. Add a channel before the first computeOp
|
|
rewriter.setInsertionPoint(firstCompute);
|
|
auto channel = spatial::SpatChannelNewOp::create(rewriter, loc, channelType);
|
|
|
|
// 2. Add a sendOp after the first value
|
|
rewriter.setInsertionPointAfterValue(firstValue);
|
|
spatial::SpatChannelSendOp::create(rewriter, loc, channel, firstValue);
|
|
|
|
// 3. Add a receiveOp after the second value
|
|
rewriter.setInsertionPointAfterValue(secondValue);
|
|
auto receivedValue = spatial::SpatChannelReceiveOp::create(rewriter, loc, secondValue.getType(), channel);
|
|
|
|
// 4. Apply reduction between second value and received value
|
|
rewriter.setInsertionPointAfterValue(receivedValue);
|
|
Value reduced = reduce(receivedValue, secondValue);
|
|
|
|
nextValuesToReduce.push_back(reduced);
|
|
}
|
|
|
|
// If we have an odd number of inputs, we need to add the last one to the
|
|
// newInputs list.
|
|
if (valuesToReduceRef.size() % 2 == 1)
|
|
nextValuesToReduce.push_back(valuesToReduceRef.back());
|
|
|
|
// Replace the inputOps list with the new one.
|
|
valuesToReduceRef = llvm::OwningArrayRef<Value>(std::move(nextValuesToReduce));
|
|
}
|
|
|
|
assert(valuesToReduceRef.size() == 1 && "Internal error: expected a single input at this point.");
|
|
|
|
auto finalValue = valuesToReduceRef[0];
|
|
|
|
if (postprocess) {
|
|
rewriter.setInsertionPointAfterValue(finalValue);
|
|
finalValue = postprocess(finalValue);
|
|
}
|
|
|
|
return finalValue;
|
|
}
|
|
|
|
template <typename PoolOp>
|
|
bool hasPostProcessPoolingWindow() {
|
|
return false;
|
|
}
|
|
|
|
template <>
|
|
bool hasPostProcessPoolingWindow<ONNXAveragePoolOp>() {
|
|
return true;
|
|
}
|
|
|
|
template <typename PoolOp>
|
|
Value postProcessPoolingWindow(ConversionPatternRewriter& rewriter,
|
|
Location loc,
|
|
PoolOp poolOp,
|
|
Value valueToDivide,
|
|
size_t krn_size,
|
|
size_t tilesSkippedByPadding) {
|
|
return nullptr;
|
|
}
|
|
|
|
template <>
|
|
Value postProcessPoolingWindow<ONNXAveragePoolOp>(ConversionPatternRewriter& rewriter,
|
|
Location loc,
|
|
ONNXAveragePoolOp poolOp,
|
|
Value valueToDivide,
|
|
size_t krn_size,
|
|
size_t tilesSkippedByPadding) {
|
|
bool countIncludePad = poolOp.getCountIncludePad() == 1;
|
|
|
|
size_t divisorNumber = countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
|
|
|
|
RankedTensorType scalarTensor = RankedTensorType::get({1}, rewriter.getF32Type());
|
|
|
|
// Put a spat.const before the computeOp, and use its value. We do this to be
|
|
// compatible with the current code generation, which assumes constant to be
|
|
// loaded in global memory, which is allocated by adding a spat.const OP
|
|
// directly under func.func (i.e. alongside ComputeOps)
|
|
auto computeOp = cast<spatial::SpatWeightedCompute>(valueToDivide.getDefiningOp()->getParentOp());
|
|
rewriter.setInsertionPoint(computeOp);
|
|
auto divisorValue = spatial::SpatConstantOp::create(rewriter,
|
|
loc,
|
|
scalarTensor,
|
|
rewriter.getI64IntegerAttr(divisorNumber),
|
|
/* should_allocate = */ rewriter.getBoolAttr(true));
|
|
|
|
rewriter.setInsertionPointAfterValue(valueToDivide);
|
|
return spatial::SpatVSDivOp::create(rewriter, loc, valueToDivide.getType(), valueToDivide, divisorValue);
|
|
}
|
|
|
|
template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
|
|
struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {
|
|
PoolingBaseConverter(MLIRContext* ctx)
|
|
: OpConversionPattern<PoolOp>(ctx) {}
|
|
|
|
LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final {
|
|
Value X = adaptor.getX();
|
|
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
|
Value Y = poolOp.getResult();
|
|
ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
|
|
|
|
size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
|
|
unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
|
|
unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
|
|
unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
|
|
|
|
if (adaptor.getAutoPad() != "NOTSET")
|
|
return rewriter.notifyMatchFailure(poolOp, "auto_pad != NOTSET is deprecated.");
|
|
|
|
size_t pad_x, pad_y;
|
|
auto padUnpackError = unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
|
|
if (padUnpackError.has_value())
|
|
return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
|
|
|
|
Location loc = poolOp.getLoc();
|
|
|
|
size_t input_h = getImageHeight(xShape);
|
|
size_t input_w = getImageWidth(xShape);
|
|
size_t output_h = getImageHeight(yShape);
|
|
size_t output_w = getImageWidth(yShape);
|
|
size_t channelTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
|
|
size_t channelTileRest = getImageChannel(xShape) % crossbarSize;
|
|
|
|
// 1: Tile the input tensor
|
|
// Input tiles need to be indexed by:
|
|
// a. Channel Tile
|
|
// b. Pixel `x` position
|
|
// c. Pixel `y` position
|
|
// For example: inputTiles[channelTile][x][y]
|
|
// Example complete input tensor: tensor<1x3x12x12xf32> (NxCxWxH)
|
|
// Suppose that the input tensor is produced by concatenating the results of
|
|
// many ComputeOps. Get the result tiles from these ComputeOps.
|
|
SmallVector<SmallVector<SmallVector<Value>>> inputTiles(
|
|
channelTileCount, SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
|
|
|
|
auto resolveErrorOpt =
|
|
resolveImgInputTiles(X, inputTiles, channelTileCount, channelTileRest, input_w, input_h, rewriter);
|
|
if (resolveErrorOpt.has_value())
|
|
return rewriter.notifyMatchFailure(poolOp, *resolveErrorOpt);
|
|
|
|
// TODO: This requires a core for each input tile, which is not ideal. We
|
|
// can do better.
|
|
// If some input tiles come from the func.func operands, load
|
|
// them into a computeOp and yield them
|
|
for (size_t t = 0; t < channelTileCount; t++) {
|
|
for (size_t x = 0; x < input_w; x++) {
|
|
for (size_t y = 0; y < input_h; y++) {
|
|
if (auto extractSliceOp = inputTiles[t][x][y].getDefiningOp<tensor::ExtractSliceOp>()) {
|
|
Location tileLoc = extractSliceOp.getLoc();
|
|
|
|
auto tempComputeOp = spatial::SpatWeightedCompute::create(rewriter,
|
|
tileLoc,
|
|
extractSliceOp.getResultType(),
|
|
/* xbarWeights =*/ValueRange(),
|
|
extractSliceOp.getResult());
|
|
|
|
Block* tempComputeOpBlock = new Block();
|
|
tempComputeOp.getBody().push_back(tempComputeOpBlock);
|
|
auto tempComputeOpBlockArg = tempComputeOpBlock->addArgument(extractSliceOp.getType(), tileLoc);
|
|
|
|
rewriter.setInsertionPointToStart(tempComputeOpBlock);
|
|
spatial::SpatYieldOp::create(rewriter, tileLoc, tempComputeOpBlockArg);
|
|
rewriter.setInsertionPointAfter(tempComputeOp);
|
|
inputTiles[t][x][y] = tempComputeOp.getResult(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 2: Tile the output tensor
|
|
// Output tiles need to be indexed by:
|
|
// a. Channel Tile
|
|
// b. Pixel `x` position
|
|
// c. Pixel `y` position
|
|
// For example: outputTiles[channelTile][x][y]
|
|
// Example complete output tensor: tensor<1x3x6x6xf32> (NxCxWxH)
|
|
SmallVector<SmallVector<SmallVector<Value>>> outputTiles(
|
|
channelTileCount, SmallVector<SmallVector<Value>>(output_w, SmallVector<Value>(output_h, nullptr)));
|
|
|
|
// List of values to pool for each output pixel
|
|
SmallVector<Value> valuesToPool;
|
|
|
|
// Iterate each output tile
|
|
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
|
// Iterate each output pixel
|
|
for (size_t outX = 0; outX < output_w; outX++) {
|
|
for (size_t outY = 0; outY < output_h; outY++) {
|
|
|
|
// Each output pixel tile is computed by pooling a window of input
|
|
// pixel tiles
|
|
valuesToPool.clear();
|
|
size_t tilesSkippedByPadding = 0;
|
|
|
|
auto [start_x, end_x] = kernel_get_start_and_end(outX, input_w, krn_w, stride_x, dilation_x, pad_x);
|
|
auto [start_y, end_y] = kernel_get_start_and_end(outY, input_h, krn_h, stride_y, dilation_y, pad_y);
|
|
|
|
for (size_t inX = start_x; inX < end_x; inX += dilation_x) {
|
|
for (size_t inY = start_y; inY < end_y; inY += dilation_y) {
|
|
if (failed(verifyWithinBoundsAndPaddings(input_w, input_h, inX, inY, pad_x, pad_y))) {
|
|
tilesSkippedByPadding++;
|
|
continue;
|
|
}
|
|
|
|
Value inputTile = inputTiles[outTile][inX][inY];
|
|
|
|
Value valueToPool;
|
|
if (auto computeProducer = inputTile.getDefiningOp<spatial::SpatWeightedCompute>()) {
|
|
|
|
int resultNumber = getResultIndex(computeProducer, inputTile);
|
|
|
|
auto yieldInComputeOp = cast<spatial::SpatYieldOp>(computeProducer.getBody().front().getTerminator());
|
|
valueToPool = yieldInComputeOp.getOperand(resultNumber);
|
|
}
|
|
else if (auto receiveProducer = inputTile.getDefiningOp<spatial::SpatChannelReceiveOp>()) {
|
|
auto sendOpOpt = getOtherEndOfChannel(receiveProducer, true, rewriter);
|
|
if (failed(sendOpOpt)) {
|
|
return rewriter.notifyMatchFailure(poolOp,
|
|
"ChannelReceiveOp does not have a matching "
|
|
"ChannelSendOp.");
|
|
}
|
|
auto sendOp = cast<spatial::SpatChannelSendOp>(*sendOpOpt);
|
|
|
|
valueToPool = sendOp.getData();
|
|
}
|
|
else {
|
|
return rewriter.notifyMatchFailure(poolOp,
|
|
"Input tile for Pooling is not produced by a "
|
|
"WeightedComputeOp nor a receiveOp");
|
|
}
|
|
|
|
valuesToPool.push_back(valueToPool);
|
|
}
|
|
}
|
|
|
|
assert(valuesToPool.size() != 0 && "Pooling computed on zero tiles make no sense.");
|
|
// assert(computeOpsForPooling.size() != 1 &&
|
|
// "Pooling computed on one tiles make no sense??? Or maybe
|
|
// this " "should have been simplified earlier???");
|
|
|
|
std::function<Value(const Value&)> postProcessFn = nullptr;
|
|
if (hasPostProcessPoolingWindow<PoolOp>()) {
|
|
postProcessFn = [&](const Value prevFinalRes) {
|
|
return postProcessPoolingWindow(
|
|
rewriter, loc, poolOp, prevFinalRes, krn_h * krn_w, tilesSkippedByPadding);
|
|
};
|
|
}
|
|
|
|
Value reducedWithinCompute = applyReducePatternNew(
|
|
valuesToPool,
|
|
rewriter,
|
|
[&](const Value lhs, const Value rhs) { return ReduceOp::create(rewriter, loc, lhs.getType(), lhs, rhs); },
|
|
nullptr,
|
|
postProcessFn);
|
|
|
|
// Send this value through a channel, and receive it in the
|
|
// `func.func`. During lowering, we will need to "move it" into the
|
|
// users computeOps
|
|
auto computeOpOfReduced =
|
|
cast<spatial::SpatWeightedCompute>(reducedWithinCompute.getDefiningOp()->getParentOp());
|
|
|
|
// Create a new channel before the computeOp
|
|
rewriter.setInsertionPoint(computeOpOfReduced);
|
|
auto reduceChannel =
|
|
spatial::SpatChannelNewOp::create(rewriter, loc, spatial::SpatChannelType::get(rewriter.getContext()));
|
|
|
|
// Send value through the channel
|
|
rewriter.setInsertionPointAfterValue(reducedWithinCompute);
|
|
spatial::SpatChannelSendOp::create(rewriter, loc, reduceChannel, reducedWithinCompute);
|
|
|
|
// Receive after the computeOp
|
|
rewriter.setInsertionPointAfter(computeOpOfReduced);
|
|
auto receivedValue =
|
|
spatial::SpatChannelReceiveOp::create(rewriter, loc, reducedWithinCompute.getType(), reduceChannel);
|
|
|
|
outputTiles[outTile][outX][outY] = receivedValue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO: outputTiles are not the results of the computeOps! We need to add
|
|
// them!
|
|
|
|
std::unordered_map<Operation*, SmallVector<std::tuple<size_t, size_t, size_t, Value>>> computeOpNeedingResults;
|
|
|
|
// Iterate each output tile
|
|
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
|
// Iterate each output pixel
|
|
for (size_t outX = 0; outX < output_w; outX++) {
|
|
for (size_t outY = 0; outY < output_h; outY++) {
|
|
auto outputTile = outputTiles[outTile][outX][outY];
|
|
auto outputTileProducer = outputTile.getDefiningOp()->getParentOp();
|
|
if (!outputTileProducer) {
|
|
return rewriter.notifyMatchFailure(poolOp,
|
|
"Output tile for Pooling is not produced by a "
|
|
"WeightedComputeOp.");
|
|
}
|
|
|
|
computeOpNeedingResults[outputTileProducer].push_back(std::make_tuple(outTile, outX, outY, outputTile));
|
|
}
|
|
}
|
|
}
|
|
|
|
Value outputImage = createImgConcatOp(outputTiles, rewriter, loc, poolOp.getType());
|
|
|
|
rewriter.replaceOp(poolOp, outputImage);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
void populatePoolingTilingPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
|
|
patterns.insert<PoolingBaseConverter<ONNXMaxPoolSingleOutOp, ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(
|
|
ctx);
|
|
patterns.insert<PoolingBaseConverter<ONNXAveragePoolOp, ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
|
|
}
|
|
|
|
} // namespace onnx_mlir
|