add PIM accelerator
This commit is contained in:
327
src/PIM/Conversion/ONNXToSpatial/NN/ExperimentalPooling.cpp
Normal file
327
src/PIM/Conversion/ONNXToSpatial/NN/ExperimentalPooling.cpp
Normal file
@@ -0,0 +1,327 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/IR/ValueRange.h"
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
template <typename PoolOp>
|
||||
bool hasPostProcessExperimentalPoolingWindow() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool hasPostProcessExperimentalPoolingWindow<ONNXAveragePoolOp>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter &rewriter,
|
||||
Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
|
||||
size_t tilesSkippedByPadding) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
Value postProcessExperimentalPoolingWindow<ONNXAveragePoolOp>(
|
||||
ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
|
||||
Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
|
||||
bool countIncludePad = poolOp.getCountIncludePad() == 1;
|
||||
|
||||
size_t divisorNumber =
|
||||
countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
|
||||
|
||||
RankedTensorType scalarTensor =
|
||||
RankedTensorType::get({1}, rewriter.getF32Type());
|
||||
|
||||
// Put a spat.const before the computeOp, and use its value. We do this to be
|
||||
// compatible with the current code generation, which assumes constant to be
|
||||
// loaded in global memory, which is allocated by adding a spat.const OP
|
||||
// directly under func.func (i.e. alongside ComputeOps)
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(
|
||||
valueToDivide.getDefiningOp()->getParentOp());
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
|
||||
rewriter.getI64IntegerAttr(divisorNumber),
|
||||
/* should_allocate = */ rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfterValue(valueToDivide);
|
||||
return rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, valueToDivide.getType(), valueToDivide, divisorValue);
|
||||
}
|
||||
|
||||
template <typename ReductionOp>
|
||||
Value reduceInputTiles(
|
||||
SmallVector<Value> &inputTiles, ConversionPatternRewriter &rewriter) {
|
||||
if (inputTiles.size() == 1) {
|
||||
return inputTiles[0];
|
||||
}
|
||||
|
||||
if (inputTiles.size() == 2) {
|
||||
return rewriter.create<spatial::SpatVMaxOp>(inputTiles[0].getLoc(),
|
||||
inputTiles[0].getType(), inputTiles[0], inputTiles[1]);
|
||||
}
|
||||
|
||||
SmallVector<Value> left(
|
||||
inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2);
|
||||
SmallVector<Value> right(
|
||||
inputTiles.begin() + inputTiles.size() / 2, inputTiles.end());
|
||||
|
||||
Value leftReduced = reduceInputTiles<ReductionOp>(left, rewriter);
|
||||
Value rightReduced = reduceInputTiles<ReductionOp>(right, rewriter);
|
||||
|
||||
return rewriter.create<ReductionOp>(
|
||||
inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced);
|
||||
}
|
||||
|
||||
template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
|
||||
struct ExperimentalPoolingBaseConverter : public OpConversionPattern<PoolOp> {
|
||||
ExperimentalPoolingBaseConverter(MLIRContext *ctx)
|
||||
: OpConversionPattern<PoolOp>(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
Value X = adaptor.getX();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
Value Y = poolOp.getResult();
|
||||
ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
|
||||
|
||||
size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
|
||||
unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
|
||||
unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
|
||||
unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
|
||||
|
||||
if (adaptor.getAutoPad() != "NOTSET") {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "auto_pad != NOTSET is deprecated.");
|
||||
}
|
||||
|
||||
size_t pad_x, pad_y;
|
||||
auto padUnpackError =
|
||||
unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
|
||||
if (padUnpackError.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
|
||||
}
|
||||
|
||||
Location loc = poolOp.getLoc();
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(xShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t output_h = GET_IMAGE_HEIGHT(yShape);
|
||||
size_t output_w = GET_IMAGE_WIDTH(yShape);
|
||||
|
||||
ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize);
|
||||
|
||||
// Assert that the input is a tensor.ConcatOp.
|
||||
auto concat = X.getDefiningOp<tensor::ConcatOp>();
|
||||
if (!concat) {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "Expected input to be a tensor.ConcatOp");
|
||||
}
|
||||
|
||||
// Create a [channel_tile][x][y] array to store the input tiles.
|
||||
std::map<long, std::map<long, std::map<long, Value>>> inputTiles;
|
||||
|
||||
// For each argument of the tensor.ConcatOp, resolve the input tiles.
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) {
|
||||
size_t tilingSize =
|
||||
it == tileCount.quot ? tileCount.rem : crossbarSize;
|
||||
|
||||
SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets = {/* 0 */ rewriter.getIndexAttr(0),
|
||||
/* 1 */ rewriter.getIndexAttr(0),
|
||||
/* 2 */ rewriter.getIndexAttr(x),
|
||||
/* 3 */ rewriter.getIndexAttr(y)};
|
||||
SmallVector<OpFoldResult> sizes = {
|
||||
/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
|
||||
/* 1 */ rewriter.getIndexAttr(tilingSize),
|
||||
/* 2 */ rewriter.getIndexAttr(1),
|
||||
/* 3 */ rewriter.getIndexAttr(1)};
|
||||
|
||||
// Get the concat's operand that we want to slice.
|
||||
Value concatInput = concat.getOperand(it);
|
||||
Value slicedTile = rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, concatInput, offsets, sizes, strides);
|
||||
|
||||
inputTiles[it][x][y] = slicedTile;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the shape of the compute's output.
|
||||
ldiv_t itc = tileCount;
|
||||
SmallVector<Type> outputTileTypes;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */
|
||||
cast<RankedTensorType>(inputTiles[it][0][0].getType())
|
||||
.getShape()[1],
|
||||
/* 2 */ 1,
|
||||
/* 3 */ 1};
|
||||
|
||||
auto elementType =
|
||||
dyn_cast<RankedTensorType>(xShape).getElementType();
|
||||
|
||||
outputTileTypes.push_back(
|
||||
RankedTensorType::get(outputShapeArray, elementType));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a plain value list of the input tiles.
|
||||
SmallVector<Value> inputTilesList;
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
inputTilesList.push_back(inputTiles[it][y][x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a single compute to calculate the output.
|
||||
auto computeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
||||
loc, outputTileTypes, SmallVector<Value>(), inputTilesList);
|
||||
|
||||
// Create a new block for the compute unit and add the operands.
|
||||
Block *block = rewriter.createBlock(&computeOp.getRegion());
|
||||
|
||||
// Fill the block arguments and keep a reference to them.
|
||||
std::map<size_t, std::map<size_t, std::map<size_t, Value>>> inputTilesArgs;
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) +
|
||||
x * (itc.quot + (itc.rem > 0)) + it;
|
||||
inputTilesArgs[it][y][x] = block->addArgument(
|
||||
computeOp->getOperand(tileIndex).getType(), loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Begin writing in the block.
|
||||
rewriter.setInsertionPointToStart(block);
|
||||
|
||||
// Go through all pooling blocks.
|
||||
SmallVector<Value> outputTiles;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
size_t start_x = x * stride_x;
|
||||
size_t start_y = y * stride_y;
|
||||
size_t end_x = std::min(start_x + krn_w, input_w);
|
||||
size_t end_y = std::min(start_y + krn_h, input_h);
|
||||
|
||||
SmallVector<Value> inputTilesToReduce;
|
||||
for (size_t ky = start_y; ky < end_y; ++ky) {
|
||||
for (size_t kx = start_x; kx < end_x; ++kx) {
|
||||
inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]);
|
||||
}
|
||||
}
|
||||
|
||||
auto reduceResult =
|
||||
reduceInputTiles<ReduceOp>(inputTilesToReduce, rewriter);
|
||||
|
||||
// If the reduce op is add, we need to divide the result by the
|
||||
// number of elements in the pooling window.
|
||||
if (hasPostProcessExperimentalPoolingWindow<PoolOp>()) {
|
||||
// Add a spat.const before the computeOp.
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc,
|
||||
RankedTensorType::get({1}, rewriter.getF32Type()),
|
||||
rewriter.getI64IntegerAttr(krn_w * krn_h),
|
||||
rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfter(reduceResult.getDefiningOp());
|
||||
reduceResult = rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, reduceResult.getType(), reduceResult, divisorValue);
|
||||
}
|
||||
outputTiles.push_back(reduceResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a YieldOp to return the output tiles.
|
||||
rewriter.create<spatial::SpatYieldOp>(loc, outputTiles);
|
||||
|
||||
// Set the rewrite cursor right after the computeOp.
|
||||
rewriter.setInsertionPointAfter(computeOp);
|
||||
|
||||
std::map<size_t, std::map<size_t, std::map<size_t, Value>>> computeOutput;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) +
|
||||
x * (itc.quot + (itc.rem > 0)) + it;
|
||||
computeOutput[it][y][x] = computeOp.getResult(tileIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We'll now create spat.img.concat ops to concatenate the output tiles.
|
||||
SmallVector<Value> outputTilesList;
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
SmallVector<Value> imgConcatTiles;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
imgConcatTiles.push_back(computeOutput[it][y][x]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
|
||||
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */ (long)tilingSize,
|
||||
/* 2 */ (long)output_w,
|
||||
/* 3 */ (long)output_h};
|
||||
|
||||
auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
|
||||
|
||||
outputTilesList.push_back(rewriter.create<spatial::SpatImgConcatOp>(loc,
|
||||
RankedTensorType::get(outputShapeArray, elementType),
|
||||
imgConcatTiles));
|
||||
}
|
||||
|
||||
// Create a new tensor.ConcatOp to concatenate the output tiles.
|
||||
Value outputTensor =
|
||||
rewriter.create<tensor::ConcatOp>(loc, 1, outputTilesList);
|
||||
|
||||
rewriter.replaceOp(poolOp, outputTensor);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateExperimentalPoolingTilingPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ExperimentalPoolingBaseConverter<ONNXMaxPoolSingleOutOp,
|
||||
ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
|
||||
patterns.insert<ExperimentalPoolingBaseConverter<ONNXAveragePoolOp,
|
||||
ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
452
src/PIM/Conversion/ONNXToSpatial/NN/Pooling.cpp
Normal file
452
src/PIM/Conversion/ONNXToSpatial/NN/Pooling.cpp
Normal file
@@ -0,0 +1,452 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/IR/ValueRange.h"
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
|
||||
|
||||
Value applyReducePatternNew(SmallVector<Value> &valuesToReduce,
|
||||
ConversionPatternRewriter &rewriter,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess) {
|
||||
// Simple case: if we have only one input, just return it
|
||||
if (valuesToReduce.size() == 1) {
|
||||
return valuesToReduce[0];
|
||||
}
|
||||
|
||||
if (preprocess) {
|
||||
for (auto &valToReduce : valuesToReduce) {
|
||||
rewriter.setInsertionPointAfterValue(valToReduce);
|
||||
valToReduce = preprocess(valToReduce);
|
||||
}
|
||||
}
|
||||
|
||||
// It is possible that `valuesToReduce` contains two entries for the same
|
||||
// computeOp. In this case, we need to apply the reduction within-computef
|
||||
|
||||
// Keep a map between a computeOp and the last Value for this reduction
|
||||
std::unordered_map<Operation *, Value> lastValueForCompute;
|
||||
for (auto &valToReduce : valuesToReduce) {
|
||||
Operation *computeOp = valToReduce.getParentBlock()->getParentOp();
|
||||
// if (valToReduce.getDefiningOp()) {
|
||||
// // If the value is defined by an operation, we take the parent
|
||||
// operation computeOp = valToReduce.getDefiningOp()->getParentOp();
|
||||
// } else {
|
||||
// // Otherwise it is a block argument,
|
||||
// computeOp->getBlock()->getParentOp();
|
||||
// }
|
||||
|
||||
assert(isa<spatial::SpatWeightedCompute>(computeOp) && "Expected a ComputeOp");
|
||||
|
||||
auto it = lastValueForCompute.find(computeOp);
|
||||
|
||||
if (it != lastValueForCompute.end()) {
|
||||
// If we have already seen this computeOp, apply the reduction
|
||||
// within-compute
|
||||
Value lastWithinComputeValue = it->second;
|
||||
|
||||
if (valToReduce.getDefiningOp()->isBeforeInBlock(
|
||||
lastWithinComputeValue.getDefiningOp())) {
|
||||
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
||||
} else {
|
||||
rewriter.setInsertionPointAfterValue(valToReduce);
|
||||
}
|
||||
valToReduce = reduce(lastWithinComputeValue, valToReduce);
|
||||
lastValueForCompute[computeOp] = valToReduce;
|
||||
}
|
||||
|
||||
lastValueForCompute[computeOp] = valToReduce;
|
||||
}
|
||||
|
||||
// Now, reconstruct from the map the valuesToReduce list
|
||||
valuesToReduce.clear();
|
||||
valuesToReduce.reserve(lastValueForCompute.size());
|
||||
for (auto &entry : lastValueForCompute) {
|
||||
valuesToReduce.push_back(entry.second);
|
||||
}
|
||||
|
||||
Location loc = valuesToReduce[0].getLoc();
|
||||
auto channelType = spatial::SpatChannelType::get(rewriter.getContext());
|
||||
|
||||
// Recursive algorithm to reduce the inputs to a single one:
|
||||
// - Take two inputs at a time, and reduce them into a single one, updating
|
||||
// the valuesToReduce list which becomes half the size.
|
||||
// - Repeat until there is only one input left.
|
||||
llvm::OwningArrayRef<Value> valuesToReduceRef(valuesToReduce);
|
||||
while (valuesToReduceRef.size() > 1) {
|
||||
SmallVector<Value> nextValuesToReduce;
|
||||
nextValuesToReduce.reserve(valuesToReduceRef.size() / 2);
|
||||
for (size_t i = 0; i < valuesToReduceRef.size() - 1; i += 2) {
|
||||
auto firstValue = valuesToReduceRef[i];
|
||||
auto secondValue = valuesToReduceRef[i + 1];
|
||||
|
||||
auto firstCompute = firstValue.getParentBlock()->getParentOp();
|
||||
auto secondCompute = secondValue.getParentBlock()->getParentOp();
|
||||
|
||||
assert(isa<spatial::SpatWeightedCompute>(firstCompute));
|
||||
assert(isa<spatial::SpatWeightedCompute>(secondCompute));
|
||||
|
||||
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
||||
std::swap(firstValue, secondValue);
|
||||
std::swap(firstCompute, secondCompute);
|
||||
}
|
||||
|
||||
// 1. Add a channel before the first computeOp
|
||||
rewriter.setInsertionPoint(firstCompute);
|
||||
auto channel = rewriter.create<spatial::SpatChannelNewOp>(loc, channelType);
|
||||
|
||||
// 2. Add a sendOp after the first value
|
||||
rewriter.setInsertionPointAfterValue(firstValue);
|
||||
rewriter.create<spatial::SpatChannelSendOp>(loc, channel, firstValue);
|
||||
|
||||
// 3. Add a receiveOp after the second value
|
||||
rewriter.setInsertionPointAfterValue(secondValue);
|
||||
auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
|
||||
loc, secondValue.getType(), channel);
|
||||
|
||||
// 4. Apply reduction between second value and received value
|
||||
rewriter.setInsertionPointAfterValue(receivedValue);
|
||||
Value reduced = reduce(receivedValue, secondValue);
|
||||
|
||||
nextValuesToReduce.push_back(reduced);
|
||||
}
|
||||
|
||||
// If we have an odd number of inputs, we need to add the last one to the
|
||||
// newInputs list.
|
||||
if (valuesToReduceRef.size() % 2 == 1) {
|
||||
nextValuesToReduce.push_back(valuesToReduceRef.back());
|
||||
}
|
||||
|
||||
// Replace the inputOps list with the new one.
|
||||
valuesToReduceRef =
|
||||
llvm::OwningArrayRef<Value>(std::move(nextValuesToReduce));
|
||||
}
|
||||
|
||||
assert(valuesToReduceRef.size() == 1 &&
|
||||
"Internal error: expected a single input at this point.");
|
||||
|
||||
auto finalValue = valuesToReduceRef[0];
|
||||
|
||||
if (postprocess) {
|
||||
rewriter.setInsertionPointAfterValue(finalValue);
|
||||
finalValue = postprocess(finalValue);
|
||||
}
|
||||
|
||||
return finalValue;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
bool hasPostProcessPoolingWindow() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool hasPostProcessPoolingWindow<ONNXAveragePoolOp>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
Value postProcessPoolingWindow(ConversionPatternRewriter &rewriter,
|
||||
Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
|
||||
size_t tilesSkippedByPadding) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
Value postProcessPoolingWindow<ONNXAveragePoolOp>(
|
||||
ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
|
||||
Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
|
||||
bool countIncludePad = poolOp.getCountIncludePad() == 1;
|
||||
|
||||
size_t divisorNumber =
|
||||
countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
|
||||
|
||||
RankedTensorType scalarTensor =
|
||||
RankedTensorType::get({1}, rewriter.getF32Type());
|
||||
|
||||
// Put a spat.const before the computeOp, and use its value. We do this to be
|
||||
// compatible with the current code generation, which assumes constant to be
|
||||
// loaded in global memory, which is allocated by adding a spat.const OP
|
||||
// directly under func.func (i.e. alongside ComputeOps)
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(
|
||||
valueToDivide.getDefiningOp()->getParentOp());
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
|
||||
rewriter.getI64IntegerAttr(divisorNumber),
|
||||
/* should_allocate = */ rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfterValue(valueToDivide);
|
||||
return rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, valueToDivide.getType(), valueToDivide, divisorValue);
|
||||
}
|
||||
|
||||
template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
|
||||
struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {
|
||||
PoolingBaseConverter(MLIRContext *ctx) : OpConversionPattern<PoolOp>(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
Value X = adaptor.getX();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
Value Y = poolOp.getResult();
|
||||
ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
|
||||
|
||||
size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
|
||||
unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
|
||||
unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
|
||||
unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
|
||||
|
||||
if (adaptor.getAutoPad() != "NOTSET") {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "auto_pad != NOTSET is deprecated.");
|
||||
}
|
||||
|
||||
size_t pad_x, pad_y;
|
||||
auto padUnpackError =
|
||||
unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
|
||||
if (padUnpackError.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
|
||||
}
|
||||
|
||||
Location loc = poolOp.getLoc();
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(xShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t output_h = GET_IMAGE_HEIGHT(yShape);
|
||||
size_t output_w = GET_IMAGE_WIDTH(yShape);
|
||||
size_t channelTileCount =
|
||||
ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
|
||||
size_t channelTileRest = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
|
||||
|
||||
// 1: Tile the input tensor
|
||||
// Input tiles need to be indexed by:
|
||||
// a. Channel Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: inputTiles[channelTile][x][y]
|
||||
// Example complete input tensor: tensor<1x3x12x12xf32> (NxCxWxH)
|
||||
// Suppose that the input tensor is produced by concatenating the results of
|
||||
// many ComputeOps. Get the result tiles from these ComputeOps.
|
||||
SmallVector<SmallVector<SmallVector<Value>>> inputTiles(channelTileCount,
|
||||
SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
|
||||
|
||||
auto resolveErrorOpt = resolveImgInputTiles(X, inputTiles, channelTileCount,
|
||||
channelTileRest, input_w, input_h, rewriter);
|
||||
if (resolveErrorOpt.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, *resolveErrorOpt);
|
||||
}
|
||||
|
||||
// TODO: This requires a core for each input tile, which is not ideal. We
|
||||
// can do better.
|
||||
// If some input tiles come from the func.func operands, load
|
||||
// them into a computeOp and yield them
|
||||
for (size_t t = 0; t < channelTileCount; t++) {
|
||||
for (size_t x = 0; x < input_w; x++) {
|
||||
for (size_t y = 0; y < input_h; y++) {
|
||||
if (auto extractSliceOp =
|
||||
inputTiles[t][x][y].getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
Location tileLoc = extractSliceOp.getLoc();
|
||||
|
||||
auto tempComputeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
||||
tileLoc, extractSliceOp.getResultType(),
|
||||
/* xbarWeights =*/ValueRange(), extractSliceOp.getResult());
|
||||
|
||||
Block *tempComputeOpBlock = new Block();
|
||||
tempComputeOp.getBody().push_back(tempComputeOpBlock);
|
||||
auto tempComputeOpBlockArg = tempComputeOpBlock->addArgument(
|
||||
extractSliceOp.getType(), tileLoc);
|
||||
|
||||
rewriter.setInsertionPointToStart(tempComputeOpBlock);
|
||||
rewriter.create<spatial::SpatYieldOp>(tileLoc, tempComputeOpBlockArg);
|
||||
rewriter.setInsertionPointAfter(tempComputeOp);
|
||||
inputTiles[t][x][y] = tempComputeOp.getResult(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2: Tile the output tensor
|
||||
// Output tiles need to be indexed by:
|
||||
// a. Channel Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: outputTiles[channelTile][x][y]
|
||||
// Example complete output tensor: tensor<1x3x6x6xf32> (NxCxWxH)
|
||||
SmallVector<SmallVector<SmallVector<Value>>> outputTiles(
|
||||
channelTileCount, SmallVector<SmallVector<Value>>(
|
||||
output_w, SmallVector<Value>(output_h, nullptr)));
|
||||
|
||||
// List of values to pool for each output pixel
|
||||
SmallVector<Value> valuesToPool;
|
||||
|
||||
// Iterate each output tile
|
||||
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
||||
// Iterate each output pixel
|
||||
for (size_t outX = 0; outX < output_w; outX++) {
|
||||
for (size_t outY = 0; outY < output_h; outY++) {
|
||||
|
||||
// Each output pixel tile is computed by pooling a window of input
|
||||
// pixel tiles
|
||||
valuesToPool.clear();
|
||||
size_t tilesSkippedByPadding = 0;
|
||||
|
||||
auto [start_x, end_x] = kernel_get_start_and_end(
|
||||
outX, input_w, krn_w, stride_x, dilation_x, pad_x);
|
||||
auto [start_y, end_y] = kernel_get_start_and_end(
|
||||
outY, input_h, krn_h, stride_y, dilation_y, pad_y);
|
||||
|
||||
for (size_t inX = start_x; inX < end_x; inX += dilation_x) {
|
||||
for (size_t inY = start_y; inY < end_y; inY += dilation_y) {
|
||||
if (failed(verifyWithinBoundsAndPaddings(
|
||||
input_w, input_h, inX, inY, pad_x, pad_y))) {
|
||||
tilesSkippedByPadding++;
|
||||
continue;
|
||||
}
|
||||
|
||||
Value inputTile = inputTiles[outTile][inX][inY];
|
||||
|
||||
Value valueToPool;
|
||||
if (auto computeProducer =
|
||||
inputTile.getDefiningOp<spatial::SpatWeightedCompute>()) {
|
||||
|
||||
int resultNumber = getResultIndex(computeProducer, inputTile);
|
||||
|
||||
auto yieldInComputeOp = cast<spatial::SpatYieldOp>(
|
||||
computeProducer.getBody().front().getTerminator());
|
||||
valueToPool = yieldInComputeOp.getOperand(resultNumber);
|
||||
} else if (auto receiveProducer =
|
||||
inputTile
|
||||
.getDefiningOp<spatial::SpatChannelReceiveOp>()) {
|
||||
auto sendOpOpt =
|
||||
getOtherEndOfChannel(receiveProducer, true, rewriter);
|
||||
if (failed(sendOpOpt)) {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"ChannelReceiveOp does not have a matching "
|
||||
"ChannelSendOp.");
|
||||
}
|
||||
auto sendOp = cast<spatial::SpatChannelSendOp>(*sendOpOpt);
|
||||
|
||||
valueToPool = sendOp.getData();
|
||||
} else {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"Input tile for Pooling is not produced by a "
|
||||
"WeightedComputeOp nor a receiveOp");
|
||||
}
|
||||
|
||||
valuesToPool.push_back(valueToPool);
|
||||
}
|
||||
}
|
||||
|
||||
assert(valuesToPool.size() != 0 &&
|
||||
"Pooling computed on zero tiles make no sense.");
|
||||
// assert(computeOpsForPooling.size() != 1 &&
|
||||
// "Pooling computed on one tiles make no sense??? Or maybe
|
||||
// this " "should have been simplified earlier???");
|
||||
|
||||
std::function<Value(const Value &)> postProcessFn = nullptr;
|
||||
if (hasPostProcessPoolingWindow<PoolOp>()) {
|
||||
postProcessFn = [&](const Value prevFinalRes) {
|
||||
return postProcessPoolingWindow(rewriter, loc, poolOp,
|
||||
prevFinalRes, krn_h * krn_w, tilesSkippedByPadding);
|
||||
};
|
||||
}
|
||||
|
||||
Value reducedWithinCompute = applyReducePatternNew(
|
||||
valuesToPool, rewriter,
|
||||
[&](const Value lhs, const Value rhs) {
|
||||
return rewriter.create<ReduceOp>(loc, lhs.getType(), lhs, rhs);
|
||||
},
|
||||
nullptr, postProcessFn);
|
||||
|
||||
// Send this value through a channel, and receive it in the
|
||||
// `func.func`. During lowering, we will need to "move it" into the
|
||||
// users computeOps
|
||||
auto computeOpOfReduced = cast<spatial::SpatWeightedCompute>(
|
||||
reducedWithinCompute.getDefiningOp()->getParentOp());
|
||||
|
||||
// Create a new channel before the computeOp
|
||||
rewriter.setInsertionPoint(computeOpOfReduced);
|
||||
auto reduceChannel = rewriter.create<spatial::SpatChannelNewOp>(
|
||||
loc, spatial::SpatChannelType::get(rewriter.getContext()));
|
||||
|
||||
// Send value through the channel
|
||||
rewriter.setInsertionPointAfterValue(reducedWithinCompute);
|
||||
rewriter.create<spatial::SpatChannelSendOp>(
|
||||
loc, reduceChannel, reducedWithinCompute);
|
||||
|
||||
// Receive after the computeOp
|
||||
rewriter.setInsertionPointAfter(computeOpOfReduced);
|
||||
auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
|
||||
loc, reducedWithinCompute.getType(), reduceChannel);
|
||||
|
||||
outputTiles[outTile][outX][outY] = receivedValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: outputTiles are not the results of the computeOps! We need to add
|
||||
// them!
|
||||
|
||||
std::unordered_map<Operation *,
|
||||
SmallVector<std::tuple<size_t, size_t, size_t, Value>>>
|
||||
computeOpNeedingResults;
|
||||
|
||||
// Iterate each output tile
|
||||
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
||||
// Iterate each output pixel
|
||||
for (size_t outX = 0; outX < output_w; outX++) {
|
||||
for (size_t outY = 0; outY < output_h; outY++) {
|
||||
auto outputTile = outputTiles[outTile][outX][outY];
|
||||
auto outputTileProducer = outputTile.getDefiningOp()->getParentOp();
|
||||
if (!outputTileProducer) {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"Output tile for Pooling is not produced by a "
|
||||
"WeightedComputeOp.");
|
||||
}
|
||||
|
||||
computeOpNeedingResults[outputTileProducer].push_back(
|
||||
std::make_tuple(outTile, outX, outY, outputTile));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Value outputImage =
|
||||
createImgConcatOp(outputTiles, rewriter, loc, poolOp.getType());
|
||||
|
||||
rewriter.replaceOp(poolOp, outputImage);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populatePoolingTilingPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<PoolingBaseConverter<ONNXMaxPoolSingleOutOp,
|
||||
ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
|
||||
patterns.insert<PoolingBaseConverter<ONNXAveragePoolOp,
|
||||
ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
90
src/PIM/Conversion/ONNXToSpatial/NN/ReduceMean.cpp
Normal file
90
src/PIM/Conversion/ONNXToSpatial/NN/ReduceMean.cpp
Normal file
@@ -0,0 +1,90 @@
|
||||
|
||||
|
||||
#include "Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
namespace onnx_mlir {
|
||||
|
||||
struct ReduceMeanConversionPattern
|
||||
: public OpConversionPattern<ONNXReduceMeanV13Op> {
|
||||
|
||||
ReduceMeanConversionPattern(MLIRContext *ctx) : OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean,
|
||||
ONNXReduceMeanV13OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
|
||||
// Get the input tensor.
|
||||
Value inputTensor = adaptor.getData();
|
||||
auto inputTensorType = cast<RankedTensorType>(inputTensor.getType());
|
||||
|
||||
// This pattern will substitute the ONNXReduceMeanV13Op with a
|
||||
// ONNXAveragePoolOp with the same input tensor and an appropriate kernel
|
||||
// shape and strides.
|
||||
|
||||
// To get the stride and shape of the kernel, we need to read the tensor
|
||||
// shape.
|
||||
int image_height = inputTensorType.getShape()[2];
|
||||
int image_width = inputTensorType.getShape()[3];
|
||||
|
||||
// Define the kernel shape and strides.
|
||||
SmallVector<int64_t> kernelShapeVals = {image_height, image_width};
|
||||
SmallVector<int64_t> stridesVals = {image_height, image_width};
|
||||
SmallVector<int64_t> dilationsVals = {1, 1};
|
||||
|
||||
// Set the pads to 0.
|
||||
SmallVector<int64_t> padsVals = {0, 0, 0, 0};
|
||||
|
||||
// Create the ArrayAttrs
|
||||
auto kernelShape = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto strides = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto dilations = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto pads = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
// Create the resulting tensor type.
|
||||
auto resultType = RankedTensorType::get(
|
||||
/*shape=*/{inputTensorType.getShape()[0], inputTensorType.getShape()[1],
|
||||
1, 1},
|
||||
/*elementType=*/inputTensorType.getElementType());
|
||||
|
||||
// Create the ONNXAveragePoolOp.
|
||||
auto averagePool = rewriter.create<ONNXAveragePoolOp>(reduceMean.getLoc(),
|
||||
resultType, inputTensor, /*auto_pad=*/"NOTSET",
|
||||
/*ceil_mode=*/0, /*count_include_pad=*/1, dilations,
|
||||
/*kernel_shape=*/kernelShape,
|
||||
/*pads=*/pads, /*strides=*/strides);
|
||||
|
||||
// Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp.
|
||||
rewriter.replaceOp(reduceMean, averagePool.getResult());
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateReduceMeanConversionPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ReduceMeanConversionPattern>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
Reference in New Issue
Block a user