add PIM accelerator
This commit is contained in:
34
src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt
Normal file
34
src/PIM/Conversion/ONNXToSpatial/CMakeLists.txt
Normal file
@@ -0,0 +1,34 @@
|
||||
set(LLVM_TARGET_DEFINITIONS ONNXToSpatial.td)
|
||||
mlir_tablegen(ONNXToSpatial.hpp.inc -gen-rewriters "-I${ONNX_MLIR_SRC_ROOT}")
|
||||
add_public_tablegen_target(ONNXToSpatialIncGen)
|
||||
|
||||
add_onnx_mlir_library(OMONNXToSpatial
|
||||
Math/Gemm.cpp
|
||||
Math/Conv.cpp
|
||||
Math/ExperimentalConv.cpp
|
||||
Math/ExperimentalGemm.cpp
|
||||
NN/Pooling.cpp
|
||||
NN/ExperimentalPooling.cpp
|
||||
NN/ReduceMean.cpp
|
||||
Tensor/ONNXConcatToTensorConcat.cpp
|
||||
Tensor/RemoveUnusedHelperOps.cpp
|
||||
Utils/SpatialReducer.cpp
|
||||
Utils/WeightSubdivider.cpp
|
||||
Utils/AnnotateReplication.cpp
|
||||
ONNXToSpatialPass.hpp
|
||||
ONNXToSpatialPass.cpp
|
||||
ONNXToSpatialCommon.cpp
|
||||
|
||||
DEPENDS
|
||||
ONNXToSpatialIncGen
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
OMCompilerOptions
|
||||
OMPimCompilerOptions
|
||||
OMONNXOps
|
||||
SpatialOps
|
||||
OMPIMCommon
|
||||
|
||||
ACCEL_INCLUDE_DIRS PRIVATE
|
||||
${PIM_INCLUDE_PATH}
|
||||
)
|
||||
624
src/PIM/Conversion/ONNXToSpatial/Math/Conv.cpp
Normal file
624
src/PIM/Conversion/ONNXToSpatial/Math/Conv.cpp
Normal file
@@ -0,0 +1,624 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/Block.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/IRMapping.h"
|
||||
#include "mlir/IR/Location.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Types.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/LogicalResult.h"
|
||||
#include <cstddef>
|
||||
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace std;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
// NOTE:
|
||||
// This might be useful to re-implement this considering for loops.
|
||||
// neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
|
||||
|
||||
/**
|
||||
* @brief A momentary representation of a core, to be used within the tiling of
|
||||
* a convolution operation.
|
||||
*/
|
||||
class Core {
|
||||
public:
|
||||
Core(const size_t coreId, ConversionPatternRewriter &rewriter)
|
||||
: coreId(coreId), rewriter(rewriter) {}
|
||||
|
||||
/**
|
||||
* @brief Add a MVM operation to the core.
|
||||
*
|
||||
* @param inputTile The input tile to the MVM operation.
|
||||
* @param xbarIndex The index of the crossbar weight to use.
|
||||
* @param outputTileId The id of the output tile.
|
||||
* @param mvmOutType The result's shape.
|
||||
* @return Value The result of the MVM operation.
|
||||
*/
|
||||
Value addMVM(
|
||||
Value inputTile, size_t xbarIndex, size_t outputTileId, Type mvmOutType) {
|
||||
// Use the inputTile as the reference location for the MVM operation.
|
||||
Location loc = inputTile.getLoc();
|
||||
|
||||
// Move the insertion point to the end of the block.
|
||||
rewriter.setInsertionPointToEnd(block.get());
|
||||
|
||||
// Add the inputTile to the block arguments, and to the operands.
|
||||
Value operand = operandMap.lookupOrNull(inputTile);
|
||||
if (not operand) {
|
||||
operand = block->addArgument(inputTile.getType(), loc);
|
||||
operands.push_back(inputTile);
|
||||
operandMap.map(inputTile, operand);
|
||||
}
|
||||
|
||||
// TODO: Compute the output type using the matrix, and check if `mvmOutType`
|
||||
// is correct.
|
||||
|
||||
// Construct the MVM operation
|
||||
Value result = rewriter.create<spatial::SpatWeightedMVMOp>(
|
||||
loc, mvmOutType, xbarIndex, operand);
|
||||
|
||||
// Since we are within the same core and no computation can happen in
|
||||
// paralllel, we can just apply a linear reduction in case we have multiple
|
||||
// MVM operations for the same outputTile.
|
||||
auto lastMVM = outputTileToMVM.find(outputTileId);
|
||||
|
||||
// If an entry for this outputTile already exists, apply reduction.
|
||||
if (lastMVM != outputTileToMVM.end()) {
|
||||
// MVM results should have the same type for reduction.
|
||||
assert(lastMVM->second.getType() == result.getType());
|
||||
result = rewriter.create<spatial::SpatVAddOp>(
|
||||
loc, mvmOutType, lastMVM->second, result);
|
||||
}
|
||||
|
||||
outputTileToMVM[outputTileId] = result;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Mark a result as remappable, and return a shared pointer to it.
|
||||
*
|
||||
* This function marks a result as remappable, and returns a shared pointer to
|
||||
* it. We need to keep track of these values to generate the YieldOp at a
|
||||
* later stage.
|
||||
*
|
||||
* @param result A result to track, for later remapping.
|
||||
* @return shared_ptr<Value> A shared pointer to the result.
|
||||
*/
|
||||
shared_ptr<Value> makeResultRemappable(Value result) {
|
||||
// Verify that the result is present in the block.
|
||||
assert(result.getDefiningOp()->getBlock() == block.get());
|
||||
|
||||
shared_ptr<mlir::Value> remappableResult = make_shared<Value>(result);
|
||||
|
||||
resultsToRemap.push_back(remappableResult);
|
||||
results.push_back(result);
|
||||
|
||||
return remappableResult;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Add a remappable operand to the core, to merge partial results
|
||||
* inter-core.
|
||||
*
|
||||
* @param remappableOperand The operand to add.
|
||||
* @return Value The block argument representing the operand.
|
||||
*/
|
||||
Value addRemappableOperand(std::shared_ptr<Value> operand) {
|
||||
// Check that the operand is not already there.
|
||||
assert(not operandMap.contains(*operand));
|
||||
|
||||
Value argument = block->addArgument(operand->getType(), operand->getLoc());
|
||||
remappableOperands.push_back(operand);
|
||||
return argument;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generate a spatial::SpatWeightedCompute operation from the core.
|
||||
*
|
||||
* @param loc The location of the operation.
|
||||
* @return spatial::SpatWeightedCompute
|
||||
*/
|
||||
spatial::SpatWeightedCompute createWComputeOp(Location loc) {
|
||||
// Get the shape of the results.
|
||||
SmallVector<Type> resultTypes;
|
||||
for (const auto &value : results) {
|
||||
resultTypes.push_back(value.getType());
|
||||
}
|
||||
|
||||
// Create the WComputeOp, with non-remappable operands only.
|
||||
wcomputeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
||||
loc, resultTypes, xbarWeights, operands);
|
||||
|
||||
// Add the body to the WComputeOp.
|
||||
Block *releasedBlock = block.release();
|
||||
wcomputeOp.getBody().push_back(releasedBlock);
|
||||
|
||||
// Add the `yieldOp` at the end, with the results.
|
||||
rewriter.setInsertionPointToEnd(releasedBlock);
|
||||
rewriter.create<spatial::SpatYieldOp>(loc, results);
|
||||
|
||||
return wcomputeOp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Remap the results to the WComputeOp results.
|
||||
*/
|
||||
void remapResults() {
|
||||
// Remap all the results to the WComputeOp results.
|
||||
assert(resultsToRemap.size() == wcomputeOp->getNumResults());
|
||||
for (size_t i = 0; i < resultsToRemap.size(); i++) {
|
||||
*resultsToRemap[i] = wcomputeOp.getResult(i);
|
||||
}
|
||||
}
|
||||
|
||||
void addRemappedOperands() {
|
||||
// Insert the remappableOperands (which were remapped in
|
||||
// `addRemappableOperand` of another Core)
|
||||
for (auto remappedValue : remappableOperands) {
|
||||
wcomputeOp->insertOperands(wcomputeOp->getNumOperands(), *remappedValue);
|
||||
}
|
||||
|
||||
// Update the wcomputeOp operandSegmentSize
|
||||
incrementWeightedComputeInputsSegmentSize(
|
||||
wcomputeOp, static_cast<int>(remappableOperands.size()));
|
||||
}
|
||||
|
||||
size_t addXbarWeight(Value weight) {
|
||||
assert(!isXbarsFull());
|
||||
xbarWeights.push_back(weight);
|
||||
return xbarWeights.size() - 1;
|
||||
}
|
||||
|
||||
bool isXbarsFull() {
|
||||
assert(xbarWeights.size() <= crossbarCountInCore);
|
||||
return xbarWeights.size() == crossbarCountInCore;
|
||||
}
|
||||
|
||||
bool isCoreEmpty() { return block->empty(); }
|
||||
|
||||
void dump() {
|
||||
// Print the coreId
|
||||
llvm::outs() << "Core " << coreId << ":\n";
|
||||
// Print the weights
|
||||
llvm::outs() << "Xbar Weights:\n";
|
||||
for (auto weight : xbarWeights) {
|
||||
weight.dump();
|
||||
}
|
||||
// Print the operands
|
||||
llvm::outs() << "Operands:\n";
|
||||
for (auto operand : operands) {
|
||||
llvm::outs() << operand << "\n";
|
||||
}
|
||||
|
||||
// Dump the body block
|
||||
for (auto &op : block->getOperations()) {
|
||||
op.dump();
|
||||
}
|
||||
|
||||
// Print the results
|
||||
llvm::outs() << "Results:\n";
|
||||
for (auto result : results) {
|
||||
llvm::outs() << result << "\n";
|
||||
}
|
||||
}
|
||||
|
||||
const size_t coreId;
|
||||
|
||||
private:
|
||||
ConversionPatternRewriter &rewriter;
|
||||
|
||||
// Should these be set<Value> instead? But I need to keep the order
|
||||
vector<Value> operands;
|
||||
vector<std::shared_ptr<Value>> remappableOperands;
|
||||
|
||||
vector<Value> results;
|
||||
vector<std::shared_ptr<Value>> resultsToRemap;
|
||||
|
||||
// Maps from input tiles to the block operand
|
||||
IRMapping operandMap;
|
||||
|
||||
// Map from outputTileId to MVM operation producing it
|
||||
unordered_map<size_t, Value> outputTileToMVM;
|
||||
|
||||
vector<Value> xbarWeights;
|
||||
|
||||
unique_ptr<mlir::Block> block = make_unique<Block>();
|
||||
|
||||
spatial::SpatWeightedCompute wcomputeOp;
|
||||
};
|
||||
|
||||
struct ONNXConvOpTile : public OpConversionPattern<ONNXConvOp> {
|
||||
ONNXConvOpTile(MLIRContext *ctx) : OpConversionPattern(ctx) {}
|
||||
|
||||
struct Producer_t {
|
||||
Value value;
|
||||
shared_ptr<Core> core;
|
||||
};
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXConvOp conv, ONNXConvOpAdaptor convAdaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
ShapedType xShape = mlir::cast<ShapedType>(convAdaptor.getX().getType());
|
||||
ShapedType wShape = mlir::cast<ShapedType>(convAdaptor.getW().getType());
|
||||
ShapedType bShape = mlir::cast<ShapedType>(convAdaptor.getB().getType());
|
||||
ShapedType yShape = mlir::cast<ShapedType>(conv.getY().getType());
|
||||
|
||||
size_t stride_x, stride_y, dilation_x, dilation_y, pad_x, pad_y;
|
||||
unpackOptionalPairVector(conv.getStrides(), stride_x, stride_y);
|
||||
unpackOptionalPairVector(conv.getDilations(), dilation_x, dilation_y);
|
||||
|
||||
auto padUnpackError =
|
||||
unpackOptionalPadsVector(convAdaptor.getPads(), pad_x, pad_y);
|
||||
if (padUnpackError.has_value()) {
|
||||
return rewriter.notifyMatchFailure(conv, padUnpackError.value());
|
||||
}
|
||||
|
||||
// TODO: Pad value at beginning and end of each dimension could be
|
||||
// different. We should handle this case.
|
||||
|
||||
// MapOperations mapOperation = MapOperations::None;
|
||||
//
|
||||
// // If we have just one user, and it is an activation funcion (or more in
|
||||
// // general a mapping operation) just inline it in the computeOps
|
||||
// auto firstUserOp = *conv->getUsers().begin();
|
||||
// if (conv->hasOneUse()) {
|
||||
// mapOperation = mlirOpToMapOperationEnum(firstUserOp);
|
||||
//
|
||||
// if (mapOperation == MapOperations::ONNXSoftmaxOp) {
|
||||
// return rewriter.notifyMatchFailure(
|
||||
// conv, "Softmax not supported as activation for convolutions.");
|
||||
// }
|
||||
// }
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(xShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t output_h = GET_IMAGE_HEIGHT(yShape);
|
||||
size_t output_w = GET_IMAGE_WIDTH(yShape);
|
||||
size_t krn_h = GET_KERNEL_HEIGHT(wShape);
|
||||
size_t krn_w = GET_KERNEL_WIDTH(wShape);
|
||||
|
||||
Location loc = conv.getLoc();
|
||||
|
||||
size_t inputTileCount =
|
||||
ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
|
||||
size_t inputTileRemainder = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
|
||||
size_t outputTileCount =
|
||||
ceilIntegerDivide(GET_IMAGE_CHANNEL(yShape), crossbarSize.getValue());
|
||||
size_t outputTileRemainder = GET_IMAGE_CHANNEL(yShape) % crossbarSize;
|
||||
|
||||
// Tile the input tensor
|
||||
// Input tiles need to be indexed by:
|
||||
// a. Channel Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: inputTiles[channelTile][x][y]
|
||||
// Example complete input tensor: tensor<1x3x6x6xf32> (NxCxWxH)
|
||||
SmallVector<SmallVector<SmallVector<Value>>> inputTiles(inputTileCount,
|
||||
SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
|
||||
|
||||
auto resolveErrorOpt = resolveImgInputTiles(convAdaptor.getX(), inputTiles,
|
||||
inputTileCount, inputTileRemainder, input_h, input_h, rewriter);
|
||||
if (resolveErrorOpt.has_value()) {
|
||||
return rewriter.notifyMatchFailure(conv, *resolveErrorOpt);
|
||||
}
|
||||
|
||||
SmallVector<OpFoldResult> strides =
|
||||
SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets =
|
||||
SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(0));
|
||||
SmallVector<OpFoldResult> sizes = SmallVector<OpFoldResult>{
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(crossbarSize),
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
|
||||
// Tile the weight tensor
|
||||
// Weight tiles need to be indexed by:
|
||||
// a. Filter Tile
|
||||
// b. Channel Tile
|
||||
// c. Kernel `x` position
|
||||
// d. Kernel `y` position
|
||||
// For example: weightTiles[filterTile][channelTile][x][y]
|
||||
// Example complete weight tensor: tensor<32x3x3x3xf32> (FxCxWxH)
|
||||
SmallVector<SmallVector<SmallVector<SmallVector<Value>>>> weightTiles(
|
||||
outputTileCount,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>(inputTileCount,
|
||||
SmallVector<SmallVector<Value>>(krn_w, SmallVector<Value>(krn_h))));
|
||||
strides = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(1));
|
||||
offsets = SmallVector<OpFoldResult>(4, rewriter.getIndexAttr(0));
|
||||
sizes = {rewriter.getIndexAttr(crossbarSize),
|
||||
rewriter.getIndexAttr(crossbarSize), rewriter.getIndexAttr(1),
|
||||
rewriter.getIndexAttr(1)};
|
||||
for (size_t i = 0; i < outputTileCount; i++) {
|
||||
if (i == outputTileCount - 1 && outputTileRemainder != 0) {
|
||||
sizes[0] = rewriter.getIndexAttr(outputTileRemainder);
|
||||
}
|
||||
sizes[1] = rewriter.getIndexAttr(crossbarSize);
|
||||
offsets[0] = rewriter.getIndexAttr(i * crossbarSize);
|
||||
for (size_t j = 0; j < inputTileCount; j++) {
|
||||
if (j == inputTileCount - 1 && inputTileRemainder != 0) {
|
||||
sizes[1] = rewriter.getIndexAttr(inputTileRemainder);
|
||||
}
|
||||
for (size_t x = 0; x < krn_w; x++) {
|
||||
for (size_t y = 0; y < krn_h; y++) {
|
||||
offsets[1] = rewriter.getIndexAttr(j * crossbarSize);
|
||||
offsets[2] = rewriter.getIndexAttr(x);
|
||||
offsets[3] = rewriter.getIndexAttr(y);
|
||||
weightTiles[i][j][x][y] = rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, convAdaptor.getW(), offsets, sizes, strides);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Distribute the computation among many compute cores
|
||||
* Try to compute in-core the computation for each output tile, and reduce
|
||||
* over as few cores as possible
|
||||
*/
|
||||
|
||||
// Tile the output tensor
|
||||
// Output tiles need to be indexed by:
|
||||
// a. Filter Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: outputTiles[filterTile][x][y]
|
||||
// Example complete output tensor: tensor<1x32x3x3xf32> (NxFxWxH)
|
||||
SmallVector<SmallVector<SmallVector<shared_ptr<Value>>>> outputTiles(
|
||||
outputTileCount,
|
||||
SmallVector<SmallVector<shared_ptr<Value>>>(
|
||||
output_w, SmallVector<shared_ptr<Value>>(output_h, nullptr)));
|
||||
|
||||
size_t replicationFactor;
|
||||
if (!conv->hasAttr(REPLICATION_ATTR_NAME)) {
|
||||
replicationFactor = 1;
|
||||
} else {
|
||||
replicationFactor =
|
||||
conv->getAttrOfType<IntegerAttr>(REPLICATION_ATTR_NAME).getInt();
|
||||
}
|
||||
// producers[outTile][out_x][out_y][producerIndex]
|
||||
vector<vector<vector<vector<Producer_t>>>> producers =
|
||||
vector<vector<vector<vector<Producer_t>>>>(outputTileCount,
|
||||
vector<vector<vector<Producer_t>>>(output_w,
|
||||
vector<vector<Producer_t>>(output_h, vector<Producer_t>())));
|
||||
|
||||
// Schedule in cores
|
||||
size_t coreId = 0;
|
||||
vector<shared_ptr<Core>> curCores(replicationFactor);
|
||||
for (size_t i = 0; i < replicationFactor; i++) {
|
||||
curCores[i] = make_shared<Core>(coreId++, rewriter);
|
||||
}
|
||||
|
||||
vector<shared_ptr<Core>> cores;
|
||||
|
||||
const size_t replicationSliceSize =
|
||||
ceilIntegerDivide(input_w, replicationFactor);
|
||||
|
||||
for (size_t krn_x = 0; krn_x < krn_h; krn_x++) {
|
||||
for (size_t krn_y = 0; krn_y < krn_w; krn_y++) {
|
||||
|
||||
RankedTensorType mvmOutType =
|
||||
RankedTensorType::get({1, static_cast<long>(crossbarSize), 1, 1},
|
||||
bShape.getElementType());
|
||||
|
||||
for (size_t outTile = 0; outTile < outputTileCount; outTile++) {
|
||||
|
||||
if (outTile == outputTileCount - 1 && outputTileRemainder != 0) {
|
||||
mvmOutType = mvmOutType.clone(
|
||||
{1, static_cast<long>(outputTileRemainder), 1, 1});
|
||||
}
|
||||
|
||||
for (size_t inTile = 0; inTile < inputTileCount; inTile++) {
|
||||
|
||||
vector<size_t> xbarIndexes(replicationFactor);
|
||||
for (size_t i = 0; i < replicationFactor; i++) {
|
||||
xbarIndexes[i] = curCores[i]->addXbarWeight(
|
||||
weightTiles[outTile][inTile][krn_x][krn_y]);
|
||||
}
|
||||
|
||||
size_t out_x = 0;
|
||||
for (size_t in_x = 0; in_x < input_w; in_x += stride_x) {
|
||||
size_t out_y = 0;
|
||||
|
||||
// I use `replicationFactor` cores. I divide the input_w into
|
||||
// `replicationFactor` slices, and each slice is distributed to a
|
||||
// core. `coreIndex` is the index of the core that will be used
|
||||
// for this slice
|
||||
size_t coreIndex = in_x / replicationSliceSize;
|
||||
assert(coreIndex < replicationFactor);
|
||||
|
||||
for (size_t in_y = 0; in_y < input_h; in_y += stride_y) {
|
||||
// Adjust the input based on the kernel
|
||||
int actual_in_x = in_x - ((int)krn_w / 2) + krn_x * dilation_x;
|
||||
int actual_in_y = in_y - ((int)krn_h / 2) + krn_y * dilation_y;
|
||||
|
||||
// Check if we are within the input image
|
||||
if (verifyWithinBoundsAndPaddings(input_w, input_h, actual_in_x,
|
||||
actual_in_y, pad_x, pad_y)
|
||||
.failed()) {
|
||||
out_y++;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t outTileId =
|
||||
outTile * output_w * output_h + out_x * output_h + out_y;
|
||||
auto mvm = curCores[coreIndex]->addMVM(
|
||||
inputTiles[inTile][actual_in_x][actual_in_y],
|
||||
xbarIndexes[coreIndex], outTileId, mvmOutType);
|
||||
|
||||
producers[outTile][out_x][out_y].push_back(
|
||||
{mvm, curCores[coreIndex]});
|
||||
|
||||
out_y++;
|
||||
}
|
||||
out_x++;
|
||||
}
|
||||
|
||||
// Computations for these crossbars are done, check if the cores
|
||||
// crossbars are fully used. If full, swap with new core
|
||||
for (size_t i = 0; i < replicationFactor; i++) {
|
||||
if (curCores[i]->isXbarsFull()) {
|
||||
cores.emplace_back(std::move(curCores[i]));
|
||||
curCores[i] = make_shared<Core>(coreId++, rewriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &curCore : curCores) {
|
||||
if (curCore->isCoreEmpty() == false) {
|
||||
cores.emplace_back(std::move(curCore));
|
||||
}
|
||||
}
|
||||
curCores.clear();
|
||||
// Now, do the reduction of each output pixel tile
|
||||
for (size_t outTile = 0; outTile < outputTileCount; outTile++) {
|
||||
for (size_t out_x = 0; out_x < output_w; out_x++) {
|
||||
for (size_t out_y = 0; out_y < output_h; out_y++) {
|
||||
// First, check if some producers are within the same core. If this is
|
||||
// true, `Core::addMVM` have already done the reduction within-core.
|
||||
// This means that we only need to consider the last producer for that
|
||||
// core.
|
||||
|
||||
std::unordered_map<size_t, Producer_t> withinCoreReducedProducers;
|
||||
for (auto producer : producers[outTile][out_x][out_y]) {
|
||||
withinCoreReducedProducers[producer.core->coreId] = producer;
|
||||
}
|
||||
|
||||
// Now, we need to apply inter-core reduction
|
||||
|
||||
// Base case with one producer
|
||||
if (withinCoreReducedProducers.size() == 1) {
|
||||
// TODO: Add the bias and apply mapping (if present)
|
||||
|
||||
auto singleProducer = withinCoreReducedProducers.begin()->second;
|
||||
// Use last producer as the final result
|
||||
auto reducedValue =
|
||||
singleProducer.core->makeResultRemappable(singleProducer.value);
|
||||
outputTiles[outTile][out_x][out_y] = reducedValue;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: This is a linear reduction, not a tree reduction. We can do
|
||||
// better: a tree reduction would make more computations happen in
|
||||
// parallel.
|
||||
|
||||
Producer_t lastProducer = withinCoreReducedProducers.begin()->second;
|
||||
|
||||
auto it = withinCoreReducedProducers.begin();
|
||||
it++;
|
||||
while (it != withinCoreReducedProducers.end()) {
|
||||
|
||||
Producer_t curProducer = it->second;
|
||||
|
||||
shared_ptr<Core> core1;
|
||||
shared_ptr<Core> core2;
|
||||
Value core1Value;
|
||||
Value core2Value;
|
||||
|
||||
auto lastProducerCoreId = lastProducer.core->coreId;
|
||||
auto curProducerCoreId = curProducer.core->coreId;
|
||||
|
||||
assert(lastProducerCoreId != curProducerCoreId &&
|
||||
"We should have already applied within-core reduction, how "
|
||||
"could we have same cores here?");
|
||||
|
||||
// Sort the cores by coreId
|
||||
if (curProducerCoreId < lastProducerCoreId) {
|
||||
core1 = curProducer.core;
|
||||
core1Value = curProducer.value;
|
||||
core2 = lastProducer.core;
|
||||
core2Value = lastProducer.value;
|
||||
} else {
|
||||
core1 = lastProducer.core;
|
||||
core1Value = lastProducer.value;
|
||||
core2 = curProducer.core;
|
||||
core2Value = curProducer.value;
|
||||
}
|
||||
|
||||
auto newCoreRes = core1->makeResultRemappable(core1Value);
|
||||
auto secondCoreBlockArg = core2->addRemappableOperand(newCoreRes);
|
||||
|
||||
rewriter.setInsertionPointAfterValue(core2Value);
|
||||
Value vaddRes =
|
||||
rewriter.create<spatial::SpatVAddOp>(core2Value.getLoc(),
|
||||
core2Value.getType(), core2Value, secondCoreBlockArg);
|
||||
|
||||
lastProducer = {vaddRes, core2};
|
||||
|
||||
it++;
|
||||
}
|
||||
|
||||
// TODO: Add the bias and apply mapping (if present)
|
||||
|
||||
// Use last producer as the final result
|
||||
auto reducedValue =
|
||||
lastProducer.core->makeResultRemappable(lastProducer.value);
|
||||
outputTiles[outTile][out_x][out_y] = reducedValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now, we need to turn the cores into a spatial::SpatWeightedCompute.
|
||||
rewriter.setInsertionPointAfter(conv);
|
||||
spatial::SpatWeightedCompute lastWComputeOp;
|
||||
for (auto &core : cores) {
|
||||
lastWComputeOp = core->createWComputeOp(loc);
|
||||
core->remapResults();
|
||||
rewriter.setInsertionPointAfter(lastWComputeOp);
|
||||
}
|
||||
|
||||
for (auto &core : cores) {
|
||||
core->addRemappedOperands();
|
||||
}
|
||||
|
||||
// Set the insertion point after the last WComputeOp.
|
||||
rewriter.setInsertionPointAfter(lastWComputeOp);
|
||||
SmallVector<Value> tilesToConcat;
|
||||
tilesToConcat.reserve(output_h * output_w * outputTileCount * crossbarSize);
|
||||
for (size_t outX = 0; outX < output_h; outX++)
|
||||
for (size_t outY = 0; outY < output_w; outY++)
|
||||
for (size_t outTile = 0; outTile < outputTileCount; outTile++)
|
||||
tilesToConcat.push_back(*outputTiles[outTile][outX][outY]);
|
||||
|
||||
Value outputImage = rewriter.create<spatial::SpatImgConcatOp>(
|
||||
loc, conv.getY().getType(), tilesToConcat);
|
||||
|
||||
// Value outputImage =
|
||||
// createImgConcatOp(outputTiles, rewriter, loc, Y.getType());
|
||||
|
||||
// If no mapping (activation) was applied, just replace ConvOp
|
||||
// if (mapOperation == MapOperations::None) {
|
||||
// rewriter.replaceOp(conv, outputImage);
|
||||
// } else {
|
||||
// // If mapping was applied, erase ConvOp and replace the mapping op
|
||||
// rewriter.eraseOp(conv);
|
||||
// rewriter.replaceOp(firstUserOp, outputImage);
|
||||
// }
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateTilingConvOpPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ONNXConvOpTile>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
430
src/PIM/Conversion/ONNXToSpatial/Math/ExperimentalConv.cpp
Normal file
430
src/PIM/Conversion/ONNXToSpatial/Math/ExperimentalConv.cpp
Normal file
@@ -0,0 +1,430 @@
|
||||
#include "Compiler/PimCompilerOptions.hpp"
|
||||
#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Types.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <unistd.h>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace std;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* @brief A pattern to tile the convolution operation into a series of compute
|
||||
* units, each one of which applies filters to a subset of the input
|
||||
* tensor. Results are also reduced and concatenated to form the final
|
||||
* output tensor.
|
||||
*/
|
||||
struct ExperimentalONNXConvOpTile : public OpConversionPattern<ONNXConvOp> {
|
||||
ExperimentalONNXConvOpTile(MLIRContext *ctx) : OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXConvOp conv, ONNXConvOpAdaptor convAdaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
|
||||
// --------------------------------- //
|
||||
// --- READ OPERATION PARAMETERS --- //
|
||||
// --------------------------------- //
|
||||
|
||||
// To get each crossbar's weights, we need to slice the weights tensor.
|
||||
// - Along the input tiles.
|
||||
// - Along the output tiles.
|
||||
// - Along the filter x position.
|
||||
// - Along the filter y position.
|
||||
ShapedType inputType = cast<ShapedType>(convAdaptor.getX().getType());
|
||||
ShapedType outputType = cast<ShapedType>(conv.getY().getType());
|
||||
ShapedType weightsType = cast<ShapedType>(convAdaptor.getW().getType());
|
||||
|
||||
// TODO: Address bigger batches.
|
||||
assert(GET_IMAGE_N(inputType) == 1 && "Batch size must be 1"
|
||||
"for convolution.");
|
||||
|
||||
// TODO: Address replication.
|
||||
assert(coresCount.getValue() == -1 &&
|
||||
"Replication is not yet supported for convolution.");
|
||||
|
||||
// TODO: Address bias addition.
|
||||
|
||||
ldiv_t inputTileCount = div(GET_IMAGE_CHANNEL(inputType), crossbarSize);
|
||||
ldiv_t outputTileCount = div(GET_IMAGE_CHANNEL(outputType), crossbarSize);
|
||||
size_t kernelWidth = GET_KERNEL_WIDTH(weightsType);
|
||||
size_t kernelHeight = GET_KERNEL_HEIGHT(weightsType);
|
||||
|
||||
// Assert that the kernel is square.
|
||||
assert(kernelWidth == kernelHeight && "Only square kernels are supported.");
|
||||
|
||||
// -------------------------------- //
|
||||
// --- SLICE THE WEIGHTS TENSOR --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// The core idea of this stage is classifying the weights by input and
|
||||
// output tile. This is because we want the applyFilters operations to be
|
||||
// tile agnostic, to keep the subsequent lowering stages as simple as
|
||||
// possible. This data structure does this weight classification:
|
||||
// - The outer map is indexed by input tile.
|
||||
// - The inner map is indexed by output tile.
|
||||
// - The SmallVector contains the weights for the filter.
|
||||
map<long, map<long, SmallVector<Value>>> weightsGroups;
|
||||
|
||||
// During all slicing operations within this stage, we'll use the same
|
||||
// strides for all dimensions.
|
||||
SmallVector<OpFoldResult> slicingStrides(4, rewriter.getIndexAttr(1));
|
||||
|
||||
ldiv_t itc = inputTileCount;
|
||||
ldiv_t otc = outputTileCount;
|
||||
|
||||
// - Slicing along the input tiles.
|
||||
// - Slicing along the output tiles.
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
long crossbarWidth = it == itc.quot ? itc.rem : crossbarSize;
|
||||
for (long ot = 0; ot < otc.quot + (otc.rem > 0); ++ot) {
|
||||
long crossbarHeight = ot == otc.quot ? otc.rem : crossbarSize;
|
||||
|
||||
// The loop above also sets the crossbar's used width and height,
|
||||
// checking if we're at the last crossbar and if it's incomplete.
|
||||
|
||||
long outputTile = ot;
|
||||
long inputTile = it;
|
||||
|
||||
// Create the slicing sizes.
|
||||
SmallVector<OpFoldResult> slicingSizes{
|
||||
/* 0 */ rewriter.getIndexAttr(crossbarHeight),
|
||||
/* 1 */ rewriter.getIndexAttr(crossbarWidth),
|
||||
/* 2 */ rewriter.getIndexAttr(1),
|
||||
/* 3 */ rewriter.getIndexAttr(1)};
|
||||
|
||||
// - Slicing along the filter x position.
|
||||
// - Slicing along the filter y position.
|
||||
for (size_t filterX = 0; filterX < kernelWidth; ++filterX) {
|
||||
for (size_t filterY = 0; filterY < kernelHeight; ++filterY) {
|
||||
|
||||
// Create the slicing offsets.
|
||||
SmallVector<OpFoldResult> slicingOffsets{
|
||||
/* 0 */ rewriter.getIndexAttr(outputTile * crossbarSize),
|
||||
/* 1 */ rewriter.getIndexAttr(inputTile * crossbarSize),
|
||||
/* 2 */ rewriter.getIndexAttr(filterX),
|
||||
/* 3 */ rewriter.getIndexAttr(filterY)};
|
||||
|
||||
// Create the slice extraction operation.
|
||||
auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
|
||||
conv.getLoc(), convAdaptor.getW(), slicingOffsets, slicingSizes,
|
||||
slicingStrides);
|
||||
|
||||
// Add a note to the extractSliceOp, with the filterX and filterY.
|
||||
weightsGroups[inputTile][outputTile].push_back(extractSliceOp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Tree reduction for compute reduction should be implemented.
|
||||
|
||||
// -------------------------------- //
|
||||
// --- CREATE ALL COMPUTE UNITS --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// Keep track of input slicing operations to avoid duplication across
|
||||
// all compute units (global slices).
|
||||
map<long, Value> globalSlices;
|
||||
|
||||
// Keep track of all partial compute results.
|
||||
map<long, Value> globalPartialResults;
|
||||
|
||||
// Use a weight subdivider to extract groups of weights for each compute
|
||||
// unit. We'll keep extracting groups until no more weights are left.
|
||||
WeightSubdivider weightSubdivider(weightsGroups);
|
||||
while (!weightSubdivider.isEmpty()) {
|
||||
|
||||
// -------------------------------- //
|
||||
// --- BEGIN A NEW COMPUTE UNIT --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// Get the next group of weights for the compute unit.
|
||||
SmallVector<TaggedWeights> weightsGroups =
|
||||
weightSubdivider.popGroups(crossbarCountInCore.getValue());
|
||||
|
||||
SmallVector<Value> computeWeights;
|
||||
SmallVector<Value> computeOperands;
|
||||
|
||||
// ------------------------------ //
|
||||
// --- SLICE THE INPUT TENSOR --- //
|
||||
// ------------------------------ //
|
||||
|
||||
// Note each tile's index in the compute unit arguments.
|
||||
map<long, size_t> inputTileIndices;
|
||||
map<long, size_t> outputTileIndices;
|
||||
map<long, size_t> reductionTileIndices; // Incoming partial results.
|
||||
|
||||
// Iterate over all weights groups for this compute unit.
|
||||
map<long, Value> localSlices; // WRT the current compute unit.
|
||||
for (auto group : weightsGroups) {
|
||||
for (Value weight : group.weights) {
|
||||
computeWeights.push_back(weight);
|
||||
}
|
||||
|
||||
// There might be multiple weight groups for the same input tile, so if
|
||||
// we've already added the input tile, skip it.
|
||||
if (localSlices.find(group.inputTile) != localSlices.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We might have already sliced the input tensor for some other compute
|
||||
// unit, so if we have, reuse the slicing operation without creating a
|
||||
// new one.
|
||||
if (globalSlices.find(group.inputTile) != globalSlices.end()) {
|
||||
computeOperands.push_back(globalSlices[group.inputTile]);
|
||||
localSlices[group.inputTile] = globalSlices[group.inputTile];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the input tensor slicing offsets.
|
||||
SmallVector<OpFoldResult> slicingOffsets{
|
||||
/* 0 */ rewriter.getIndexAttr(0), // No offset along the batch axis.
|
||||
/* 1 */ rewriter.getIndexAttr(group.inputTile * crossbarSize),
|
||||
/* 2 */ rewriter.getIndexAttr(0),
|
||||
/* 3 */ rewriter.getIndexAttr(0)};
|
||||
|
||||
// Create the input tensor slicing sizes.
|
||||
size_t tilingSize = group.inputTile == inputTileCount.quot
|
||||
? inputTileCount.rem
|
||||
: crossbarSize;
|
||||
SmallVector<OpFoldResult> slicingSizes{
|
||||
/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
|
||||
/* 1 */ rewriter.getIndexAttr(tilingSize),
|
||||
/* 2 */ rewriter.getIndexAttr(GET_IMAGE_WIDTH(inputType)),
|
||||
/* 3 */ rewriter.getIndexAttr(GET_IMAGE_HEIGHT(inputType))};
|
||||
|
||||
// Create the slice extraction operation.
|
||||
auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
|
||||
conv.getLoc(), convAdaptor.getX(), slicingOffsets, slicingSizes,
|
||||
slicingStrides);
|
||||
|
||||
computeOperands.push_back(extractSliceOp);
|
||||
|
||||
// Update slicing maps.
|
||||
globalSlices[group.inputTile] = extractSliceOp;
|
||||
localSlices[group.inputTile] = extractSliceOp;
|
||||
|
||||
// Update the input tile index.
|
||||
inputTileIndices[group.inputTile] = computeOperands.size() - 1;
|
||||
}
|
||||
|
||||
// ------------------------------- //
|
||||
// --- PREPARE THE OUTPUT TYPE --- //
|
||||
// ------------------------------- //
|
||||
|
||||
// Fill the compute output's type by looking at the output tiles.
|
||||
SmallVector<Type> computeOutputType;
|
||||
for (TaggedWeights group : weightsGroups) {
|
||||
|
||||
// There might be multiple weight groups for the same output tile, so if
|
||||
// we've already added the output tile, skip it.
|
||||
if (outputTileIndices.find(group.outputTile) !=
|
||||
outputTileIndices.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Additionally, after adding the input slices as operands, also add any
|
||||
// compatible partial results from previous compute units.
|
||||
if (globalPartialResults.find(group.outputTile) !=
|
||||
globalPartialResults.end()) {
|
||||
computeOperands.push_back(globalPartialResults[group.outputTile]);
|
||||
reductionTileIndices[group.outputTile] = computeOperands.size() - 1;
|
||||
}
|
||||
|
||||
// Define the output shape for this group.
|
||||
long outputTileSize = group.outputTile == outputTileCount.quot
|
||||
? outputTileCount.rem
|
||||
: crossbarSize;
|
||||
|
||||
// TODO: Address non-same padding.
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */ outputTileSize,
|
||||
/* 2 */ GET_IMAGE_WIDTH(outputType), // Same padding assumed.
|
||||
/* 3 */ GET_IMAGE_HEIGHT(outputType)};
|
||||
|
||||
auto elementType =
|
||||
dyn_cast<RankedTensorType>(conv.getY().getType()).getElementType();
|
||||
|
||||
computeOutputType.push_back(
|
||||
RankedTensorType::get(outputShapeArray, elementType));
|
||||
|
||||
outputTileIndices[group.outputTile] = computeOutputType.size() - 1;
|
||||
}
|
||||
|
||||
// ----------------------------- //
|
||||
// --- FILL THE COMPUTE UNIT --- //
|
||||
// ----------------------------- //
|
||||
|
||||
// Create the compute unit.
|
||||
spatial::SpatWeightedCompute currentCompute =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(conv.getLoc(),
|
||||
computeOutputType, computeWeights, computeOperands);
|
||||
|
||||
// Create a new block for the compute unit and add the operands.
|
||||
Block *block = rewriter.createBlock(¤tCompute.getRegion());
|
||||
rewriter.setInsertionPointToStart(block);
|
||||
for (Value operand : computeOperands) {
|
||||
block->addArgument(operand.getType(), conv->getLoc());
|
||||
}
|
||||
|
||||
// Initialize a map of local partial results.
|
||||
map<long, Value> localPartialResults; // WRT the current compute unit.
|
||||
|
||||
// If we have any reduction tiles, add them to the local partial results.
|
||||
for (auto reductionTileIndex : reductionTileIndices) {
|
||||
localPartialResults[reductionTileIndex.first] =
|
||||
block->getArgument(reductionTileIndex.second);
|
||||
}
|
||||
|
||||
// Add all the applyFilters operations to the block.
|
||||
for (TaggedWeights group : weightsGroups) {
|
||||
|
||||
// Get the outputType for this group.
|
||||
Type outputType =
|
||||
computeOutputType[outputTileIndices[group.outputTile]];
|
||||
|
||||
// Create an apply filters operation.
|
||||
BlockArgument blockArgument =
|
||||
block->getArgument(inputTileIndices[group.inputTile]);
|
||||
|
||||
// The list of weight indices is group.startingCrossbarIndex + 0, 1, 2,
|
||||
// ... As many weights as the size of group.weights.
|
||||
SmallVector<long> weightIndices;
|
||||
for (size_t i = 0; i < group.weights.size(); ++i) {
|
||||
weightIndices.push_back(group.startingCrossbarIndex + i);
|
||||
}
|
||||
|
||||
SmallVector<int64_t> xKerPos;
|
||||
SmallVector<int64_t> yKerPos;
|
||||
for (auto weight : group.weights) {
|
||||
// Assert that the weight is an extract_slice operation.
|
||||
auto extractSliceOp = weight.getDefiningOp<tensor::ExtractSliceOp>();
|
||||
assert(extractSliceOp && "Weight is not an extract_slice operation.");
|
||||
|
||||
// Get the filter x and y positions from the extract_slice operation.
|
||||
auto offsets = extractSliceOp.getStaticOffsets();
|
||||
xKerPos.push_back(offsets[2]);
|
||||
yKerPos.push_back(offsets[3]);
|
||||
}
|
||||
|
||||
ArrayAttr weightIndicesAttr = rewriter.getI64ArrayAttr(weightIndices);
|
||||
ArrayAttr xKerPosAttr = rewriter.getI64ArrayAttr(xKerPos);
|
||||
ArrayAttr yKerPosAttr = rewriter.getI64ArrayAttr(yKerPos);
|
||||
|
||||
Value result =
|
||||
rewriter.create<spatial::SpatApplyFiltersOp>(conv.getLoc(), outputType,
|
||||
weightIndicesAttr, xKerPosAttr, yKerPosAttr, blockArgument);
|
||||
|
||||
// Perform local reduction if necessary.
|
||||
if (localPartialResults.find(group.outputTile) !=
|
||||
localPartialResults.end()) {
|
||||
|
||||
result = rewriter.create<spatial::SpatVAddOp>(conv.getLoc(),
|
||||
result.getType(), localPartialResults[group.outputTile], result);
|
||||
}
|
||||
|
||||
// Update the partial results map.
|
||||
localPartialResults[group.outputTile] = result;
|
||||
}
|
||||
|
||||
// Add a yield operation to the block by concatenating the partial
|
||||
// results.
|
||||
SmallVector<Value> applyFiltersResults;
|
||||
for (size_t i = 0; i < computeOutputType.size(); ++i) {
|
||||
long outputTile;
|
||||
|
||||
// Given an output tile index, find the corresponding output tile.
|
||||
for (auto outputTileIndex : outputTileIndices) {
|
||||
if (outputTileIndex.second == i) {
|
||||
outputTile = outputTileIndex.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that tile's partial result and add it to the list.
|
||||
applyFiltersResults.push_back(localPartialResults[outputTile]);
|
||||
}
|
||||
|
||||
// Create the yield operation with the given results.
|
||||
rewriter.create<spatial::SpatYieldOp>(conv.getLoc(), applyFiltersResults);
|
||||
|
||||
// Update the global partial results map.
|
||||
for (size_t i = 0; i < applyFiltersResults.size(); ++i) {
|
||||
long outputTile;
|
||||
|
||||
// Given an output tile index, find the corresponding output tile.
|
||||
for (auto outputTileIndex : outputTileIndices) {
|
||||
if (outputTileIndex.second == i) {
|
||||
outputTile = outputTileIndex.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
globalPartialResults[outputTile] = currentCompute.getResult(i);
|
||||
}
|
||||
|
||||
// Move the rewrite cursor out of the block.
|
||||
rewriter.setInsertionPointAfter(currentCompute);
|
||||
}
|
||||
|
||||
// ------------------------------ //
|
||||
// --- CONCATENATE THE OUTPUT --- //
|
||||
// ------------------------------ //
|
||||
|
||||
// Turn the values into a SmallVector.
|
||||
SmallVector<Value> outputValues;
|
||||
for (long i = 0; i < outputTileCount.quot + (outputTileCount.rem > 0);
|
||||
++i) {
|
||||
outputValues.push_back(globalPartialResults[i]);
|
||||
}
|
||||
|
||||
// Assert that the number of output values is correct.
|
||||
assert(outputValues.size() > 0 &&
|
||||
"No output values were generated for the convolution.");
|
||||
|
||||
// If the conv's user is a ReLU...
|
||||
if (conv->hasOneUse()) {
|
||||
Operation *user = *conv->getUsers().begin();
|
||||
if (auto relu = dyn_cast<ONNXReluOp>(user)) {
|
||||
// ...then we can just replace the ReLU with the concatenation.
|
||||
rewriter.replaceOp(relu,
|
||||
rewriter.create<tensor::ConcatOp>(conv.getLoc(), 1, outputValues));
|
||||
|
||||
// And erase the convolution.
|
||||
rewriter.eraseOp(conv);
|
||||
return success();
|
||||
}
|
||||
}
|
||||
|
||||
// Return the final output.
|
||||
rewriter.replaceOp(conv,
|
||||
rewriter.create<tensor::ConcatOp>(conv.getLoc(), 1, outputValues));
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Populate the tiling pattern for a convolution operation.
|
||||
*
|
||||
* @param patterns The pattern set to populate.
|
||||
* @param ctx The MLIR context.
|
||||
*/
|
||||
void populateExperimentalTilingConvOpPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ExperimentalONNXConvOpTile>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
400
src/PIM/Conversion/ONNXToSpatial/Math/ExperimentalGemm.cpp
Normal file
400
src/PIM/Conversion/ONNXToSpatial/Math/ExperimentalGemm.cpp
Normal file
@@ -0,0 +1,400 @@
|
||||
#include "Compiler/PimCompilerOptions.hpp"
|
||||
#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
|
||||
#include "Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace std;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
struct ExperimentalGemmConversionPattern
|
||||
: public OpConversionPattern<ONNXGemmOp> {
|
||||
ExperimentalGemmConversionPattern(MLIRContext *ctx)
|
||||
: OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXGemmOp gemmOp, ONNXGemmOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
|
||||
// --------------------------------- //
|
||||
// --- READ OPERATION PARAMETERS --- //
|
||||
// --------------------------------- //
|
||||
|
||||
// To get each crossbar's weights, we need to slice the weights tensor.
|
||||
// - Along the input tiles.
|
||||
// - Along the output tiles.
|
||||
// - Along the filter x position.
|
||||
// - Along the filter y position.
|
||||
ShapedType inputType = cast<ShapedType>(adaptor.getA().getType());
|
||||
ShapedType outputType = cast<ShapedType>(gemmOp.getY().getType());
|
||||
ShapedType matrixType = cast<ShapedType>(adaptor.getB().getType());
|
||||
|
||||
// TODO: Address bigger batches.
|
||||
assert(inputType.getShape()[0] == 1 &&
|
||||
"Only batch size of 1 is supported for GEMM.");
|
||||
|
||||
// TODO: Address replication.
|
||||
assert(coresCount.getValue() == -1 &&
|
||||
"Replication is not yet supported for GEMM.");
|
||||
|
||||
// TODO: Address bias addition.
|
||||
|
||||
assert(inputType.getShape()[1] == matrixType.getShape()[0] &&
|
||||
"Input tile size must match the matrix's row size.");
|
||||
|
||||
ldiv_t inputTileCount = div(inputType.getShape()[1], crossbarSize);
|
||||
ldiv_t outputTileCount = div(outputType.getShape()[1], crossbarSize);
|
||||
size_t kernelWidth = 1;
|
||||
size_t kernelHeight = 1;
|
||||
|
||||
// Assert that the kernel is square.
|
||||
assert(kernelWidth == kernelHeight && "Only square kernels are supported.");
|
||||
|
||||
// -------------------------------- //
|
||||
// --- SLICE THE WEIGHTS TENSOR --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// The core idea of this stage is classifying the weights by input and
|
||||
// output tile. This is because we want the applyFilters operations to be
|
||||
// tile agnostic, to keep the subsequent lowering stages as simple as
|
||||
// possible. This data structure does this weight classification:
|
||||
// - The outer map is indexed by input tile.
|
||||
// - The inner map is indexed by output tile.
|
||||
// - The SmallVector contains the weights for the filter.
|
||||
map<long, map<long, SmallVector<Value>>> weightsGroups;
|
||||
|
||||
// During all slicing operations within this stage, we'll use the same
|
||||
// strides for all dimensions.
|
||||
SmallVector<OpFoldResult> slicingStrides(2, rewriter.getIndexAttr(1));
|
||||
|
||||
ldiv_t itc = inputTileCount;
|
||||
ldiv_t otc = outputTileCount;
|
||||
|
||||
// - Slicing along the input tiles.
|
||||
// - Slicing along the output tiles.
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
long crossbarWidth = it == itc.quot ? itc.rem : crossbarSize;
|
||||
for (long ot = 0; ot < otc.quot + (otc.rem > 0); ++ot) {
|
||||
long crossbarHeight = ot == otc.quot ? otc.rem : crossbarSize;
|
||||
|
||||
// The loop above also sets the crossbar's used width and height,
|
||||
// checking if we're at the last crossbar and if it's incomplete.
|
||||
|
||||
long outputTile = ot;
|
||||
long inputTile = it;
|
||||
|
||||
// Create the slicing sizes.
|
||||
SmallVector<OpFoldResult> slicingSizes{
|
||||
/* 0 */ rewriter.getIndexAttr(crossbarHeight),
|
||||
/* 1 */ rewriter.getIndexAttr(crossbarWidth),
|
||||
/* 2 */ /* rewriter.getIndexAttr(1), */
|
||||
/* 3 */ /* rewriter.getIndexAttr(1) */};
|
||||
|
||||
// - Slicing along the filter x position.
|
||||
// - Slicing along the filter y position.
|
||||
for (size_t filterX = 0; filterX < kernelWidth; ++filterX) {
|
||||
for (size_t filterY = 0; filterY < kernelHeight; ++filterY) {
|
||||
|
||||
// Create the slicing offsets.
|
||||
SmallVector<OpFoldResult> slicingOffsets{
|
||||
/* 0 */ rewriter.getIndexAttr(outputTile * crossbarSize),
|
||||
/* 1 */ rewriter.getIndexAttr(inputTile * crossbarSize),
|
||||
/* 2 */ /* rewriter.getIndexAttr(filterX), */
|
||||
/* 3 */ /* rewriter.getIndexAttr(filterY) */};
|
||||
|
||||
// Create the slice extraction operation.
|
||||
auto extractSliceOp = rewriter.create<tensor::ExtractSliceOp>(
|
||||
gemmOp.getLoc(), adaptor.getB(), slicingOffsets, slicingSizes,
|
||||
slicingStrides);
|
||||
|
||||
// Add a note to the extractSliceOp, with the filterX and filterY.
|
||||
weightsGroups[inputTile][outputTile].push_back(extractSliceOp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Tree reduction for compute reduction should be implemented.
|
||||
|
||||
// -------------------------------- //
|
||||
// --- CREATE ALL COMPUTE UNITS --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// Keep track of input slicing operations to avoid duplication across
|
||||
// all compute units (global slices).
|
||||
map<long, Value> globalSlices;
|
||||
|
||||
// Keep track of all partial compute results.
|
||||
map<long, Value> globalPartialResults;
|
||||
|
||||
// Use a weight subdivider to extract groups of weights for each compute
|
||||
// unit. We'll keep extracting groups until no more weights are left.
|
||||
WeightSubdivider weightSubdivider(weightsGroups);
|
||||
while (!weightSubdivider.isEmpty()) {
|
||||
|
||||
// -------------------------------- //
|
||||
// --- BEGIN A NEW COMPUTE UNIT --- //
|
||||
// -------------------------------- //
|
||||
|
||||
// Get the next group of weights for the compute unit.
|
||||
SmallVector<TaggedWeights> weightsGroups =
|
||||
weightSubdivider.popGroups(crossbarCountInCore.getValue());
|
||||
|
||||
SmallVector<Value> computeWeights;
|
||||
SmallVector<Value> computeOperands;
|
||||
|
||||
// ------------------------------ //
|
||||
// --- SLICE THE INPUT TENSOR --- //
|
||||
// ------------------------------ //
|
||||
|
||||
// Note each tile's index in the compute unit arguments.
|
||||
map<long, size_t> inputTileIndices;
|
||||
map<long, size_t> outputTileIndices;
|
||||
map<long, size_t> reductionTileIndices; // Incoming partial results.
|
||||
|
||||
// Iterate over all weights groups for this compute unit.
|
||||
map<long, Value> localSlices; // WRT the current compute unit.
|
||||
for (auto group : weightsGroups) {
|
||||
for (Value weight : group.weights) {
|
||||
computeWeights.push_back(weight);
|
||||
}
|
||||
|
||||
// There might be multiple weight groups for the same input tile, so if
|
||||
// we've already added the input tile, skip it.
|
||||
if (localSlices.find(group.inputTile) != localSlices.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We might have already sliced the input tensor for some other compute
|
||||
// unit, so if we have, reuse the slicing operation without creating a
|
||||
// new one.
|
||||
if (globalSlices.find(group.inputTile) != globalSlices.end()) {
|
||||
computeOperands.push_back(globalSlices[group.inputTile]);
|
||||
localSlices[group.inputTile] = globalSlices[group.inputTile];
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the input tensor slicing offsets.
|
||||
SmallVector<OpFoldResult> slicingOffsets{
|
||||
/* 0 */ rewriter.getIndexAttr(0), // No offset along the batch axis.
|
||||
/* 1 */ rewriter.getIndexAttr(group.inputTile * crossbarSize),
|
||||
/* 2 */ /* rewriter.getIndexAttr(0), */
|
||||
/* 3 */ /* rewriter.getIndexAttr(0) */};
|
||||
|
||||
// Create the input tensor slicing sizes.
|
||||
size_t tilingSize = group.inputTile == inputTileCount.quot
|
||||
? inputTileCount.rem
|
||||
: crossbarSize;
|
||||
SmallVector<OpFoldResult> slicingSizes{
|
||||
/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
|
||||
/* 1 */ rewriter.getIndexAttr(tilingSize),
|
||||
/* 2 */ /* rewriter.getIndexAttr(GET_IMAGE_WIDTH(inputType)), */
|
||||
/* 3 */ /* rewriter.getIndexAttr(GET_IMAGE_HEIGHT(inputType)) */};
|
||||
|
||||
// Create the slice extraction operation.
|
||||
auto extractSliceOp =
|
||||
rewriter.create<tensor::ExtractSliceOp>(gemmOp.getLoc(),
|
||||
adaptor.getA(), slicingOffsets, slicingSizes, slicingStrides);
|
||||
|
||||
computeOperands.push_back(extractSliceOp);
|
||||
|
||||
// Update slicing maps.
|
||||
globalSlices[group.inputTile] = extractSliceOp;
|
||||
localSlices[group.inputTile] = extractSliceOp;
|
||||
|
||||
// Update the input tile index.
|
||||
inputTileIndices[group.inputTile] = computeOperands.size() - 1;
|
||||
}
|
||||
|
||||
// ------------------------------- //
|
||||
// --- PREPARE THE OUTPUT TYPE --- //
|
||||
// ------------------------------- //
|
||||
|
||||
// Fill the compute output's type by looking at the output tiles.
|
||||
SmallVector<Type> computeOutputType;
|
||||
for (TaggedWeights group : weightsGroups) {
|
||||
|
||||
// There might be multiple weight groups for the same output tile, so if
|
||||
// we've already added the output tile, skip it.
|
||||
if (outputTileIndices.find(group.outputTile) !=
|
||||
outputTileIndices.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Additionally, after adding the input slices as operands, also add any
|
||||
// compatible partial results from previous compute units.
|
||||
if (globalPartialResults.find(group.outputTile) !=
|
||||
globalPartialResults.end()) {
|
||||
computeOperands.push_back(globalPartialResults[group.outputTile]);
|
||||
reductionTileIndices[group.outputTile] = computeOperands.size() - 1;
|
||||
}
|
||||
|
||||
// Define the output shape for this group.
|
||||
long outputTileSize = group.outputTile == outputTileCount.quot
|
||||
? outputTileCount.rem
|
||||
: crossbarSize;
|
||||
|
||||
// TODO: Address non-same padding.
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */ outputTileSize,
|
||||
/* 2 */ /* GET_IMAGE_WIDTH(outputType), */ // Same padding assumed.
|
||||
/* 3 */ /* GET_IMAGE_HEIGHT(outputType) */};
|
||||
|
||||
auto elementType = dyn_cast<RankedTensorType>(gemmOp.getY().getType())
|
||||
.getElementType();
|
||||
|
||||
computeOutputType.push_back(
|
||||
RankedTensorType::get(outputShapeArray, elementType));
|
||||
|
||||
outputTileIndices[group.outputTile] = computeOutputType.size() - 1;
|
||||
}
|
||||
|
||||
// ----------------------------- //
|
||||
// --- FILL THE COMPUTE UNIT --- //
|
||||
// ----------------------------- //
|
||||
|
||||
// Create the compute unit.
|
||||
spatial::SpatWeightedCompute currentCompute =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(gemmOp.getLoc(),
|
||||
computeOutputType, computeWeights, computeOperands);
|
||||
|
||||
// Create a new block for the compute unit and add the operands.
|
||||
Block *block = rewriter.createBlock(¤tCompute.getRegion());
|
||||
rewriter.setInsertionPointToStart(block);
|
||||
for (Value operand : computeOperands) {
|
||||
block->addArgument(operand.getType(), gemmOp->getLoc());
|
||||
}
|
||||
|
||||
// Initialize a map of local partial results.
|
||||
map<long, Value> localPartialResults; // WRT the current compute unit.
|
||||
|
||||
// If we have any reduction tiles, add them to the local partial results.
|
||||
for (auto reductionTileIndex : reductionTileIndices) {
|
||||
localPartialResults[reductionTileIndex.first] =
|
||||
block->getArgument(reductionTileIndex.second);
|
||||
}
|
||||
|
||||
// Add all the applyFilters operations to the block.
|
||||
for (TaggedWeights group : weightsGroups) {
|
||||
|
||||
// Get the outputType for this group.
|
||||
Type outputType =
|
||||
computeOutputType[outputTileIndices[group.outputTile]];
|
||||
|
||||
// Create an apply filters operation.
|
||||
BlockArgument blockArgument =
|
||||
block->getArgument(inputTileIndices[group.inputTile]);
|
||||
|
||||
// The list of weight indices is group.startingCrossbarIndex + 0, 1, 2,
|
||||
// ... As many weights as the size of group.weights.
|
||||
SmallVector<long> weightIndices;
|
||||
for (size_t i = 0; i < group.weights.size(); ++i) {
|
||||
weightIndices.push_back(group.startingCrossbarIndex + i);
|
||||
}
|
||||
|
||||
SmallVector<int64_t> xKerPos;
|
||||
SmallVector<int64_t> yKerPos;
|
||||
for (auto weight : group.weights) {
|
||||
// Assert that the weight is an extract_slice operation.
|
||||
auto extractSliceOp = weight.getDefiningOp<tensor::ExtractSliceOp>();
|
||||
assert(extractSliceOp && "Weight is not an extract_slice operation.");
|
||||
|
||||
// Get the filter x and y positions from the extract_slice operation.
|
||||
xKerPos.push_back(0);
|
||||
yKerPos.push_back(0);
|
||||
}
|
||||
|
||||
ArrayAttr weightIndicesAttr = rewriter.getI64ArrayAttr(weightIndices);
|
||||
ArrayAttr xKerPosAttr = rewriter.getI64ArrayAttr(xKerPos);
|
||||
ArrayAttr yKerPosAttr = rewriter.getI64ArrayAttr(yKerPos);
|
||||
|
||||
Value result = rewriter.create<spatial::SpatApplyFiltersOp>(gemmOp.getLoc(),
|
||||
outputType, weightIndicesAttr, xKerPosAttr, yKerPosAttr,
|
||||
blockArgument);
|
||||
|
||||
// Perform local reduction if necessary.
|
||||
if (localPartialResults.find(group.outputTile) !=
|
||||
localPartialResults.end()) {
|
||||
|
||||
result = rewriter.create<spatial::SpatVAddOp>(gemmOp.getLoc(),
|
||||
result.getType(), localPartialResults[group.outputTile], result);
|
||||
}
|
||||
|
||||
// Update the partial results map.
|
||||
localPartialResults[group.outputTile] = result;
|
||||
}
|
||||
|
||||
// Add a yield operation to the block by concatenating the partial
|
||||
// results.
|
||||
SmallVector<Value> applyFiltersResults;
|
||||
for (size_t i = 0; i < computeOutputType.size(); ++i) {
|
||||
long outputTile;
|
||||
|
||||
// Given an output tile index, find the corresponding output tile.
|
||||
for (auto outputTileIndex : outputTileIndices) {
|
||||
if (outputTileIndex.second == i) {
|
||||
outputTile = outputTileIndex.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get that tile's partial result and add it to the list.
|
||||
applyFiltersResults.push_back(localPartialResults[outputTile]);
|
||||
}
|
||||
|
||||
// Create the yield operation with the given results.
|
||||
rewriter.create<spatial::SpatYieldOp>(gemmOp.getLoc(), applyFiltersResults);
|
||||
|
||||
// Update the global partial results map.
|
||||
for (size_t i = 0; i < applyFiltersResults.size(); ++i) {
|
||||
long outputTile;
|
||||
|
||||
// Given an output tile index, find the corresponding output tile.
|
||||
for (auto outputTileIndex : outputTileIndices) {
|
||||
if (outputTileIndex.second == i) {
|
||||
outputTile = outputTileIndex.first;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
globalPartialResults[outputTile] = currentCompute.getResult(i);
|
||||
}
|
||||
|
||||
// Move the rewrite cursor out of the block.
|
||||
rewriter.setInsertionPointAfter(currentCompute);
|
||||
}
|
||||
|
||||
// ------------------------------ //
|
||||
// --- CONCATENATE THE OUTPUT --- //
|
||||
// ------------------------------ //
|
||||
|
||||
// Turn the values into a SmallVector.
|
||||
SmallVector<Value> outputValues;
|
||||
for (long i = 0; i < outputTileCount.quot + (outputTileCount.rem > 0);
|
||||
++i) {
|
||||
outputValues.push_back(globalPartialResults[i]);
|
||||
}
|
||||
|
||||
// Assert that the number of output values is correct.
|
||||
assert(outputValues.size() > 0 &&
|
||||
"No output values were generated for the GEMM operation.");
|
||||
|
||||
// Return the final output.
|
||||
rewriter.replaceOp(gemmOp,
|
||||
rewriter.create<tensor::ConcatOp>(gemmOp.getLoc(), 1, outputValues));
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateGemmToConvConversionPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ExperimentalGemmConversionPattern>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
317
src/PIM/Conversion/ONNXToSpatial/Math/Gemm.cpp
Normal file
317
src/PIM/Conversion/ONNXToSpatial/Math/Gemm.cpp
Normal file
@@ -0,0 +1,317 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Location.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Support/LogicalResult.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
const StringRef COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME = "computeWithSoftmaxDivisor";
|
||||
|
||||
struct ONNXGemmOpTile : public OpConversionPattern<ONNXGemmOp> {
|
||||
ONNXGemmOpTile(MLIRContext* ctx)
|
||||
: OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(ONNXGemmOp gemmOp, ONNXGemmOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final {
|
||||
Location gemmLoc = gemmOp.getLoc();
|
||||
Value a = adaptor.getA();
|
||||
Value b = adaptor.getB();
|
||||
Value c = adaptor.getC();
|
||||
Value out = gemmOp.getY();
|
||||
|
||||
float alpha = adaptor.getAlpha().convertToFloat();
|
||||
float beta = adaptor.getBeta().convertToFloat();
|
||||
bool transA = adaptor.getTransA();
|
||||
bool transB = adaptor.getTransB();
|
||||
|
||||
auto aType = cast<RankedTensorType>(a.getType());
|
||||
auto bType = cast<RankedTensorType>(b.getType());
|
||||
auto outType = cast<RankedTensorType>(out.getType());
|
||||
|
||||
RankedTensorType cType = nullptr;
|
||||
bool hasC = !isa<ONNXNoneOp>(c.getDefiningOp());
|
||||
if (hasC) {
|
||||
cType = cast<RankedTensorType>(c.getType());
|
||||
assert("Only support 2 tensor for C" && cType.getRank() == 2);
|
||||
}
|
||||
|
||||
assert("Only support static shapes" && aType.hasStaticShape() && bType.hasStaticShape()
|
||||
&& (!hasC || cType.hasStaticShape()) && outType.hasStaticShape());
|
||||
|
||||
if (transA) {
|
||||
auto aShape = aType.getShape();
|
||||
auto transposedType = aType.cloneWith(ArrayRef({aShape[1], aShape[0]}), aType.getElementType());
|
||||
a = rewriter.create<ONNXTransposeOp>(gemmLoc, transposedType, a, rewriter.getI64ArrayAttr({1, 0}));
|
||||
}
|
||||
if (transB) {
|
||||
auto bShape = bType.getShape();
|
||||
auto transposedType = bType.cloneWith(ArrayRef({bShape[1], bShape[0]}), bType.getElementType());
|
||||
b = rewriter.create<ONNXTransposeOp>(gemmLoc, transposedType, b, rewriter.getI64ArrayAttr({1, 0}));
|
||||
}
|
||||
|
||||
if (alpha != 1.0f) {
|
||||
auto alphaTensorType = RankedTensorType::get({1, 1}, cast<RankedTensorType>(a.getType()).getElementType());
|
||||
auto alphaTensorValue = DenseFPElementsAttr::get(alphaTensorType, {alpha});
|
||||
auto alphaTensor = rewriter.create<arith::ConstantOp>(gemmLoc, alphaTensorType, alphaTensorValue);
|
||||
a = rewriter.create<spatial::SpatVMulOp>(gemmLoc, a.getType(), a, alphaTensor);
|
||||
}
|
||||
if (hasC && beta != 1.0f) {
|
||||
auto betaTensorType = RankedTensorType::get({1, 1}, cast<RankedTensorType>(c.getType()).getElementType());
|
||||
auto betaTensorValue = DenseFPElementsAttr::get(betaTensorType, {beta});
|
||||
auto betaTensor = rewriter.create<arith::ConstantOp>(gemmLoc, betaTensorType, betaTensorValue);
|
||||
c = rewriter.create<spatial::SpatVMulOp>(gemmLoc, c.getType(), c, betaTensor);
|
||||
}
|
||||
|
||||
auto [aNumHSlices, aLastHSliceSize] = ceilIntegerDivideWithRemainder(aType.getDimSize(1), crossbarSize.getValue());
|
||||
auto [bNumHSlices, bLastHSliceSize] = ceilIntegerDivideWithRemainder(bType.getDimSize(1), crossbarSize.getValue());
|
||||
auto bNumVSlices = aNumHSlices;
|
||||
auto bLastVSliceSize = aLastHSliceSize;
|
||||
auto cNumHSlices = bNumHSlices;
|
||||
auto cLastHSliceSize = bLastHSliceSize;
|
||||
auto outNumHSlices = cNumHSlices;
|
||||
auto outLastHSliceSize = cLastHSliceSize;
|
||||
|
||||
const size_t coresPerVSlice = ceilIntegerDivide(bNumVSlices, crossbarCountInCore.getValue());
|
||||
|
||||
DenseMap<CoreId, SmallVector<Value>> aHSlices = sliceVectorPerCrossbarPerCore(a, rewriter, gemmLoc);
|
||||
|
||||
DenseMap<HSliceId, DenseMap<CoreId, SmallVector<Value>>> bTiles =
|
||||
tileMatrix(b, crossbarSize, crossbarSize, rewriter, gemmLoc);
|
||||
|
||||
SmallVector<Value> cHSlices;
|
||||
if (hasC && cType.getDimSize(0) == 1 && cType.getDimSize(1) == 1)
|
||||
c = broadcastToVector(c, bType.getDimSize(1), rewriter, gemmLoc);
|
||||
if (hasC)
|
||||
cHSlices = sliceVector(c, crossbarSize, rewriter, gemmLoc);
|
||||
|
||||
RankedTensorType outHSliceType =
|
||||
RankedTensorType::get({1, static_cast<long>(crossbarSize)}, outType.getElementType());
|
||||
RankedTensorType outLastHSliceType =
|
||||
RankedTensorType::get({1, static_cast<long>(bLastHSliceSize)}, outType.getElementType());
|
||||
|
||||
SmallVector<Value> outHSlices;
|
||||
outHSlices.reserve(outNumHSlices);
|
||||
for (size_t outSliceId = 0; outSliceId < outNumHSlices; outSliceId++) {
|
||||
RankedTensorType currOutHSliceType = outHSliceType;
|
||||
if (outSliceId == outNumHSlices - 1 && outLastHSliceSize != 0)
|
||||
currOutHSliceType = outLastHSliceType;
|
||||
|
||||
SmallVector<Value> partialResults;
|
||||
partialResults.reserve(coresPerVSlice);
|
||||
for (size_t coreId = 0; coreId < coresPerVSlice; coreId++) {
|
||||
SmallVector<Value> weights;
|
||||
weights.reserve(aHSlices[coreId].size());
|
||||
|
||||
for (size_t aSliceId = 0; aSliceId < aHSlices[coreId].size(); aSliceId++)
|
||||
weights.push_back(bTiles[outSliceId][coreId][aSliceId]);
|
||||
|
||||
auto computeOp =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(gemmLoc, currOutHSliceType, weights, aHSlices[coreId]);
|
||||
|
||||
auto* computeBlock = new Block();
|
||||
for (auto aHSlice : aHSlices[coreId])
|
||||
computeBlock->addArgument(aHSlice.getType(), gemmLoc);
|
||||
computeOp.getBody().push_back(computeBlock);
|
||||
rewriter.setInsertionPointToStart(computeBlock);
|
||||
|
||||
auto computeArgs = computeBlock->getArguments();
|
||||
SmallVector<Value> vmmOutputs;
|
||||
vmmOutputs.reserve(computeArgs.size());
|
||||
for (size_t aHSliceId = 0; aHSliceId < aNumHSlices; aHSliceId++)
|
||||
vmmOutputs.push_back(
|
||||
rewriter.create<spatial::SpatWeightedVMMOp>(gemmLoc, currOutHSliceType, aHSliceId, computeArgs[aHSliceId]));
|
||||
assert(!vmmOutputs.empty() && "vmmOutputs must be non-empty");
|
||||
|
||||
Value partialVmmSum = sumTensors(vmmOutputs, rewriter);
|
||||
rewriter.create<spatial::SpatYieldOp>(gemmLoc, partialVmmSum);
|
||||
rewriter.setInsertionPointAfter(computeOp);
|
||||
|
||||
partialResults.push_back(computeOp.getResult(0));
|
||||
}
|
||||
|
||||
if (hasC) {
|
||||
Value cHSlice = cHSlices[outSliceId];
|
||||
partialResults.push_back(cHSlice);
|
||||
}
|
||||
|
||||
auto reduceComputeOp =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(gemmLoc, currOutHSliceType, SmallVector<Value>(), partialResults);
|
||||
|
||||
auto* reduceBlock = new Block();
|
||||
for (auto partialResult : partialResults)
|
||||
reduceBlock->addArgument(partialResult.getType(), gemmLoc);
|
||||
reduceComputeOp.getBody().push_back(reduceBlock);
|
||||
rewriter.setInsertionPointToStart(reduceBlock);
|
||||
|
||||
auto blockArgs = reduceBlock->getArguments();
|
||||
Value outHSlice = sumTensors({blockArgs.begin(), blockArgs.end()}, rewriter);
|
||||
rewriter.create<spatial::SpatYieldOp>(gemmLoc, outHSlice);
|
||||
rewriter.setInsertionPointAfter(reduceComputeOp);
|
||||
|
||||
outHSlices.push_back(reduceComputeOp.getResult(0));
|
||||
}
|
||||
|
||||
rewriter.setInsertionPoint(gemmOp);
|
||||
auto concatOp = rewriter.create<tensor::ConcatOp>(gemmLoc, /*axis=*/1, outHSlices);
|
||||
rewriter.replaceOp(gemmOp, concatOp);
|
||||
return success();
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
* Resolves the ONNXExpOp from the use chain of the given start value.
|
||||
*
|
||||
* This function traverses the use chain of the start value until it finds an
|
||||
* ONNXExpOp. It returns the value of the ONNXExpOp.
|
||||
*
|
||||
* @param startValue The starting value of the use chain.
|
||||
* @return The value of the ONNXExpOp found in the use chain.
|
||||
*/
|
||||
static Value resolveONNXExpOpFromUseChain(Value startValue) {
|
||||
Value walker = startValue;
|
||||
|
||||
while (!llvm::isa<ONNXExpOp>(walker.getDefiningOp())) {
|
||||
walker = walker.getDefiningOp()->getOperand(0);
|
||||
|
||||
assert(walker && walker.getDefiningOp()
|
||||
&& "Unwinded the whole chain of operations while trying to "
|
||||
"find ONNXExpOp, but did not find it");
|
||||
}
|
||||
|
||||
// Make sure the dividend is actually produced by an ONNXExpOp
|
||||
assert(llvm::isa<ONNXExpOp>(walker.getDefiningOp())
|
||||
&& "Old output tile (softmax reducer) is not produced by an "
|
||||
"ONNXExpOp");
|
||||
|
||||
return walker;
|
||||
}
|
||||
|
||||
// Softmax is a special case, as it requires another reduction after the
|
||||
// first one. In the cores, `applyReducePattern` already applied
|
||||
// f(x) = exp(x) to each tile. This mean that now we just need to
|
||||
// reduce-sum these tiles, and then divide each tile by the reduced sum,
|
||||
// which is propagated back to the cores via a broadcast channel.
|
||||
LogicalResult softmaxReductionApplication(SmallVector<OpAndResNum>& outputOpsAndResNums,
|
||||
Value& softmaxChannel,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
SpatialReducer& reducer,
|
||||
ONNXGemmOp& gemmOp,
|
||||
Location& loc) const {
|
||||
|
||||
// TODO: Check case with one compute op
|
||||
|
||||
// Cast vector of Value into vector of ComputeOp
|
||||
SmallVector<ComputeAndResNum> softmaxOpsToReduce =
|
||||
llvm::to_vector(llvm::map_range(outputOpsAndResNums, [&](OpAndResNum computeAndResNum) {
|
||||
return std::make_pair(cast<spatial::SpatWeightedCompute>(computeAndResNum.first), computeAndResNum.second);
|
||||
}));
|
||||
|
||||
RankedTensorType::Builder tensorTypeBuilder({1}, Float32Type::get(rewriter.getContext()), nullptr);
|
||||
const TensorType scalarTensorType = tensorTypeBuilder;
|
||||
|
||||
reducer.applyReducePattern(
|
||||
softmaxOpsToReduce,
|
||||
[&](Value a, Value b) { return rewriter.create<spatial::SpatVAddOp>(loc, scalarTensorType, a, b); },
|
||||
/* preprocess = */
|
||||
[&](Value a) { return rewriter.create<spatial::SpatSumOp>(loc, scalarTensorType, a); },
|
||||
[&](Value softmaxDivisor) {
|
||||
// Signal that this is the compute with the softmax divisor
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(softmaxDivisor.getDefiningOp()->getParentOp());
|
||||
computeOp->setAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME, rewriter.getUnitAttr());
|
||||
|
||||
// Broadcast the divisor to all the cores
|
||||
rewriter.setInsertionPointAfterValue(softmaxDivisor);
|
||||
rewriter.create<spatial::SpatChannelBroadcastSendOp>(loc, softmaxChannel, softmaxDivisor);
|
||||
|
||||
/*
|
||||
* softmaxDividend = onnx.exp (...)
|
||||
* sum = spat.SumOp(softmaxDividend)
|
||||
* [following can be repeated N times, thus walk the use chain]
|
||||
* softmaxDivisor = spat.sadd(sum, ...)
|
||||
*/
|
||||
Value softmaxDividend = resolveONNXExpOpFromUseChain(softmaxDivisor.getDefiningOp()->getOperand(0));
|
||||
|
||||
// Make sure the dividend is actually produced by an ONNXExpOp
|
||||
assert(llvm::isa<ONNXExpOp>(softmaxDividend.getDefiningOp())
|
||||
&& "Dividend of softmax reduction is not an ONNXExpOp");
|
||||
|
||||
// Do not divide here, divide after this
|
||||
return softmaxDivisor;
|
||||
});
|
||||
|
||||
// In all the cores, insert a ChannelRecvOp and divide the output tile by
|
||||
// the reduced denominator.
|
||||
outputOpsAndResNums.clear();
|
||||
outputOpsAndResNums.reserve(softmaxOpsToReduce.size());
|
||||
for (auto& computeToDivideOpAndResNum : softmaxOpsToReduce) {
|
||||
|
||||
auto yieldOp = cast<spatial::SpatYieldOp>(computeToDivideOpAndResNum.first.getBody().front().getTerminator());
|
||||
|
||||
Value divisor;
|
||||
|
||||
// Check if this compute contains the softmax divisor: if so, find the
|
||||
// ChannelBroadcastSendOp, otherwise receive the value from the channel
|
||||
// using ChannelBroadcastReceiveOp
|
||||
if (computeToDivideOpAndResNum.first->hasAttr(COMPUTE_HAS_SOFTMAX_DIVISOR_ATTRNAME)) {
|
||||
|
||||
bool found = false;
|
||||
for (auto broadcastOp :
|
||||
computeToDivideOpAndResNum.first.getBody().front().getOps<spatial::SpatChannelBroadcastSendOp>()) {
|
||||
assert(found == false
|
||||
&& "More than one ChannelBroadcastSendOp in "
|
||||
"compute? How is this possible?");
|
||||
found = true;
|
||||
|
||||
divisor = broadcastOp.getData();
|
||||
}
|
||||
|
||||
assert(found
|
||||
&& "No ChannelBroadcastSendOp in compute where softmax "
|
||||
"divisor was specified to be?");
|
||||
}
|
||||
else {
|
||||
rewriter.setInsertionPoint(yieldOp);
|
||||
divisor = rewriter.create<spatial::SpatChannelBroadcastReceiveOp>(loc, scalarTensorType, softmaxChannel);
|
||||
}
|
||||
|
||||
// Walk the chain of operations until we find the ONNXExpOp: this is
|
||||
// needed because some some may have a different amount of `VAddOp`s due
|
||||
// to the tree reduction (e.g. some may have no VAddOp, some may have
|
||||
// multiples)
|
||||
Value oldOutputTile = resolveONNXExpOpFromUseChain(yieldOp->getOperand(computeToDivideOpAndResNum.second));
|
||||
|
||||
rewriter.setInsertionPoint(yieldOp);
|
||||
Value newOutputTile = rewriter.create<spatial::SpatVSDivOp>(loc, oldOutputTile.getType(), oldOutputTile, divisor);
|
||||
auto yieldOperandNum = yieldOp->getNumOperands();
|
||||
yieldOp->insertOperands(yieldOperandNum, newOutputTile);
|
||||
|
||||
outputOpsAndResNums.push_back({computeToDivideOpAndResNum.first, yieldOperandNum});
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateTilingGemmOpPattern(RewritePatternSet& patterns, MLIRContext* ctx) {
|
||||
patterns.insert<ONNXGemmOpTile>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
327
src/PIM/Conversion/ONNXToSpatial/NN/ExperimentalPooling.cpp
Normal file
327
src/PIM/Conversion/ONNXToSpatial/NN/ExperimentalPooling.cpp
Normal file
@@ -0,0 +1,327 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/IR/ValueRange.h"
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
template <typename PoolOp>
|
||||
bool hasPostProcessExperimentalPoolingWindow() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool hasPostProcessExperimentalPoolingWindow<ONNXAveragePoolOp>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
Value postProcessExperimentalPoolingWindow(ConversionPatternRewriter &rewriter,
|
||||
Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
|
||||
size_t tilesSkippedByPadding) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
Value postProcessExperimentalPoolingWindow<ONNXAveragePoolOp>(
|
||||
ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
|
||||
Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
|
||||
bool countIncludePad = poolOp.getCountIncludePad() == 1;
|
||||
|
||||
size_t divisorNumber =
|
||||
countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
|
||||
|
||||
RankedTensorType scalarTensor =
|
||||
RankedTensorType::get({1}, rewriter.getF32Type());
|
||||
|
||||
// Put a spat.const before the computeOp, and use its value. We do this to be
|
||||
// compatible with the current code generation, which assumes constant to be
|
||||
// loaded in global memory, which is allocated by adding a spat.const OP
|
||||
// directly under func.func (i.e. alongside ComputeOps)
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(
|
||||
valueToDivide.getDefiningOp()->getParentOp());
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
|
||||
rewriter.getI64IntegerAttr(divisorNumber),
|
||||
/* should_allocate = */ rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfterValue(valueToDivide);
|
||||
return rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, valueToDivide.getType(), valueToDivide, divisorValue);
|
||||
}
|
||||
|
||||
template <typename ReductionOp>
|
||||
Value reduceInputTiles(
|
||||
SmallVector<Value> &inputTiles, ConversionPatternRewriter &rewriter) {
|
||||
if (inputTiles.size() == 1) {
|
||||
return inputTiles[0];
|
||||
}
|
||||
|
||||
if (inputTiles.size() == 2) {
|
||||
return rewriter.create<spatial::SpatVMaxOp>(inputTiles[0].getLoc(),
|
||||
inputTiles[0].getType(), inputTiles[0], inputTiles[1]);
|
||||
}
|
||||
|
||||
SmallVector<Value> left(
|
||||
inputTiles.begin(), inputTiles.begin() + inputTiles.size() / 2);
|
||||
SmallVector<Value> right(
|
||||
inputTiles.begin() + inputTiles.size() / 2, inputTiles.end());
|
||||
|
||||
Value leftReduced = reduceInputTiles<ReductionOp>(left, rewriter);
|
||||
Value rightReduced = reduceInputTiles<ReductionOp>(right, rewriter);
|
||||
|
||||
return rewriter.create<ReductionOp>(
|
||||
inputTiles[0].getLoc(), leftReduced.getType(), leftReduced, rightReduced);
|
||||
}
|
||||
|
||||
template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
|
||||
struct ExperimentalPoolingBaseConverter : public OpConversionPattern<PoolOp> {
|
||||
ExperimentalPoolingBaseConverter(MLIRContext *ctx)
|
||||
: OpConversionPattern<PoolOp>(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
Value X = adaptor.getX();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
Value Y = poolOp.getResult();
|
||||
ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
|
||||
|
||||
size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
|
||||
unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
|
||||
unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
|
||||
unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
|
||||
|
||||
if (adaptor.getAutoPad() != "NOTSET") {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "auto_pad != NOTSET is deprecated.");
|
||||
}
|
||||
|
||||
size_t pad_x, pad_y;
|
||||
auto padUnpackError =
|
||||
unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
|
||||
if (padUnpackError.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
|
||||
}
|
||||
|
||||
Location loc = poolOp.getLoc();
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(xShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t output_h = GET_IMAGE_HEIGHT(yShape);
|
||||
size_t output_w = GET_IMAGE_WIDTH(yShape);
|
||||
|
||||
ldiv_t tileCount = std::div(GET_IMAGE_CHANNEL(xShape), crossbarSize);
|
||||
|
||||
// Assert that the input is a tensor.ConcatOp.
|
||||
auto concat = X.getDefiningOp<tensor::ConcatOp>();
|
||||
if (!concat) {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "Expected input to be a tensor.ConcatOp");
|
||||
}
|
||||
|
||||
// Create a [channel_tile][x][y] array to store the input tiles.
|
||||
std::map<long, std::map<long, std::map<long, Value>>> inputTiles;
|
||||
|
||||
// For each argument of the tensor.ConcatOp, resolve the input tiles.
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < tileCount.quot + (tileCount.rem > 0); ++it) {
|
||||
size_t tilingSize =
|
||||
it == tileCount.quot ? tileCount.rem : crossbarSize;
|
||||
|
||||
SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets = {/* 0 */ rewriter.getIndexAttr(0),
|
||||
/* 1 */ rewriter.getIndexAttr(0),
|
||||
/* 2 */ rewriter.getIndexAttr(x),
|
||||
/* 3 */ rewriter.getIndexAttr(y)};
|
||||
SmallVector<OpFoldResult> sizes = {
|
||||
/* 0 */ rewriter.getIndexAttr(1), // Batch size is always 1.
|
||||
/* 1 */ rewriter.getIndexAttr(tilingSize),
|
||||
/* 2 */ rewriter.getIndexAttr(1),
|
||||
/* 3 */ rewriter.getIndexAttr(1)};
|
||||
|
||||
// Get the concat's operand that we want to slice.
|
||||
Value concatInput = concat.getOperand(it);
|
||||
Value slicedTile = rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, concatInput, offsets, sizes, strides);
|
||||
|
||||
inputTiles[it][x][y] = slicedTile;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prepare the shape of the compute's output.
|
||||
ldiv_t itc = tileCount;
|
||||
SmallVector<Type> outputTileTypes;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */
|
||||
cast<RankedTensorType>(inputTiles[it][0][0].getType())
|
||||
.getShape()[1],
|
||||
/* 2 */ 1,
|
||||
/* 3 */ 1};
|
||||
|
||||
auto elementType =
|
||||
dyn_cast<RankedTensorType>(xShape).getElementType();
|
||||
|
||||
outputTileTypes.push_back(
|
||||
RankedTensorType::get(outputShapeArray, elementType));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a plain value list of the input tiles.
|
||||
SmallVector<Value> inputTilesList;
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
inputTilesList.push_back(inputTiles[it][y][x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a single compute to calculate the output.
|
||||
auto computeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
||||
loc, outputTileTypes, SmallVector<Value>(), inputTilesList);
|
||||
|
||||
// Create a new block for the compute unit and add the operands.
|
||||
Block *block = rewriter.createBlock(&computeOp.getRegion());
|
||||
|
||||
// Fill the block arguments and keep a reference to them.
|
||||
std::map<size_t, std::map<size_t, std::map<size_t, Value>>> inputTilesArgs;
|
||||
for (size_t y = 0; y < input_h; ++y) {
|
||||
for (size_t x = 0; x < input_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
auto tileIndex = y * input_w * (itc.quot + (itc.rem > 0)) +
|
||||
x * (itc.quot + (itc.rem > 0)) + it;
|
||||
inputTilesArgs[it][y][x] = block->addArgument(
|
||||
computeOp->getOperand(tileIndex).getType(), loc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Begin writing in the block.
|
||||
rewriter.setInsertionPointToStart(block);
|
||||
|
||||
// Go through all pooling blocks.
|
||||
SmallVector<Value> outputTiles;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
size_t start_x = x * stride_x;
|
||||
size_t start_y = y * stride_y;
|
||||
size_t end_x = std::min(start_x + krn_w, input_w);
|
||||
size_t end_y = std::min(start_y + krn_h, input_h);
|
||||
|
||||
SmallVector<Value> inputTilesToReduce;
|
||||
for (size_t ky = start_y; ky < end_y; ++ky) {
|
||||
for (size_t kx = start_x; kx < end_x; ++kx) {
|
||||
inputTilesToReduce.push_back(inputTilesArgs[it][ky][kx]);
|
||||
}
|
||||
}
|
||||
|
||||
auto reduceResult =
|
||||
reduceInputTiles<ReduceOp>(inputTilesToReduce, rewriter);
|
||||
|
||||
// If the reduce op is add, we need to divide the result by the
|
||||
// number of elements in the pooling window.
|
||||
if (hasPostProcessExperimentalPoolingWindow<PoolOp>()) {
|
||||
// Add a spat.const before the computeOp.
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc,
|
||||
RankedTensorType::get({1}, rewriter.getF32Type()),
|
||||
rewriter.getI64IntegerAttr(krn_w * krn_h),
|
||||
rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfter(reduceResult.getDefiningOp());
|
||||
reduceResult = rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, reduceResult.getType(), reduceResult, divisorValue);
|
||||
}
|
||||
outputTiles.push_back(reduceResult);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a YieldOp to return the output tiles.
|
||||
rewriter.create<spatial::SpatYieldOp>(loc, outputTiles);
|
||||
|
||||
// Set the rewrite cursor right after the computeOp.
|
||||
rewriter.setInsertionPointAfter(computeOp);
|
||||
|
||||
std::map<size_t, std::map<size_t, std::map<size_t, Value>>> computeOutput;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
auto tileIndex = y * output_w * (itc.quot + (itc.rem > 0)) +
|
||||
x * (itc.quot + (itc.rem > 0)) + it;
|
||||
computeOutput[it][y][x] = computeOp.getResult(tileIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We'll now create spat.img.concat ops to concatenate the output tiles.
|
||||
SmallVector<Value> outputTilesList;
|
||||
for (long it = 0; it < itc.quot + (itc.rem > 0); ++it) {
|
||||
SmallVector<Value> imgConcatTiles;
|
||||
for (size_t y = 0; y < output_h; ++y) {
|
||||
for (size_t x = 0; x < output_w; ++x) {
|
||||
imgConcatTiles.push_back(computeOutput[it][y][x]);
|
||||
}
|
||||
}
|
||||
|
||||
size_t tilingSize = it == tileCount.quot ? tileCount.rem : crossbarSize;
|
||||
|
||||
SmallVector<int64_t> outputShapeArray{
|
||||
/* 0 */ 1, // Batch size is always 1.
|
||||
/* 1 */ (long)tilingSize,
|
||||
/* 2 */ (long)output_w,
|
||||
/* 3 */ (long)output_h};
|
||||
|
||||
auto elementType = dyn_cast<RankedTensorType>(xShape).getElementType();
|
||||
|
||||
outputTilesList.push_back(rewriter.create<spatial::SpatImgConcatOp>(loc,
|
||||
RankedTensorType::get(outputShapeArray, elementType),
|
||||
imgConcatTiles));
|
||||
}
|
||||
|
||||
// Create a new tensor.ConcatOp to concatenate the output tiles.
|
||||
Value outputTensor =
|
||||
rewriter.create<tensor::ConcatOp>(loc, 1, outputTilesList);
|
||||
|
||||
rewriter.replaceOp(poolOp, outputTensor);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateExperimentalPoolingTilingPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ExperimentalPoolingBaseConverter<ONNXMaxPoolSingleOutOp,
|
||||
ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
|
||||
patterns.insert<ExperimentalPoolingBaseConverter<ONNXAveragePoolOp,
|
||||
ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
452
src/PIM/Conversion/ONNXToSpatial/NN/Pooling.cpp
Normal file
452
src/PIM/Conversion/ONNXToSpatial/NN/Pooling.cpp
Normal file
@@ -0,0 +1,452 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/IR/ValueRange.h"
|
||||
#include "src/Accelerators/PIM/Common/PIMCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
|
||||
|
||||
Value applyReducePatternNew(SmallVector<Value> &valuesToReduce,
|
||||
ConversionPatternRewriter &rewriter,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess) {
|
||||
// Simple case: if we have only one input, just return it
|
||||
if (valuesToReduce.size() == 1) {
|
||||
return valuesToReduce[0];
|
||||
}
|
||||
|
||||
if (preprocess) {
|
||||
for (auto &valToReduce : valuesToReduce) {
|
||||
rewriter.setInsertionPointAfterValue(valToReduce);
|
||||
valToReduce = preprocess(valToReduce);
|
||||
}
|
||||
}
|
||||
|
||||
// It is possible that `valuesToReduce` contains two entries for the same
|
||||
// computeOp. In this case, we need to apply the reduction within-computef
|
||||
|
||||
// Keep a map between a computeOp and the last Value for this reduction
|
||||
std::unordered_map<Operation *, Value> lastValueForCompute;
|
||||
for (auto &valToReduce : valuesToReduce) {
|
||||
Operation *computeOp = valToReduce.getParentBlock()->getParentOp();
|
||||
// if (valToReduce.getDefiningOp()) {
|
||||
// // If the value is defined by an operation, we take the parent
|
||||
// operation computeOp = valToReduce.getDefiningOp()->getParentOp();
|
||||
// } else {
|
||||
// // Otherwise it is a block argument,
|
||||
// computeOp->getBlock()->getParentOp();
|
||||
// }
|
||||
|
||||
assert(isa<spatial::SpatWeightedCompute>(computeOp) && "Expected a ComputeOp");
|
||||
|
||||
auto it = lastValueForCompute.find(computeOp);
|
||||
|
||||
if (it != lastValueForCompute.end()) {
|
||||
// If we have already seen this computeOp, apply the reduction
|
||||
// within-compute
|
||||
Value lastWithinComputeValue = it->second;
|
||||
|
||||
if (valToReduce.getDefiningOp()->isBeforeInBlock(
|
||||
lastWithinComputeValue.getDefiningOp())) {
|
||||
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
||||
} else {
|
||||
rewriter.setInsertionPointAfterValue(valToReduce);
|
||||
}
|
||||
valToReduce = reduce(lastWithinComputeValue, valToReduce);
|
||||
lastValueForCompute[computeOp] = valToReduce;
|
||||
}
|
||||
|
||||
lastValueForCompute[computeOp] = valToReduce;
|
||||
}
|
||||
|
||||
// Now, reconstruct from the map the valuesToReduce list
|
||||
valuesToReduce.clear();
|
||||
valuesToReduce.reserve(lastValueForCompute.size());
|
||||
for (auto &entry : lastValueForCompute) {
|
||||
valuesToReduce.push_back(entry.second);
|
||||
}
|
||||
|
||||
Location loc = valuesToReduce[0].getLoc();
|
||||
auto channelType = spatial::SpatChannelType::get(rewriter.getContext());
|
||||
|
||||
// Recursive algorithm to reduce the inputs to a single one:
|
||||
// - Take two inputs at a time, and reduce them into a single one, updating
|
||||
// the valuesToReduce list which becomes half the size.
|
||||
// - Repeat until there is only one input left.
|
||||
llvm::OwningArrayRef<Value> valuesToReduceRef(valuesToReduce);
|
||||
while (valuesToReduceRef.size() > 1) {
|
||||
SmallVector<Value> nextValuesToReduce;
|
||||
nextValuesToReduce.reserve(valuesToReduceRef.size() / 2);
|
||||
for (size_t i = 0; i < valuesToReduceRef.size() - 1; i += 2) {
|
||||
auto firstValue = valuesToReduceRef[i];
|
||||
auto secondValue = valuesToReduceRef[i + 1];
|
||||
|
||||
auto firstCompute = firstValue.getParentBlock()->getParentOp();
|
||||
auto secondCompute = secondValue.getParentBlock()->getParentOp();
|
||||
|
||||
assert(isa<spatial::SpatWeightedCompute>(firstCompute));
|
||||
assert(isa<spatial::SpatWeightedCompute>(secondCompute));
|
||||
|
||||
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
||||
std::swap(firstValue, secondValue);
|
||||
std::swap(firstCompute, secondCompute);
|
||||
}
|
||||
|
||||
// 1. Add a channel before the first computeOp
|
||||
rewriter.setInsertionPoint(firstCompute);
|
||||
auto channel = rewriter.create<spatial::SpatChannelNewOp>(loc, channelType);
|
||||
|
||||
// 2. Add a sendOp after the first value
|
||||
rewriter.setInsertionPointAfterValue(firstValue);
|
||||
rewriter.create<spatial::SpatChannelSendOp>(loc, channel, firstValue);
|
||||
|
||||
// 3. Add a receiveOp after the second value
|
||||
rewriter.setInsertionPointAfterValue(secondValue);
|
||||
auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
|
||||
loc, secondValue.getType(), channel);
|
||||
|
||||
// 4. Apply reduction between second value and received value
|
||||
rewriter.setInsertionPointAfterValue(receivedValue);
|
||||
Value reduced = reduce(receivedValue, secondValue);
|
||||
|
||||
nextValuesToReduce.push_back(reduced);
|
||||
}
|
||||
|
||||
// If we have an odd number of inputs, we need to add the last one to the
|
||||
// newInputs list.
|
||||
if (valuesToReduceRef.size() % 2 == 1) {
|
||||
nextValuesToReduce.push_back(valuesToReduceRef.back());
|
||||
}
|
||||
|
||||
// Replace the inputOps list with the new one.
|
||||
valuesToReduceRef =
|
||||
llvm::OwningArrayRef<Value>(std::move(nextValuesToReduce));
|
||||
}
|
||||
|
||||
assert(valuesToReduceRef.size() == 1 &&
|
||||
"Internal error: expected a single input at this point.");
|
||||
|
||||
auto finalValue = valuesToReduceRef[0];
|
||||
|
||||
if (postprocess) {
|
||||
rewriter.setInsertionPointAfterValue(finalValue);
|
||||
finalValue = postprocess(finalValue);
|
||||
}
|
||||
|
||||
return finalValue;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
bool hasPostProcessPoolingWindow() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool hasPostProcessPoolingWindow<ONNXAveragePoolOp>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename PoolOp>
|
||||
Value postProcessPoolingWindow(ConversionPatternRewriter &rewriter,
|
||||
Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size,
|
||||
size_t tilesSkippedByPadding) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <>
|
||||
Value postProcessPoolingWindow<ONNXAveragePoolOp>(
|
||||
ConversionPatternRewriter &rewriter, Location loc, ONNXAveragePoolOp poolOp,
|
||||
Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) {
|
||||
bool countIncludePad = poolOp.getCountIncludePad() == 1;
|
||||
|
||||
size_t divisorNumber =
|
||||
countIncludePad ? krn_size : krn_size - tilesSkippedByPadding;
|
||||
|
||||
RankedTensorType scalarTensor =
|
||||
RankedTensorType::get({1}, rewriter.getF32Type());
|
||||
|
||||
// Put a spat.const before the computeOp, and use its value. We do this to be
|
||||
// compatible with the current code generation, which assumes constant to be
|
||||
// loaded in global memory, which is allocated by adding a spat.const OP
|
||||
// directly under func.func (i.e. alongside ComputeOps)
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(
|
||||
valueToDivide.getDefiningOp()->getParentOp());
|
||||
rewriter.setInsertionPoint(computeOp);
|
||||
auto divisorValue = rewriter.create<spatial::SpatConstantOp>(loc, scalarTensor,
|
||||
rewriter.getI64IntegerAttr(divisorNumber),
|
||||
/* should_allocate = */ rewriter.getBoolAttr(true));
|
||||
|
||||
rewriter.setInsertionPointAfterValue(valueToDivide);
|
||||
return rewriter.create<spatial::SpatVSDivOp>(
|
||||
loc, valueToDivide.getType(), valueToDivide, divisorValue);
|
||||
}
|
||||
|
||||
template <typename PoolOp, typename PoolOpAdaptor, typename ReduceOp>
|
||||
struct PoolingBaseConverter : public OpConversionPattern<PoolOp> {
|
||||
PoolingBaseConverter(MLIRContext *ctx) : OpConversionPattern<PoolOp>(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
Value X = adaptor.getX();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
Value Y = poolOp.getResult();
|
||||
ShapedType yShape = mlir::cast<ShapedType>(Y.getType());
|
||||
|
||||
size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h;
|
||||
unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y);
|
||||
unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y);
|
||||
unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h);
|
||||
|
||||
if (adaptor.getAutoPad() != "NOTSET") {
|
||||
return rewriter.notifyMatchFailure(
|
||||
poolOp, "auto_pad != NOTSET is deprecated.");
|
||||
}
|
||||
|
||||
size_t pad_x, pad_y;
|
||||
auto padUnpackError =
|
||||
unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y);
|
||||
if (padUnpackError.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, padUnpackError.value());
|
||||
}
|
||||
|
||||
Location loc = poolOp.getLoc();
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(xShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t output_h = GET_IMAGE_HEIGHT(yShape);
|
||||
size_t output_w = GET_IMAGE_WIDTH(yShape);
|
||||
size_t channelTileCount =
|
||||
ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
|
||||
size_t channelTileRest = GET_IMAGE_CHANNEL(xShape) % crossbarSize;
|
||||
|
||||
// 1: Tile the input tensor
|
||||
// Input tiles need to be indexed by:
|
||||
// a. Channel Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: inputTiles[channelTile][x][y]
|
||||
// Example complete input tensor: tensor<1x3x12x12xf32> (NxCxWxH)
|
||||
// Suppose that the input tensor is produced by concatenating the results of
|
||||
// many ComputeOps. Get the result tiles from these ComputeOps.
|
||||
SmallVector<SmallVector<SmallVector<Value>>> inputTiles(channelTileCount,
|
||||
SmallVector<SmallVector<Value>>(input_w, SmallVector<Value>(input_h)));
|
||||
|
||||
auto resolveErrorOpt = resolveImgInputTiles(X, inputTiles, channelTileCount,
|
||||
channelTileRest, input_w, input_h, rewriter);
|
||||
if (resolveErrorOpt.has_value()) {
|
||||
return rewriter.notifyMatchFailure(poolOp, *resolveErrorOpt);
|
||||
}
|
||||
|
||||
// TODO: This requires a core for each input tile, which is not ideal. We
|
||||
// can do better.
|
||||
// If some input tiles come from the func.func operands, load
|
||||
// them into a computeOp and yield them
|
||||
for (size_t t = 0; t < channelTileCount; t++) {
|
||||
for (size_t x = 0; x < input_w; x++) {
|
||||
for (size_t y = 0; y < input_h; y++) {
|
||||
if (auto extractSliceOp =
|
||||
inputTiles[t][x][y].getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
Location tileLoc = extractSliceOp.getLoc();
|
||||
|
||||
auto tempComputeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
||||
tileLoc, extractSliceOp.getResultType(),
|
||||
/* xbarWeights =*/ValueRange(), extractSliceOp.getResult());
|
||||
|
||||
Block *tempComputeOpBlock = new Block();
|
||||
tempComputeOp.getBody().push_back(tempComputeOpBlock);
|
||||
auto tempComputeOpBlockArg = tempComputeOpBlock->addArgument(
|
||||
extractSliceOp.getType(), tileLoc);
|
||||
|
||||
rewriter.setInsertionPointToStart(tempComputeOpBlock);
|
||||
rewriter.create<spatial::SpatYieldOp>(tileLoc, tempComputeOpBlockArg);
|
||||
rewriter.setInsertionPointAfter(tempComputeOp);
|
||||
inputTiles[t][x][y] = tempComputeOp.getResult(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2: Tile the output tensor
|
||||
// Output tiles need to be indexed by:
|
||||
// a. Channel Tile
|
||||
// b. Pixel `x` position
|
||||
// c. Pixel `y` position
|
||||
// For example: outputTiles[channelTile][x][y]
|
||||
// Example complete output tensor: tensor<1x3x6x6xf32> (NxCxWxH)
|
||||
SmallVector<SmallVector<SmallVector<Value>>> outputTiles(
|
||||
channelTileCount, SmallVector<SmallVector<Value>>(
|
||||
output_w, SmallVector<Value>(output_h, nullptr)));
|
||||
|
||||
// List of values to pool for each output pixel
|
||||
SmallVector<Value> valuesToPool;
|
||||
|
||||
// Iterate each output tile
|
||||
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
||||
// Iterate each output pixel
|
||||
for (size_t outX = 0; outX < output_w; outX++) {
|
||||
for (size_t outY = 0; outY < output_h; outY++) {
|
||||
|
||||
// Each output pixel tile is computed by pooling a window of input
|
||||
// pixel tiles
|
||||
valuesToPool.clear();
|
||||
size_t tilesSkippedByPadding = 0;
|
||||
|
||||
auto [start_x, end_x] = kernel_get_start_and_end(
|
||||
outX, input_w, krn_w, stride_x, dilation_x, pad_x);
|
||||
auto [start_y, end_y] = kernel_get_start_and_end(
|
||||
outY, input_h, krn_h, stride_y, dilation_y, pad_y);
|
||||
|
||||
for (size_t inX = start_x; inX < end_x; inX += dilation_x) {
|
||||
for (size_t inY = start_y; inY < end_y; inY += dilation_y) {
|
||||
if (failed(verifyWithinBoundsAndPaddings(
|
||||
input_w, input_h, inX, inY, pad_x, pad_y))) {
|
||||
tilesSkippedByPadding++;
|
||||
continue;
|
||||
}
|
||||
|
||||
Value inputTile = inputTiles[outTile][inX][inY];
|
||||
|
||||
Value valueToPool;
|
||||
if (auto computeProducer =
|
||||
inputTile.getDefiningOp<spatial::SpatWeightedCompute>()) {
|
||||
|
||||
int resultNumber = getResultIndex(computeProducer, inputTile);
|
||||
|
||||
auto yieldInComputeOp = cast<spatial::SpatYieldOp>(
|
||||
computeProducer.getBody().front().getTerminator());
|
||||
valueToPool = yieldInComputeOp.getOperand(resultNumber);
|
||||
} else if (auto receiveProducer =
|
||||
inputTile
|
||||
.getDefiningOp<spatial::SpatChannelReceiveOp>()) {
|
||||
auto sendOpOpt =
|
||||
getOtherEndOfChannel(receiveProducer, true, rewriter);
|
||||
if (failed(sendOpOpt)) {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"ChannelReceiveOp does not have a matching "
|
||||
"ChannelSendOp.");
|
||||
}
|
||||
auto sendOp = cast<spatial::SpatChannelSendOp>(*sendOpOpt);
|
||||
|
||||
valueToPool = sendOp.getData();
|
||||
} else {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"Input tile for Pooling is not produced by a "
|
||||
"WeightedComputeOp nor a receiveOp");
|
||||
}
|
||||
|
||||
valuesToPool.push_back(valueToPool);
|
||||
}
|
||||
}
|
||||
|
||||
assert(valuesToPool.size() != 0 &&
|
||||
"Pooling computed on zero tiles make no sense.");
|
||||
// assert(computeOpsForPooling.size() != 1 &&
|
||||
// "Pooling computed on one tiles make no sense??? Or maybe
|
||||
// this " "should have been simplified earlier???");
|
||||
|
||||
std::function<Value(const Value &)> postProcessFn = nullptr;
|
||||
if (hasPostProcessPoolingWindow<PoolOp>()) {
|
||||
postProcessFn = [&](const Value prevFinalRes) {
|
||||
return postProcessPoolingWindow(rewriter, loc, poolOp,
|
||||
prevFinalRes, krn_h * krn_w, tilesSkippedByPadding);
|
||||
};
|
||||
}
|
||||
|
||||
Value reducedWithinCompute = applyReducePatternNew(
|
||||
valuesToPool, rewriter,
|
||||
[&](const Value lhs, const Value rhs) {
|
||||
return rewriter.create<ReduceOp>(loc, lhs.getType(), lhs, rhs);
|
||||
},
|
||||
nullptr, postProcessFn);
|
||||
|
||||
// Send this value through a channel, and receive it in the
|
||||
// `func.func`. During lowering, we will need to "move it" into the
|
||||
// users computeOps
|
||||
auto computeOpOfReduced = cast<spatial::SpatWeightedCompute>(
|
||||
reducedWithinCompute.getDefiningOp()->getParentOp());
|
||||
|
||||
// Create a new channel before the computeOp
|
||||
rewriter.setInsertionPoint(computeOpOfReduced);
|
||||
auto reduceChannel = rewriter.create<spatial::SpatChannelNewOp>(
|
||||
loc, spatial::SpatChannelType::get(rewriter.getContext()));
|
||||
|
||||
// Send value through the channel
|
||||
rewriter.setInsertionPointAfterValue(reducedWithinCompute);
|
||||
rewriter.create<spatial::SpatChannelSendOp>(
|
||||
loc, reduceChannel, reducedWithinCompute);
|
||||
|
||||
// Receive after the computeOp
|
||||
rewriter.setInsertionPointAfter(computeOpOfReduced);
|
||||
auto receivedValue = rewriter.create<spatial::SpatChannelReceiveOp>(
|
||||
loc, reducedWithinCompute.getType(), reduceChannel);
|
||||
|
||||
outputTiles[outTile][outX][outY] = receivedValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: outputTiles are not the results of the computeOps! We need to add
|
||||
// them!
|
||||
|
||||
std::unordered_map<Operation *,
|
||||
SmallVector<std::tuple<size_t, size_t, size_t, Value>>>
|
||||
computeOpNeedingResults;
|
||||
|
||||
// Iterate each output tile
|
||||
for (size_t outTile = 0; outTile < channelTileCount; outTile++) {
|
||||
// Iterate each output pixel
|
||||
for (size_t outX = 0; outX < output_w; outX++) {
|
||||
for (size_t outY = 0; outY < output_h; outY++) {
|
||||
auto outputTile = outputTiles[outTile][outX][outY];
|
||||
auto outputTileProducer = outputTile.getDefiningOp()->getParentOp();
|
||||
if (!outputTileProducer) {
|
||||
return rewriter.notifyMatchFailure(poolOp,
|
||||
"Output tile for Pooling is not produced by a "
|
||||
"WeightedComputeOp.");
|
||||
}
|
||||
|
||||
computeOpNeedingResults[outputTileProducer].push_back(
|
||||
std::make_tuple(outTile, outX, outY, outputTile));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Value outputImage =
|
||||
createImgConcatOp(outputTiles, rewriter, loc, poolOp.getType());
|
||||
|
||||
rewriter.replaceOp(poolOp, outputImage);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populatePoolingTilingPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<PoolingBaseConverter<ONNXMaxPoolSingleOutOp,
|
||||
ONNXMaxPoolSingleOutOpAdaptor, spatial::SpatVMaxOp>>(ctx);
|
||||
patterns.insert<PoolingBaseConverter<ONNXAveragePoolOp,
|
||||
ONNXAveragePoolOpAdaptor, spatial::SpatVAddOp>>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
90
src/PIM/Conversion/ONNXToSpatial/NN/ReduceMean.cpp
Normal file
90
src/PIM/Conversion/ONNXToSpatial/NN/ReduceMean.cpp
Normal file
@@ -0,0 +1,90 @@
|
||||
|
||||
|
||||
#include "Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
namespace onnx_mlir {
|
||||
|
||||
struct ReduceMeanConversionPattern
|
||||
: public OpConversionPattern<ONNXReduceMeanV13Op> {
|
||||
|
||||
ReduceMeanConversionPattern(MLIRContext *ctx) : OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXReduceMeanV13Op reduceMean,
|
||||
ONNXReduceMeanV13OpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
|
||||
// Get the input tensor.
|
||||
Value inputTensor = adaptor.getData();
|
||||
auto inputTensorType = cast<RankedTensorType>(inputTensor.getType());
|
||||
|
||||
// This pattern will substitute the ONNXReduceMeanV13Op with a
|
||||
// ONNXAveragePoolOp with the same input tensor and an appropriate kernel
|
||||
// shape and strides.
|
||||
|
||||
// To get the stride and shape of the kernel, we need to read the tensor
|
||||
// shape.
|
||||
int image_height = inputTensorType.getShape()[2];
|
||||
int image_width = inputTensorType.getShape()[3];
|
||||
|
||||
// Define the kernel shape and strides.
|
||||
SmallVector<int64_t> kernelShapeVals = {image_height, image_width};
|
||||
SmallVector<int64_t> stridesVals = {image_height, image_width};
|
||||
SmallVector<int64_t> dilationsVals = {1, 1};
|
||||
|
||||
// Set the pads to 0.
|
||||
SmallVector<int64_t> padsVals = {0, 0, 0, 0};
|
||||
|
||||
// Create the ArrayAttrs
|
||||
auto kernelShape = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(kernelShapeVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto strides = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(stridesVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto dilations = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(dilationsVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
auto pads = mlir::ArrayAttr::get(rewriter.getContext(),
|
||||
llvm::to_vector(
|
||||
llvm::map_range(padsVals, [&](int64_t v) -> mlir::Attribute {
|
||||
return rewriter.getI64IntegerAttr(v);
|
||||
})));
|
||||
|
||||
// Create the resulting tensor type.
|
||||
auto resultType = RankedTensorType::get(
|
||||
/*shape=*/{inputTensorType.getShape()[0], inputTensorType.getShape()[1],
|
||||
1, 1},
|
||||
/*elementType=*/inputTensorType.getElementType());
|
||||
|
||||
// Create the ONNXAveragePoolOp.
|
||||
auto averagePool = rewriter.create<ONNXAveragePoolOp>(reduceMean.getLoc(),
|
||||
resultType, inputTensor, /*auto_pad=*/"NOTSET",
|
||||
/*ceil_mode=*/0, /*count_include_pad=*/1, dilations,
|
||||
/*kernel_shape=*/kernelShape,
|
||||
/*pads=*/pads, /*strides=*/strides);
|
||||
|
||||
// Replace the ONNXReduceMeanV13Op with the ONNXAveragePoolOp.
|
||||
rewriter.replaceOp(reduceMean, averagePool.getResult());
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateReduceMeanConversionPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ReduceMeanConversionPattern>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
79
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatial.td
Normal file
79
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatial.td
Normal file
@@ -0,0 +1,79 @@
|
||||
#ifndef ONNX_TO_SPATIAL
|
||||
#define ONNX_TO_SPATIAL
|
||||
|
||||
#ifndef OP_BASE
|
||||
include "mlir/Dialect/Tensor/IR/TensorOps.td"
|
||||
include "mlir/Dialect/Arith/IR/ArithOps.td"
|
||||
include "src/Dialect/ONNX/ONNX.td"
|
||||
include "src/Accelerators/PIM/Dialect/Spatial/Spatial.td"
|
||||
#endif // OP_BASE
|
||||
|
||||
def onnxToArithConstantOp : Pat<
|
||||
(ONNXConstantOp $sparse_value, $value, $value_float, $value_floats, $value_int, $value_ints, $value_string, $value_strings),
|
||||
(Arith_ConstantOp $value)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ONNXMatMulOp to ONNXGemmOp patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def matMulAddToGemmPattern : Pat<
|
||||
(ONNXAddOp (ONNXMatMulOp:$matmulres $A, $B), $C),
|
||||
(ONNXGemmOp $A, $B, $C,
|
||||
/* alpha = */ (NativeCodeCall<"$_builder.getF32FloatAttr(1)">),
|
||||
/* beta = */ (NativeCodeCall<"$_builder.getF32FloatAttr(1)">),
|
||||
/* transA = */ (NativeCodeCall<"IntegerAttr::get($_builder.getIntegerType(64, true), 0)">),
|
||||
/* transB = */ (NativeCodeCall<"IntegerAttr::get($_builder.getIntegerType(64, true), 0)">)
|
||||
)
|
||||
>;
|
||||
|
||||
def matMulToGemmPattern : Pat<
|
||||
(ONNXMatMulOp:$matmulres $A, $B),
|
||||
(
|
||||
ONNXGemmOp $A, $B,
|
||||
/* C = */ (NativeCodeCall<"$_builder.create<tensor::EmptyOp>($_loc, cast<ShapedType>(matmulres.getY().getType()).getShape(), cast<ShapedType>(matmulres.getY().getType()).getElementType());">),
|
||||
/* alpha = */ (NativeCodeCall<"$_builder.getF32FloatAttr(1)">),
|
||||
/* beta = */ (NativeCodeCall<"$_builder.getF32FloatAttr(0)">),
|
||||
/* transA = */ (NativeCodeCall<"IntegerAttr::get($_builder.getIntegerType(64, true), 0)">),
|
||||
/* transB = */ (NativeCodeCall<"IntegerAttr::get($_builder.getIntegerType(64, true), 0)">)
|
||||
)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ONNXConvOp + ONNXAddOp to ONNXConvOp pattern
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// This pattern is used to fuse an ONNXConvOp and an ONNXAddOp into a single
|
||||
// ONNXConvOp with a bias.
|
||||
def convAddToConvWithBiasPatternLeft : Pat<
|
||||
(ONNXAddOp $add_operand, (ONNXConvOp:$convres $x, $w, $bias, $auto_pad, $dilations, $group, $kernel_shape, $pad, $strides)),
|
||||
(ONNXConvOp $x, $w, $add_operand, $auto_pad, $dilations, $group, $kernel_shape, $pad, $strides)
|
||||
>;
|
||||
|
||||
def convAddToConvWithBiasPatternRight : Pat<
|
||||
(ONNXAddOp (ONNXConvOp:$convres $x, $w, $bias, $auto_pad, $dilations, $group, $kernel_shape, $pad, $strides), $add_operand),
|
||||
(ONNXConvOp $x, $w, $add_operand, $auto_pad, $dilations, $group, $kernel_shape, $pad, $strides)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Operation to ignore (i.e. remove)
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def replaceWithOperationOfValue : NativeCodeCall<"$0">;
|
||||
|
||||
def removeLRNPattern : Pat<
|
||||
(ONNXLRNOp $A, $_, $_, $_, $_),
|
||||
(replaceWithOperationOfValue $A)
|
||||
>;
|
||||
|
||||
def HaveSameStaticShape: Constraint<
|
||||
CPred<"onnx_mlir::haveSameStaticShape($0, $1)">,
|
||||
"Two tensors have the same static shape">;
|
||||
|
||||
def removeFlattenSameShapePattern : Pat<
|
||||
(ONNXFlattenOp:$flattenOp $A, $axis),
|
||||
(replaceWithOperationOfValue $A),
|
||||
[(HaveSameStaticShape $flattenOp, $A)]
|
||||
>; // Add closing parenthesis here
|
||||
|
||||
#endif // ONNX_TO_SPATIAL
|
||||
499
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.cpp
Normal file
499
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.cpp
Normal file
@@ -0,0 +1,499 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Location.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
|
||||
#include "ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
SmallVector<Value> sliceTensor(
|
||||
const Value& tensorToSlice, size_t axis, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
ArrayRef<long> shape = getTensorShape(tensorToSlice);
|
||||
assert("Invalid axis" && axis < shape.size());
|
||||
|
||||
SmallVector<OpFoldResult> strides(shape.size(), rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets(shape.size(), rewriter.getIndexAttr(0));
|
||||
SmallVector<OpFoldResult> sizes;
|
||||
sizes.reserve(shape.size());
|
||||
for (const auto size : shape)
|
||||
sizes.push_back(rewriter.getIndexAttr(size));
|
||||
sizes[axis] = rewriter.getIndexAttr(sliceSize);
|
||||
|
||||
long length = shape[axis];
|
||||
auto [numSlices, lastSliceSize] = ceilIntegerDivideWithRemainder(length, sliceSize);
|
||||
SmallVector<Value> slices;
|
||||
slices.reserve(numSlices);
|
||||
|
||||
for (int64_t i = 0; i < numSlices; i++) {
|
||||
offsets[axis] = rewriter.getIndexAttr(i * sliceSize);
|
||||
if (i == numSlices - 1 && lastSliceSize != 0)
|
||||
sizes[axis] = rewriter.getIndexAttr(lastSliceSize);
|
||||
|
||||
Value slice = rewriter.create<tensor::ExtractSliceOp>(loc, tensorToSlice, offsets, sizes, strides);
|
||||
slices.push_back(slice);
|
||||
}
|
||||
|
||||
return slices;
|
||||
}
|
||||
|
||||
SmallVector<Value>
|
||||
sliceVector(const Value& vectorToSlice, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
ArrayRef<long> shape = getTensorShape(vectorToSlice);
|
||||
assert("Not a vector" && isVectorShape(shape));
|
||||
size_t axis = shape[0] != 1 ? 0 : 1;
|
||||
return sliceTensor(vectorToSlice, axis, sliceSize, rewriter, loc);
|
||||
}
|
||||
|
||||
DenseMap<CoreId, SmallVector<Value>>
|
||||
sliceVectorPerCrossbarPerCore(const Value& vectorToSlice, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
SmallVector<Value> slices = sliceVector(vectorToSlice, crossbarSize, rewriter, loc);
|
||||
DenseMap<CoreId, SmallVector<Value>> slicesPerCore;
|
||||
for (size_t sliceId = 0; sliceId < slices.size(); sliceId++) {
|
||||
size_t coreId = sliceId / crossbarCountInCore;
|
||||
slicesPerCore[coreId].push_back(slices[sliceId]);
|
||||
}
|
||||
return slicesPerCore;
|
||||
}
|
||||
|
||||
DenseMap<HSliceId, DenseMap<CoreId, SmallVector<Value>>> tileMatrix(
|
||||
Value& matrixToTile, int64_t hSliceSize, int64_t vSliceSize, ConversionPatternRewriter& rewriter, Location& loc) {
|
||||
assert("Not a matrix" && isMatrixShape(getTensorShape(matrixToTile)));
|
||||
|
||||
DenseMap<HSliceId, DenseMap<CoreId, SmallVector<Value>>> tiles;
|
||||
|
||||
SmallVector<Value> hSlices = sliceTensor(matrixToTile, 1, hSliceSize, rewriter, loc);
|
||||
size_t numHSlices = hSlices.size();
|
||||
for (size_t hSliceId = 0; hSliceId < numHSlices; hSliceId++) {
|
||||
Value hSlice = hSlices[hSliceId];
|
||||
SmallVector<Value> vSlices = sliceTensor(hSlice, 0, vSliceSize, rewriter, loc);
|
||||
for (size_t vSliceId = 0; vSliceId < vSlices.size(); vSliceId++) {
|
||||
size_t coreId = vSliceId / crossbarCountInCore;
|
||||
Value vSlice = vSlices[vSliceId];
|
||||
tiles[hSliceId][coreId].push_back(vSlice);
|
||||
}
|
||||
}
|
||||
return tiles;
|
||||
}
|
||||
|
||||
tensor::SplatOp
|
||||
broadcastToVector(Value scalarToBroadcast, int64_t length, ConversionPatternRewriter& rewriter, Location loc) {
|
||||
auto oldType = cast<RankedTensorType>(scalarToBroadcast.getType());
|
||||
Type elementType = oldType.getElementType();
|
||||
int64_t shape[2] = {1, length};
|
||||
Type type = oldType.cloneWith(ArrayRef(shape), elementType);
|
||||
|
||||
auto zero = rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult();
|
||||
SmallVector<Value> index(oldType.getRank(), zero);
|
||||
auto elementValue = rewriter.create<tensor::ExtractOp>(loc, scalarToBroadcast, index).getResult();
|
||||
|
||||
return rewriter.create<tensor::SplatOp>(loc, type, elementValue);
|
||||
}
|
||||
|
||||
Value sumTensors(ArrayRef<Value> tensors, ConversionPatternRewriter& rewriter) {
|
||||
if (tensors.size() == 1)
|
||||
return tensors[0];
|
||||
|
||||
SmallVector<Value> tensors1 = {tensors.begin(), tensors.end()};
|
||||
SmallVector<Value> tensors2;
|
||||
tensors2.reserve(tensors.size() / 2);
|
||||
|
||||
auto* currTensors = &tensors1;
|
||||
auto* nextTensors = &tensors2;
|
||||
while (currTensors->size() > 1) {
|
||||
for (size_t i = 0; i < currTensors->size() - 1; i += 2) {
|
||||
Value a = (*currTensors)[i];
|
||||
Value b = (*currTensors)[i + 1];
|
||||
rewriter.setInsertionPointAfterValue(b);
|
||||
auto addedValue = rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b);
|
||||
nextTensors->push_back(addedValue);
|
||||
}
|
||||
if (currTensors->size() % 2 == 1)
|
||||
nextTensors->push_back(currTensors->back());
|
||||
std::swap(currTensors, nextTensors);
|
||||
nextTensors->clear();
|
||||
}
|
||||
assert(currTensors->size() == 1 && "Expected a single input at this point.");
|
||||
return (*currTensors)[0];
|
||||
}
|
||||
|
||||
Value createMapOperation(PatternRewriter& rewriter, MapOperations mapOp, const Value& input) {
|
||||
switch (mapOp) {
|
||||
case MapOperations::None: assert(false && "Invalid map operation during map operation creation.");
|
||||
case MapOperations::ONNXSoftmaxOp: return rewriter.create<ONNXSoftmaxOp>(input.getLoc(), input.getType(), input);
|
||||
case MapOperations::ONNXReluOp: return rewriter.create<ONNXReluOp>(input.getLoc(), input.getType(), input);
|
||||
case MapOperations::ONNXLeakyReluOp: return rewriter.create<ONNXLeakyReluOp>(input.getLoc(), input.getType(), input);
|
||||
case MapOperations::ONNXExpOp: return rewriter.create<ONNXExpOp>(input.getLoc(), input.getType(), input);
|
||||
}
|
||||
}
|
||||
|
||||
void unpackOptionalPairVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& value1, size_t& value2) {
|
||||
if (auto unpackedStrides = valuesArray) {
|
||||
value1 = mlir::cast<IntegerAttr>(unpackedStrides->getValue()[0]).getInt();
|
||||
value2 = mlir::cast<IntegerAttr>(unpackedStrides->getValue()[1]).getInt();
|
||||
}
|
||||
else {
|
||||
value1 = 1;
|
||||
value2 = 1;
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<llvm::Twine>
|
||||
unpackOptionalPadsVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& pad_x, size_t& pad_y) {
|
||||
if (valuesArray.has_value()) {
|
||||
auto pads = mlir::ArrayAttr(*valuesArray);
|
||||
if (pads.size() != 4)
|
||||
return "pads must have 4 elements.";
|
||||
|
||||
pad_x = cast<IntegerAttr>(pads[2]).getInt();
|
||||
pad_y = cast<IntegerAttr>(pads[3]).getInt();
|
||||
}
|
||||
else {
|
||||
// Default padding is 0 unless specified otherwise.
|
||||
// https://onnx.ai/onnx/operators/onnx__Conv.html
|
||||
pad_x = pad_y = 0;
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void tileImageTensorByChannel(Value imageTensor,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>& tiles,
|
||||
size_t tileSize,
|
||||
ConversionPatternRewriter& rewriter) {
|
||||
ShapedType imageShape = mlir::cast<ShapedType>(imageTensor.getType());
|
||||
|
||||
size_t input_h = GET_IMAGE_HEIGHT(imageShape);
|
||||
size_t input_w = GET_IMAGE_WIDTH(imageShape);
|
||||
size_t tileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(imageShape), tileSize);
|
||||
size_t tileRest = GET_IMAGE_CHANNEL(imageShape) % tileSize;
|
||||
|
||||
SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets(4, rewriter.getIndexAttr(0));
|
||||
SmallVector<OpFoldResult> sizes = {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(tileSize), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
|
||||
Location loc = imageTensor.getLoc();
|
||||
|
||||
for (size_t i = 0; i < tileCount; i++) {
|
||||
if (i == tileCount - 1 && tileRest != 0)
|
||||
sizes[1] = rewriter.getIndexAttr(tileRest);
|
||||
for (size_t x = 0; x < input_w; x++) {
|
||||
for (size_t y = 0; y < input_h; y++) {
|
||||
offsets[1] = rewriter.getIndexAttr(i * tileSize);
|
||||
offsets[2] = rewriter.getIndexAttr(x);
|
||||
offsets[3] = rewriter.getIndexAttr(y);
|
||||
|
||||
tiles[i][x][y] = rewriter.create<tensor::ExtractSliceOp>(loc, imageTensor, offsets, sizes, strides);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Value createImgConcatOp(SmallVector<SmallVector<SmallVector<Value>>>& outputTiles,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location& loc,
|
||||
Type outputType) {
|
||||
// Populate the outputTiles for the concat in the given order:
|
||||
// 1. Start top left pixel
|
||||
// 2. Continue on its right pixel till the end of the row
|
||||
// 3. Restart on the next row
|
||||
size_t outputTileCount = outputTiles.size();
|
||||
size_t output_w = outputTiles[0].size();
|
||||
size_t output_h = outputTiles[0][0].size();
|
||||
SmallVector<Value> tilesToConcat;
|
||||
tilesToConcat.reserve(output_h * output_w * outputTileCount * crossbarSize);
|
||||
for (size_t outX = 0; outX < output_h; outX++)
|
||||
for (size_t outY = 0; outY < output_w; outY++)
|
||||
for (size_t outTile = 0; outTile < outputTileCount; outTile++)
|
||||
tilesToConcat.push_back(outputTiles[outTile][outX][outY]);
|
||||
|
||||
return rewriter.create<spatial::SpatImgConcatOp>(loc, outputType, tilesToConcat);
|
||||
}
|
||||
|
||||
LogicalResult
|
||||
verifyWithinBoundsAndPaddings(size_t input_w, size_t input_h, int inX, int inY, size_t pad_x, size_t pad_y) {
|
||||
|
||||
if (inX < 0) {
|
||||
assert((size_t) (-inX) <= pad_x && "verifyWithinBoundsAndPaddings: Negative x value out of padding");
|
||||
return failure();
|
||||
}
|
||||
|
||||
if (inY < 0) {
|
||||
assert((size_t) (-inY) <= pad_y && "verifyWithinBoundsAndPaddings: Negative y value out of padding");
|
||||
return failure();
|
||||
}
|
||||
|
||||
if ((size_t) inX >= input_w || (size_t) inY >= input_h) {
|
||||
assert((size_t) inX < input_w + pad_x && "verifyWithinBoundsAndPaddings: Positive x out of bounds");
|
||||
assert((size_t) inY < input_h + pad_y && "verifyWithinBoundsAndPaddings: Positive y out of bounds");
|
||||
return failure();
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
Value createExtractSliceImg(Value valToSlice,
|
||||
size_t x,
|
||||
size_t y,
|
||||
size_t t,
|
||||
size_t channelTileCount,
|
||||
size_t channelTileRest,
|
||||
size_t input_w,
|
||||
size_t input_h,
|
||||
PatternRewriter& rewriter) {
|
||||
SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets(4, rewriter.getIndexAttr(0));
|
||||
SmallVector<OpFoldResult> sizes = {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(crossbarSize), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
|
||||
if (t == channelTileCount - 1 && channelTileRest != 0)
|
||||
sizes[1] = rewriter.getIndexAttr(channelTileRest);
|
||||
|
||||
offsets[1] = rewriter.getIndexAttr(t * crossbarSize);
|
||||
offsets[2] = rewriter.getIndexAttr(x);
|
||||
offsets[3] = rewriter.getIndexAttr(y);
|
||||
|
||||
return rewriter.create<tensor::ExtractSliceOp>(valToSlice.getLoc(), valToSlice, offsets, sizes, strides);
|
||||
}
|
||||
|
||||
Value indexImgValue(Value v,
|
||||
size_t x,
|
||||
size_t y,
|
||||
size_t t,
|
||||
size_t channelTileCount,
|
||||
size_t channelTileRest,
|
||||
size_t input_w,
|
||||
size_t input_h,
|
||||
ConversionPatternRewriter& rewriter) {
|
||||
|
||||
auto newV = rewriter.getRemappedValue(v);
|
||||
if (newV)
|
||||
v = newV;
|
||||
|
||||
if (!v.getDefiningOp())
|
||||
return createExtractSliceImg(v, x, y, t, channelTileCount, channelTileRest, input_w, input_h, rewriter);
|
||||
|
||||
if (auto computeOp = v.getDefiningOp<spatial::SpatWeightedCompute>()) {
|
||||
// We found the computeOp that produces the tile we want, just return this
|
||||
// value.
|
||||
// TODO: Should we assert that x,y,t are zero?
|
||||
assert(x == 0 && y == 0 && t == 0 && "indexImgValue: WeightedComputeOp tile indeces should be zero");
|
||||
return v;
|
||||
}
|
||||
|
||||
if (auto receiveOp = v.getDefiningOp<spatial::SpatChannelReceiveOp>()) {
|
||||
// This is a receiveOp, just return its value which will be resolved later
|
||||
assert(x == 0 && y == 0 && t == 0 && "indexImgValue: receiveOp tile indeces should be zero");
|
||||
return v;
|
||||
}
|
||||
|
||||
if (auto imgConcatOp = v.getDefiningOp<spatial::SpatImgConcatOp>()) {
|
||||
auto imgConcatInput = imgConcatOp.getInputTile(x, y, t);
|
||||
// TODO: Is this correct?
|
||||
// Above we already index exactly the tile we want, so `x=y=t=0` in
|
||||
// recursive call
|
||||
|
||||
return indexImgValue(imgConcatInput, 0, 0, 0, channelTileCount, channelTileRest, input_w, input_h, rewriter);
|
||||
}
|
||||
|
||||
if (auto tensorConcatOp = v.getDefiningOp<tensor::ConcatOp>()) {
|
||||
// This can be recursive.
|
||||
// First, get the input tensors of the tensor.concatOp
|
||||
// Then, find the input tensor that contains the tile we want
|
||||
// Finally, recursive call asking for the tile
|
||||
auto concatAxis = tensorConcatOp.getDim();
|
||||
assert(concatAxis != 0 && "Expecting to concat on channel/x/y axis");
|
||||
assert(concatAxis == 1 && "TODO: Make sure this works and makes sense for other axis.");
|
||||
SmallVector<size_t, 4> indexDims = {1, t * crossbarSize, x, y};
|
||||
|
||||
// Find the input tensor that contains the tile we want
|
||||
size_t currentTile = 0;
|
||||
for (auto concatInput : tensorConcatOp.getInputs()) {
|
||||
auto concatInputShape = cast<ShapedType>(concatInput.getType());
|
||||
assert(concatInputShape.getRank() == 4 && "Expecting an image tensor");
|
||||
auto concatInputSizeOnAxis = concatInputShape.getDimSize(concatAxis);
|
||||
|
||||
if (currentTile + concatInputSizeOnAxis > indexDims[concatAxis]) {
|
||||
// This input tensor contains the tile we want
|
||||
indexDims[concatAxis] -= currentTile;
|
||||
if (indexDims[1] % crossbarSize != 0) {
|
||||
assert(ignoreConcatError
|
||||
&& "TODO: Handle non-tile aligned tensor, or set "
|
||||
"--ignore-concat-error=true");
|
||||
}
|
||||
return indexImgValue(concatInput,
|
||||
indexDims[2],
|
||||
indexDims[3],
|
||||
indexDims[1] / crossbarSize,
|
||||
channelTileCount,
|
||||
channelTileRest,
|
||||
input_w,
|
||||
input_h,
|
||||
rewriter);
|
||||
}
|
||||
currentTile += concatInputSizeOnAxis;
|
||||
}
|
||||
|
||||
assert(false
|
||||
&& "Could not find the input tensor that contains the tile "
|
||||
"within tensor.ConcatOp");
|
||||
}
|
||||
|
||||
v.dump();
|
||||
|
||||
assert(false && "indexImgValue: unsupported operation");
|
||||
}
|
||||
|
||||
void resolveInputTensorTilesBlockArg(Value wholeInputTensor,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>& inputTiles,
|
||||
size_t channelTileCount,
|
||||
size_t channelTileRest,
|
||||
size_t input_w,
|
||||
size_t input_h,
|
||||
PatternRewriter& rewriter) {
|
||||
SmallVector<OpFoldResult> strides(4, rewriter.getIndexAttr(1));
|
||||
SmallVector<OpFoldResult> offsets(4, rewriter.getIndexAttr(0));
|
||||
SmallVector<OpFoldResult> sizes = {
|
||||
rewriter.getIndexAttr(1), rewriter.getIndexAttr(crossbarSize), rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
|
||||
Location loc = wholeInputTensor.getLoc();
|
||||
|
||||
for (size_t t = 0; t < channelTileCount; t++) {
|
||||
if (t == channelTileCount - 1 && channelTileRest != 0)
|
||||
sizes[1] = rewriter.getIndexAttr(channelTileRest);
|
||||
for (size_t x = 0; x < input_w; x++) {
|
||||
for (size_t y = 0; y < input_h; y++) {
|
||||
offsets[1] = rewriter.getIndexAttr(t * crossbarSize);
|
||||
offsets[2] = rewriter.getIndexAttr(x);
|
||||
offsets[3] = rewriter.getIndexAttr(y);
|
||||
|
||||
inputTiles[t][x][y] = rewriter.create<tensor::ExtractSliceOp>(loc, wholeInputTensor, offsets, sizes, strides);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<Twine> resolveImgInputTiles(Value wholeInputTensor,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>& inputTiles,
|
||||
size_t channelTileCount,
|
||||
size_t channelTileRest,
|
||||
size_t input_w,
|
||||
size_t input_h,
|
||||
ConversionPatternRewriter& rewriter) {
|
||||
|
||||
for (size_t t = 0; t < channelTileCount; t++) {
|
||||
for (size_t x = 0; x < input_w; x++) {
|
||||
for (size_t y = 0; y < input_h; y++) {
|
||||
inputTiles[t][x][y] =
|
||||
indexImgValue(wholeInputTensor, x, y, t, channelTileCount, channelTileRest, input_w, input_h, rewriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
LogicalResult handleFlattenLikeOp(SmallVector<SmallVector<Value>>& inputTiles,
|
||||
const size_t inputTilesCount,
|
||||
const size_t lastInputTileDimension,
|
||||
TensorType inputShape,
|
||||
TensorType outputShape,
|
||||
Value reshapeInput,
|
||||
ConversionPatternRewriter& rewriter) {
|
||||
// Only support reshape between an image and a vector (i.e. flatten)
|
||||
if (inputShape.getRank() != 4 || outputShape.getRank() != 2) {
|
||||
return rewriter.notifyMatchFailure(reshapeInput.getDefiningOp(),
|
||||
"resolveVecInputTiles only supports reshapes from 4D to 2D tensors");
|
||||
}
|
||||
|
||||
/*
|
||||
* From a 4D tensor <N, C, W, H> to a 2D tensor <N, C*H*W>
|
||||
*/
|
||||
auto N = inputShape.getDimSize(0);
|
||||
auto C = inputShape.getDimSize(1);
|
||||
auto H = inputShape.getDimSize(2);
|
||||
auto W = inputShape.getDimSize(3);
|
||||
assert(N == 1 && "Only support N = 1 for image tensors");
|
||||
|
||||
for (size_t i = 0; i < inputTilesCount; i++) {
|
||||
auto c = (i / (H * W)) % C;
|
||||
// TODO: Is this correct? Or should I invert h and w?
|
||||
auto w = (i / H) % W;
|
||||
auto h = i % H;
|
||||
|
||||
Value curTile = indexImgValue(reshapeInput, w, h, c, inputTilesCount, lastInputTileDimension, W, H, rewriter);
|
||||
|
||||
// Assert the shape of the tile, and reshape it
|
||||
auto curTileShape = cast<TensorType>(curTile.getType());
|
||||
assert(curTileShape.getRank() == 4 && "We just reshaped an image tensor, why rank != 4?");
|
||||
assert(curTileShape.getDimSize(0) == 1 && "We just reshaped an image tensor with N = 1, why is it now != 1?");
|
||||
assert(curTileShape.getDimSize(2) == 1 && "We should have just looked up a single pixel why W != 1?");
|
||||
assert(curTileShape.getDimSize(3) == 1 && "We should have just looked up a single pixel why H != 1?");
|
||||
|
||||
// Reshape this pixel tensor into a vector, for compatibility with the
|
||||
// rest
|
||||
SmallVector<int64_t> newShapeVals = {curTileShape.getDimSize(0), curTileShape.getDimSize(1)};
|
||||
auto shapeType = RankedTensorType::get({static_cast<int64_t>(newShapeVals.size())}, rewriter.getI64Type());
|
||||
Value shapeTensor =
|
||||
rewriter.create<arith::ConstantOp>(reshapeInput.getLoc(), DenseIntElementsAttr::get(shapeType, newShapeVals));
|
||||
auto reshapedType = RankedTensorType::get(newShapeVals, curTileShape.getElementType());
|
||||
auto reshapedCurTile = tosa::ReshapeOp::create(rewriter, reshapeInput.getLoc(), reshapedType, curTile, shapeTensor);
|
||||
|
||||
size_t coreIndex = i / crossbarCountInCore;
|
||||
inputTiles[coreIndex].push_back(reshapedCurTile);
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> kernel_get_start_and_end(
|
||||
int64_t out_pos, int64_t input_width, int64_t krn_width, int64_t stride, int64_t dilation, int64_t pad) {
|
||||
int64_t firstValid = std::ceil(static_cast<float>(pad) / dilation) * dilation - pad;
|
||||
int64_t start = std::max(firstValid, out_pos * stride - pad);
|
||||
int64_t end = std::min(input_width, out_pos * stride + (krn_width - 1) * dilation + 1 - pad);
|
||||
|
||||
assert(start >= 0 && "Start position must be non-negative.");
|
||||
assert(end >= 0 && "End position must be non-negative.");
|
||||
return std::make_pair(start, end);
|
||||
}
|
||||
|
||||
void incrementWeightedComputeInputsSegmentSize(spatial::SpatWeightedCompute wcomputeOp, int increment) {
|
||||
auto oldSegmentSizes = wcomputeOp->getAttrOfType<DenseI32ArrayAttr>(wcomputeOp.getOperandSegmentSizesAttrName());
|
||||
|
||||
auto newSegmentSizes =
|
||||
DenseI32ArrayAttr::get(wcomputeOp->getContext(), {oldSegmentSizes[0], oldSegmentSizes[1] + increment});
|
||||
|
||||
wcomputeOp->setAttr(wcomputeOp.getOperandSegmentSizesAttrName(), newSegmentSizes);
|
||||
}
|
||||
|
||||
int getResultIndex(Operation* op, Value v) {
|
||||
int resultNumber = -1;
|
||||
for (auto result : op->getResults()) {
|
||||
if (result == v) {
|
||||
resultNumber = result.getResultNumber();
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(resultNumber >= 0 && "Value not found in given operation's results.");
|
||||
|
||||
return resultNumber;
|
||||
}
|
||||
|
||||
}; // namespace onnx_mlir
|
||||
262
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp
Normal file
262
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp
Normal file
@@ -0,0 +1,262 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "llvm/Support/LogicalResult.h"
|
||||
|
||||
#define DEFINE_MAP_OP(opname) opname,
|
||||
|
||||
#define GET_IMAGE_WIDTH(shapedType) shapedType.getDimSize(2)
|
||||
#define GET_IMAGE_HEIGHT(shapedType) shapedType.getDimSize(3)
|
||||
#define GET_IMAGE_CHANNEL(shapedType) shapedType.getDimSize(1)
|
||||
#define GET_IMAGE_N(shapedType) shapedType.getDimSize(0)
|
||||
#define GET_KERNEL_WIDTH(shapedType) shapedType.getDimSize(2)
|
||||
#define GET_KERNEL_HEIGHT(shapedType) shapedType.getDimSize(3)
|
||||
#define GET_FILTER_COUNT(shapedType) shapedType.getDimSize(0)
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
const StringRef REPLICATION_ATTR_NAME = "replication_factor";
|
||||
|
||||
using HSliceId = size_t;
|
||||
using CoreId = size_t;
|
||||
|
||||
enum class MapOperations {
|
||||
None,
|
||||
ONNXSoftmaxOp,
|
||||
ONNXReluOp,
|
||||
ONNXLeakyReluOp,
|
||||
ONNXExpOp
|
||||
};
|
||||
|
||||
template <class A, class B, class C = std::common_type_t<A, B>>
|
||||
constexpr C ceilIntegerDivide(A a, B b) {
|
||||
static_assert(std::is_integral_v<A>, "A must be an integer type");
|
||||
static_assert(std::is_integral_v<B>, "B must be an integer type");
|
||||
C ac = static_cast<C>(a);
|
||||
C bc = static_cast<C>(b);
|
||||
return 1 + (ac - 1) / bc;
|
||||
}
|
||||
|
||||
template <class A, class B, class C = std::common_type_t<A, B>>
|
||||
constexpr std::pair<C, C> ceilIntegerDivideWithRemainder(A a, B b) {
|
||||
static_assert(std::is_integral_v<A>, "A must be an integer type");
|
||||
static_assert(std::is_integral_v<B>, "B must be an integer type");
|
||||
C ac = static_cast<C>(a);
|
||||
C bc = static_cast<C>(b);
|
||||
return {ceilIntegerDivide(ac, bc), ac % bc};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool isVectorShape(const ArrayRef<T> shape) {
|
||||
return shape.size() == 2 && (shape[0] == 1 || shape[1] == 1);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool isMatrixShape(const ArrayRef<T> shape) {
|
||||
return shape.size() == 2;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool isHVectorShape(const ArrayRef<T> shape) {
|
||||
return shape.size() == 2 && shape[0] == 1;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
bool isVVectorShape(const ArrayRef<T> shape) {
|
||||
return shape.size() == 2 && shape[1] == 1;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
T getVectorLength(const ArrayRef<T> shape) {
|
||||
assert(isVectorShape(shape));
|
||||
return shape[0] != 1 ? shape[0] : shape[1];
|
||||
}
|
||||
|
||||
inline auto getTensorShape(const Value tensor) { return cast<RankedTensorType>(tensor.getType()).getShape(); }
|
||||
|
||||
SmallVector<Value> sliceTensor(
|
||||
const Value& tensorToSlice, size_t axis, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc);
|
||||
|
||||
SmallVector<Value>
|
||||
sliceVector(const Value& vectorToSlice, int64_t sliceSize, ConversionPatternRewriter& rewriter, Location loc);
|
||||
|
||||
DenseMap<CoreId, SmallVector<Value>>
|
||||
sliceVectorPerCrossbarPerCore(const Value& vectorToSlice, ConversionPatternRewriter& rewriter, Location loc);
|
||||
|
||||
DenseMap<HSliceId, DenseMap<CoreId, SmallVector<Value>>> tileMatrix(
|
||||
Value& matrixToTile, int64_t hSliceSize, int64_t vSliceSize, ConversionPatternRewriter& rewriter, Location& loc);
|
||||
|
||||
tensor::SplatOp
|
||||
broadcastToVector(Value scalarToBroadcast, int64_t length, ConversionPatternRewriter& rewriter, Location loc);
|
||||
|
||||
Value sumTensors(ArrayRef<Value> tensors, ConversionPatternRewriter& rewriter);
|
||||
|
||||
Value createMapOperation(PatternRewriter& rewriter, MapOperations mapOp, const Value& input);
|
||||
|
||||
/**
|
||||
* Unpacks an optional pair vector into two size_t values.
|
||||
*
|
||||
* @param valuesArray The optional `mlir::ArrayAttr` containing the pair of
|
||||
* values.
|
||||
* @param value1 The reference to the first `size_t` variable to store the
|
||||
* unpacked value.
|
||||
* @param value2 The reference to the second `size_t` variable to store the
|
||||
* unpacked value.
|
||||
*/
|
||||
void unpackOptionalPairVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& value1, size_t& value2);
|
||||
|
||||
/**
|
||||
* Unpacks the optional pads vector.
|
||||
*
|
||||
* @param valuesArray The optional array attribute containing the values.
|
||||
* @param pad_x The output variable to store the value of pad_x.
|
||||
* @param pad_y The output variable to store the value of pad_y.
|
||||
* @param rewriter The rewriter to notify failure
|
||||
*
|
||||
* @return llvm::Optional<llvm::Twine> The error message if the pads are invalid
|
||||
*/
|
||||
std::optional<Twine> unpackOptionalPadsVector(std::optional<mlir::ArrayAttr> valuesArray, size_t& pad_x, size_t& pad_y);
|
||||
|
||||
/**
|
||||
* Tiles the image tensor by channel.
|
||||
*
|
||||
* This function takes an image tensor and tiles it into smaller tiles based on
|
||||
* the channel dimension. The size of each tile is specified by the tileSize
|
||||
* parameter.
|
||||
*
|
||||
* @param imageTensor The input image tensor (NxCxWxH) to be tiled.
|
||||
* @param tiles The output tiles vector to store the tiled image tensors.
|
||||
* @param tileSize The size of each tile.
|
||||
* @param rewriter The ConversionPatternRewriter used for creating operations.
|
||||
*/
|
||||
void tileImageTensorByChannel(Value imageTensor,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>& tiles,
|
||||
size_t tileSize,
|
||||
ConversionPatternRewriter& rewriter);
|
||||
|
||||
/**
|
||||
* Creates an ImgConcatOp based on the given tiles.
|
||||
*
|
||||
* This function takes a 3-dimensional vector `outputTiles` representing the
|
||||
* tiles to concatenate. The tiles are indexed by [tile][x][y].
|
||||
*
|
||||
* @param outputTiles The tiles to concatenate.
|
||||
* @param rewriter The ConversionPatternRewriter used for creating the
|
||||
* ImgConcatOp.
|
||||
* @param loc The location of the operation.
|
||||
* @param outputType The type of the output tensor.
|
||||
*
|
||||
* @return The created ImgConcatOp.
|
||||
*/
|
||||
Value createImgConcatOp(SmallVector<SmallVector<SmallVector<Value>>>& outputTiles,
|
||||
ConversionPatternRewriter& rewriter,
|
||||
Location& loc,
|
||||
Type outputType);
|
||||
|
||||
/**
|
||||
* @brief Verifies if the given input coordinates and padding values are within
|
||||
* the bounds of the input tensor.
|
||||
*
|
||||
* @param input_w The width of the input tensor.
|
||||
* @param input_h The height of the input tensor.
|
||||
* @param inX The X-coordinate of the input.
|
||||
* @param inY The Y-coordinate of the input.
|
||||
* @param pad_x The padding value in the X-direction.
|
||||
* @param pad_y The padding value in the Y-direction.
|
||||
* @return LogicalResult Returns success if the coordinates and padding are
|
||||
* within bounds, failure otherwise.
|
||||
*/
|
||||
LogicalResult
|
||||
verifyWithinBoundsAndPaddings(size_t input_w, size_t input_h, int inX, int inY, size_t pad_x, size_t pad_y);
|
||||
|
||||
/**
|
||||
* Resolves the tiling of the input tensor into smaller tiles.
|
||||
*
|
||||
* This function takes a whole input tensor and tiles it into smaller tiles
|
||||
* using the provided parameters. The resulting tiles are stored in the
|
||||
* `inputTiles` vector.
|
||||
* Input tiles need to be indexed by:
|
||||
* a. Channel Tile
|
||||
* b. Pixel `x` position
|
||||
* c. Pixel `y` position
|
||||
* For example: inputTiles[channelTile][x][y]
|
||||
*
|
||||
* @param wholeInputTensor The whole input tensor to be tiled.
|
||||
* @param inputTiles A vector of vectors of vectors of Values representing the
|
||||
* tiles of the input tensor. The outermost vector represents
|
||||
* the channels, the middle vector represents the rows, and
|
||||
* the innermost vector represents the columns of the tiles.
|
||||
* @param channelTileCount The number of tiles for the `channel` axis.
|
||||
* @param channelTileRest The size of the last channelTile. Set as 0 if tiles
|
||||
* fit exactly
|
||||
* @param input_w The width of the input tensor.
|
||||
* @param input_h The height of the input tensor.
|
||||
* @param rewriter The ConversionPatternRewriter used for creating operations.
|
||||
*
|
||||
* @return std::optional<llvm::Twine> An error message if the input tensor could
|
||||
* not be resolved into tiles.
|
||||
*/
|
||||
std::optional<Twine> resolveImgInputTiles(Value wholeInputTensor,
|
||||
SmallVector<SmallVector<SmallVector<Value>>>& inputTiles,
|
||||
size_t channelTileCount,
|
||||
size_t channelTileRest,
|
||||
size_t input_w,
|
||||
size_t input_h,
|
||||
mlir::ConversionPatternRewriter& rewriter);
|
||||
|
||||
/**
|
||||
* Computes the boundaries of an image kernel application.
|
||||
*
|
||||
* @param out_pos The position of the output element.
|
||||
* @param input_width The width of the input image.
|
||||
* @param krn_width The width of the kernel.
|
||||
* @param stride The stride value.
|
||||
* @param dilation The dilation value.
|
||||
* @param pad The padding value.
|
||||
* @return A pair of size_t values representing the start and end positions of
|
||||
* the kernel application.
|
||||
*/
|
||||
std::pair<size_t, size_t> kernel_get_start_and_end(
|
||||
int64_t out_pos, int64_t input_width, int64_t krn_width, int64_t stride, int64_t dilation, int64_t pad);
|
||||
|
||||
/**
|
||||
* @brief Increment the `operandSegmentSizes` in the WeightedCompute operation
|
||||
* for the `inputs` operand.
|
||||
*
|
||||
* This function increments the size of the `inputs` operand segment in the
|
||||
* `operandSegmentSizes` of the given WeightedCompute operation by the specified
|
||||
* increment. This is necessary when new operands are programmatically added to
|
||||
* the WeightedCompute operation.
|
||||
*
|
||||
* @param wcomputeOp The WeightedCompute operation whose `operandSegmentSizes`
|
||||
* is to be incremented.
|
||||
* @param increment The value by which to increment the `inputs` operand segment
|
||||
* size.
|
||||
*/
|
||||
void incrementWeightedComputeInputsSegmentSize(spatial::SpatWeightedCompute wcomputeOp, int increment);
|
||||
|
||||
/**
|
||||
* @brief Finds the result index of the given operation that produces the
|
||||
* specified value.
|
||||
*
|
||||
* This function takes an operation and a value, and returns the index of the
|
||||
* result of the operation that corresponds to the given value.
|
||||
*
|
||||
* @param op Operation whose result index is to be found.
|
||||
* @param v The value for which the result index is to be determined.
|
||||
* @return The index of the result of the operation that produces the specified
|
||||
* value.
|
||||
*/
|
||||
int getResultIndex(Operation* op, Value v);
|
||||
|
||||
}; // namespace onnx_mlir
|
||||
131
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp
Normal file
131
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.cpp
Normal file
@@ -0,0 +1,131 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_os_ostream.h"
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
|
||||
#include "Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
||||
#include "ONNXToSpatialPass.hpp"
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
|
||||
#include "src/Compiler/CompilerOptions.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
namespace spatial {
|
||||
|
||||
void ONNXToSpatialPass::runOnOperation() {
|
||||
llvm::dbgs() << "Running ONNXToSpatialLoweringPass\n";
|
||||
|
||||
ModuleOp module = getOperation();
|
||||
MLIRContext* ctx = &getContext();
|
||||
|
||||
RewritePatternSet mergeActivationPatterns(ctx);
|
||||
mergeActivationPatterns.add<onnxToArithConstantOp>(ctx);
|
||||
mergeActivationPatterns.add<convAddToConvWithBiasPatternLeft>(ctx);
|
||||
mergeActivationPatterns.add<convAddToConvWithBiasPatternRight>(ctx);
|
||||
mergeActivationPatterns.add<matMulAddToGemmPattern>(ctx);
|
||||
mergeActivationPatterns.add<matMulToGemmPattern>(ctx);
|
||||
mergeActivationPatterns.add<removeFlattenSameShapePattern>(ctx);
|
||||
|
||||
if (failed(applyPatternsAndFoldGreedily(module, std::move(mergeActivationPatterns))))
|
||||
llvm::dbgs() << "Failed to merge activation patterns, continuing...\n";
|
||||
|
||||
IRRewriter rewriter(module);
|
||||
func::FuncOp funcOp = *module.getOps<func::FuncOp>().begin();
|
||||
if (annotateReplication(funcOp, rewriter).failed()) {
|
||||
llvm::dbgs() << "Failed during annotation for replication analysis\n";
|
||||
signalPassFailure();
|
||||
return;
|
||||
}
|
||||
|
||||
ConversionTarget target(*ctx);
|
||||
target.addLegalDialect<ONNXDialect, SpatialDialect, tensor::TensorDialect, arith::ArithDialect, tosa::TosaDialect>();
|
||||
target.addIllegalOp<ONNXMatMulOp>();
|
||||
target.addIllegalOp<ONNXGemmOp>();
|
||||
target.addIllegalOp<ONNXConvOp>();
|
||||
target.addIllegalOp<ONNXLRNOp>();
|
||||
target.addIllegalOp<ONNXMaxPoolSingleOutOp>();
|
||||
target.addIllegalOp<ONNXAveragePoolOp>();
|
||||
target.addIllegalOp<ONNXConcatOp>();
|
||||
target.addIllegalOp<ONNXSoftmaxOp>();
|
||||
target.addIllegalOp<ONNXReduceMeanV13Op>();
|
||||
|
||||
RewritePatternSet patterns(ctx);
|
||||
patterns.add<removeLRNPattern>(ctx);
|
||||
|
||||
if (useExperimentalConvImpl) {
|
||||
populateExperimentalTilingConvOpPattern(patterns, ctx);
|
||||
populateExperimentalPoolingTilingPattern(patterns, ctx);
|
||||
populateGemmToConvConversionPattern(patterns, ctx);
|
||||
}
|
||||
else {
|
||||
populateTilingConvOpPattern(patterns, ctx);
|
||||
populatePoolingTilingPattern(patterns, ctx);
|
||||
populateTilingGemmOpPattern(patterns, ctx);
|
||||
}
|
||||
|
||||
populateONNXConcatToTensorConcatPattern(patterns, ctx);
|
||||
populateReduceMeanConversionPattern(patterns, ctx);
|
||||
|
||||
if (failed(applyPartialConversion(module, target, std::move(patterns)))) {
|
||||
signalPassFailure();
|
||||
return;
|
||||
}
|
||||
|
||||
// Count the number of compute ops and check they do not exceed the core count
|
||||
if (coresCount != -1) {
|
||||
int computeOpsCount = 0;
|
||||
for (auto& op : funcOp.getFunctionBody().front().getOperations())
|
||||
if (isa<spatial::SpatWeightedCompute>(op))
|
||||
computeOpsCount++;
|
||||
|
||||
if (computeOpsCount > coresCount) {
|
||||
llvm::dbgs() << "Number of compute ops exceeds the core count\n";
|
||||
signalPassFailure();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing "helper ops" i.e. concat,img_concat,reshape.
|
||||
RewritePatternSet removeUnusedHelperOpsPatterns(ctx);
|
||||
populateRemoveUnusedHelperOpsPatterns(removeUnusedHelperOpsPatterns, ctx);
|
||||
|
||||
if (failed(applyPatternsAndFoldGreedily(module, std::move(removeUnusedHelperOpsPatterns))))
|
||||
llvm::dbgs() << "Failed to remove unused helper ops, continuing...\n";
|
||||
|
||||
annotateWeightsConstants(funcOp);
|
||||
|
||||
// Dump to file for debug
|
||||
std::string outputDir = outputBaseName.substr(0, outputBaseName.find_last_of('/')).append("/dialects");
|
||||
std::filesystem::create_directory(outputDir);
|
||||
std::fstream file(outputDir + "/spatial.mlir", std::ios::out);
|
||||
llvm::raw_os_ostream os(file);
|
||||
os << *module;
|
||||
os.flush();
|
||||
file.close();
|
||||
}
|
||||
|
||||
void ONNXToSpatialPass::annotateWeightsConstants(func::FuncOp funcOp) const {
|
||||
MLIRContext* ctx = funcOp.getContext();
|
||||
funcOp.walk([&](arith::ConstantOp constantOp) {
|
||||
bool isAlwaysWeight =
|
||||
llvm::all_of(constantOp->getUsers(), [](auto user) -> bool { return isa<SpatWeightedCompute>(user); });
|
||||
if (isAlwaysWeight)
|
||||
constantOp->setAttr("weightAlways", UnitAttr::get(ctx));
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace spatial
|
||||
|
||||
} // namespace onnx_mlir
|
||||
34
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.hpp
Normal file
34
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPass.hpp
Normal file
@@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Pass/Pass.h"
|
||||
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
using namespace mlir;
|
||||
extern bool haveSameStaticShape(Value lhs, Value rhs);
|
||||
|
||||
namespace spatial {
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatial.hpp.inc"
|
||||
|
||||
struct ONNXToSpatialPass : PassWrapper<ONNXToSpatialPass, OperationPass<ModuleOp>> {
|
||||
MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ONNXToSpatialPass)
|
||||
StringRef getArgument() const override { return "convert-onnx-to-spatial"; }
|
||||
StringRef getDescription() const override { return "Lower ONNX ops to Spatial ops."; }
|
||||
|
||||
ONNXToSpatialPass() = default;
|
||||
ONNXToSpatialPass(const ONNXToSpatialPass& pass) {}
|
||||
|
||||
void runOnOperation() override;
|
||||
|
||||
private:
|
||||
void annotateWeightsConstants(func::FuncOp funcOp) const;
|
||||
};
|
||||
|
||||
} // namespace spatial
|
||||
|
||||
std::unique_ptr<Pass> createONNXToSpatialPass() { return std::make_unique<spatial::ONNXToSpatialPass>(); }
|
||||
|
||||
} // namespace onnx_mlir
|
||||
40
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp
Normal file
40
src/PIM/Conversion/ONNXToSpatial/ONNXToSpatialPatterns.hpp
Normal file
@@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
void populateLoweringONNXMatMulOpToSpatialPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateTilingGemmOpPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
void populateTilingConvOpPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populatePoolingTilingPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateDistributeReducePattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateFoldComputePattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateONNXConcatToTensorConcatPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateRemoveUnusedHelperOpsPatterns(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
void populateReduceMeanConversionPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
// Experimental patterns.
|
||||
void populateExperimentalTilingConvOpPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
void populateGemmToConvConversionPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
void populateExperimentalPoolingTilingPattern(
|
||||
mlir::RewritePatternSet &patterns, mlir::MLIRContext *ctx);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,31 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
struct ONNXConcatToTensorConcat : public OpConversionPattern<ONNXConcatOp> {
|
||||
ONNXConcatToTensorConcat(MLIRContext *ctx) : OpConversionPattern(ctx) {}
|
||||
|
||||
LogicalResult matchAndRewrite(ONNXConcatOp maxpoolOp,
|
||||
ONNXConcatOpAdaptor adaptor,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
auto inputs = adaptor.getInputs();
|
||||
int64_t axis = adaptor.getAxis();
|
||||
|
||||
rewriter.replaceOpWithNewOp<tensor::ConcatOp>(maxpoolOp, axis, inputs);
|
||||
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
void populateONNXConcatToTensorConcatPattern(
|
||||
RewritePatternSet &patterns, MLIRContext *ctx) {
|
||||
patterns.insert<ONNXConcatToTensorConcat>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,34 @@
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
template <typename OpTy, typename OpAdaptorTy>
|
||||
struct RemoveUnusedHelperOps : public OpRewritePattern<OpTy> {
|
||||
RemoveUnusedHelperOps(MLIRContext* ctx)
|
||||
: OpRewritePattern<OpTy>(ctx) {}
|
||||
|
||||
void initialize() { this->setHasBoundedRewriteRecursion(); }
|
||||
|
||||
LogicalResult matchAndRewrite(OpTy op, PatternRewriter& rewriter) const final {
|
||||
if (op.getResult().use_empty()) {
|
||||
rewriter.eraseOp(op);
|
||||
return success();
|
||||
}
|
||||
|
||||
return failure();
|
||||
}
|
||||
};
|
||||
|
||||
void populateRemoveUnusedHelperOpsPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
|
||||
patterns.insert<RemoveUnusedHelperOps<tensor::ConcatOp, tensor::ConcatOpAdaptor>>(ctx);
|
||||
patterns.insert<RemoveUnusedHelperOps<spatial::SpatImgConcatOp, spatial::SpatImgConcatOpAdaptor>>(ctx);
|
||||
patterns.insert<RemoveUnusedHelperOps<ONNXReshapeOp, ONNXReshapeOpAdaptor>>(ctx);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
119
src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
Normal file
119
src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include <queue>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* @brief Structure that describes the replication of a convolution operation,
|
||||
* along the image height axis.
|
||||
*/
|
||||
struct ConvReplication {
|
||||
ONNXConvOp convOp; // Convolution operation
|
||||
size_t input_w; // Width of the input image
|
||||
size_t replicationFactor; // Replication factor on the image height axis
|
||||
size_t coresNeededPerReplica; // Number of cores needed for each replica
|
||||
|
||||
friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
|
||||
return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
|
||||
}
|
||||
|
||||
ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
|
||||
: convOp(convOp),
|
||||
input_w(input_w),
|
||||
replicationFactor(replicationFactor),
|
||||
coresNeededPerReplica(coresNeededPerReplica) {}
|
||||
};
|
||||
|
||||
LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
|
||||
|
||||
if (coresCount == -1) {
|
||||
// No need for annotation, implicitly set replication to 1
|
||||
return success();
|
||||
}
|
||||
|
||||
std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
|
||||
|
||||
size_t minimumCores = 0;
|
||||
|
||||
for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
|
||||
if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
|
||||
// Convolution layer
|
||||
|
||||
Value X = convOp.getX(), W = convOp.getW();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
ShapedType wShape = mlir::cast<ShapedType>(W.getType());
|
||||
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t krn_h = GET_KERNEL_HEIGHT(wShape);
|
||||
size_t krn_w = GET_KERNEL_WIDTH(wShape);
|
||||
|
||||
size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
|
||||
size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
|
||||
|
||||
auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
|
||||
auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
|
||||
|
||||
minimumCores += neededCores;
|
||||
|
||||
convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
|
||||
}
|
||||
else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
|
||||
// Fully connected layer
|
||||
auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
|
||||
auto inputSize = matrixTensorShape.getDimSize(0);
|
||||
auto outputSize = matrixTensorShape.getDimSize(1);
|
||||
if (gemmOp.getTransB())
|
||||
std::swap(inputSize, outputSize);
|
||||
|
||||
const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
|
||||
const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
|
||||
|
||||
// Each output tile is computed by `coresPerOutputTile` cores. The
|
||||
// entire input is given to each of these cores.
|
||||
const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
|
||||
|
||||
auto neededCores = coresPerOutputTile * outputTilesCount;
|
||||
|
||||
minimumCores += neededCores;
|
||||
}
|
||||
}
|
||||
|
||||
if (static_cast<size_t>(coresCount) < minimumCores) {
|
||||
return funcOp->emitError("Not enough cores for this network: ")
|
||||
<< minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
|
||||
}
|
||||
|
||||
size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
|
||||
|
||||
// Consume all the elements in the queue
|
||||
while (!convOpsReplicationQueue.empty()) {
|
||||
auto convOpReplication = convOpsReplicationQueue.top();
|
||||
convOpsReplicationQueue.pop();
|
||||
|
||||
// Check if we can replicate this convolution (e.g. we have enough cores)
|
||||
if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
|
||||
// We can replicate this convolution: increment replicationFactor and put
|
||||
// back in queue
|
||||
availableCores -= convOpReplication.coresNeededPerReplica;
|
||||
convOpReplication.replicationFactor++;
|
||||
|
||||
convOpsReplicationQueue.push(convOpReplication);
|
||||
}
|
||||
else {
|
||||
// Cannot replicate this convolution anymore, annotate the operation
|
||||
// with the replication factor
|
||||
convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
|
||||
rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
|
||||
}
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
mlir::LogicalResult annotateReplication(
|
||||
mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
382
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp
Normal file
382
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp
Normal file
@@ -0,0 +1,382 @@
|
||||
|
||||
#include "SpatialReducer.hpp"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
|
||||
#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
llvm::SmallPtrSet<Operation *, 16>
|
||||
onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
|
||||
|
||||
ResNum SpatialReducer::applyResultProcessing(
|
||||
ComputeAndResNum computeOpAndResNum,
|
||||
std::function<Value(const Value &)> processFun,
|
||||
ConversionPatternRewriter &rewriter) {
|
||||
assert(processFun);
|
||||
|
||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
||||
auto resultNum = GET_RES_NUM(computeOpAndResNum);
|
||||
|
||||
spatial::SpatYieldOp yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
|
||||
Value result = yieldOp->getOperand(resultNum);
|
||||
rewriter.setInsertionPointAfterValue(result);
|
||||
Value processedResult = processFun(result);
|
||||
if (processedResult == result) {
|
||||
// Sometimes we want processedResult to return the same value but do
|
||||
// something else with it (e.g. in softmax we want to broadcast the value
|
||||
// using a channel). In this case, we can just return the same value.
|
||||
return resultNum;
|
||||
}
|
||||
|
||||
yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
|
||||
|
||||
return yieldOp.getNumOperands() - 1;
|
||||
}
|
||||
|
||||
OpAndResNum SpatialReducer::applyReducePattern(
|
||||
SmallVector<ComputeAndResNum> &computeOpsAndResNum,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess) {
|
||||
|
||||
if (preprocess) {
|
||||
for (auto &computeOpAndResNum : computeOpsAndResNum) {
|
||||
GET_RES_NUM(computeOpAndResNum) =
|
||||
applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
|
||||
}
|
||||
}
|
||||
|
||||
// It is possible that `computeOpsAndResNum` contains two entries for the same
|
||||
// computeOp. In this case, we need to apply the reduction within-computef
|
||||
|
||||
// Keep a map between a computeOp and the last Value for this reduction
|
||||
std::unordered_map<Operation *, Value> lastValueForCompute;
|
||||
for (auto &computeOpAndResNum : computeOpsAndResNum) {
|
||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
Value valueWithinCompute =
|
||||
yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
|
||||
|
||||
auto it = lastValueForCompute.find(computeOp.getOperation());
|
||||
|
||||
if (it != lastValueForCompute.end()) {
|
||||
// If we have already seen this computeOp, apply the reduction
|
||||
// within-compute
|
||||
Value lastWithinComputeValue = it->second;
|
||||
|
||||
assert(valueWithinCompute.getDefiningOp() &&
|
||||
lastWithinComputeValue.getDefiningOp());
|
||||
|
||||
if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(
|
||||
lastWithinComputeValue.getDefiningOp())) {
|
||||
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
||||
} else {
|
||||
rewriter.setInsertionPointAfterValue(valueWithinCompute);
|
||||
}
|
||||
valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
|
||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
||||
}
|
||||
|
||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
||||
}
|
||||
|
||||
// Now, reconstruct from the map the computeOpsAndResNum list
|
||||
computeOpsAndResNum.clear();
|
||||
computeOpsAndResNum.reserve(lastValueForCompute.size());
|
||||
for (auto &entry : lastValueForCompute) {
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(entry.first);
|
||||
auto valueWithinCompute = entry.second;
|
||||
|
||||
// We check if `valueWithinCompute` is already used by the yieldOp, in that
|
||||
// case no need to add it
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
bool yieldOpUseFound = false;
|
||||
for (auto &use : valueWithinCompute.getUses()) {
|
||||
if (use.getOwner() == yieldOp.getOperation()) {
|
||||
// If the value is already used by the yieldOp, we can just use it
|
||||
computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
|
||||
yieldOpUseFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (yieldOpUseFound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If this result is not used within a yieldOp, then add it
|
||||
auto resultNum = yieldOp->getNumOperands();
|
||||
yieldOp->insertOperands(resultNum, valueWithinCompute);
|
||||
|
||||
computeOpsAndResNum.push_back({computeOp, resultNum});
|
||||
}
|
||||
|
||||
Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
|
||||
|
||||
// Recursive algorithm to reduce the inputs to a single one:
|
||||
// - Take two inputs at a time, and reduce them into a single one, updating
|
||||
// the computeOpsAndResNum list which becomes half the size.
|
||||
// - Repeat until there is only one input left.
|
||||
llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
|
||||
while (computeOpsRef.size() > 1) {
|
||||
SmallVector<ComputeAndResNum> nextComputeOps;
|
||||
nextComputeOps.reserve(computeOpsRef.size() / 2);
|
||||
for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
|
||||
auto [firstCompute, firstResultNum] = computeOpsRef[i];
|
||||
auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
|
||||
|
||||
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
||||
std::swap(firstCompute, secondCompute);
|
||||
std::swap(firstResultNum, secondResultNum);
|
||||
}
|
||||
|
||||
// We do not immediately alter the computeOps results/operands, instead we
|
||||
// do it in a delayed manner, to avoid invalidating the references to the
|
||||
// computeOps (which must be replaced by a cloned ComputeOp when changing
|
||||
// the number of results)
|
||||
// See below `reducerChanges.push_back` and `finalizeReduceUpdates`
|
||||
|
||||
auto yieldOpFirstCompute = cast<spatial::SpatYieldOp>(
|
||||
firstCompute.getBody().front().getTerminator());
|
||||
|
||||
// Add a new operand to the block of the second computeOp
|
||||
Block &secondBlock = secondCompute.getBody().front();
|
||||
Value formerRes1 = secondBlock.addArgument(
|
||||
yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
|
||||
|
||||
auto secondComputeWeightsNum =
|
||||
secondCompute->getAttrOfType<DenseI32ArrayAttr>(
|
||||
secondCompute.getOperandSegmentSizesAttrName())[0];
|
||||
auto secondComputeOperandNum =
|
||||
secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
|
||||
|
||||
// Take the "former-result" from the second computeOp
|
||||
spatial::SpatYieldOp secondYield =
|
||||
cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
|
||||
Value formerRes2 = secondYield.getOperand(secondResultNum);
|
||||
|
||||
// Apply reduction operation
|
||||
rewriter.setInsertionPoint(secondYield);
|
||||
Value reduced = reduce(formerRes2, formerRes1);
|
||||
|
||||
// Unfortunately, it is not possible to update the result in place,
|
||||
// because we may have already referenced it by <computeOp, resultNum>
|
||||
// outside of this function, thus replacing it would invalidate the
|
||||
// reference. Therefore, we need to append a new result to the yieldOp,
|
||||
// and then at a later stage update the computeOp accordingly.
|
||||
|
||||
// Add `reduced` to the second yieldOp
|
||||
auto secondYieldOperandNum = secondYield.getNumOperands();
|
||||
secondYield->insertOperands(secondYieldOperandNum, reduced);
|
||||
secondResultNum = secondYieldOperandNum;
|
||||
|
||||
// We should also add an entry for updating the results of the last
|
||||
// operation (the one which never becomes a `firstCompute`): because it is
|
||||
// not tracked by reducerChanges as `fromOp`
|
||||
reducerChanges.push_back({firstCompute.getOperation(), firstResultNum,
|
||||
secondCompute.getOperation(), secondComputeOperandNum});
|
||||
nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
|
||||
}
|
||||
|
||||
// If we have an odd number of inputs, we need to add the last one to the
|
||||
// newInputs list.
|
||||
if (computeOpsRef.size() % 2 == 1) {
|
||||
nextComputeOps.push_back(computeOpsRef.back());
|
||||
}
|
||||
|
||||
// Replace the inputOps list with the new one.
|
||||
computeOpsRef =
|
||||
llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
|
||||
}
|
||||
|
||||
assert(computeOpsRef.size() == 1 &&
|
||||
"Internal error: expected a single input at this point.");
|
||||
|
||||
auto finalComputeAndResNum = computeOpsRef[0];
|
||||
|
||||
// Force the update of the results of this computeOp, when finalizing
|
||||
computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
|
||||
|
||||
if (postprocess) {
|
||||
GET_RES_NUM(finalComputeAndResNum) =
|
||||
applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
|
||||
}
|
||||
|
||||
return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(),
|
||||
GET_RES_NUM(finalComputeAndResNum));
|
||||
}
|
||||
|
||||
void SpatialReducer::finalizeReduceUpdates() {
|
||||
assert(reducesFinalized == false && "Cannot finalize two times.");
|
||||
|
||||
reducesFinalized = true;
|
||||
|
||||
// First, add the results to the computeOps
|
||||
for (auto &reduceChange : reducerChanges) {
|
||||
updateResultsOfCompute(reduceChange.fromOp);
|
||||
}
|
||||
|
||||
for (auto &c : computeOpNeedingResUpdate) {
|
||||
updateResultsOfCompute(c.getOperation());
|
||||
}
|
||||
|
||||
for (auto &reducerChange : this->reducerChanges) {
|
||||
auto fromOp = reducerChange.fromOp;
|
||||
auto toOp = reducerChange.toOp;
|
||||
auto fromOpResNum = reducerChange.fromOpResNum;
|
||||
auto toOpOperandNum = reducerChange.toOpOperandNum;
|
||||
|
||||
auto fromComputeOp = opToReplacedCompute[fromOp];
|
||||
assert(fromComputeOp && "fromOp should have been mapped before!");
|
||||
|
||||
// toComputeOp could be the existing pointer, or we have to remap it with
|
||||
// `opToReplacedCompute`
|
||||
auto toComputeOp = opToReplacedCompute[toOp];
|
||||
if (!toComputeOp) {
|
||||
toComputeOp = cast<spatial::SpatWeightedCompute>(toOp);
|
||||
}
|
||||
|
||||
assert(toComputeOp != fromComputeOp &&
|
||||
"Oops should have caught this earlier!");
|
||||
|
||||
assert(toComputeOp->getNumOperands() == toOpOperandNum &&
|
||||
"toOpOperandNum should be the last operand of toComputeOp, are the "
|
||||
"operations in the right order?");
|
||||
|
||||
// Add the new operand to `toComputeOp`
|
||||
auto fromResult = fromComputeOp.getResult(fromOpResNum);
|
||||
toComputeOp->insertOperands(toOpOperandNum, fromResult);
|
||||
incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum &opAndResNum) {
|
||||
assert(reducesFinalized &&
|
||||
"Cannot create resolve values before finalizing the reduce updates.");
|
||||
|
||||
Operation *opToCast;
|
||||
auto it = opToReplacedCompute.find(opAndResNum.first);
|
||||
if (it != opToReplacedCompute.end()) {
|
||||
opToCast = it->second;
|
||||
} else {
|
||||
opToCast = opAndResNum.first;
|
||||
}
|
||||
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(opToCast);
|
||||
|
||||
return computeOp.getResult(opAndResNum.second);
|
||||
}
|
||||
|
||||
void SpatialReducer::updateResultsOfCompute(Operation *computeOp) {
|
||||
if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
|
||||
// If we have already replaced the fromOp, we do not need to do it again
|
||||
return;
|
||||
}
|
||||
auto oldComputeOp = cast<spatial::SpatWeightedCompute>(computeOp);
|
||||
|
||||
auto oldComputeOpNum = oldComputeOp->getNumOperands();
|
||||
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
|
||||
|
||||
if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
|
||||
// No result was added, just add itself to the map
|
||||
opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
|
||||
return;
|
||||
}
|
||||
|
||||
// Add the results by inspecting its YieldOp
|
||||
auto newResultTypes = yieldOp.getOperandTypes();
|
||||
|
||||
// Create a new ComputeOp with the new result type, but same operands
|
||||
rewriter.setInsertionPoint(oldComputeOp);
|
||||
auto newComputeOp =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(oldComputeOp->getLoc(),
|
||||
newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
|
||||
|
||||
newComputeOp.getBody().takeBody(oldComputeOp.getBody());
|
||||
|
||||
auto newComputeOpNum = newComputeOp->getNumOperands();
|
||||
|
||||
assert(oldComputeOpNum == newComputeOpNum);
|
||||
|
||||
// Since we replaced the old ComputeOp with a new one, we need to replace
|
||||
// all its results' uses
|
||||
for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
|
||||
Value oldResult = oldComputeOp.getResult(i);
|
||||
Value newResult = newComputeOp.getResult(i);
|
||||
|
||||
// Replace the uses, except the uses of the compute ops which got deleted
|
||||
// previously
|
||||
rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
|
||||
}
|
||||
|
||||
// Finally, erase the old computeOp and update the map
|
||||
opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
|
||||
oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
|
||||
rewriter.setInsertionPoint(oldComputeOp);
|
||||
rewriter.eraseOp(oldComputeOp);
|
||||
}
|
||||
|
||||
Value SpatialReducer::createImgConcatOp(
|
||||
SmallVector<SmallVector<SmallVector<OpAndResNum>>> &outputTiles,
|
||||
Location &loc, Type outputType) {
|
||||
|
||||
assert(reducesFinalized &&
|
||||
"Cannot create ImgConcatOp before finalizing the reduce updates.");
|
||||
|
||||
// outputTiles are indexed like this: [channelTile][x][y]
|
||||
auto tilesCount = outputTiles.size();
|
||||
auto width = outputTiles[0].size();
|
||||
auto height = outputTiles[0][0].size();
|
||||
|
||||
SmallVector<SmallVector<SmallVector<Value>>> remappedOutputTiles(tilesCount,
|
||||
SmallVector<SmallVector<Value>>(width, SmallVector<Value>(height)));
|
||||
|
||||
for (size_t t = 0; t < tilesCount; t++)
|
||||
for (size_t x = 0; x < width; x++)
|
||||
for (size_t y = 0; y < height; y++)
|
||||
remappedOutputTiles[t][x][y] =
|
||||
resolveValueFromOpAndResNum(outputTiles[t][x][y]);
|
||||
|
||||
return ::onnx_mlir::createImgConcatOp(
|
||||
remappedOutputTiles, rewriter, loc, outputType);
|
||||
}
|
||||
|
||||
OpAndResNum SpatialReducer::applyAddMapReduction(
|
||||
SmallVector<ComputeAndResNum> &computeOps,
|
||||
ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp) {
|
||||
|
||||
std::function<Value(const Value &)> postprocessing = nullptr;
|
||||
|
||||
if (mapOp != MapOperations::None) {
|
||||
postprocessing = [&](const Value a) {
|
||||
Value mapOperand = a;
|
||||
if (biasTile) {
|
||||
mapOperand = rewriter.create<spatial::SpatVAddOp>(
|
||||
a.getLoc(), a.getType(), a, biasTile);
|
||||
}
|
||||
return createMapOperation(rewriter, mapOp, mapOperand);
|
||||
};
|
||||
}
|
||||
|
||||
return this->applyReducePattern(
|
||||
computeOps,
|
||||
[&](Value a, Value b) {
|
||||
return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b);
|
||||
},
|
||||
/* preprocess = */ nullptr, postprocessing);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
83
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp
Normal file
83
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp
Normal file
@@ -0,0 +1,83 @@
|
||||
#pragma once
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
using ResNum = unsigned int;
|
||||
|
||||
using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;
|
||||
|
||||
struct SpatialReducerChange {
|
||||
Operation *fromOp;
|
||||
unsigned int fromOpResNum;
|
||||
Operation *toOp;
|
||||
unsigned int toOpOperandNum;
|
||||
};
|
||||
|
||||
using OpAndResNum = std::pair<Operation *, ResNum>;
|
||||
|
||||
class SpatialReducer {
|
||||
|
||||
public:
|
||||
SpatialReducer(ConversionPatternRewriter &rewriter) : rewriter(rewriter) {}
|
||||
|
||||
OpAndResNum applyReducePattern(
|
||||
SmallVector<ComputeAndResNum> &computeOpsAndResNum,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess);
|
||||
|
||||
OpAndResNum applyAddMapReduction(SmallVector<ComputeAndResNum> &computeOps,
|
||||
ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp);
|
||||
|
||||
void finalizeReduceUpdates();
|
||||
|
||||
~SpatialReducer() {
|
||||
if (!reducesFinalized) {
|
||||
finalizeReduceUpdates();
|
||||
}
|
||||
}
|
||||
|
||||
Value createImgConcatOp(
|
||||
llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>
|
||||
&outputTiles,
|
||||
Location &loc, Type outputType);
|
||||
|
||||
Value resolveValueFromOpAndResNum(OpAndResNum &opAndResNum);
|
||||
|
||||
private:
|
||||
[[nodiscard("computeOp result number gets updated")]] ResNum
|
||||
applyResultProcessing(ComputeAndResNum computeOpAndResNum,
|
||||
std::function<Value(const Value &)> processFun,
|
||||
ConversionPatternRewriter &rewriter);
|
||||
|
||||
/**
|
||||
* @brief Update the results of a ComputeOp.
|
||||
*
|
||||
* This function updates the results of a ComputeOp by taking a look at the
|
||||
operands of its yieldOp.
|
||||
* If the ComputeOp was replaced, it updates `opToReplacedCompute` with the
|
||||
replaced ComputeOp.
|
||||
*
|
||||
* @param computeOp The ComputeOp to update the results of.
|
||||
*/
|
||||
void updateResultsOfCompute(Operation *computeOp);
|
||||
|
||||
ConversionPatternRewriter &rewriter;
|
||||
bool reducesFinalized = false;
|
||||
|
||||
// List of changes to be applied after the reduction is finalized
|
||||
SmallVector<SpatialReducerChange, 4> reducerChanges;
|
||||
// List of computeOps that need to be replaced with new results
|
||||
SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
|
||||
|
||||
std::unordered_map<Operation *, spatial::SpatWeightedCompute> opToReplacedCompute;
|
||||
|
||||
static llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
|
||||
};
|
||||
|
||||
} // namespace onnx_mlir
|
||||
53
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp
Normal file
53
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp
Normal file
@@ -0,0 +1,53 @@
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
WeightSubdivider::WeightSubdivider(
|
||||
map<long, map<long, SmallVector<Value>>> weights)
|
||||
: weights(std::move(weights)) {}
|
||||
|
||||
bool WeightSubdivider::isEmpty() const { return weights.empty(); }
|
||||
|
||||
TaggedWeights WeightSubdivider::popGroup(size_t amount) {
|
||||
assert(!weights.empty() && "No weights to extract.");
|
||||
|
||||
auto it = weights.begin();
|
||||
SmallVector<Value> &values = it->second.begin()->second;
|
||||
|
||||
long inputTile = it->first;
|
||||
long outputTile = it->second.begin()->first;
|
||||
|
||||
size_t n = std::min(amount, values.size());
|
||||
crossbarsUsed += n;
|
||||
|
||||
SmallVector<Value> result;
|
||||
result.assign(values.begin(), values.begin() + n);
|
||||
|
||||
if (n < values.size()) {
|
||||
values.erase(values.begin(), values.begin() + n);
|
||||
} else {
|
||||
it->second.erase(outputTile);
|
||||
if (it->second.empty()) {
|
||||
weights.erase(inputTile);
|
||||
}
|
||||
}
|
||||
|
||||
return {inputTile, outputTile, crossbarsUsed - n, result};
|
||||
}
|
||||
|
||||
SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
|
||||
crossbarsUsed = 0;
|
||||
SmallVector<TaggedWeights> result;
|
||||
size_t remaining = n;
|
||||
|
||||
while (remaining > 0 && !weights.empty()) {
|
||||
auto group = popGroup(remaining);
|
||||
result.push_back(group);
|
||||
remaining -= group.weights.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
46
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp
Normal file
46
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp
Normal file
@@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include <map>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace std;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* @brief A helper struct to store a group of weights.
|
||||
*
|
||||
*/
|
||||
struct TaggedWeights {
|
||||
long inputTile;
|
||||
long outputTile;
|
||||
size_t startingCrossbarIndex;
|
||||
SmallVector<Value> weights;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A helper class to subdivide weights into groups.
|
||||
*
|
||||
* Weights are stored as a map of maps of SmallVectors. The outer map is indexed
|
||||
* by input tile, the inner map is indexed by output tile, and the SmallVector
|
||||
* contains the weights for the filter. This class allows us to extract groups
|
||||
* of weights from the map until we've extracted a certain number of elements,
|
||||
* namely as many as we need to fill a compute unit.
|
||||
*/
|
||||
class WeightSubdivider {
|
||||
private:
|
||||
map<long, map<long, SmallVector<Value>>> weights;
|
||||
size_t crossbarsUsed = 0;
|
||||
|
||||
TaggedWeights popGroup(size_t amount);
|
||||
|
||||
public:
|
||||
WeightSubdivider(map<long, map<long, SmallVector<Value>>> weights);
|
||||
|
||||
bool isEmpty() const;
|
||||
SmallVector<TaggedWeights> popGroups(size_t n);
|
||||
};
|
||||
|
||||
} // namespace onnx_mlir
|
||||
Reference in New Issue
Block a user