add PIM accelerator
This commit is contained in:
119
src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
Normal file
119
src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp
Normal file
@@ -0,0 +1,119 @@
|
||||
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
||||
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
||||
|
||||
#include <queue>
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* @brief Structure that describes the replication of a convolution operation,
|
||||
* along the image height axis.
|
||||
*/
|
||||
struct ConvReplication {
|
||||
ONNXConvOp convOp; // Convolution operation
|
||||
size_t input_w; // Width of the input image
|
||||
size_t replicationFactor; // Replication factor on the image height axis
|
||||
size_t coresNeededPerReplica; // Number of cores needed for each replica
|
||||
|
||||
friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
|
||||
return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
|
||||
}
|
||||
|
||||
ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
|
||||
: convOp(convOp),
|
||||
input_w(input_w),
|
||||
replicationFactor(replicationFactor),
|
||||
coresNeededPerReplica(coresNeededPerReplica) {}
|
||||
};
|
||||
|
||||
LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
|
||||
|
||||
if (coresCount == -1) {
|
||||
// No need for annotation, implicitly set replication to 1
|
||||
return success();
|
||||
}
|
||||
|
||||
std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
|
||||
|
||||
size_t minimumCores = 0;
|
||||
|
||||
for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
|
||||
if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
|
||||
// Convolution layer
|
||||
|
||||
Value X = convOp.getX(), W = convOp.getW();
|
||||
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
||||
ShapedType wShape = mlir::cast<ShapedType>(W.getType());
|
||||
|
||||
size_t input_w = GET_IMAGE_WIDTH(xShape);
|
||||
size_t krn_h = GET_KERNEL_HEIGHT(wShape);
|
||||
size_t krn_w = GET_KERNEL_WIDTH(wShape);
|
||||
|
||||
size_t inputTileCount = ceilIntegerDivide(GET_IMAGE_CHANNEL(xShape), crossbarSize.getValue());
|
||||
size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
|
||||
|
||||
auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
|
||||
auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
|
||||
|
||||
minimumCores += neededCores;
|
||||
|
||||
convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
|
||||
}
|
||||
else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
|
||||
// Fully connected layer
|
||||
auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
|
||||
auto inputSize = matrixTensorShape.getDimSize(0);
|
||||
auto outputSize = matrixTensorShape.getDimSize(1);
|
||||
if (gemmOp.getTransB())
|
||||
std::swap(inputSize, outputSize);
|
||||
|
||||
const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
|
||||
const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
|
||||
|
||||
// Each output tile is computed by `coresPerOutputTile` cores. The
|
||||
// entire input is given to each of these cores.
|
||||
const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
|
||||
|
||||
auto neededCores = coresPerOutputTile * outputTilesCount;
|
||||
|
||||
minimumCores += neededCores;
|
||||
}
|
||||
}
|
||||
|
||||
if (static_cast<size_t>(coresCount) < minimumCores) {
|
||||
return funcOp->emitError("Not enough cores for this network: ")
|
||||
<< minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
|
||||
}
|
||||
|
||||
size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
|
||||
|
||||
// Consume all the elements in the queue
|
||||
while (!convOpsReplicationQueue.empty()) {
|
||||
auto convOpReplication = convOpsReplicationQueue.top();
|
||||
convOpsReplicationQueue.pop();
|
||||
|
||||
// Check if we can replicate this convolution (e.g. we have enough cores)
|
||||
if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
|
||||
// We can replicate this convolution: increment replicationFactor and put
|
||||
// back in queue
|
||||
availableCores -= convOpReplication.coresNeededPerReplica;
|
||||
convOpReplication.replicationFactor++;
|
||||
|
||||
convOpsReplicationQueue.push(convOpReplication);
|
||||
}
|
||||
else {
|
||||
// Cannot replicate this convolution anymore, annotate the operation
|
||||
// with the replication factor
|
||||
convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
|
||||
rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
|
||||
}
|
||||
}
|
||||
|
||||
return success();
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
@@ -0,0 +1,11 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
mlir::LogicalResult annotateReplication(
|
||||
mlir::func::FuncOp funcOp, mlir::IRRewriter &rewriter);
|
||||
|
||||
} // namespace onnx_mlir
|
||||
382
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp
Normal file
382
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.cpp
Normal file
@@ -0,0 +1,382 @@
|
||||
|
||||
#include "SpatialReducer.hpp"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <cassert>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
|
||||
#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
llvm::SmallPtrSet<Operation *, 16>
|
||||
onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
|
||||
|
||||
ResNum SpatialReducer::applyResultProcessing(
|
||||
ComputeAndResNum computeOpAndResNum,
|
||||
std::function<Value(const Value &)> processFun,
|
||||
ConversionPatternRewriter &rewriter) {
|
||||
assert(processFun);
|
||||
|
||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
||||
auto resultNum = GET_RES_NUM(computeOpAndResNum);
|
||||
|
||||
spatial::SpatYieldOp yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
|
||||
Value result = yieldOp->getOperand(resultNum);
|
||||
rewriter.setInsertionPointAfterValue(result);
|
||||
Value processedResult = processFun(result);
|
||||
if (processedResult == result) {
|
||||
// Sometimes we want processedResult to return the same value but do
|
||||
// something else with it (e.g. in softmax we want to broadcast the value
|
||||
// using a channel). In this case, we can just return the same value.
|
||||
return resultNum;
|
||||
}
|
||||
|
||||
yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
|
||||
|
||||
return yieldOp.getNumOperands() - 1;
|
||||
}
|
||||
|
||||
OpAndResNum SpatialReducer::applyReducePattern(
|
||||
SmallVector<ComputeAndResNum> &computeOpsAndResNum,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess) {
|
||||
|
||||
if (preprocess) {
|
||||
for (auto &computeOpAndResNum : computeOpsAndResNum) {
|
||||
GET_RES_NUM(computeOpAndResNum) =
|
||||
applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
|
||||
}
|
||||
}
|
||||
|
||||
// It is possible that `computeOpsAndResNum` contains two entries for the same
|
||||
// computeOp. In this case, we need to apply the reduction within-computef
|
||||
|
||||
// Keep a map between a computeOp and the last Value for this reduction
|
||||
std::unordered_map<Operation *, Value> lastValueForCompute;
|
||||
for (auto &computeOpAndResNum : computeOpsAndResNum) {
|
||||
auto computeOp = GET_COMP(computeOpAndResNum);
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
Value valueWithinCompute =
|
||||
yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
|
||||
|
||||
auto it = lastValueForCompute.find(computeOp.getOperation());
|
||||
|
||||
if (it != lastValueForCompute.end()) {
|
||||
// If we have already seen this computeOp, apply the reduction
|
||||
// within-compute
|
||||
Value lastWithinComputeValue = it->second;
|
||||
|
||||
assert(valueWithinCompute.getDefiningOp() &&
|
||||
lastWithinComputeValue.getDefiningOp());
|
||||
|
||||
if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(
|
||||
lastWithinComputeValue.getDefiningOp())) {
|
||||
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
||||
} else {
|
||||
rewriter.setInsertionPointAfterValue(valueWithinCompute);
|
||||
}
|
||||
valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
|
||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
||||
}
|
||||
|
||||
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
||||
}
|
||||
|
||||
// Now, reconstruct from the map the computeOpsAndResNum list
|
||||
computeOpsAndResNum.clear();
|
||||
computeOpsAndResNum.reserve(lastValueForCompute.size());
|
||||
for (auto &entry : lastValueForCompute) {
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(entry.first);
|
||||
auto valueWithinCompute = entry.second;
|
||||
|
||||
// We check if `valueWithinCompute` is already used by the yieldOp, in that
|
||||
// case no need to add it
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
||||
bool yieldOpUseFound = false;
|
||||
for (auto &use : valueWithinCompute.getUses()) {
|
||||
if (use.getOwner() == yieldOp.getOperation()) {
|
||||
// If the value is already used by the yieldOp, we can just use it
|
||||
computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
|
||||
yieldOpUseFound = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (yieldOpUseFound) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If this result is not used within a yieldOp, then add it
|
||||
auto resultNum = yieldOp->getNumOperands();
|
||||
yieldOp->insertOperands(resultNum, valueWithinCompute);
|
||||
|
||||
computeOpsAndResNum.push_back({computeOp, resultNum});
|
||||
}
|
||||
|
||||
Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
|
||||
|
||||
// Recursive algorithm to reduce the inputs to a single one:
|
||||
// - Take two inputs at a time, and reduce them into a single one, updating
|
||||
// the computeOpsAndResNum list which becomes half the size.
|
||||
// - Repeat until there is only one input left.
|
||||
llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
|
||||
while (computeOpsRef.size() > 1) {
|
||||
SmallVector<ComputeAndResNum> nextComputeOps;
|
||||
nextComputeOps.reserve(computeOpsRef.size() / 2);
|
||||
for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
|
||||
auto [firstCompute, firstResultNum] = computeOpsRef[i];
|
||||
auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
|
||||
|
||||
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
||||
std::swap(firstCompute, secondCompute);
|
||||
std::swap(firstResultNum, secondResultNum);
|
||||
}
|
||||
|
||||
// We do not immediately alter the computeOps results/operands, instead we
|
||||
// do it in a delayed manner, to avoid invalidating the references to the
|
||||
// computeOps (which must be replaced by a cloned ComputeOp when changing
|
||||
// the number of results)
|
||||
// See below `reducerChanges.push_back` and `finalizeReduceUpdates`
|
||||
|
||||
auto yieldOpFirstCompute = cast<spatial::SpatYieldOp>(
|
||||
firstCompute.getBody().front().getTerminator());
|
||||
|
||||
// Add a new operand to the block of the second computeOp
|
||||
Block &secondBlock = secondCompute.getBody().front();
|
||||
Value formerRes1 = secondBlock.addArgument(
|
||||
yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
|
||||
|
||||
auto secondComputeWeightsNum =
|
||||
secondCompute->getAttrOfType<DenseI32ArrayAttr>(
|
||||
secondCompute.getOperandSegmentSizesAttrName())[0];
|
||||
auto secondComputeOperandNum =
|
||||
secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
|
||||
|
||||
// Take the "former-result" from the second computeOp
|
||||
spatial::SpatYieldOp secondYield =
|
||||
cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
|
||||
Value formerRes2 = secondYield.getOperand(secondResultNum);
|
||||
|
||||
// Apply reduction operation
|
||||
rewriter.setInsertionPoint(secondYield);
|
||||
Value reduced = reduce(formerRes2, formerRes1);
|
||||
|
||||
// Unfortunately, it is not possible to update the result in place,
|
||||
// because we may have already referenced it by <computeOp, resultNum>
|
||||
// outside of this function, thus replacing it would invalidate the
|
||||
// reference. Therefore, we need to append a new result to the yieldOp,
|
||||
// and then at a later stage update the computeOp accordingly.
|
||||
|
||||
// Add `reduced` to the second yieldOp
|
||||
auto secondYieldOperandNum = secondYield.getNumOperands();
|
||||
secondYield->insertOperands(secondYieldOperandNum, reduced);
|
||||
secondResultNum = secondYieldOperandNum;
|
||||
|
||||
// We should also add an entry for updating the results of the last
|
||||
// operation (the one which never becomes a `firstCompute`): because it is
|
||||
// not tracked by reducerChanges as `fromOp`
|
||||
reducerChanges.push_back({firstCompute.getOperation(), firstResultNum,
|
||||
secondCompute.getOperation(), secondComputeOperandNum});
|
||||
nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
|
||||
}
|
||||
|
||||
// If we have an odd number of inputs, we need to add the last one to the
|
||||
// newInputs list.
|
||||
if (computeOpsRef.size() % 2 == 1) {
|
||||
nextComputeOps.push_back(computeOpsRef.back());
|
||||
}
|
||||
|
||||
// Replace the inputOps list with the new one.
|
||||
computeOpsRef =
|
||||
llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
|
||||
}
|
||||
|
||||
assert(computeOpsRef.size() == 1 &&
|
||||
"Internal error: expected a single input at this point.");
|
||||
|
||||
auto finalComputeAndResNum = computeOpsRef[0];
|
||||
|
||||
// Force the update of the results of this computeOp, when finalizing
|
||||
computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
|
||||
|
||||
if (postprocess) {
|
||||
GET_RES_NUM(finalComputeAndResNum) =
|
||||
applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
|
||||
}
|
||||
|
||||
return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(),
|
||||
GET_RES_NUM(finalComputeAndResNum));
|
||||
}
|
||||
|
||||
void SpatialReducer::finalizeReduceUpdates() {
|
||||
assert(reducesFinalized == false && "Cannot finalize two times.");
|
||||
|
||||
reducesFinalized = true;
|
||||
|
||||
// First, add the results to the computeOps
|
||||
for (auto &reduceChange : reducerChanges) {
|
||||
updateResultsOfCompute(reduceChange.fromOp);
|
||||
}
|
||||
|
||||
for (auto &c : computeOpNeedingResUpdate) {
|
||||
updateResultsOfCompute(c.getOperation());
|
||||
}
|
||||
|
||||
for (auto &reducerChange : this->reducerChanges) {
|
||||
auto fromOp = reducerChange.fromOp;
|
||||
auto toOp = reducerChange.toOp;
|
||||
auto fromOpResNum = reducerChange.fromOpResNum;
|
||||
auto toOpOperandNum = reducerChange.toOpOperandNum;
|
||||
|
||||
auto fromComputeOp = opToReplacedCompute[fromOp];
|
||||
assert(fromComputeOp && "fromOp should have been mapped before!");
|
||||
|
||||
// toComputeOp could be the existing pointer, or we have to remap it with
|
||||
// `opToReplacedCompute`
|
||||
auto toComputeOp = opToReplacedCompute[toOp];
|
||||
if (!toComputeOp) {
|
||||
toComputeOp = cast<spatial::SpatWeightedCompute>(toOp);
|
||||
}
|
||||
|
||||
assert(toComputeOp != fromComputeOp &&
|
||||
"Oops should have caught this earlier!");
|
||||
|
||||
assert(toComputeOp->getNumOperands() == toOpOperandNum &&
|
||||
"toOpOperandNum should be the last operand of toComputeOp, are the "
|
||||
"operations in the right order?");
|
||||
|
||||
// Add the new operand to `toComputeOp`
|
||||
auto fromResult = fromComputeOp.getResult(fromOpResNum);
|
||||
toComputeOp->insertOperands(toOpOperandNum, fromResult);
|
||||
incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
|
||||
}
|
||||
}
|
||||
|
||||
Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum &opAndResNum) {
|
||||
assert(reducesFinalized &&
|
||||
"Cannot create resolve values before finalizing the reduce updates.");
|
||||
|
||||
Operation *opToCast;
|
||||
auto it = opToReplacedCompute.find(opAndResNum.first);
|
||||
if (it != opToReplacedCompute.end()) {
|
||||
opToCast = it->second;
|
||||
} else {
|
||||
opToCast = opAndResNum.first;
|
||||
}
|
||||
|
||||
auto computeOp = cast<spatial::SpatWeightedCompute>(opToCast);
|
||||
|
||||
return computeOp.getResult(opAndResNum.second);
|
||||
}
|
||||
|
||||
void SpatialReducer::updateResultsOfCompute(Operation *computeOp) {
|
||||
if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
|
||||
// If we have already replaced the fromOp, we do not need to do it again
|
||||
return;
|
||||
}
|
||||
auto oldComputeOp = cast<spatial::SpatWeightedCompute>(computeOp);
|
||||
|
||||
auto oldComputeOpNum = oldComputeOp->getNumOperands();
|
||||
|
||||
auto yieldOp =
|
||||
cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
|
||||
|
||||
if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
|
||||
// No result was added, just add itself to the map
|
||||
opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
|
||||
return;
|
||||
}
|
||||
|
||||
// Add the results by inspecting its YieldOp
|
||||
auto newResultTypes = yieldOp.getOperandTypes();
|
||||
|
||||
// Create a new ComputeOp with the new result type, but same operands
|
||||
rewriter.setInsertionPoint(oldComputeOp);
|
||||
auto newComputeOp =
|
||||
rewriter.create<spatial::SpatWeightedCompute>(oldComputeOp->getLoc(),
|
||||
newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
|
||||
|
||||
newComputeOp.getBody().takeBody(oldComputeOp.getBody());
|
||||
|
||||
auto newComputeOpNum = newComputeOp->getNumOperands();
|
||||
|
||||
assert(oldComputeOpNum == newComputeOpNum);
|
||||
|
||||
// Since we replaced the old ComputeOp with a new one, we need to replace
|
||||
// all its results' uses
|
||||
for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
|
||||
Value oldResult = oldComputeOp.getResult(i);
|
||||
Value newResult = newComputeOp.getResult(i);
|
||||
|
||||
// Replace the uses, except the uses of the compute ops which got deleted
|
||||
// previously
|
||||
rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
|
||||
}
|
||||
|
||||
// Finally, erase the old computeOp and update the map
|
||||
opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
|
||||
oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
|
||||
rewriter.setInsertionPoint(oldComputeOp);
|
||||
rewriter.eraseOp(oldComputeOp);
|
||||
}
|
||||
|
||||
Value SpatialReducer::createImgConcatOp(
|
||||
SmallVector<SmallVector<SmallVector<OpAndResNum>>> &outputTiles,
|
||||
Location &loc, Type outputType) {
|
||||
|
||||
assert(reducesFinalized &&
|
||||
"Cannot create ImgConcatOp before finalizing the reduce updates.");
|
||||
|
||||
// outputTiles are indexed like this: [channelTile][x][y]
|
||||
auto tilesCount = outputTiles.size();
|
||||
auto width = outputTiles[0].size();
|
||||
auto height = outputTiles[0][0].size();
|
||||
|
||||
SmallVector<SmallVector<SmallVector<Value>>> remappedOutputTiles(tilesCount,
|
||||
SmallVector<SmallVector<Value>>(width, SmallVector<Value>(height)));
|
||||
|
||||
for (size_t t = 0; t < tilesCount; t++)
|
||||
for (size_t x = 0; x < width; x++)
|
||||
for (size_t y = 0; y < height; y++)
|
||||
remappedOutputTiles[t][x][y] =
|
||||
resolveValueFromOpAndResNum(outputTiles[t][x][y]);
|
||||
|
||||
return ::onnx_mlir::createImgConcatOp(
|
||||
remappedOutputTiles, rewriter, loc, outputType);
|
||||
}
|
||||
|
||||
OpAndResNum SpatialReducer::applyAddMapReduction(
|
||||
SmallVector<ComputeAndResNum> &computeOps,
|
||||
ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp) {
|
||||
|
||||
std::function<Value(const Value &)> postprocessing = nullptr;
|
||||
|
||||
if (mapOp != MapOperations::None) {
|
||||
postprocessing = [&](const Value a) {
|
||||
Value mapOperand = a;
|
||||
if (biasTile) {
|
||||
mapOperand = rewriter.create<spatial::SpatVAddOp>(
|
||||
a.getLoc(), a.getType(), a, biasTile);
|
||||
}
|
||||
return createMapOperation(rewriter, mapOp, mapOperand);
|
||||
};
|
||||
}
|
||||
|
||||
return this->applyReducePattern(
|
||||
computeOps,
|
||||
[&](Value a, Value b) {
|
||||
return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b);
|
||||
},
|
||||
/* preprocess = */ nullptr, postprocessing);
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
83
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp
Normal file
83
src/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp
Normal file
@@ -0,0 +1,83 @@
|
||||
#pragma once
|
||||
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
||||
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
using ResNum = unsigned int;
|
||||
|
||||
using ComputeAndResNum = std::pair<spatial::SpatWeightedCompute, ResNum>;
|
||||
|
||||
struct SpatialReducerChange {
|
||||
Operation *fromOp;
|
||||
unsigned int fromOpResNum;
|
||||
Operation *toOp;
|
||||
unsigned int toOpOperandNum;
|
||||
};
|
||||
|
||||
using OpAndResNum = std::pair<Operation *, ResNum>;
|
||||
|
||||
class SpatialReducer {
|
||||
|
||||
public:
|
||||
SpatialReducer(ConversionPatternRewriter &rewriter) : rewriter(rewriter) {}
|
||||
|
||||
OpAndResNum applyReducePattern(
|
||||
SmallVector<ComputeAndResNum> &computeOpsAndResNum,
|
||||
std::function<Value(const Value &, const Value &)> reduce,
|
||||
std::function<Value(const Value &)> preprocess,
|
||||
std::function<Value(const Value &)> postprocess);
|
||||
|
||||
OpAndResNum applyAddMapReduction(SmallVector<ComputeAndResNum> &computeOps,
|
||||
ConversionPatternRewriter &rewriter, Value biasTile, MapOperations mapOp);
|
||||
|
||||
void finalizeReduceUpdates();
|
||||
|
||||
~SpatialReducer() {
|
||||
if (!reducesFinalized) {
|
||||
finalizeReduceUpdates();
|
||||
}
|
||||
}
|
||||
|
||||
Value createImgConcatOp(
|
||||
llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>
|
||||
&outputTiles,
|
||||
Location &loc, Type outputType);
|
||||
|
||||
Value resolveValueFromOpAndResNum(OpAndResNum &opAndResNum);
|
||||
|
||||
private:
|
||||
[[nodiscard("computeOp result number gets updated")]] ResNum
|
||||
applyResultProcessing(ComputeAndResNum computeOpAndResNum,
|
||||
std::function<Value(const Value &)> processFun,
|
||||
ConversionPatternRewriter &rewriter);
|
||||
|
||||
/**
|
||||
* @brief Update the results of a ComputeOp.
|
||||
*
|
||||
* This function updates the results of a ComputeOp by taking a look at the
|
||||
operands of its yieldOp.
|
||||
* If the ComputeOp was replaced, it updates `opToReplacedCompute` with the
|
||||
replaced ComputeOp.
|
||||
*
|
||||
* @param computeOp The ComputeOp to update the results of.
|
||||
*/
|
||||
void updateResultsOfCompute(Operation *computeOp);
|
||||
|
||||
ConversionPatternRewriter &rewriter;
|
||||
bool reducesFinalized = false;
|
||||
|
||||
// List of changes to be applied after the reduction is finalized
|
||||
SmallVector<SpatialReducerChange, 4> reducerChanges;
|
||||
// List of computeOps that need to be replaced with new results
|
||||
SmallVector<spatial::SpatWeightedCompute> computeOpNeedingResUpdate;
|
||||
|
||||
std::unordered_map<Operation *, spatial::SpatWeightedCompute> opToReplacedCompute;
|
||||
|
||||
static llvm::SmallPtrSet<Operation *, 16> oldComputeOpsReplaced;
|
||||
};
|
||||
|
||||
} // namespace onnx_mlir
|
||||
53
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp
Normal file
53
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.cpp
Normal file
@@ -0,0 +1,53 @@
|
||||
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
WeightSubdivider::WeightSubdivider(
|
||||
map<long, map<long, SmallVector<Value>>> weights)
|
||||
: weights(std::move(weights)) {}
|
||||
|
||||
bool WeightSubdivider::isEmpty() const { return weights.empty(); }
|
||||
|
||||
TaggedWeights WeightSubdivider::popGroup(size_t amount) {
|
||||
assert(!weights.empty() && "No weights to extract.");
|
||||
|
||||
auto it = weights.begin();
|
||||
SmallVector<Value> &values = it->second.begin()->second;
|
||||
|
||||
long inputTile = it->first;
|
||||
long outputTile = it->second.begin()->first;
|
||||
|
||||
size_t n = std::min(amount, values.size());
|
||||
crossbarsUsed += n;
|
||||
|
||||
SmallVector<Value> result;
|
||||
result.assign(values.begin(), values.begin() + n);
|
||||
|
||||
if (n < values.size()) {
|
||||
values.erase(values.begin(), values.begin() + n);
|
||||
} else {
|
||||
it->second.erase(outputTile);
|
||||
if (it->second.empty()) {
|
||||
weights.erase(inputTile);
|
||||
}
|
||||
}
|
||||
|
||||
return {inputTile, outputTile, crossbarsUsed - n, result};
|
||||
}
|
||||
|
||||
SmallVector<TaggedWeights> WeightSubdivider::popGroups(size_t n) {
|
||||
crossbarsUsed = 0;
|
||||
SmallVector<TaggedWeights> result;
|
||||
size_t remaining = n;
|
||||
|
||||
while (remaining > 0 && !weights.empty()) {
|
||||
auto group = popGroup(remaining);
|
||||
result.push_back(group);
|
||||
remaining -= group.weights.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace onnx_mlir
|
||||
46
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp
Normal file
46
src/PIM/Conversion/ONNXToSpatial/Utils/WeightSubdivider.hpp
Normal file
@@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include "mlir/IR/Value.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include <map>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace std;
|
||||
|
||||
namespace onnx_mlir {
|
||||
|
||||
/**
|
||||
* @brief A helper struct to store a group of weights.
|
||||
*
|
||||
*/
|
||||
struct TaggedWeights {
|
||||
long inputTile;
|
||||
long outputTile;
|
||||
size_t startingCrossbarIndex;
|
||||
SmallVector<Value> weights;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief A helper class to subdivide weights into groups.
|
||||
*
|
||||
* Weights are stored as a map of maps of SmallVectors. The outer map is indexed
|
||||
* by input tile, the inner map is indexed by output tile, and the SmallVector
|
||||
* contains the weights for the filter. This class allows us to extract groups
|
||||
* of weights from the map until we've extracted a certain number of elements,
|
||||
* namely as many as we need to fill a compute unit.
|
||||
*/
|
||||
class WeightSubdivider {
|
||||
private:
|
||||
map<long, map<long, SmallVector<Value>>> weights;
|
||||
size_t crossbarsUsed = 0;
|
||||
|
||||
TaggedWeights popGroup(size_t amount);
|
||||
|
||||
public:
|
||||
WeightSubdivider(map<long, map<long, SmallVector<Value>>> weights);
|
||||
|
||||
bool isEmpty() const;
|
||||
SmallVector<TaggedWeights> popGroups(size_t n);
|
||||
};
|
||||
|
||||
} // namespace onnx_mlir
|
||||
Reference in New Issue
Block a user