349 lines
15 KiB
C++
349 lines
15 KiB
C++
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/Value.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
#include <cassert>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
|
|
#include "SpatialReducer.hpp"
|
|
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
|
|
|
|
#define GET_COMP(computeOpAndResNum) std::get<0>(computeOpAndResNum)
|
|
#define GET_RES_NUM(computeOpAndResNum) std::get<1>(computeOpAndResNum)
|
|
|
|
namespace onnx_mlir {
|
|
|
|
llvm::SmallPtrSet<mlir::Operation*, 16> onnx_mlir::SpatialReducer::oldComputeOpsReplaced;
|
|
|
|
ResNum SpatialReducer::applyResultProcessing(ComputeAndResNum computeOpAndResNum,
|
|
std::function<mlir::Value(const mlir::Value&)> processFun,
|
|
mlir::ConversionPatternRewriter& rewriter) {
|
|
assert(processFun);
|
|
|
|
auto computeOp = GET_COMP(computeOpAndResNum);
|
|
auto resultNum = GET_RES_NUM(computeOpAndResNum);
|
|
|
|
spatial::SpatYieldOp yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
|
|
mlir::Value result = yieldOp->getOperand(resultNum);
|
|
rewriter.setInsertionPointAfterValue(result);
|
|
mlir::Value processedResult = processFun(result);
|
|
if (processedResult == result) {
|
|
// Sometimes we want processedResult to return the same value but do
|
|
// something else with it (e.g. in softmax we want to broadcast the value
|
|
// using a channel). In this case, we can just return the same value.
|
|
return resultNum;
|
|
}
|
|
|
|
yieldOp->insertOperands(yieldOp->getNumOperands(), processedResult);
|
|
|
|
return yieldOp.getNumOperands() - 1;
|
|
}
|
|
|
|
OpAndResNum
|
|
SpatialReducer::applyReducePattern(llvm::SmallVector<ComputeAndResNum>& computeOpsAndResNum,
|
|
std::function<mlir::Value(const mlir::Value&, const mlir::Value&)> reduce,
|
|
std::function<mlir::Value(const mlir::Value&)> preprocess,
|
|
std::function<mlir::Value(const mlir::Value&)> postprocess) {
|
|
|
|
if (preprocess)
|
|
for (auto& computeOpAndResNum : computeOpsAndResNum)
|
|
GET_RES_NUM(computeOpAndResNum) = applyResultProcessing(computeOpAndResNum, preprocess, rewriter);
|
|
|
|
// It is possible that `computeOpsAndResNum` contains two entries for the same
|
|
// computeOp. In this case, we need to apply the reduction within-computef
|
|
|
|
// Keep a map between a computeOp and the last Value for this reduction
|
|
std::unordered_map<mlir::Operation*, mlir::Value> lastValueForCompute;
|
|
for (auto& computeOpAndResNum : computeOpsAndResNum) {
|
|
auto computeOp = GET_COMP(computeOpAndResNum);
|
|
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
mlir::Value valueWithinCompute = yieldOp->getOperand(GET_RES_NUM(computeOpAndResNum));
|
|
|
|
auto it = lastValueForCompute.find(computeOp.getOperation());
|
|
|
|
if (it != lastValueForCompute.end()) {
|
|
// If we have already seen this computeOp, apply the reduction
|
|
// within-compute
|
|
mlir::Value lastWithinComputeValue = it->second;
|
|
|
|
assert(valueWithinCompute.getDefiningOp() && lastWithinComputeValue.getDefiningOp());
|
|
|
|
if (valueWithinCompute.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp()))
|
|
rewriter.setInsertionPointAfterValue(lastWithinComputeValue);
|
|
else
|
|
rewriter.setInsertionPointAfterValue(valueWithinCompute);
|
|
valueWithinCompute = reduce(lastWithinComputeValue, valueWithinCompute);
|
|
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
|
}
|
|
|
|
lastValueForCompute[computeOp.getOperation()] = valueWithinCompute;
|
|
}
|
|
|
|
// Now, reconstruct from the map the computeOpsAndResNum list
|
|
computeOpsAndResNum.clear();
|
|
computeOpsAndResNum.reserve(lastValueForCompute.size());
|
|
for (auto& entry : lastValueForCompute) {
|
|
auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(entry.first);
|
|
auto valueWithinCompute = entry.second;
|
|
|
|
// We check if `valueWithinCompute` is already used by the yieldOp, in that
|
|
// case no need to add it
|
|
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(computeOp.getBody().front().getTerminator());
|
|
bool yieldOpUseFound = false;
|
|
for (auto& use : valueWithinCompute.getUses()) {
|
|
if (use.getOwner() == yieldOp.getOperation()) {
|
|
// If the value is already used by the yieldOp, we can just use it
|
|
computeOpsAndResNum.push_back({computeOp, use.getOperandNumber()});
|
|
yieldOpUseFound = true;
|
|
break;
|
|
}
|
|
}
|
|
if (yieldOpUseFound)
|
|
continue;
|
|
|
|
// If this result is not used within a yieldOp, then add it
|
|
auto resultNum = yieldOp->getNumOperands();
|
|
yieldOp->insertOperands(resultNum, valueWithinCompute);
|
|
|
|
computeOpsAndResNum.push_back({computeOp, resultNum});
|
|
}
|
|
|
|
mlir::Location loc = GET_COMP(computeOpsAndResNum[0])->getLoc();
|
|
|
|
// Recursive algorithm to reduce the inputs to a single one:
|
|
// - Take two inputs at a time, and reduce them into a single one, updating
|
|
// the computeOpsAndResNum list which becomes half the size.
|
|
// - Repeat until there is only one input left.
|
|
llvm::OwningArrayRef<ComputeAndResNum> computeOpsRef(computeOpsAndResNum);
|
|
while (computeOpsRef.size() > 1) {
|
|
llvm::SmallVector<ComputeAndResNum> nextComputeOps;
|
|
nextComputeOps.reserve(computeOpsRef.size() / 2);
|
|
for (size_t i = 0; i < computeOpsRef.size() - 1; i += 2) {
|
|
auto [firstCompute, firstResultNum] = computeOpsRef[i];
|
|
auto [secondCompute, secondResultNum] = computeOpsRef[i + 1];
|
|
|
|
if (secondCompute->isBeforeInBlock(firstCompute)) {
|
|
std::swap(firstCompute, secondCompute);
|
|
std::swap(firstResultNum, secondResultNum);
|
|
}
|
|
|
|
// We do not immediately alter the computeOps results/operands, instead we
|
|
// do it in a delayed manner, to avoid invalidating the references to the
|
|
// computeOps (which must be replaced by a cloned ComputeOp when changing
|
|
// the number of results)
|
|
// See below `reducerChanges.push_back` and `finalizeReduceUpdates`
|
|
|
|
auto yieldOpFirstCompute = mlir::cast<spatial::SpatYieldOp>(firstCompute.getBody().front().getTerminator());
|
|
|
|
// Add a new operand to the block of the second computeOp
|
|
mlir::Block& secondBlock = secondCompute.getBody().front();
|
|
mlir::Value formerRes1 = secondBlock.addArgument(yieldOpFirstCompute->getOperand(firstResultNum).getType(), loc);
|
|
|
|
auto secondComputeWeightsNum =
|
|
secondCompute->getAttrOfType<mlir::DenseI32ArrayAttr>(secondCompute.getOperandSegmentSizesAttrName())[0];
|
|
auto secondComputeOperandNum = secondComputeWeightsNum + secondBlock.getNumArguments() - 1;
|
|
|
|
// Take the "former-result" from the second computeOp
|
|
spatial::SpatYieldOp secondYield = mlir::cast<spatial::SpatYieldOp>(secondBlock.getTerminator());
|
|
mlir::Value formerRes2 = secondYield.getOperand(secondResultNum);
|
|
|
|
// Apply reduction operation
|
|
rewriter.setInsertionPoint(secondYield);
|
|
mlir::Value reduced = reduce(formerRes2, formerRes1);
|
|
|
|
// Unfortunately, it is not possible to update the result in place,
|
|
// because we may have already referenced it by <computeOp, resultNum>
|
|
// outside of this function, thus replacing it would invalidate the
|
|
// reference. Therefore, we need to append a new result to the yieldOp,
|
|
// and then at a later stage update the computeOp accordingly.
|
|
|
|
// Add `reduced` to the second yieldOp
|
|
auto secondYieldOperandNum = secondYield.getNumOperands();
|
|
secondYield->insertOperands(secondYieldOperandNum, reduced);
|
|
secondResultNum = secondYieldOperandNum;
|
|
|
|
// We should also add an entry for updating the results of the last
|
|
// operation (the one which never becomes a `firstCompute`): because it is
|
|
// not tracked by reducerChanges as `fromOp`
|
|
reducerChanges.push_back(
|
|
{firstCompute.getOperation(), firstResultNum, secondCompute.getOperation(), secondComputeOperandNum});
|
|
nextComputeOps.push_back(std::make_pair(secondCompute, secondResultNum));
|
|
}
|
|
|
|
// If we have an odd number of inputs, we need to add the last one to the
|
|
// newInputs list.
|
|
if (computeOpsRef.size() % 2 == 1)
|
|
nextComputeOps.push_back(computeOpsRef.back());
|
|
|
|
// Replace the inputOps list with the new one.
|
|
computeOpsRef = llvm::OwningArrayRef<ComputeAndResNum>(std::move(nextComputeOps));
|
|
}
|
|
|
|
assert(computeOpsRef.size() == 1 && "Internal error: expected a single input at this point.");
|
|
|
|
auto finalComputeAndResNum = computeOpsRef[0];
|
|
|
|
// Force the update of the results of this computeOp, when finalizing
|
|
computeOpNeedingResUpdate.push_back(GET_COMP(finalComputeAndResNum));
|
|
|
|
if (postprocess)
|
|
GET_RES_NUM(finalComputeAndResNum) = applyResultProcessing(finalComputeAndResNum, postprocess, rewriter);
|
|
|
|
return std::make_pair(GET_COMP(finalComputeAndResNum).getOperation(), GET_RES_NUM(finalComputeAndResNum));
|
|
}
|
|
|
|
void SpatialReducer::finalizeReduceUpdates() {
|
|
assert(reducesFinalized == false && "Cannot finalize two times.");
|
|
|
|
reducesFinalized = true;
|
|
|
|
// First, add the results to the computeOps
|
|
for (auto& reduceChange : reducerChanges)
|
|
updateResultsOfCompute(reduceChange.fromOp);
|
|
|
|
for (auto& c : computeOpNeedingResUpdate)
|
|
updateResultsOfCompute(c.getOperation());
|
|
|
|
for (auto& reducerChange : this->reducerChanges) {
|
|
auto fromOp = reducerChange.fromOp;
|
|
auto toOp = reducerChange.toOp;
|
|
auto fromOpResNum = reducerChange.fromOpResNum;
|
|
auto toOpOperandNum = reducerChange.toOpOperandNum;
|
|
|
|
auto fromComputeOp = opToReplacedCompute[fromOp];
|
|
assert(fromComputeOp && "fromOp should have been mapped before!");
|
|
|
|
// toComputeOp could be the existing pointer, or we have to remap it with
|
|
// `opToReplacedCompute`
|
|
auto toComputeOp = opToReplacedCompute[toOp];
|
|
if (!toComputeOp)
|
|
toComputeOp = mlir::cast<spatial::SpatWeightedCompute>(toOp);
|
|
|
|
assert(toComputeOp != fromComputeOp && "Oops should have caught this earlier!");
|
|
|
|
assert(toComputeOp->getNumOperands() == toOpOperandNum
|
|
&& "toOpOperandNum should be the last operand of toComputeOp, are the "
|
|
"operations in the right order?");
|
|
|
|
// Add the new operand to `toComputeOp`
|
|
auto fromResult = fromComputeOp.getResult(fromOpResNum);
|
|
toComputeOp->insertOperands(toOpOperandNum, fromResult);
|
|
incrementWeightedComputeInputsSegmentSize(toComputeOp, 1);
|
|
}
|
|
}
|
|
|
|
mlir::Value SpatialReducer::resolveValueFromOpAndResNum(OpAndResNum& opAndResNum) {
|
|
assert(reducesFinalized && "Cannot create resolve values before finalizing the reduce updates.");
|
|
|
|
mlir::Operation* opToCast;
|
|
auto it = opToReplacedCompute.find(opAndResNum.first);
|
|
if (it != opToReplacedCompute.end())
|
|
opToCast = it->second;
|
|
else
|
|
opToCast = opAndResNum.first;
|
|
|
|
auto computeOp = mlir::cast<spatial::SpatWeightedCompute>(opToCast);
|
|
|
|
return computeOp.getResult(opAndResNum.second);
|
|
}
|
|
|
|
void SpatialReducer::updateResultsOfCompute(mlir::Operation* computeOp) {
|
|
if (opToReplacedCompute.find(computeOp) != opToReplacedCompute.end()) {
|
|
// If we have already replaced the fromOp, we do not need to do it again
|
|
return;
|
|
}
|
|
auto oldComputeOp = mlir::cast<spatial::SpatWeightedCompute>(computeOp);
|
|
|
|
auto oldComputeOpNum = oldComputeOp->getNumOperands();
|
|
|
|
auto yieldOp = mlir::cast<spatial::SpatYieldOp>(oldComputeOp.getBody().front().getTerminator());
|
|
|
|
if (yieldOp.getNumOperands() == oldComputeOp->getNumResults()) {
|
|
// No result was added, just add itself to the map
|
|
opToReplacedCompute[oldComputeOp.getOperation()] = oldComputeOp;
|
|
return;
|
|
}
|
|
|
|
// Add the results by inspecting its YieldOp
|
|
auto newResultTypes = yieldOp.getOperandTypes();
|
|
|
|
// Create a new ComputeOp with the new result type, but same operands
|
|
rewriter.setInsertionPoint(oldComputeOp);
|
|
auto newComputeOp = rewriter.create<spatial::SpatWeightedCompute>(
|
|
oldComputeOp->getLoc(), newResultTypes, oldComputeOp.getWeights(), oldComputeOp.getInputs());
|
|
|
|
newComputeOp.getBody().takeBody(oldComputeOp.getBody());
|
|
|
|
auto newComputeOpNum = newComputeOp->getNumOperands();
|
|
|
|
assert(oldComputeOpNum == newComputeOpNum);
|
|
|
|
// Since we replaced the old ComputeOp with a new one, we need to replace
|
|
// all its results' uses
|
|
for (size_t i = 0; i < oldComputeOp.getNumResults(); i++) {
|
|
mlir::Value oldResult = oldComputeOp.getResult(i);
|
|
mlir::Value newResult = newComputeOp.getResult(i);
|
|
|
|
// Replace the uses, except the uses of the compute ops which got deleted
|
|
// previously
|
|
rewriter.replaceAllUsesExcept(oldResult, newResult, oldComputeOpsReplaced);
|
|
}
|
|
|
|
// Finally, erase the old computeOp and update the map
|
|
opToReplacedCompute[oldComputeOp.getOperation()] = newComputeOp;
|
|
oldComputeOpsReplaced.insert(oldComputeOp.getOperation());
|
|
rewriter.setInsertionPoint(oldComputeOp);
|
|
rewriter.eraseOp(oldComputeOp);
|
|
}
|
|
|
|
mlir::Value
|
|
SpatialReducer::createImgConcatOp(llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<OpAndResNum>>>& outputTiles,
|
|
mlir::Location& loc,
|
|
mlir::Type outputType) {
|
|
|
|
assert(reducesFinalized && "Cannot create ImgConcatOp before finalizing the reduce updates.");
|
|
|
|
// outputTiles are indexed like this: [channelTile][x][y]
|
|
auto tilesCount = outputTiles.size();
|
|
auto width = outputTiles[0].size();
|
|
auto height = outputTiles[0][0].size();
|
|
|
|
llvm::SmallVector<llvm::SmallVector<llvm::SmallVector<mlir::Value>>> remappedOutputTiles(
|
|
tilesCount, llvm::SmallVector<llvm::SmallVector<mlir::Value>>(width, llvm::SmallVector<mlir::Value>(height)));
|
|
|
|
for (size_t t = 0; t < tilesCount; t++)
|
|
for (size_t x = 0; x < width; x++)
|
|
for (size_t y = 0; y < height; y++)
|
|
remappedOutputTiles[t][x][y] = resolveValueFromOpAndResNum(outputTiles[t][x][y]);
|
|
|
|
return ::onnx_mlir::createImgConcatOp(remappedOutputTiles, rewriter, loc, outputType);
|
|
}
|
|
|
|
OpAndResNum SpatialReducer::applyAddMapReduction(llvm::SmallVector<ComputeAndResNum>& computeOps,
|
|
mlir::ConversionPatternRewriter& rewriter,
|
|
mlir::Value biasTile,
|
|
MapOperations mapOp) {
|
|
|
|
std::function<mlir::Value(const mlir::Value&)> postprocessing = nullptr;
|
|
|
|
if (mapOp != MapOperations::None) {
|
|
postprocessing = [&](const mlir::Value a) {
|
|
mlir::Value mapOperand = a;
|
|
if (biasTile)
|
|
mapOperand = rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, biasTile);
|
|
return createMapOperation(rewriter, mapOp, mapOperand);
|
|
};
|
|
}
|
|
|
|
return this->applyReducePattern(
|
|
computeOps,
|
|
[&](mlir::Value a, mlir::Value b) { return rewriter.create<spatial::SpatVAddOp>(a.getLoc(), a.getType(), a, b); },
|
|
/* preprocess = */ nullptr,
|
|
postprocessing);
|
|
}
|
|
|
|
} // namespace onnx_mlir
|