#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Value.h" #include "mlir/IR/ValueRange.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include #include #include #include "src/Accelerators/PIM/Common/PimCommon.hpp" #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/SpatialReducer.hpp" #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" using namespace mlir; namespace onnx_mlir { Value applyReducePatternNew(SmallVector& valuesToReduce, ConversionPatternRewriter& rewriter, std::function reduce, std::function preprocess, std::function postprocess) { // Simple case: if we have only one input, just return it if (valuesToReduce.size() == 1) return valuesToReduce[0]; if (preprocess) { for (auto& valToReduce : valuesToReduce) { rewriter.setInsertionPointAfterValue(valToReduce); valToReduce = preprocess(valToReduce); } } // It is possible that `valuesToReduce` contains two entries for the same // computeOp. In this case, we need to apply the reduction within-computef // Keep a map between a computeOp and the last Value for this reduction std::unordered_map lastValueForCompute; for (auto& valToReduce : valuesToReduce) { Operation* computeOp = valToReduce.getParentBlock()->getParentOp(); // if (valToReduce.getDefiningOp()) { // // If the value is defined by an operation, we take the parent // operation computeOp = valToReduce.getDefiningOp()->getParentOp(); // } else { // // Otherwise it is a block argument, // computeOp->getBlock()->getParentOp(); // } assert(isa(computeOp) && "Expected a ComputeOp"); auto it = lastValueForCompute.find(computeOp); if (it != lastValueForCompute.end()) { // If we have already seen this computeOp, apply the reduction // within-compute Value lastWithinComputeValue = it->second; if (valToReduce.getDefiningOp()->isBeforeInBlock(lastWithinComputeValue.getDefiningOp())) rewriter.setInsertionPointAfterValue(lastWithinComputeValue); else rewriter.setInsertionPointAfterValue(valToReduce); valToReduce = reduce(lastWithinComputeValue, valToReduce); lastValueForCompute[computeOp] = valToReduce; } lastValueForCompute[computeOp] = valToReduce; } // Now, reconstruct from the map the valuesToReduce list valuesToReduce.clear(); valuesToReduce.reserve(lastValueForCompute.size()); for (auto& entry : lastValueForCompute) valuesToReduce.push_back(entry.second); Location loc = valuesToReduce[0].getLoc(); auto channelType = spatial::SpatChannelType::get(rewriter.getContext()); // Recursive algorithm to reduce the inputs to a single one: // - Take two inputs at a time, and reduce them into a single one, updating // the valuesToReduce list which becomes half the size. // - Repeat until there is only one input left. llvm::OwningArrayRef valuesToReduceRef(valuesToReduce); while (valuesToReduceRef.size() > 1) { SmallVector nextValuesToReduce; nextValuesToReduce.reserve(valuesToReduceRef.size() / 2); for (size_t i = 0; i < valuesToReduceRef.size() - 1; i += 2) { auto firstValue = valuesToReduceRef[i]; auto secondValue = valuesToReduceRef[i + 1]; auto firstCompute = firstValue.getParentBlock()->getParentOp(); auto secondCompute = secondValue.getParentBlock()->getParentOp(); assert(isa(firstCompute)); assert(isa(secondCompute)); if (secondCompute->isBeforeInBlock(firstCompute)) { std::swap(firstValue, secondValue); std::swap(firstCompute, secondCompute); } // 1. Add a channel before the first computeOp rewriter.setInsertionPoint(firstCompute); auto channel = spatial::SpatChannelNewOp::create(rewriter, loc, channelType); // 2. Add a sendOp after the first value rewriter.setInsertionPointAfterValue(firstValue); spatial::SpatChannelSendOp::create(rewriter, loc, channel, firstValue); // 3. Add a receiveOp after the second value rewriter.setInsertionPointAfterValue(secondValue); auto receivedValue = spatial::SpatChannelReceiveOp::create(rewriter, loc, secondValue.getType(), channel); // 4. Apply reduction between second value and received value rewriter.setInsertionPointAfterValue(receivedValue); Value reduced = reduce(receivedValue, secondValue); nextValuesToReduce.push_back(reduced); } // If we have an odd number of inputs, we need to add the last one to the // newInputs list. if (valuesToReduceRef.size() % 2 == 1) nextValuesToReduce.push_back(valuesToReduceRef.back()); // Replace the inputOps list with the new one. valuesToReduceRef = llvm::OwningArrayRef(std::move(nextValuesToReduce)); } assert(valuesToReduceRef.size() == 1 && "Internal error: expected a single input at this point."); auto finalValue = valuesToReduceRef[0]; if (postprocess) { rewriter.setInsertionPointAfterValue(finalValue); finalValue = postprocess(finalValue); } return finalValue; } template bool hasPostProcessPoolingWindow() { return false; } template <> bool hasPostProcessPoolingWindow() { return true; } template Value postProcessPoolingWindow(ConversionPatternRewriter& rewriter, Location loc, PoolOp poolOp, Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) { return nullptr; } template <> Value postProcessPoolingWindow(ConversionPatternRewriter& rewriter, Location loc, ONNXAveragePoolOp poolOp, Value valueToDivide, size_t krn_size, size_t tilesSkippedByPadding) { bool countIncludePad = poolOp.getCountIncludePad() == 1; size_t divisorNumber = countIncludePad ? krn_size : krn_size - tilesSkippedByPadding; RankedTensorType scalarTensor = RankedTensorType::get({1}, rewriter.getF32Type()); // Put a spat.const before the computeOp, and use its value. We do this to be // compatible with the current code generation, which assumes constant to be // loaded in global memory, which is allocated by adding a spat.const OP // directly under func.func (i.e. alongside ComputeOps) auto computeOp = cast(valueToDivide.getDefiningOp()->getParentOp()); rewriter.setInsertionPoint(computeOp); auto divisorValue = spatial::SpatConstantOp::create(rewriter, loc, scalarTensor, rewriter.getI64IntegerAttr(divisorNumber), /* should_allocate = */ rewriter.getBoolAttr(true)); rewriter.setInsertionPointAfterValue(valueToDivide); return spatial::SpatVSDivOp::create(rewriter, loc, valueToDivide.getType(), valueToDivide, divisorValue); } template struct PoolingBaseConverter : public OpConversionPattern { PoolingBaseConverter(MLIRContext* ctx) : OpConversionPattern(ctx) {} LogicalResult matchAndRewrite(PoolOp poolOp, PoolOpAdaptor adaptor, ConversionPatternRewriter& rewriter) const final { Value X = adaptor.getX(); ShapedType xShape = mlir::cast(X.getType()); Value Y = poolOp.getResult(); ShapedType yShape = mlir::cast(Y.getType()); size_t stride_x, stride_y, dilation_x, dilation_y, krn_w, krn_h; unpackOptionalPairVector(adaptor.getStrides(), stride_x, stride_y); unpackOptionalPairVector(adaptor.getDilations(), dilation_x, dilation_y); unpackOptionalPairVector(adaptor.getKernelShape(), krn_w, krn_h); if (adaptor.getAutoPad() != "NOTSET") return rewriter.notifyMatchFailure(poolOp, "auto_pad != NOTSET is deprecated."); size_t pad_x, pad_y; auto padUnpackError = unpackOptionalPadsVector(adaptor.getPads(), pad_x, pad_y); if (padUnpackError.has_value()) return rewriter.notifyMatchFailure(poolOp, padUnpackError.value()); Location loc = poolOp.getLoc(); size_t input_h = getImageHeight(xShape); size_t input_w = getImageWidth(xShape); size_t output_h = getImageHeight(yShape); size_t output_w = getImageWidth(yShape); size_t channelTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue()); size_t channelTileRest = getImageChannel(xShape) % crossbarSize; // 1: Tile the input tensor // Input tiles need to be indexed by: // a. Channel Tile // b. Pixel `x` position // c. Pixel `y` position // For example: inputTiles[channelTile][x][y] // Example complete input tensor: tensor<1x3x12x12xf32> (NxCxWxH) // Suppose that the input tensor is produced by concatenating the results of // many ComputeOps. Get the result tiles from these ComputeOps. SmallVector>> inputTiles( channelTileCount, SmallVector>(input_w, SmallVector(input_h))); auto resolveErrorOpt = resolveImgInputTiles(X, inputTiles, channelTileCount, channelTileRest, input_w, input_h, rewriter); if (resolveErrorOpt.has_value()) return rewriter.notifyMatchFailure(poolOp, *resolveErrorOpt); // TODO: This requires a core for each input tile, which is not ideal. We // can do better. // If some input tiles come from the func.func operands, load // them into a computeOp and yield them for (size_t t = 0; t < channelTileCount; t++) { for (size_t x = 0; x < input_w; x++) { for (size_t y = 0; y < input_h; y++) { if (auto extractSliceOp = inputTiles[t][x][y].getDefiningOp()) { Location tileLoc = extractSliceOp.getLoc(); auto tempComputeOp = spatial::SpatWeightedCompute::create(rewriter, tileLoc, extractSliceOp.getResultType(), /* xbarWeights =*/ValueRange(), extractSliceOp.getResult()); Block* tempComputeOpBlock = new Block(); tempComputeOp.getBody().push_back(tempComputeOpBlock); auto tempComputeOpBlockArg = tempComputeOpBlock->addArgument(extractSliceOp.getType(), tileLoc); rewriter.setInsertionPointToStart(tempComputeOpBlock); spatial::SpatYieldOp::create(rewriter, tileLoc, tempComputeOpBlockArg); rewriter.setInsertionPointAfter(tempComputeOp); inputTiles[t][x][y] = tempComputeOp.getResult(0); } } } } // 2: Tile the output tensor // Output tiles need to be indexed by: // a. Channel Tile // b. Pixel `x` position // c. Pixel `y` position // For example: outputTiles[channelTile][x][y] // Example complete output tensor: tensor<1x3x6x6xf32> (NxCxWxH) SmallVector>> outputTiles( channelTileCount, SmallVector>(output_w, SmallVector(output_h, nullptr))); // List of values to pool for each output pixel SmallVector valuesToPool; // Iterate each output tile for (size_t outTile = 0; outTile < channelTileCount; outTile++) { // Iterate each output pixel for (size_t outX = 0; outX < output_w; outX++) { for (size_t outY = 0; outY < output_h; outY++) { // Each output pixel tile is computed by pooling a window of input // pixel tiles valuesToPool.clear(); size_t tilesSkippedByPadding = 0; auto [start_x, end_x] = kernel_get_start_and_end(outX, input_w, krn_w, stride_x, dilation_x, pad_x); auto [start_y, end_y] = kernel_get_start_and_end(outY, input_h, krn_h, stride_y, dilation_y, pad_y); for (size_t inX = start_x; inX < end_x; inX += dilation_x) { for (size_t inY = start_y; inY < end_y; inY += dilation_y) { if (failed(verifyWithinBoundsAndPaddings(input_w, input_h, inX, inY, pad_x, pad_y))) { tilesSkippedByPadding++; continue; } Value inputTile = inputTiles[outTile][inX][inY]; Value valueToPool; if (auto computeProducer = inputTile.getDefiningOp()) { int resultNumber = getResultIndex(computeProducer, inputTile); auto yieldInComputeOp = cast(computeProducer.getBody().front().getTerminator()); valueToPool = yieldInComputeOp.getOperand(resultNumber); } else if (auto receiveProducer = inputTile.getDefiningOp()) { auto sendOpOpt = getOtherEndOfChannel(receiveProducer, true, rewriter); if (failed(sendOpOpt)) { return rewriter.notifyMatchFailure(poolOp, "ChannelReceiveOp does not have a matching " "ChannelSendOp."); } auto sendOp = cast(*sendOpOpt); valueToPool = sendOp.getData(); } else { return rewriter.notifyMatchFailure(poolOp, "Input tile for Pooling is not produced by a " "WeightedComputeOp nor a receiveOp"); } valuesToPool.push_back(valueToPool); } } assert(valuesToPool.size() != 0 && "Pooling computed on zero tiles make no sense."); // assert(computeOpsForPooling.size() != 1 && // "Pooling computed on one tiles make no sense??? Or maybe // this " "should have been simplified earlier???"); std::function postProcessFn = nullptr; if (hasPostProcessPoolingWindow()) { postProcessFn = [&](const Value prevFinalRes) { return postProcessPoolingWindow( rewriter, loc, poolOp, prevFinalRes, krn_h * krn_w, tilesSkippedByPadding); }; } Value reducedWithinCompute = applyReducePatternNew( valuesToPool, rewriter, [&](const Value lhs, const Value rhs) { return ReduceOp::create(rewriter, loc, lhs.getType(), lhs, rhs); }, nullptr, postProcessFn); // Send this value through a channel, and receive it in the // `func.func`. During lowering, we will need to "move it" into the // users computeOps auto computeOpOfReduced = cast(reducedWithinCompute.getDefiningOp()->getParentOp()); // Create a new channel before the computeOp rewriter.setInsertionPoint(computeOpOfReduced); auto reduceChannel = spatial::SpatChannelNewOp::create(rewriter, loc, spatial::SpatChannelType::get(rewriter.getContext())); // Send value through the channel rewriter.setInsertionPointAfterValue(reducedWithinCompute); spatial::SpatChannelSendOp::create(rewriter, loc, reduceChannel, reducedWithinCompute); // Receive after the computeOp rewriter.setInsertionPointAfter(computeOpOfReduced); auto receivedValue = spatial::SpatChannelReceiveOp::create(rewriter, loc, reducedWithinCompute.getType(), reduceChannel); outputTiles[outTile][outX][outY] = receivedValue; } } } // TODO: outputTiles are not the results of the computeOps! We need to add // them! std::unordered_map>> computeOpNeedingResults; // Iterate each output tile for (size_t outTile = 0; outTile < channelTileCount; outTile++) { // Iterate each output pixel for (size_t outX = 0; outX < output_w; outX++) { for (size_t outY = 0; outY < output_h; outY++) { auto outputTile = outputTiles[outTile][outX][outY]; auto outputTileProducer = outputTile.getDefiningOp()->getParentOp(); if (!outputTileProducer) { return rewriter.notifyMatchFailure(poolOp, "Output tile for Pooling is not produced by a " "WeightedComputeOp."); } computeOpNeedingResults[outputTileProducer].push_back(std::make_tuple(outTile, outX, outY, outputTile)); } } } Value outputImage = createImgConcatOp(outputTiles, rewriter, loc, poolOp.getType()); rewriter.replaceOp(poolOp, outputImage); return success(); } }; void populatePoolingTilingPattern(RewritePatternSet& patterns, MLIRContext* ctx) { patterns.insert>( ctx); patterns.insert>(ctx); } } // namespace onnx_mlir