Raptor/src/PIM/Conversion/ONNXToSpatial/Patterns/NN/Softmax.cpp

#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Transforms/DialectConversion.h"

#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Common/Common.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ConversionPatterns.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/HostFoldability.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"

using namespace mlir;

namespace onnx_mlir {
namespace {

static int64_t normalizeAxis(int64_t axis, int64_t rank) { return axis >= 0 ? axis : rank + axis; }

static SmallVector<int64_t> permuteShape(ArrayRef<int64_t> shape, ArrayRef<int64_t> permutation) {
  SmallVector<int64_t> permutedShape;
  permutedShape.reserve(permutation.size());
  for (int64_t axis : permutation)
    permutedShape.push_back(shape[axis]);
  return permutedShape;
}

static Value createSoftmaxCompute(Value input, ConversionPatternRewriter& rewriter, Location loc) {
  auto inputType = cast<RankedTensorType>(input.getType());
  constexpr size_t numInputs = 1;
  auto computeOp =
    createSpatCompute<numInputs>(rewriter, loc, TypeRange {inputType}, {}, ValueRange {input}, [&](Value x) {
      auto softmaxOp = spatial::SpatSoftmaxOp::create(rewriter, loc, inputType, x);
      spatial::SpatYieldOp::create(rewriter, loc, softmaxOp.getResult());
    });
  return computeOp.getResult(0);
}

static Value concatValues(ValueRange inputs, int64_t axis, ConversionPatternRewriter& rewriter, Location loc) {
  auto firstType = cast<RankedTensorType>(inputs.front().getType());
  SmallVector<int64_t> outputShape(firstType.getShape().begin(), firstType.getShape().end());
  int64_t concatDimSize = 0;
  for (Value input : inputs)
    concatDimSize += cast<RankedTensorType>(input.getType()).getDimSize(axis);
  outputShape[axis] = concatDimSize;
  auto resultType = RankedTensorType::get(outputShape, firstType.getElementType(), firstType.getEncoding());

  if (llvm::all_of(inputs, isHostFoldableValue))
    return createSpatConcat(rewriter, loc, axis, inputs);

  auto concatCompute = createSpatCompute(rewriter, loc, TypeRange {resultType}, {}, inputs, [&](ValueRange args) {
    spatial::SpatYieldOp::create(rewriter, loc, createSpatConcat(rewriter, loc, axis, args));
  });
  return concatCompute.getResult(0);
}

static Value
buildSoftmax(Value input, int64_t softmaxAxis, int64_t axis, ConversionPatternRewriter& rewriter, Location loc) {
  auto inputType = cast<RankedTensorType>(input.getType());
  if (axis == inputType.getRank())
    return createSoftmaxCompute(input, rewriter, loc);

  if (axis == softmaxAxis)
    return buildSoftmax(input, softmaxAxis, axis + 1, rewriter, loc);

  SmallVector<Value> slices = sliceTensor(input, axis, /*sliceSize=*/1, rewriter, loc);
  SmallVector<Value> rebuiltSlices;
  rebuiltSlices.reserve(slices.size());
  for (Value slice : slices)
    rebuiltSlices.push_back(buildSoftmax(slice, softmaxAxis, axis + 1, rewriter, loc));

  return concatValues(rebuiltSlices, axis, rewriter, loc);
}

struct SoftmaxToSpatialCompute : OpConversionPattern<ONNXSoftmaxOp> {
  using OpConversionPattern::OpConversionPattern;

  LogicalResult matchAndRewrite(ONNXSoftmaxOp softmaxOp,
                                ONNXSoftmaxOpAdaptor adaptor,
                                ConversionPatternRewriter& rewriter) const override {
    auto inputType = dyn_cast<RankedTensorType>(adaptor.getInput().getType());
    if (!inputType || !inputType.hasStaticShape())
      return failure();

    int64_t axis = normalizeAxis(softmaxOp.getAxis(), inputType.getRank());
    if (axis < 0 || axis >= inputType.getRank())
      return failure();

    Value input = adaptor.getInput();
    Value result;
    if (axis == inputType.getRank() - 1) {
      result = buildSoftmax(input, axis, /*axis=*/0, rewriter, softmaxOp.getLoc());
    }
    else {
      SmallVector<int64_t> permutation;
      permutation.reserve(inputType.getRank());
      for (int64_t dim = 0; dim < inputType.getRank(); ++dim)
        if (dim != axis)
          permutation.push_back(dim);
      permutation.push_back(axis);

      SmallVector<int64_t> inversePermutation(inputType.getRank());
      for (auto [newIndex, oldIndex] : llvm::enumerate(permutation))
        inversePermutation[oldIndex] = static_cast<int64_t>(newIndex);

      auto transposedType = RankedTensorType::get(
        permuteShape(inputType.getShape(), permutation), inputType.getElementType(), inputType.getEncoding());
      auto preTransposeCompute =
        createSpatCompute<1>(rewriter, softmaxOp.getLoc(), TypeRange {transposedType}, {}, input, [&](Value x) {
          Value transposed = ONNXTransposeOp::create(
            rewriter, softmaxOp.getLoc(), transposedType, x, rewriter.getI64ArrayAttr(permutation));
          spatial::SpatYieldOp::create(rewriter, softmaxOp.getLoc(), transposed);
        });
      Value transposedInput = preTransposeCompute.getResult(0);
      Value transposedResult = buildSoftmax(
        transposedInput, /*softmaxAxis=*/inputType.getRank() - 1, /*axis=*/0, rewriter, softmaxOp.getLoc());
      auto postTransposeCompute =
        createSpatCompute<1>(rewriter, softmaxOp.getLoc(), TypeRange {inputType}, {}, transposedResult, [&](Value x) {
          Value transposed = ONNXTransposeOp::create(
            rewriter, softmaxOp.getLoc(), inputType, x, rewriter.getI64ArrayAttr(inversePermutation));
          spatial::SpatYieldOp::create(rewriter, softmaxOp.getLoc(), transposed);
        });
      result = postTransposeCompute.getResult(0);
    }

    rewriter.replaceOp(softmaxOp, result);
    return success();
  }
};

} // namespace

void populateSoftmaxPatterns(RewritePatternSet& patterns, MLIRContext* ctx) {
  patterns.add<SoftmaxToSpatialCompute>(ctx);
}

} // namespace onnx_mlir