Raptor/src/PIM/Conversion/ONNXToSpatial/Common.hpp

#pragma once

#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/IR/Block.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Transforms/DialectConversion.h"

#include <cassert>
#include <type_traits>
#include <utility>

#include "llvm/ADT/SmallPtrSet.h"

#include "src/Accelerators/PIM/Common/PimCommon.hpp"
#include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"

namespace onnx_mlir {

template <class ShapedType>
inline auto getImageWidth(const ShapedType& shapedType) {
  return shapedType.getDimSize(2);
}

template <class ShapedType>
inline auto getImageHeight(const ShapedType& shapedType) {
  return shapedType.getDimSize(3);
}

template <class ShapedType>
inline auto getImageChannel(const ShapedType& shapedType) {
  return shapedType.getDimSize(1);
}

template <class ShapedType>
inline auto getImageN(const ShapedType& shapedType) {
  return shapedType.getDimSize(0);
}

template <class ShapedType>
inline auto getKernelWidth(const ShapedType& shapedType) {
  return shapedType.getDimSize(2);
}

template <class ShapedType>
inline auto getKernelHeight(const ShapedType& shapedType) {
  return shapedType.getDimSize(3);
}

template <class ShapedType>
inline auto getFilterCount(const ShapedType& shapedType) {
  return shapedType.getDimSize(0);
}

using HSliceId = size_t;
using CoreId = size_t;

template <class A, class B, class C = std::common_type_t<A, B>>
constexpr C ceilIntegerDivide(A a, B b) {
  static_assert(std::is_integral_v<A>, "A must be an integer type");
  static_assert(std::is_integral_v<B>, "B must be an integer type");
  C ac = static_cast<C>(a);
  C bc = static_cast<C>(b);
  return 1 + (ac - 1) / bc;
}

template <class A, class B, class C = std::common_type_t<A, B>>
constexpr std::pair<C, C> ceilIntegerDivideWithRemainder(A a, B b) {
  static_assert(std::is_integral_v<A>, "A must be an integer type");
  static_assert(std::is_integral_v<B>, "B must be an integer type");
  C ac = static_cast<C>(a);
  C bc = static_cast<C>(b);
  return {ceilIntegerDivide(ac, bc), ac % bc};
}

template <class T>
bool isVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && (shape[0] == 1 || shape[1] == 1);
}

template <class T>
bool isMatrixShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2;
}

template <class T>
bool isHVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && shape[0] == 1;
}

template <class T>
bool isVVectorShape(mlir::ArrayRef<T> shape) {
  return shape.size() == 2 && shape[1] == 1;
}

template <class T>
T getVectorLength(mlir::ArrayRef<T> shape) {
  assert(isVectorShape(shape));
  return shape[0] != 1 ? shape[0] : shape[1];
}

inline auto getTensorShape(mlir::Value tensor) {
  return mlir::cast<mlir::RankedTensorType>(tensor.getType()).getShape();
}

inline bool isWeightLikeComputeOperand(mlir::Value value) {
  auto rankedType = mlir::dyn_cast<mlir::RankedTensorType>(value.getType());
  if (!rankedType || !isMatrixShape(rankedType.getShape()))
    return false;

  llvm::SmallPtrSet<mlir::Operation*, 8> visited;

  while (auto* definingOp = value.getDefiningOp()) {
    if (!visited.insert(definingOp).second)
      return false;
    if (hasWeightAlways(definingOp))
      return true;

    if (auto extractSliceOp = mlir::dyn_cast<mlir::tensor::ExtractSliceOp>(definingOp)) {
      value = extractSliceOp.getSource();
      continue;
    }
    if (auto expandShapeOp = mlir::dyn_cast<mlir::tensor::ExpandShapeOp>(definingOp)) {
      value = expandShapeOp.getSrc();
      continue;
    }
    if (auto collapseShapeOp = mlir::dyn_cast<mlir::tensor::CollapseShapeOp>(definingOp)) {
      value = collapseShapeOp.getSrc();
      continue;
    }
    if (auto transposeOp = mlir::dyn_cast<mlir::ONNXTransposeOp>(definingOp)) {
      value = transposeOp.getData();
      continue;
    }

    return false;
  }

  return false;
}

namespace detail {

inline mlir::ValueRange getBlockArgs(mlir::Block* block) { return mlir::ValueRange(block->getArguments()); }

template <typename Fn, size_t... Is>
decltype(auto) invokeWithBlockArgs(Fn&& fn, mlir::Block* block, std::index_sequence<Is...>) {
  return std::forward<Fn>(fn)(block->getArgument(Is)...);
}

template <typename Fn, size_t... Is>
decltype(auto) invokeWithValues(Fn&& fn, mlir::ArrayRef<mlir::Value> values, std::index_sequence<Is...>) {
  return std::forward<Fn>(fn)(values[Is]...);
}

template <size_t>
using ValueArg = mlir::Value;

template <typename Fn, typename Seq>
struct InvokeWithBlockArgsResult;

template <typename Fn, size_t... Is>
struct InvokeWithBlockArgsResult<Fn, std::index_sequence<Is...>> {
  using type = std::invoke_result_t<Fn, ValueArg<Is>...>;
};

template <typename Fn, typename Seq>
using InvokeWithBlockArgsResultT = typename InvokeWithBlockArgsResult<Fn, Seq>::type;

template <typename Fn>
using InvokeWithValueRangeResultT = std::invoke_result_t<Fn, mlir::ValueRange>;

} // namespace detail

template <size_t NumInputs, typename RewriterT, typename BodyFn>
auto createSpatCompute(RewriterT& rewriter,
                       mlir::Location loc,
                       mlir::TypeRange resultTypes,
                       mlir::ValueRange weights,
                       mlir::ValueRange inputs,
                       BodyFn&& body) {
  assert(inputs.size() == NumInputs && "NumInputs must match the number of input values");
  auto computeOp = spatial::SpatCompute::create(rewriter, loc, resultTypes, weights, inputs);

  auto* block = new mlir::Block();
  for (mlir::Value input : inputs)
    block->addArgument(input.getType(), loc);

  computeOp.getBody().push_back(block);
  rewriter.setInsertionPointToStart(block);

  using BodyResult = detail::InvokeWithBlockArgsResultT<std::decay_t<BodyFn>, std::make_index_sequence<NumInputs>>;
  if constexpr (std::is_same_v<BodyResult, mlir::LogicalResult>) {
    auto bodyResult =
      detail::invokeWithBlockArgs(std::forward<BodyFn>(body), block, std::make_index_sequence<NumInputs> {});
    if (mlir::failed(bodyResult)) {
      rewriter.setInsertionPointAfter(computeOp);
      rewriter.eraseOp(computeOp);
      return mlir::FailureOr<spatial::SpatCompute>(mlir::failure());
    }
    rewriter.setInsertionPointAfter(computeOp);
    return mlir::FailureOr<spatial::SpatCompute>(computeOp);
  }
  else {
    static_assert(std::is_same_v<BodyResult, void>, "createSpatCompute body must return void or mlir::LogicalResult");
    detail::invokeWithBlockArgs(std::forward<BodyFn>(body), block, std::make_index_sequence<NumInputs> {});

    rewriter.setInsertionPointAfter(computeOp);
    return computeOp;
  }
}

template <typename RewriterT, typename BodyFn>
auto createSpatCompute(RewriterT& rewriter,
                       mlir::Location loc,
                       mlir::TypeRange resultTypes,
                       mlir::ValueRange weights,
                       mlir::ValueRange inputs,
                       BodyFn&& body) {
  auto computeOp = spatial::SpatCompute::create(rewriter, loc, resultTypes, weights, inputs);

  auto* block = new mlir::Block();
  for (mlir::Value input : inputs)
    block->addArgument(input.getType(), loc);

  computeOp.getBody().push_back(block);
  rewriter.setInsertionPointToStart(block);

  using BodyResult = detail::InvokeWithValueRangeResultT<std::decay_t<BodyFn>>;
  if constexpr (std::is_same_v<BodyResult, mlir::LogicalResult>) {
    auto bodyResult = std::forward<BodyFn>(body)(detail::getBlockArgs(block));
    if (mlir::failed(bodyResult)) {
      rewriter.setInsertionPointAfter(computeOp);
      rewriter.eraseOp(computeOp);
      return mlir::FailureOr<spatial::SpatCompute>(mlir::failure());
    }
    rewriter.setInsertionPointAfter(computeOp);
    return mlir::FailureOr<spatial::SpatCompute>(computeOp);
  }
  else {
    static_assert(std::is_same_v<BodyResult, void>, "createSpatCompute body must return void or mlir::LogicalResult");
    std::forward<BodyFn>(body)(detail::getBlockArgs(block));

    rewriter.setInsertionPointAfter(computeOp);
    return computeOp;
  }
}

llvm::SmallVector<mlir::Value> sliceTensor(const mlir::Value& tensorToSlice,
                                           size_t axis,
                                           int64_t sliceSize,
                                           mlir::ConversionPatternRewriter& rewriter,
                                           mlir::Location loc);

llvm::SmallVector<mlir::Value> sliceVector(const mlir::Value& vectorToSlice,
                                           int64_t sliceSize,
                                           mlir::ConversionPatternRewriter& rewriter,
                                           mlir::Location loc);

llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>> sliceVectorPerCrossbarPerCore(
  const mlir::Value& vectorToSlice, mlir::ConversionPatternRewriter& rewriter, mlir::Location loc);

llvm::DenseMap<HSliceId, llvm::DenseMap<CoreId, llvm::SmallVector<mlir::Value>>>
tileMatrix(mlir::Value& matrixToTile,
           int64_t hSliceSize,
           int64_t vSliceSize,
           mlir::ConversionPatternRewriter& rewriter,
           mlir::Location& loc);

mlir::tensor::SplatOp broadcastToVector(mlir::Value scalarToBroadcast,
                                        int64_t length,
                                        mlir::ConversionPatternRewriter& rewriter,
                                        mlir::Location loc);

mlir::Value sumTensors(mlir::ArrayRef<mlir::Value> tensors, mlir::ConversionPatternRewriter& rewriter);

}; // namespace onnx_mlir