Raptor/src/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.cpp

#include <queue>

#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
#include "src/Dialect/ONNX/ONNXOps.hpp"

using namespace mlir;

namespace onnx_mlir {

/**
 * @brief Structure that describes the replication of a convolution operation,
 * along the image height axis.
 */
struct ConvReplication {
  ONNXConvOp convOp;            // Convolution operation
  size_t input_w;               // Width of the input image
  size_t replicationFactor;     // Replication factor on the image height axis
  size_t coresNeededPerReplica; // Number of cores needed for each replica

  friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
    return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
  }

  ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
  : convOp(convOp),
    input_w(input_w),
    replicationFactor(replicationFactor),
    coresNeededPerReplica(coresNeededPerReplica) {}
};

LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {

  if (coresCount == -1) {
    // No need for annotation, implicitly set replication to 1
    return success();
  }

  std::priority_queue<struct ConvReplication> convOpsReplicationQueue;

  size_t minimumCores = 0;

  for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
    if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
      // Convolution layer

      Value X = convOp.getX(), W = convOp.getW();
      ShapedType xShape = mlir::cast<ShapedType>(X.getType());
      ShapedType wShape = mlir::cast<ShapedType>(W.getType());

      size_t input_w = getImageWidth(xShape);
      size_t krn_h = getKernelHeight(wShape);
      size_t krn_w = getKernelWidth(wShape);

      size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
      size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());

      auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
      auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());

      minimumCores += neededCores;

      convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
    }
    else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
      // Fully connected layer
      auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
      auto inputSize = matrixTensorShape.getDimSize(0);
      auto outputSize = matrixTensorShape.getDimSize(1);
      if (gemmOp.getTransB())
        std::swap(inputSize, outputSize);

      const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
      const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());

      // Each output tile is computed by `coresPerOutputTile` cores. The
      // entire input is given to each of these cores.
      const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());

      auto neededCores = coresPerOutputTile * outputTilesCount;

      minimumCores += neededCores;
    }
  }

  if (static_cast<size_t>(coresCount) < minimumCores) {
    return funcOp->emitError("Not enough cores for this network: ")
        << minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
  }

  size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;

  // Consume all the elements in the queue
  while (!convOpsReplicationQueue.empty()) {
    auto convOpReplication = convOpsReplicationQueue.top();
    convOpsReplicationQueue.pop();

    // Check if we can replicate this convolution (e.g. we have enough cores)
    if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
      // We can replicate this convolution: increment replicationFactor and put
      // back in queue
      availableCores -= convOpReplication.coresNeededPerReplica;
      convOpReplication.replicationFactor++;

      convOpsReplicationQueue.push(convOpReplication);
    }
    else {
      // Cannot replicate this convolution anymore, annotate the operation
      // with the replication factor
      convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
                                        rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
    }
  }

  return success();
}

} // namespace onnx_mlir