#include #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp" #include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp" #include "src/Dialect/ONNX/ONNXOps.hpp" using namespace mlir; namespace onnx_mlir { /** * @brief Structure that describes the replication of a convolution operation, * along the image height axis. */ struct ConvReplication { ONNXConvOp convOp; // Convolution operation size_t input_w; // Width of the input image size_t replicationFactor; // Replication factor on the image height axis size_t coresNeededPerReplica; // Number of cores needed for each replica friend bool operator<(const ConvReplication& a, const ConvReplication& b) { return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor; } ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica) : convOp(convOp), input_w(input_w), replicationFactor(replicationFactor), coresNeededPerReplica(coresNeededPerReplica) {} }; LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) { if (coresCount == -1) { // No need for annotation, implicitly set replication to 1 return success(); } std::priority_queue convOpsReplicationQueue; size_t minimumCores = 0; for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) { if (auto convOp = dyn_cast(op)) { // Convolution layer Value X = convOp.getX(), W = convOp.getW(); ShapedType xShape = mlir::cast(X.getType()); ShapedType wShape = mlir::cast(W.getType()); size_t input_w = getImageWidth(xShape); size_t krn_h = getKernelHeight(wShape); size_t krn_w = getKernelWidth(wShape); size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue()); size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue()); auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount; auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue()); minimumCores += neededCores; convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores); } else if (auto gemmOp = dyn_cast(op)) { // Fully connected layer auto matrixTensorShape = cast(gemmOp.getB().getType()); auto inputSize = matrixTensorShape.getDimSize(0); auto outputSize = matrixTensorShape.getDimSize(1); if (gemmOp.getTransB()) std::swap(inputSize, outputSize); const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue()); const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue()); // Each output tile is computed by `coresPerOutputTile` cores. The // entire input is given to each of these cores. const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue()); auto neededCores = coresPerOutputTile * outputTilesCount; minimumCores += neededCores; } } if (static_cast(coresCount) < minimumCores) { return funcOp->emitError("Not enough cores for this network: ") << minimumCores << " cores needed, but only " << static_cast(coresCount) << " available."; } size_t availableCores = static_cast(coresCount) - minimumCores; // Consume all the elements in the queue while (!convOpsReplicationQueue.empty()) { auto convOpReplication = convOpsReplicationQueue.top(); convOpsReplicationQueue.pop(); // Check if we can replicate this convolution (e.g. we have enough cores) if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) { // We can replicate this convolution: increment replicationFactor and put // back in queue availableCores -= convOpReplication.coresNeededPerReplica; convOpReplication.replicationFactor++; convOpsReplicationQueue.push(convOpReplication); } else { // Cannot replicate this convolution anymore, annotate the operation // with the replication factor convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME, rewriter.getI64IntegerAttr(convOpReplication.replicationFactor)); } } return success(); } } // namespace onnx_mlir