120 lines
4.5 KiB
C++
120 lines
4.5 KiB
C++
#include <queue>
|
|
|
|
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
|
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
|
|
#include "src/Accelerators/PIM/Conversion/ONNXToSpatial/Utils/AnnotateReplication.hpp"
|
|
#include "src/Dialect/ONNX/ONNXOps.hpp"
|
|
|
|
using namespace mlir;
|
|
|
|
namespace onnx_mlir {
|
|
|
|
/**
|
|
* @brief Structure that describes the replication of a convolution operation,
|
|
* along the image height axis.
|
|
*/
|
|
struct ConvReplication {
|
|
ONNXConvOp convOp; // Convolution operation
|
|
size_t input_w; // Width of the input image
|
|
size_t replicationFactor; // Replication factor on the image height axis
|
|
size_t coresNeededPerReplica; // Number of cores needed for each replica
|
|
|
|
friend bool operator<(const ConvReplication& a, const ConvReplication& b) {
|
|
return a.input_w / a.replicationFactor < b.input_w / b.replicationFactor;
|
|
}
|
|
|
|
ConvReplication(ONNXConvOp convOp, size_t input_w, size_t replicationFactor, size_t coresNeededPerReplica)
|
|
: convOp(convOp),
|
|
input_w(input_w),
|
|
replicationFactor(replicationFactor),
|
|
coresNeededPerReplica(coresNeededPerReplica) {}
|
|
};
|
|
|
|
LogicalResult annotateReplication(mlir::func::FuncOp funcOp, mlir::IRRewriter& rewriter) {
|
|
|
|
if (coresCount == -1) {
|
|
// No need for annotation, implicitly set replication to 1
|
|
return success();
|
|
}
|
|
|
|
std::priority_queue<struct ConvReplication> convOpsReplicationQueue;
|
|
|
|
size_t minimumCores = 0;
|
|
|
|
for (auto& op : funcOp.getFunctionBody().begin()->getOperations()) {
|
|
if (auto convOp = dyn_cast<ONNXConvOp>(op)) {
|
|
// Convolution layer
|
|
|
|
Value X = convOp.getX(), W = convOp.getW();
|
|
ShapedType xShape = mlir::cast<ShapedType>(X.getType());
|
|
ShapedType wShape = mlir::cast<ShapedType>(W.getType());
|
|
|
|
size_t input_w = getImageWidth(xShape);
|
|
size_t krn_h = getKernelHeight(wShape);
|
|
size_t krn_w = getKernelWidth(wShape);
|
|
|
|
size_t inputTileCount = ceilIntegerDivide(getImageChannel(xShape), crossbarSize.getValue());
|
|
size_t outputTileCount = ceilIntegerDivide(wShape.getDimSize(0), crossbarSize.getValue());
|
|
|
|
auto neededXbars = krn_h * krn_w * inputTileCount * outputTileCount;
|
|
auto neededCores = ceilIntegerDivide(neededXbars, crossbarCountInCore.getValue());
|
|
|
|
minimumCores += neededCores;
|
|
|
|
convOpsReplicationQueue.emplace(convOp, input_w, 1, neededCores);
|
|
}
|
|
else if (auto gemmOp = dyn_cast<ONNXGemmOp>(op)) {
|
|
// Fully connected layer
|
|
auto matrixTensorShape = cast<ShapedType>(gemmOp.getB().getType());
|
|
auto inputSize = matrixTensorShape.getDimSize(0);
|
|
auto outputSize = matrixTensorShape.getDimSize(1);
|
|
if (gemmOp.getTransB())
|
|
std::swap(inputSize, outputSize);
|
|
|
|
const size_t inputTilesCount = ceilIntegerDivide(inputSize, crossbarSize.getValue());
|
|
const size_t outputTilesCount = ceilIntegerDivide(outputSize, crossbarSize.getValue());
|
|
|
|
// Each output tile is computed by `coresPerOutputTile` cores. The
|
|
// entire input is given to each of these cores.
|
|
const size_t coresPerOutputTile = ceilIntegerDivide(inputTilesCount, crossbarCountInCore.getValue());
|
|
|
|
auto neededCores = coresPerOutputTile * outputTilesCount;
|
|
|
|
minimumCores += neededCores;
|
|
}
|
|
}
|
|
|
|
if (static_cast<size_t>(coresCount) < minimumCores) {
|
|
return funcOp->emitError("Not enough cores for this network: ")
|
|
<< minimumCores << " cores needed, but only " << static_cast<size_t>(coresCount) << " available.";
|
|
}
|
|
|
|
size_t availableCores = static_cast<size_t>(coresCount) - minimumCores;
|
|
|
|
// Consume all the elements in the queue
|
|
while (!convOpsReplicationQueue.empty()) {
|
|
auto convOpReplication = convOpsReplicationQueue.top();
|
|
convOpsReplicationQueue.pop();
|
|
|
|
// Check if we can replicate this convolution (e.g. we have enough cores)
|
|
if (availableCores > convOpReplication.coresNeededPerReplica * (convOpReplication.replicationFactor + 1)) {
|
|
// We can replicate this convolution: increment replicationFactor and put
|
|
// back in queue
|
|
availableCores -= convOpReplication.coresNeededPerReplica;
|
|
convOpReplication.replicationFactor++;
|
|
|
|
convOpsReplicationQueue.push(convOpReplication);
|
|
}
|
|
else {
|
|
// Cannot replicate this convolution anymore, annotate the operation
|
|
// with the replication factor
|
|
convOpReplication.convOp->setAttr(REPLICATION_ATTR_NAME,
|
|
rewriter.getI64IntegerAttr(convOpReplication.replicationFactor));
|
|
}
|
|
}
|
|
|
|
return success();
|
|
}
|
|
|
|
} // namespace onnx_mlir
|