Dynamic gemm/conv

2026-05-28 18:00:14 +02:00
parent cbf7b235f1
commit 1ab489fe0a
17 changed files with 704 additions and 69 deletions
@@ -111,6 +111,32 @@ static Value buildPackedWeight(DenseElementsAttr wDenseAttr,
  return arith::ConstantOp::create(rewriter, loc, packedWeightType, packedAttr);
 }

+static Value createConvWeightMatrix(Value w,
+                                    RankedTensorType wFlatType,
+                                    RankedTensorType wTransType,
+                                    ConversionPatternRewriter& rewriter,
+                                    Location loc) {
+  auto buildWeightMatrix = [&](Value weight) -> Value {
+    Value wFlat = tensor::CollapseShapeOp::create(rewriter,
+                                                  loc,
+                                                  wFlatType,
+                                                  weight,
+                                                  SmallVector<ReassociationIndices> {
+                                                    {0},
+                                                    {1, 2, 3}
+    });
+    return ONNXTransposeOp::create(rewriter, loc, wTransType, wFlat, rewriter.getI64ArrayAttr({1, 0})).getResult();
+  };
+
+  if (isCompileTimeComputable(w))
+    return buildWeightMatrix(w);
+
+  auto computeOp = createSpatCompute<1>(rewriter, loc, TypeRange {wTransType}, {}, ValueRange {w}, [&](Value weight) {
+    spatial::SpatYieldOp::create(rewriter, loc, buildWeightMatrix(weight));
+  });
+  return computeOp.getResult(0);
+}
+
 static Value buildPackedBias(bool hasBias,
                             Value gemmBias,
                             Value biasMatrix,
@@ -395,15 +421,7 @@ static Value lowerSingleConvGroup(Value x,

  // Prepare weight matrix W for crossbar storage:
  // W: [numChannelsOut, numChannelsIn, wHeight, wWidth] -> [numChannelsOut, patchSize] -> [patchSize, numChannelsOut]
-  Value wFlat = tensor::CollapseShapeOp::create(rewriter,
-                                                loc,
-                                                wFlatType,
-                                                w,
-                                                SmallVector<ReassociationIndices> {
-                                                  {0},
-                                                  {1, 2, 3}
-  });
-  Value wTrans = ONNXTransposeOp::create(rewriter, loc, wTransType, wFlat, rewriter.getI64ArrayAttr({1, 0}));
+  Value wTrans = createConvWeightMatrix(w, wFlatType, wTransType, rewriter, loc);

  // Pass bias through directly; Gemm handles rank-1 C canonicalization.
  bool hasB = !isa<ONNXNoneOp>(b.getDefiningOp());
@@ -73,38 +73,11 @@ static Value createIndexConstant(ConversionPatternRewriter& rewriter, int64_t va
  return getOrCreateHostIndexConstant(anchorOp, value, rewriter);
 }

-static std::optional<int64_t> getConstantIndexValue(Value value) {
-  if (auto constantIndex = value.getDefiningOp<arith::ConstantIndexOp>())
-    return constantIndex.value();
-
-  APInt constantValue;
-  if (matchPattern(value, m_ConstantInt(&constantValue)))
-    return constantValue.getSExtValue();
-
-  return std::nullopt;
-}
-
 static Value
 createAffineApply(ConversionPatternRewriter& rewriter, Location loc, AffineExpr expr, ValueRange operands) {
  AffineMap map = AffineMap::get(/*dimCount=*/operands.size(), /*symbolCount=*/0, expr);
-
-  SmallVector<Attribute> operandConstants;
-  operandConstants.reserve(operands.size());
-  for (Value operand : operands) {
-    std::optional<int64_t> constantValue = getConstantIndexValue(operand);
-    if (!constantValue)
-      return affine::AffineApplyOp::create(rewriter, loc, map, operands).getResult();
-    operandConstants.push_back(rewriter.getIndexAttr(*constantValue));
-  }
-
-  SmallVector<Attribute> foldedResults;
-  if (succeeded(map.constantFold(operandConstants, foldedResults))) {
-    auto constantResult = dyn_cast<IntegerAttr>(foldedResults.front());
-    if (constantResult)
-      return createIndexConstant(rewriter, constantResult.getInt());
-  }
-
-  return affine::AffineApplyOp::create(rewriter, loc, map, operands).getResult();
+  Operation* anchorOp = rewriter.getInsertionBlock()->getParentOp();
+  return createAffineApplyOrFoldedConstant(rewriter, loc, map, operands, anchorOp);
 }

 static Value
@@ -379,6 +352,233 @@ static spatial::SpatComputeBatch createVmmBatch(Value a,
  return batchOp;
 }

+static Value createDynamicGemmBatchRow(
+  Value lane, int64_t numOutCols, ConversionPatternRewriter& rewriter, Location loc) {
+  if (numOutCols == 1)
+    return lane;
+
+  MLIRContext* context = rewriter.getContext();
+  AffineExpr d0 = getAffineDimExpr(0, context);
+  return createAffineApply(rewriter, loc, d0.floorDiv(numOutCols), ValueRange {lane});
+}
+
+static Value createDynamicGemmBatchColumn(
+  Value lane, int64_t numOutCols, ConversionPatternRewriter& rewriter, Location loc) {
+  return modIndexByConstant(lane, numOutCols, rewriter, loc);
+}
+
+static Value
+extractDynamicGemmBColumn(Value matrix, Value column, RankedTensorType vectorType, ConversionPatternRewriter& rewriter, Location loc) {
+  SmallVector<OpFoldResult> offsets {rewriter.getIndexAttr(0), column};
+  SmallVector<OpFoldResult> strides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+  auto columnSliceType = RankedTensorType::get({vectorType.getDimSize(1), 1}, vectorType.getElementType());
+  Value columnSlice = materializeContiguousTensorSlice(matrix, columnSliceType, offsets, strides, rewriter, loc);
+  SmallVector<ReassociationIndices> collapseReassociation {ReassociationIndices {0, 1}};
+  auto collapsedType = RankedTensorType::get({vectorType.getDimSize(1)}, vectorType.getElementType());
+  Value collapsed =
+    tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, columnSlice, collapseReassociation).getResult();
+  SmallVector<ReassociationIndices> expandReassociation {ReassociationIndices {0, 1}};
+  return tensor::ExpandShapeOp::create(rewriter, loc, vectorType, collapsed, expandReassociation).getResult();
+}
+
+static Value extractTransposedBRow(
+  Value transposedB, Value row, RankedTensorType vectorType, ConversionPatternRewriter& rewriter, Location loc) {
+  SmallVector<OpFoldResult> offsets {row, rewriter.getIndexAttr(0)};
+  SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(vectorType.getDimSize(1))};
+  SmallVector<OpFoldResult> strides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+  return tensor::ExtractSliceOp::create(rewriter, loc, vectorType, transposedB, offsets, sizes, strides).getResult();
+}
+
+static Value extractDynamicGemmRowVector(
+  Value matrix, Value row, RankedTensorType vectorType, ConversionPatternRewriter& rewriter, Location loc) {
+  SmallVector<OpFoldResult> offsets {row, rewriter.getIndexAttr(0)};
+  SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(vectorType.getDimSize(1))};
+  SmallVector<OpFoldResult> strides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+  return tensor::ExtractSliceOp::create(rewriter, loc, vectorType, matrix, offsets, sizes, strides).getResult();
+}
+
+static FailureOr<RankedTensorType> verifyDynamicGemmBiasType(RankedTensorType cType, RankedTensorType outType) {
+  if (!cType.hasStaticShape() || cType.getRank() > 2)
+    return failure();
+
+  if (cType.getRank() == 0)
+    return cType;
+
+  int64_t numOutRows = outType.getDimSize(0);
+  int64_t numOutCols = outType.getDimSize(1);
+  if (cType.getRank() == 1) {
+    int64_t cols = cType.getDimSize(0);
+    if (cols == 1 || cols == numOutCols)
+      return cType;
+    return failure();
+  }
+
+  int64_t rows = cType.getDimSize(0);
+  int64_t cols = cType.getDimSize(1);
+  if ((rows == 1 || rows == numOutRows) && (cols == 1 || cols == numOutCols))
+    return cType;
+  return failure();
+}
+
+static bool hasGemmBias(Value c) {
+  Operation* definingOp = c.getDefiningOp();
+  return !definingOp || !isa<ONNXNoneOp>(definingOp);
+}
+
+static Value createScalarTensorConstant(RankedTensorType scalarType,
+                                        float value,
+                                        ConversionPatternRewriter& rewriter,
+                                        Location loc) {
+  auto elementType = scalarType.getElementType();
+  auto scalarAttr = rewriter.getFloatAttr(elementType, value);
+  auto denseAttr = DenseElementsAttr::get(scalarType, scalarAttr);
+  return arith::ConstantOp::create(rewriter, loc, scalarType, denseAttr).getResult();
+}
+
+static Value createBroadcastedBiasScalar(Value bias,
+                                         RankedTensorType biasType,
+                                         Value row,
+                                         Value column,
+                                         RankedTensorType scalarType,
+                                         ConversionPatternRewriter& rewriter,
+                                         Location loc) {
+  SmallVector<OpFoldResult> unitStrides(biasType.getRank(), rewriter.getIndexAttr(1));
+  if (biasType.getRank() == 1) {
+    SmallVector<OpFoldResult> offsets {
+      biasType.getDimSize(0) == 1 ? OpFoldResult(rewriter.getIndexAttr(0)) : OpFoldResult(column)};
+    SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(1)};
+    auto vectorType = RankedTensorType::get({1}, scalarType.getElementType());
+    Value vector = tensor::ExtractSliceOp::create(rewriter, loc, vectorType, bias, offsets, sizes, unitStrides)
+                     .getResult();
+    SmallVector<ReassociationIndices> reassociation {ReassociationIndices {0, 1}};
+    return tensor::ExpandShapeOp::create(rewriter, loc, scalarType, vector, reassociation).getResult();
+  }
+
+  if (biasType.getRank() == 2) {
+    SmallVector<OpFoldResult> offsets {
+      biasType.getDimSize(0) == 1 ? OpFoldResult(rewriter.getIndexAttr(0)) : OpFoldResult(row),
+      biasType.getDimSize(1) == 1 ? OpFoldResult(rewriter.getIndexAttr(0)) : OpFoldResult(column)};
+    SmallVector<OpFoldResult> sizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+    return tensor::ExtractSliceOp::create(rewriter, loc, scalarType, bias, offsets, sizes, unitStrides).getResult();
+  }
+
+  Value scalar = tensor::ExtractOp::create(rewriter, loc, bias, ValueRange {}).getResult();
+  return tensor::SplatOp::create(rewriter, loc, scalarType, scalar).getResult();
+}
+
+static spatial::SpatComputeBatch createVvdmulBatch(Value a,
+                                                   Value b,
+                                                   RankedTensorType aType,
+                                                   RankedTensorType bType,
+                                                   RankedTensorType scalarPiecesType,
+                                                   RankedTensorType outType,
+                                                   bool bAlreadyTransposed,
+                                                   ConversionPatternRewriter& rewriter,
+                                                   Location loc) {
+  const int64_t numOutRows = outType.getDimSize(0);
+  const int64_t numOutCols = outType.getDimSize(1);
+  const int64_t reductionSize = aType.getDimSize(1);
+  const int64_t laneCount = numOutRows * numOutCols;
+  auto batchOp = spatial::SpatComputeBatch::create(rewriter,
+                                                   loc,
+                                                   TypeRange {scalarPiecesType},
+                                                   rewriter.getI32IntegerAttr(static_cast<int32_t>(laneCount)),
+                                                   ValueRange {},
+                                                   ValueRange {a, b});
+
+  SmallVector<Type> blockArgTypes {rewriter.getIndexType(), aType, bType, scalarPiecesType};
+  SmallVector<Location> blockArgLocs(blockArgTypes.size(), loc);
+  Block* body =
+    rewriter.createBlock(&batchOp.getBody(), batchOp.getBody().end(), TypeRange(blockArgTypes), blockArgLocs);
+  rewriter.setInsertionPointToEnd(body);
+
+  auto lane = batchOp.getLaneArgument();
+  auto inputA = batchOp.getInputArgument(0);
+  auto inputB = batchOp.getInputArgument(1);
+  auto output = batchOp.getOutputArgument(0);
+  assert(lane && inputA && inputB && output && "malformed dynamic Gemm compute_batch body");
+
+  Value row = createDynamicGemmBatchRow(*lane, numOutCols, rewriter, loc);
+  Value column = createDynamicGemmBatchColumn(*lane, numOutCols, rewriter, loc);
+
+  auto vectorType = RankedTensorType::get({1, reductionSize}, aType.getElementType());
+  auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
+  Value aVector = extractDynamicGemmRowVector(*inputA, row, vectorType, rewriter, loc);
+  Value bVector = bAlreadyTransposed
+                    ? extractTransposedBRow(*inputB, column, vectorType, rewriter, loc)
+                    : extractDynamicGemmBColumn(*inputB, column, vectorType, rewriter, loc);
+  Value scalar = spatial::SpatVVDMulOp::create(rewriter, loc, scalarType, aVector, bVector).getResult();
+
+  auto inParallelOp = spatial::SpatInParallelOp::create(rewriter, loc);
+  rewriter.setInsertionPointToStart(&inParallelOp.getRegion().front());
+  SmallVector<OpFoldResult> outputOffsets {*lane, rewriter.getIndexAttr(0)};
+  SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+  SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+  tensor::ParallelInsertSliceOp::create(rewriter, loc, scalar, *output, outputOffsets, scalarSizes, unitStrides);
+
+  rewriter.setInsertionPointAfter(batchOp);
+  return batchOp;
+}
+
+static spatial::SpatCompute createDynamicGemmOutputCompute(Value scalarPieces,
+                                                           Value bias,
+                                                           RankedTensorType scalarPiecesType,
+                                                           RankedTensorType biasType,
+                                                           RankedTensorType outType,
+                                                           float alpha,
+                                                           float beta,
+                                                           ConversionPatternRewriter& rewriter,
+                                                           Location loc) {
+  const int64_t laneCount = scalarPiecesType.getDimSize(0);
+  const int64_t numOutCols = outType.getDimSize(1);
+  SmallVector<Value> inputs {scalarPieces};
+  if (bias)
+    inputs.push_back(bias);
+
+  return createSpatCompute(rewriter, loc, TypeRange {outType}, {}, inputs, [&](ValueRange blockArgs) {
+    Value pieces = blockArgs[0];
+    Value biasArg = bias ? blockArgs[1] : Value();
+    auto scalarType = RankedTensorType::get({1, 1}, outType.getElementType());
+    Value outputInit = tensor::EmptyOp::create(rewriter, loc, outType.getShape(), outType.getElementType()).getResult();
+    Value c0 = createIndexConstant(rewriter, 0);
+    Value c1 = createIndexConstant(rewriter, 1);
+    Value cLaneCount = createIndexConstant(rewriter, laneCount);
+    auto loop = scf::ForOp::create(rewriter, loc, c0, cLaneCount, c1, ValueRange {outputInit});
+    rewriter.setInsertionPointToStart(loop.getBody());
+
+    Value lane = loop.getInductionVar();
+    Value outputAcc = loop.getRegionIterArgs().front();
+    Value row = createDynamicGemmBatchRow(lane, numOutCols, rewriter, loc);
+    Value column = createDynamicGemmBatchColumn(lane, numOutCols, rewriter, loc);
+    SmallVector<OpFoldResult> scalarOffsets {lane, rewriter.getIndexAttr(0)};
+    SmallVector<OpFoldResult> scalarSizes {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+    SmallVector<OpFoldResult> unitStrides {rewriter.getIndexAttr(1), rewriter.getIndexAttr(1)};
+    Value scalar =
+      tensor::ExtractSliceOp::create(rewriter, loc, scalarType, pieces, scalarOffsets, scalarSizes, unitStrides)
+        .getResult();
+    if (alpha != 1.0f) {
+      Value alphaTensor = createScalarTensorConstant(scalarType, alpha, rewriter, loc);
+      scalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, scalar, alphaTensor).getResult();
+    }
+    if (biasArg) {
+      Value biasScalar = createBroadcastedBiasScalar(biasArg, biasType, row, column, scalarType, rewriter, loc);
+      if (beta != 1.0f) {
+        Value betaTensor = createScalarTensorConstant(scalarType, beta, rewriter, loc);
+        biasScalar = spatial::SpatVMulOp::create(rewriter, loc, scalarType, biasScalar, betaTensor).getResult();
+      }
+      scalar = spatial::SpatVAddOp::create(rewriter, loc, scalarType, scalar, biasScalar).getResult();
+    }
+    SmallVector<OpFoldResult> outputOffsets {row, column};
+    Value outputNext =
+      tensor::InsertSliceOp::create(rewriter, loc, scalar, outputAcc, outputOffsets, scalarSizes, unitStrides)
+        .getResult();
+    scf::YieldOp::create(rewriter, loc, outputNext);
+
+    rewriter.setInsertionPointAfter(loop);
+    spatial::SpatYieldOp::create(rewriter, loc, loop.getResult(0));
+  });
+}
+
 static Value createPartialGroupOffset(Value hSlice,
                                      int64_t kSlice,
                                      int64_t numKSlices,
@@ -570,9 +770,50 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
    return failure();
  }

+  const int64_t numOutRows = outType.getDimSize(0);
+  const int64_t numOutCols = outType.getDimSize(1);
+  const int64_t reductionSize = aType.getDimSize(1);
+
  if (!isCompileTimeComputable(b)) {
-    gemmOp.emitOpError("requires Gemm input B to be statically computed from constants");
-    return failure();
+    bool hasC = hasGemmBias(c);
+    float alpha = gemmOpAdaptor.getAlpha().convertToFloat();
+    float beta = gemmOpAdaptor.getBeta().convertToFloat();
+    RankedTensorType biasType;
+    if (hasC) {
+      auto cType = dyn_cast<RankedTensorType>(c.getType());
+      if (!cType || !cType.hasStaticShape()) {
+        pim::emitUnsupportedStaticShapeDiagnostic(gemmOp, "Gemm bias");
+        return failure();
+      }
+      auto verifiedBiasType = verifyDynamicGemmBiasType(cType, outType);
+      if (failed(verifiedBiasType)) {
+        gemmOp.emitOpError("requires Gemm bias C to be broadcastable to the output shape");
+        return failure();
+      }
+      biasType = *verifiedBiasType;
+    }
+
+    const int64_t expectedBRows = gemmOpAdaptor.getTransB() ? numOutCols : reductionSize;
+    const int64_t expectedBCols = gemmOpAdaptor.getTransB() ? reductionSize : numOutCols;
+    if (aType.getDimSize(0) != numOutRows || bType.getDimSize(0) != expectedBRows
+        || bType.getDimSize(1) != expectedBCols) {
+      gemmOp.emitOpError("has inconsistent A, B, and output shapes");
+      return failure();
+    }
+
+    const int64_t laneCount64 = numOutRows * numOutCols;
+    if (laneCount64 > std::numeric_limits<int32_t>::max()) {
+      gemmOp.emitOpError("requires Gemm dynamic batch lane count to fit in i32");
+      return failure();
+    }
+
+    auto scalarPiecesType = RankedTensorType::get({laneCount64, 1}, outType.getElementType());
+    auto batchOp = createVvdmulBatch(
+      a, b, aType, bType, scalarPiecesType, outType, gemmOpAdaptor.getTransB(), rewriter, loc);
+    auto outputCompute = createDynamicGemmOutputCompute(
+      batchOp.getResult(0), hasC ? c : Value(), scalarPiecesType, biasType, outType, alpha, beta, rewriter, loc);
+    rewriter.replaceOp(gemmOp, outputCompute.getResults());
+    return success();
  }

  auto scaledB = materializeScaledConstantTensor(b, gemmOpAdaptor.getAlpha().convertToFloat(), rewriter, loc);
@@ -590,9 +831,6 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
    bType = cast<RankedTensorType>(b.getType());
  }

-  const int64_t numOutRows = outType.getDimSize(0);
-  const int64_t numOutCols = outType.getDimSize(1);
-  const int64_t reductionSize = aType.getDimSize(1);
  if (aType.getDimSize(0) != numOutRows || bType.getDimSize(0) != reductionSize || bType.getDimSize(1) != numOutCols) {
    gemmOp.emitOpError("has inconsistent A, B, and output shapes after transpose handling");
    return failure();
@@ -615,7 +853,7 @@ LogicalResult GemmToSpatialComputes::matchAndRewrite(ONNXGemmOp gemmOp,
  aType = paddedAType;

  Value bias;
-  bool hasC = !isa<ONNXNoneOp>(c.getDefiningOp());
+  bool hasC = hasGemmBias(c);
  auto paddedOutType = RankedTensorType::get({numOutRows, paddedOutCols}, outType.getElementType());
  if (hasC) {
    auto cType = dyn_cast<RankedTensorType>(c.getType());