fix bufferization and weight emission after new gemm patterns

2026-05-27 16:15:10 +02:00
parent 013ae0ac2a
commit 1a5d7d2a3f
10 changed files with 349 additions and 317 deletions
@@ -814,7 +814,7 @@ static SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
 struct CoreEmissionResult {
  OnnxMlirCompilerErrorCodes status = CompilerSuccess;
  MemoryReportRow reportRow;
-  llvm::SmallVector<unsigned, 8> usedWeightIndices;
+  llvm::SmallVector<ResolvedWeightView, 8> usedWeights;
 };

 template <typename MapTy>
@@ -879,7 +879,6 @@ struct CompiledCoreNode {
  Kind kind = Kind::Op;
  Operation* op = nullptr;
  CompiledCoreOpKind opKind = CompiledCoreOpKind::Load;
-  std::optional<unsigned> weightIndex;
  CompiledIndexExpr lowerBound;
  CompiledIndexExpr upperBound;
  CompiledIndexExpr step;
@@ -978,12 +977,6 @@ static LogicalResult compileCoreEmissionPlan(Block& block,
    opNode.kind = CompiledCoreNode::Kind::Op;
    opNode.op = &op;
    opNode.opKind = *opKind;
-    if (auto vmmOp = dyn_cast<pim::PimVMMOp>(op)) {
-      auto weightIndex = onnx_mlir::resolveWeightIndex(weightOwner, vmmOp);
-      if (!weightIndex)
-        return failure();
-      opNode.weightIndex = *weightIndex;
-    }
    plan.push_back(std::move(opNode));
  }
  return success();
@@ -992,6 +985,9 @@ static LogicalResult compileCoreEmissionPlan(Block& block,
 static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<CompiledCoreNode>& plan,
                                             PimCodeGen& coreCodeGen,
                                             StaticValueKnowledge& knowledge,
+                                             llvm::function_ref<llvm::FailureOr<unsigned>(pim::PimVMMOp,
+                                                                                          const StaticValueKnowledge&)>
+                                               resolveWeightSlot,
                                             size_t& processedOperations,
                                             std::optional<unsigned> batchLane = std::nullopt,
                                             std::optional<unsigned> batchLaneCount = std::nullopt) {
@@ -1015,7 +1011,7 @@ static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<Compile
          aliasBindings.bind(iterArg, iterValue);

        if (failed(executeCompiledCorePlan(
-              *node.loopBody, coreCodeGen, knowledge, processedOperations, batchLane, batchLaneCount)))
+              *node.loopBody, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount)))
          return failure();

        auto yieldOp = cast<mlir::scf::YieldOp>(forOp.getRegion().front().getTerminator());
@@ -1048,9 +1044,10 @@ static LogicalResult executeCompiledCorePlan(const llvm::SmallVectorImpl<Compile
      coreCodeGen.codeGenConcatOp(cast<pim::PimConcatOp>(node.op), knowledge);
      break;
    case CompiledCoreOpKind::Vmm:
-      assert(node.weightIndex && "compiled VMM op must have cached weight index");
-      coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(
-        *node.weightIndex, cast<pim::PimVMMOp>(node.op), true, knowledge);
+      if (auto weightSlot = resolveWeightSlot(cast<pim::PimVMMOp>(node.op), knowledge); succeeded(weightSlot))
+        coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(*weightSlot, cast<pim::PimVMMOp>(node.op), true, knowledge);
+      else
+        return failure();
      break;
    case CompiledCoreOpKind::Transpose:
      coreCodeGen.codeGenTransposeOp(cast<pim::PimTransposeOp>(node.op), knowledge);
@@ -1138,6 +1135,9 @@ static int64_t codeGenCoreOps(Block& block,
                              PimCodeGen& coreCodeGen,
                              const StaticValueKnowledge& initialKnowledge,
                              Operation* weightOwner,
+                              llvm::function_ref<llvm::FailureOr<unsigned>(pim::PimVMMOp,
+                                                                           const StaticValueKnowledge&)>
+                                resolveWeightSlot,
                              std::optional<unsigned> batchLane = std::nullopt,
                              std::optional<unsigned> batchLaneCount = std::nullopt) {
  llvm::SmallVector<CompiledCoreNode, 32> plan;
@@ -1146,7 +1146,8 @@ static int64_t codeGenCoreOps(Block& block,

  size_t processedOperations = 0;
  StaticValueKnowledge knowledge = initialKnowledge;
-  auto result = executeCompiledCorePlan(plan, coreCodeGen, knowledge, processedOperations, batchLane, batchLaneCount);
+  auto result =
+    executeCompiledCorePlan(plan, coreCodeGen, knowledge, resolveWeightSlot, processedOperations, batchLane, batchLaneCount);
  return failed(result) ? -1 : static_cast<int64_t>(processedOperations);
 }

@@ -1174,9 +1175,6 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
  size_t maxCoreId = 0;
  uint64_t nextBatchReportId = 0;

-  // Create Weight Folder
-  auto mapCoreWeightToFileName = createAndPopulateWeightFolder(funcOp, outputDirPath);
-
  SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
  SmallDenseMap<memref::GlobalOp, MemEntry, 16> materializedHostGlobals =
    collectMaterializedHostGlobals(moduleOp, funcOp, memory);
@@ -1238,11 +1236,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
    ++nextBatchReportId;
  }

-  auto linkCoreWeights = [&](size_t originalCoreId,
-                             size_t coreId,
-                             ArrayRef<unsigned> usedIndices,
-                             ValueRange weights,
-                             Operation* weightOwner,
+  auto linkCoreWeights = [&](size_t coreId,
+                             ArrayRef<std::string> weightFiles,
                             json::Array& xbarsPerGroup) -> OnnxMlirCompilerErrorCodes {
    auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
    if (auto error = sys::fs::create_directory(coreWeightsDirPath)) {
@@ -1250,20 +1245,12 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
      return InvalidOutputFileAccess;
    }

-    auto& mapWeightToFile = mapCoreWeightToFileName[originalCoreId];
-    for (unsigned index : usedIndices) {
-      if (index >= weights.size()) {
-        weightOwner->emitWarning("Weight index " + std::to_string(index) + " is out of range");
-        assert(index < weights.size() && "Weight index is out of range");
-      }
-      mlir::Value weight = weights[index];
-      xbarsPerGroup.push_back(index);
-      assert(mapWeightToFile.contains(weight) && "Weight was not materialized into a file!!");
-      auto& fileName = mapWeightToFile[weight];
+    for (auto [slot, fileName] : llvm::enumerate(weightFiles)) {
+      xbarsPerGroup.push_back(static_cast<int64_t>(slot));
      if (auto error = sys::fs::create_link(outputDirPath + "/weights/" + fileName,
-                                            coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")) {
+                                            coreWeightsDirPath + "/crossbar_" + std::to_string(slot) + ".bin")) {
        errs() << "Error creating link file: " << (outputDirPath + "/weights/" + fileName) << " to "
-               << (coreWeightsDirPath + "/crossbar_" + std::to_string(index) + ".bin")
+               << (coreWeightsDirPath + "/crossbar_" + std::to_string(slot) + ".bin")
               << "\nError:" << error.message() << '\n';
        return InvalidOutputFileAccess;
      }
@@ -1275,6 +1262,22 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
  auto emitJob = [&](const CoreEmissionJob& job) -> CoreEmissionResult {
    CoreEmissionResult result;
    PimAcceleratorMemory jobMemory(memory.memEntriesMap, false);
+    llvm::SmallVector<ResolvedWeightView, 8> usedWeights;
+
+    auto resolveWeightSlot = [&](pim::PimVMMOp vmmOp,
+                                 const StaticValueKnowledge& knowledge) -> llvm::FailureOr<unsigned> {
+      auto weightView = onnx_mlir::resolveWeightView(job.coreLikeOp, vmmOp.getWeight(), knowledge);
+      if (failed(weightView)) {
+        vmmOp.emitOpError("requires a statically resolvable dense global weight view during PIM codegen");
+        return failure();
+      }
+
+      if (auto it = llvm::find(usedWeights, *weightView); it != usedWeights.end())
+        return static_cast<unsigned>(std::distance(usedWeights.begin(), it));
+
+      usedWeights.push_back(*weightView);
+      return static_cast<unsigned>(usedWeights.size() - 1);
+    };

    std::error_code errorCode;
    auto outputCorePath = outputDirPath + "/core_" + std::to_string(job.emittedCoreId) + ".pim";
@@ -1307,21 +1310,20 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
      auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId);
      deviceMemory.allocateCore(coreOp);

-      int64_t processedOperations =
-        codeGenCoreOps(coreOp.getBody().front(), coreCodeGen, StaticValueKnowledge {}, coreOp.getOperation());
+      int64_t processedOperations = codeGenCoreOps(
+        coreOp.getBody().front(), coreCodeGen, StaticValueKnowledge {}, coreOp.getOperation(), resolveWeightSlot);
      if (processedOperations < 0) {
        result.status = CompilerFailure;
        return result;
      }
      assert(processedOperations > 0);
      result.reportRow = deviceMemory.getReportRow();
-      result.usedWeightIndices = getUsedWeightIndices(coreOp);
+      result.usedWeights = std::move(usedWeights);
    }
    else {
      auto coreBatchOp = cast<pim::PimCoreBatchOp>(job.coreLikeOp);
      aliasMaterializedHostGlobals(coreBatchOp, moduleOp, materializedHostGlobals, jobMemory);
      auto& deviceMemory = jobMemory.getOrCreateDeviceMem(job.emittedCoreId);
-      result.usedWeightIndices = getUsedWeightIndices(coreBatchOp);

      for (unsigned lane : job.lanes) {
        StaticValueKnowledge knowledge;
@@ -1335,6 +1337,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
                                                     coreCodeGen,
                                                     knowledge,
                                                     coreBatchOp.getOperation(),
+                                                     resolveWeightSlot,
                                                     lane,
                                                     static_cast<unsigned>(coreBatchOp.getLaneCount()));
        if (processedOperations < 0) {
@@ -1345,6 +1348,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
      }

      result.reportRow = deviceMemory.getReportRow();
+      result.usedWeights = std::move(usedWeights);
    }

    pim_binary::patchInstructionCount(coreBinaryStream, coreCodeGen.getEmittedInstructionCount());
@@ -1368,14 +1372,23 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
    if (jobResults[jobIndex].status != CompilerSuccess)
      return jobResults[jobIndex].status;

+  llvm::SmallVector<WeightFileRequest, 8> weightRequests;
+  weightRequests.reserve(jobs.size());
+  for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) {
+    WeightFileRequest request;
+    request.coreId = jobs[jobIndex].emittedCoreId;
+    request.weights = jobResults[jobIndex].usedWeights;
+    weightRequests.push_back(std::move(request));
+  }
+  auto mapCoreWeightToFileName = createAndPopulateWeightFolder(weightRequests, outputDirPath);
+
  for (size_t jobIndex = 0; jobIndex < jobs.size(); ++jobIndex) {
    const CoreEmissionJob& job = jobs[jobIndex];
    const CoreEmissionResult& result = jobResults[jobIndex];
    json::Array xbarsPerGroup;

    if (auto coreOp = dyn_cast<pim::PimCoreOp>(job.coreLikeOp)) {
-      if (auto err = linkCoreWeights(
-            job.originalCoreId, job.emittedCoreId, result.usedWeightIndices, coreOp.getWeights(), coreOp.getOperation(), xbarsPerGroup))
+      if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup))
        return err;
      xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup);
      memory.recordCoreReport(job.emittedCoreId, result.reportRow);
@@ -1391,14 +1404,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimCode(ModuleOp& moduleOp, std::
    for (size_t jobIndex : group) {
      const CoreEmissionJob& job = jobs[jobIndex];
      const CoreEmissionResult& result = jobResults[jobIndex];
-      auto coreBatchOp = cast<pim::PimCoreBatchOp>(job.coreLikeOp);
      json::Array xbarsPerGroup;
-      if (auto err = linkCoreWeights(job.originalCoreId,
-                                     job.emittedCoreId,
-                                     result.usedWeightIndices,
-                                     coreBatchOp.getWeights(),
-                                     coreBatchOp.getOperation(),
-                                     xbarsPerGroup))
+      if (auto err = linkCoreWeights(job.emittedCoreId, mapCoreWeightToFileName[job.emittedCoreId], xbarsPerGroup))
        return err;
      xbarsPerArrayGroup["core" + std::to_string(job.emittedCoreId)] = std::move(xbarsPerGroup);
      reportedCoreIds.push_back(static_cast<int32_t>(job.emittedCoreId));