automatic code formatting

use seed in validate.py for deterministic tests
fix wrong send/receive reordering in post dcp merge instructions compaction
2026-05-13 21:51:19 +02:00 · 2026-05-13 21:49:36 +02:00 · 2026-05-13 21:48:49 +02:00
16 changed files with 296 additions and 144 deletions
@@ -1,7 +1,6 @@
-#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
-
 #include "mlir/IR/BuiltinTypeInterfaces.h"

+#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"

 using namespace mlir;
@@ -1,8 +1,7 @@
-#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
-
 #include "llvm/Support/Format.h"

 #include "src/Accelerators/PIM/Common/Support/FileSystemUtils.hpp"
+#include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"

 namespace onnx_mlir {

@@ -1,10 +1,9 @@
 #pragma once

-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"

-#include <cstdint>
 #include <fstream>
 #include <limits>
 #include <string>
@@ -70,9 +70,7 @@ inline void writeUint32LE(llvm::raw_ostream& os, uint32_t value) {
  os.write(bytes.data(), bytes.size());
 }

-inline void writeInt32LE(llvm::raw_ostream& os, int32_t value) {
-  writeUint32LE(os, static_cast<uint32_t>(value));
-}
+inline void writeInt32LE(llvm::raw_ostream& os, int32_t value) { writeUint32LE(os, static_cast<uint32_t>(value)); }

 inline void writeHeader(llvm::raw_ostream& os) {
  os.write(kMagic, sizeof(kMagic));
@@ -186,39 +184,39 @@ inline Opcode opcodeFromString(llvm::StringRef opName) {

 inline llvm::StringRef opcodeToString(Opcode opcode) {
  switch (opcode) {
-  case Opcode::nop: return "nop";
-  case Opcode::sldi: return "sldi";
-  case Opcode::sld: return "sld";
-  case Opcode::sadd: return "sadd";
-  case Opcode::ssub: return "ssub";
-  case Opcode::smul: return "smul";
-  case Opcode::saddi: return "saddi";
-  case Opcode::smuli: return "smuli";
-  case Opcode::setbw: return "setbw";
-  case Opcode::mvmul: return "mvmul";
-  case Opcode::vvadd: return "vvadd";
-  case Opcode::vvsub: return "vvsub";
-  case Opcode::vvmul: return "vvmul";
-  case Opcode::vvdmul: return "vvdmul";
-  case Opcode::vvmax: return "vvmax";
-  case Opcode::vvsll: return "vvsll";
-  case Opcode::vvsra: return "vvsra";
-  case Opcode::vavg: return "vavg";
-  case Opcode::vrelu: return "vrelu";
-  case Opcode::vtanh: return "vtanh";
-  case Opcode::vsigm: return "vsigm";
+  case Opcode::nop:      return "nop";
+  case Opcode::sldi:     return "sldi";
+  case Opcode::sld:      return "sld";
+  case Opcode::sadd:     return "sadd";
+  case Opcode::ssub:     return "ssub";
+  case Opcode::smul:     return "smul";
+  case Opcode::saddi:    return "saddi";
+  case Opcode::smuli:    return "smuli";
+  case Opcode::setbw:    return "setbw";
+  case Opcode::mvmul:    return "mvmul";
+  case Opcode::vvadd:    return "vvadd";
+  case Opcode::vvsub:    return "vvsub";
+  case Opcode::vvmul:    return "vvmul";
+  case Opcode::vvdmul:   return "vvdmul";
+  case Opcode::vvmax:    return "vvmax";
+  case Opcode::vvsll:    return "vvsll";
+  case Opcode::vvsra:    return "vvsra";
+  case Opcode::vavg:     return "vavg";
+  case Opcode::vrelu:    return "vrelu";
+  case Opcode::vtanh:    return "vtanh";
+  case Opcode::vsigm:    return "vsigm";
  case Opcode::vsoftmax: return "vsoftmax";
-  case Opcode::vmv: return "vmv";
-  case Opcode::vrsu: return "vrsu";
-  case Opcode::vrsl: return "vrsl";
-  case Opcode::ld: return "ld";
-  case Opcode::st: return "st";
-  case Opcode::lldi: return "lldi";
-  case Opcode::lmv: return "lmv";
-  case Opcode::send: return "send";
-  case Opcode::recv: return "recv";
-  case Opcode::wait: return "wait";
-  case Opcode::sync: return "sync";
+  case Opcode::vmv:      return "vmv";
+  case Opcode::vrsu:     return "vrsu";
+  case Opcode::vrsl:     return "vrsl";
+  case Opcode::ld:       return "ld";
+  case Opcode::st:       return "st";
+  case Opcode::lldi:     return "lldi";
+  case Opcode::lmv:      return "lmv";
+  case Opcode::send:     return "send";
+  case Opcode::recv:     return "recv";
+  case Opcode::wait:     return "wait";
+  case Opcode::sync:     return "sync";
  }
  llvm_unreachable("Unsupported PIM binary opcode");
 }
@@ -235,9 +233,7 @@ inline InstructionRecord makeInstructionRecord(const llvm::json::Object& instruc
  case Opcode::sldi:
  case Opcode::saddi:
  case Opcode::smuli:
-  case Opcode::lldi:
-    record.r2OrImm = getOptionalInt(instruction, "imm");
-    break;
+  case Opcode::lldi:  record.r2OrImm = getOptionalInt(instruction, "imm"); break;
  case Opcode::mvmul:
    record.r2OrImm = getOptionalInt(instruction, "mbiw");
    record.generic1 = getOptionalInt(instruction, "relu");
@@ -252,9 +248,7 @@ inline InstructionRecord makeInstructionRecord(const llvm::json::Object& instruc
    record.r2OrImm = getOptionalInt(instruction, "core");
    record.generic3 = getOptionalInt(instruction, "size");
    break;
-  default:
-    record.r2OrImm = getOptionalInt(instruction, "rs2");
-    break;
+  default: record.r2OrImm = getOptionalInt(instruction, "rs2"); break;
  }

  if (record.opcode != Opcode::mvmul && record.opcode != Opcode::setbw) {
@@ -371,8 +365,7 @@ inline llvm::json::Object makeInstructionJson(const InstructionRecord& record) {
    break;
  case Opcode::wait:
  case Opcode::sync:
-  case Opcode::nop:
-    break;
+  case Opcode::nop:  break;
  }

  return instruction;
@@ -367,7 +367,7 @@ void PimCodeGen::emitMemCopyOp(StringRef opName,
  instruction.generic1 = 0;
  instruction.generic2 = 0;
  instruction.generic3 = static_cast<int32_t>(size);
-  (void)sizeFieldName;
+  (void) sizeFieldName;
  emitInstruction(instruction);
 }

@@ -1,5 +1,4 @@
 #include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
-
 #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"

 using namespace mlir;
@@ -75,16 +74,14 @@ struct PackSpatialConcatInputsPattern final : OpRewritePattern<spatial::SpatConc
      return failure();

    auto outputType = cast<ShapedType>(concatOp.getOutput().getType());
-    auto newConcat = pim::PimConcatOp::create(rewriter,
-                                             concatOp.getLoc(),
-                                             concatOp.getOutput().getType(),
-                                             concatOp.getAxisAttr(),
-                                             ValueRange(packedInputs),
-                                             tensor::EmptyOp::create(rewriter,
-                                                                     concatOp.getLoc(),
-                                                                     outputType.getShape(),
-                                                                     outputType.getElementType())
-                                               .getResult());
+    auto newConcat = pim::PimConcatOp::create(
+      rewriter,
+      concatOp.getLoc(),
+      concatOp.getOutput().getType(),
+      concatOp.getAxisAttr(),
+      ValueRange(packedInputs),
+      tensor::EmptyOp::create(rewriter, concatOp.getLoc(), outputType.getShape(), outputType.getElementType())
+        .getResult());
    rewriter.replaceOp(concatOp, newConcat.getOutput());
    return success();
  }
@@ -1,7 +1,7 @@
 #pragma once

-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/PatternMatch.h"

 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
@@ -1,15 +1,15 @@
-#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
-
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"

 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"

 #include <limits>

+#include "src/Accelerators/PIM/Dialect/Pim/Transforms/StaticMemoryCoalescing/StaticMemoryCoalescing.hpp"
+
 using namespace mlir;

 namespace onnx_mlir {
@@ -29,9 +29,8 @@ static uint64_t getTypeSizeBytes(MemRefType type) {
  return static_cast<uint64_t>(type.getNumElements() * type.getElementTypeBitWidth() / 8);
 }

-static FailureOr<uint64_t> getLastUseInstruction(memref::AllocOp allocOp,
-                                                 Block& body,
-                                                 const DenseMap<Operation*, uint64_t>& opOrder) {
+static FailureOr<uint64_t>
+getLastUseInstruction(memref::AllocOp allocOp, Block& body, const DenseMap<Operation*, uint64_t>& opOrder) {
  uint64_t endInstruction = opOrder.lookup(allocOp);
  SmallPtrSet<Operation*, 16> visited;
  SmallVector<Value> pendingValues;
@@ -45,10 +44,9 @@ static FailureOr<uint64_t> getLastUseInstruction(memref::AllocOp allocOp,
      if (!visited.insert(user).second)
        continue;

-      if (isSupportedAliasOp(user)) {
+      if (isSupportedAliasOp(user))
        for (Value result : user->getResults())
          pendingValues.push_back(result);
-      }

      if (auto dpsOp = dyn_cast<DestinationStyleOpInterface>(user)) {
        for (OpResult result : user->getResults()) {
@@ -2,7 +2,6 @@

 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/Operation.h"

 #include "llvm/ADT/SmallVector.h"

@@ -45,9 +45,7 @@ struct CoalescingReportEntry {
  CoalescingReportRow row;
 };

-static std::string formatMemory(uint64_t bytes) {
-  return formatReportMemory(bytes);
-}
+static std::string formatMemory(uint64_t bytes) { return formatReportMemory(bytes); }

 static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
  auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
@@ -58,9 +56,10 @@ static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
 static void printReportRow(raw_ostream& os, const CoalescingReportRow& row) {
  llvm::SmallVector<ReportField, 4> fields = {
    {"Number of candidates", std::to_string(row.numCandidates)},
-    {"Skipped allocations", std::to_string(row.numSkipped)},
-    {"Removed allocations", std::to_string(row.numRemoved)},
-    {"Saved memory", formatMemory(row.savedBytes)}};
+    {"Skipped allocations",  std::to_string(row.numSkipped)   },
+    {"Removed allocations",  std::to_string(row.numRemoved)   },
+    {"Saved memory",         formatMemory(row.savedBytes)     }
+  };
  printReportFlatFields(os, fields);
 }

@@ -87,10 +86,12 @@ static void emitReport(ArrayRef<CoalescingReportEntry> entries) {
    totalRow.savedBytes += entryTotal.savedBytes;
  }

-  llvm::SmallVector<ReportField, 4> totalFields = {{"Number of candidates", std::to_string(totalRow.numCandidates)},
-                                                   {"Skipped allocations", std::to_string(totalRow.numSkipped)},
-                                                   {"Removed allocations", std::to_string(totalRow.numRemoved)},
-                                                   {"Saved memory", formatMemory(totalRow.savedBytes)}};
+  llvm::SmallVector<ReportField, 4> totalFields = {
+    {"Number of candidates", std::to_string(totalRow.numCandidates)},
+    {"Skipped allocations",  std::to_string(totalRow.numSkipped)   },
+    {"Removed allocations",  std::to_string(totalRow.numRemoved)   },
+    {"Saved memory",         formatMemory(totalRow.savedBytes)     }
+  };
  printReportTotalsBlock(os, totalFields);
  if (!entries.empty())
    os << "\n";
@@ -127,15 +128,17 @@ static void emitReport(ArrayRef<CoalescingReportEntry> entries) {
    if (sortedEntries[index].kind == CoalescingReportEntry::Kind::Batch) {
      llvm::SmallVector<ReportField, 4> perCoreFields = {
        {"Number of candidates", std::to_string(sortedEntries[index].row.numCandidates)},
-        {"Skipped allocations", std::to_string(sortedEntries[index].row.numSkipped)},
-        {"Removed allocations", std::to_string(sortedEntries[index].row.numRemoved)},
-        {"Saved memory", formatMemory(sortedEntries[index].row.savedBytes)}};
+        {"Skipped allocations",  std::to_string(sortedEntries[index].row.numSkipped)   },
+        {"Removed allocations",  std::to_string(sortedEntries[index].row.numRemoved)   },
+        {"Saved memory",         formatMemory(sortedEntries[index].row.savedBytes)     }
+      };
      CoalescingReportRow totalRow = getTotalRow(sortedEntries[index]);
      llvm::SmallVector<ReportField, 4> totalFields = {
        {"Number of candidates", std::to_string(totalRow.numCandidates)},
-        {"Skipped allocations", std::to_string(totalRow.numSkipped)},
-        {"Removed allocations", std::to_string(totalRow.numRemoved)},
-        {"Saved memory", formatMemory(totalRow.savedBytes)}};
+        {"Skipped allocations",  std::to_string(totalRow.numSkipped)   },
+        {"Removed allocations",  std::to_string(totalRow.numRemoved)   },
+        {"Saved memory",         formatMemory(totalRow.savedBytes)     }
+      };
      printReportPerCoreAndTotalFields(os, perCoreFields, totalFields);
    }
    else {
@@ -196,8 +199,6 @@ struct StaticMemoryCoalescingPass : PassWrapper<StaticMemoryCoalescingPass, Oper

 } // namespace

-std::unique_ptr<Pass> createPimStaticMemoryCoalescingPass() {
-  return std::make_unique<StaticMemoryCoalescingPass>();
-}
+std::unique_ptr<Pass> createPimStaticMemoryCoalescingPass() { return std::make_unique<StaticMemoryCoalescingPass>(); }

 } // namespace onnx_mlir
@@ -818,13 +818,14 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu
    }
  }

-  llvm::SmallVector<ReportField, 6> totalFields = {{"Used cores", std::to_string(usedCpuCount)},
-                                                   {"Number of top-level compute ops", std::to_string(totalComputeOps)},
-                                                   {"Number of logical computes", std::to_string(totalLogicalComputes)},
-                                                   {"Number of top-level batch compute ops",
-                                                    std::to_string(totalBatchComputeOps)},
-                                                   {"Number of instructions", std::to_string(totalInstructionCount)},
-                                                   {"Number of used crossbars", std::to_string(totalWeightCount)}};
+  llvm::SmallVector<ReportField, 6> totalFields = {
+    {"Used cores",                            std::to_string(usedCpuCount)         },
+    {"Number of top-level compute ops",       std::to_string(totalComputeOps)      },
+    {"Number of logical computes",            std::to_string(totalLogicalComputes) },
+    {"Number of top-level batch compute ops", std::to_string(totalBatchComputeOps) },
+    {"Number of instructions",                std::to_string(totalInstructionCount)},
+    {"Number of used crossbars",              std::to_string(totalWeightCount)     }
+  };
  printReportTotalsBlock(os, totalFields);
  if (!collectedData.empty())
    os << "\n";
@@ -876,13 +877,15 @@ void generateReport(func::FuncOp funcOp, const std::string& name, size_t usedCpu

    llvm::SmallVector<ReportField, 3> perCoreFields = {
      {"Number of logical computes", std::to_string(perCoreLogicalComputeCount)},
-      {"Number of instructions", std::to_string(perCoreInstructionCount)},
-      {"Number of used crossbars", std::to_string(perCoreWeightCount)}};
+      {"Number of instructions",     std::to_string(perCoreInstructionCount)   },
+      {"Number of used crossbars",   std::to_string(perCoreWeightCount)        }
+    };
    if (current.isRebatched) {
      llvm::SmallVector<ReportField, 3> totalEntryFields = {
        {"Number of logical computes", std::to_string(current.logicalComputeCount)},
-        {"Number of instructions", std::to_string(totalEntryInstructionCount)},
-        {"Number of used crossbars", std::to_string(current.weightCount)}};
+        {"Number of instructions",     std::to_string(totalEntryInstructionCount) },
+        {"Number of used crossbars",   std::to_string(current.weightCount)        }
+      };
      printReportPerCoreAndTotalFields(os, perCoreFields, totalEntryFields);
    }
    else {
@@ -1003,6 +1006,23 @@ public:
      DenseMap<Value, Value> externalInputMap;
      DenseMap<Value, size_t> weightToIndex;
    };
+    struct RemoteSendInfo {
+      ChannelInfo channelInfo;
+      ComputeInstance consumer;
+      size_t inputIndex = 0;
+      size_t consumerOrder = 0;
+      size_t sourceOrder = 0;
+    };
+    struct RemoteReceiveEntry {
+      ChannelInfo channelInfo;
+      ComputeInstance consumer;
+      size_t inputIndex = 0;
+      size_t sourceOrder = 0;
+    };
+    auto getRemoteSendPairKey = [](const ChannelInfo& channelInfo) {
+      return (static_cast<uint64_t>(static_cast<uint32_t>(channelInfo.sourceCoreId)) << 32)
+           | static_cast<uint32_t>(channelInfo.targetCoreId);
+    };

    auto getTaskInputs = [&](const ScheduledTask& task) {
      SmallVector<Value> inputs;
@@ -1143,7 +1163,7 @@ public:
      }
    };

-    DenseMap<ComputeInstance, SmallVector<SmallVector<ChannelInfo>>> remoteSendsByTask;
+    DenseMap<ComputeInstance, SmallVector<SmallVector<RemoteSendInfo>>> remoteSendsByTask;
    DenseMap<ComputeInstance, SmallVector<std::optional<ChannelInfo>>> remoteInputsByTask;
    DenseMap<size_t, SmallVector<Value>> cpuExternalInputs;
    DenseMap<size_t, SmallVector<Value>> cpuWeights;
@@ -1176,7 +1196,7 @@ public:
                auto& perResultChannels = remoteSendsByTask[producerRef->instance];
                if (perResultChannels.empty())
                  perResultChannels.resize(getTaskOutputTypes(producerIt->second).size());
-                perResultChannels[producerRef->resultIndex].push_back(info);
+                perResultChannels[producerRef->resultIndex].push_back({info, task.key, inputIndex, task.order, 0});
              }
              continue;
            }
@@ -1201,6 +1221,79 @@ public:
      }
    }

+    DenseSet<uint64_t> pairsNeedingReceiveReorder;
+    for (size_t cpu : orderedCpus) {
+      DenseMap<uint64_t, size_t> nextSourceOrderByPair;
+      DenseMap<uint64_t, size_t> lastConsumerOrderByPair;
+      for (const ScheduledTask& task : tasksByCpu[cpu]) {
+        auto sendsIt = remoteSendsByTask.find(task.key);
+        if (sendsIt == remoteSendsByTask.end())
+          continue;
+        for (auto& sendInfos : sendsIt->second) {
+          for (RemoteSendInfo& sendInfo : sendInfos) {
+            uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
+            sendInfo.sourceOrder = nextSourceOrderByPair[pairKey]++;
+            auto [it, inserted] = lastConsumerOrderByPair.try_emplace(pairKey, sendInfo.consumerOrder);
+            if (!inserted) {
+              if (sendInfo.consumerOrder < it->second)
+                pairsNeedingReceiveReorder.insert(pairKey);
+              it->second = sendInfo.consumerOrder;
+            }
+          }
+        }
+      }
+    }
+
+    DenseMap<uint64_t, SmallVector<RemoteSendInfo*>> reorderedSendsByPair;
+    for (auto& taskSends : remoteSendsByTask) {
+      for (auto& sendInfos : taskSends.second) {
+        for (RemoteSendInfo& sendInfo : sendInfos) {
+          uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
+          if (pairsNeedingReceiveReorder.contains(pairKey))
+            reorderedSendsByPair[pairKey].push_back(&sendInfo);
+        }
+      }
+    }
+    for (auto& pairSends : reorderedSendsByPair) {
+      llvm::stable_sort(pairSends.second, [](const RemoteSendInfo* lhs, const RemoteSendInfo* rhs) {
+        if (lhs->sourceOrder != rhs->sourceOrder)
+          return lhs->sourceOrder < rhs->sourceOrder;
+        return lhs->channelInfo.channelId < rhs->channelInfo.channelId;
+      });
+      for (RemoteSendInfo* sendInfo : pairSends.second) {
+        int64_t channelId = nextChannelId++;
+        sendInfo->channelInfo.channelId = channelId;
+        auto remoteInputsIt = remoteInputsByTask.find(sendInfo->consumer);
+        assert(remoteInputsIt != remoteInputsByTask.end() && "missing remote input for reordered send");
+        assert(sendInfo->inputIndex < remoteInputsIt->second.size() && "remote input index out of range");
+        assert(remoteInputsIt->second[sendInfo->inputIndex] && "missing reordered remote input channel");
+        remoteInputsIt->second[sendInfo->inputIndex]->channelId = channelId;
+      }
+    }
+
+    DenseMap<size_t, DenseMap<uint64_t, SmallVector<RemoteReceiveEntry>>> receiveQueuesByCpu;
+    for (auto& taskSends : remoteSendsByTask) {
+      for (const auto& sendInfos : taskSends.second) {
+        for (const RemoteSendInfo& sendInfo : sendInfos) {
+          uint64_t pairKey = getRemoteSendPairKey(sendInfo.channelInfo);
+          if (!pairsNeedingReceiveReorder.contains(pairKey))
+            continue;
+          size_t targetCpu = static_cast<size_t>(sendInfo.channelInfo.targetCoreId - 1);
+          receiveQueuesByCpu[targetCpu][pairKey].push_back(
+            {sendInfo.channelInfo, sendInfo.consumer, sendInfo.inputIndex, sendInfo.sourceOrder});
+        }
+      }
+    }
+    for (auto& cpuQueues : receiveQueuesByCpu) {
+      for (auto& pairQueue : cpuQueues.second) {
+        llvm::stable_sort(pairQueue.second, [](const RemoteReceiveEntry& lhs, const RemoteReceiveEntry& rhs) {
+          if (lhs.sourceOrder != rhs.sourceOrder)
+            return lhs.sourceOrder < rhs.sourceOrder;
+          return lhs.channelInfo.channelId < rhs.channelInfo.channelId;
+        });
+      }
+    }
+
    auto returnOp = cast<func::ReturnOp>(func.getBody().front().getTerminator());
    IRRewriter rewriter(&getContext());
    DenseMap<size_t, CpuProgram> cpuPrograms;
@@ -1255,6 +1348,59 @@ public:
      CpuProgram& program = cpuPrograms[cpu];
      IRRewriter cpuRewriter(&getContext());
      cpuRewriter.setInsertionPointToEnd(program.block);
+      DenseMap<uint64_t, size_t> receiveQueueIndices;
+      DenseMap<ComputeInstance, SmallVector<Value>> preReceivedInputsByTask;
+
+      auto lookupPreReceivedInput = [&](ComputeInstance consumer, size_t inputIndex) -> std::optional<Value> {
+        auto inputsIt = preReceivedInputsByTask.find(consumer);
+        if (inputsIt == preReceivedInputsByTask.end() || inputsIt->second.size() <= inputIndex)
+          return std::nullopt;
+        Value value = inputsIt->second[inputIndex];
+        if (!value)
+          return std::nullopt;
+        return value;
+      };
+
+      auto receiveThroughInput = [&](const ChannelInfo& requestedChannelInfo,
+                                     ComputeInstance requestedConsumer,
+                                     size_t requestedInputIndex) -> std::optional<Value> {
+        uint64_t pairKey = getRemoteSendPairKey(requestedChannelInfo);
+        auto cpuQueuesIt = receiveQueuesByCpu.find(cpu);
+        if (cpuQueuesIt == receiveQueuesByCpu.end())
+          return std::nullopt;
+        auto queueIt = cpuQueuesIt->second.find(pairKey);
+        if (queueIt == cpuQueuesIt->second.end())
+          return std::nullopt;
+
+        auto& queue = queueIt->second;
+        size_t& queueIndex = receiveQueueIndices[pairKey];
+        while (queueIndex < queue.size()) {
+          const RemoteReceiveEntry& entry = queue[queueIndex++];
+          auto consumerTaskIt = taskByKey.find(entry.consumer);
+          if (consumerTaskIt == taskByKey.end())
+            return std::nullopt;
+          SmallVector<Value> consumerInputs = getTaskInputs(consumerTaskIt->second);
+          if (consumerInputs.size() <= entry.inputIndex)
+            return std::nullopt;
+          Type inputType = consumerInputs[entry.inputIndex].getType();
+          auto receive =
+            spatial::SpatChannelReceiveOp::create(cpuRewriter,
+                                                  loc,
+                                                  inputType,
+                                                  cpuRewriter.getI64IntegerAttr(entry.channelInfo.channelId),
+                                                  cpuRewriter.getI32IntegerAttr(entry.channelInfo.sourceCoreId),
+                                                  cpuRewriter.getI32IntegerAttr(entry.channelInfo.targetCoreId));
+
+          auto& receivedInputs = preReceivedInputsByTask[entry.consumer];
+          if (receivedInputs.size() <= entry.inputIndex)
+            receivedInputs.resize(entry.inputIndex + 1);
+          receivedInputs[entry.inputIndex] = receive.getResult();
+
+          if (entry.consumer == requestedConsumer && entry.inputIndex == requestedInputIndex)
+            return receive.getResult();
+        }
+        return std::nullopt;
+      };

      for (const ScheduledTask& task : tasksByCpu[cpu]) {
        SmallVector<Value> taskInputs = getTaskInputs(task);
@@ -1284,6 +1430,24 @@ public:
                continue;
              }
              const ChannelInfo& channelInfo = *remoteInputsIt->second[inputIndex];
+              uint64_t pairKey = getRemoteSendPairKey(channelInfo);
+              if (pairsNeedingReceiveReorder.contains(pairKey)) {
+                if (std::optional<Value> preReceived = lookupPreReceivedInput(task.key, inputIndex)) {
+                  resolvedInputs.push_back(*preReceived);
+                  continue;
+                }
+                std::optional<Value> received = receiveThroughInput(channelInfo, task.key, inputIndex);
+                if (!received) {
+                  task.sourceOp->emitOpError("failed to materialize reordered remote receive")
+                    << " consumerCpu=" << cpu << " consumerSlot=" << task.slot
+                    << " sourceCoreId=" << channelInfo.sourceCoreId << " targetCoreId=" << channelInfo.targetCoreId
+                    << " channelId=" << channelInfo.channelId;
+                  signalPassFailure();
+                  return;
+                }
+                resolvedInputs.push_back(*received);
+                continue;
+              }
              auto receive =
                spatial::SpatChannelReceiveOp::create(cpuRewriter,
                                                      loc,
@@ -1367,13 +1531,14 @@ public:
            if (sendInfos.empty())
              continue;
            Value producedValue = taskYieldValues[resultIndex];
-            for (const ChannelInfo& sendInfo : sendInfos)
+            for (const RemoteSendInfo& sendInfo : sendInfos) {
              spatial::SpatChannelSendOp::create(cpuRewriter,
                                                 loc,
-                                                 cpuRewriter.getI64IntegerAttr(sendInfo.channelId),
-                                                 cpuRewriter.getI32IntegerAttr(sendInfo.sourceCoreId),
-                                                 cpuRewriter.getI32IntegerAttr(sendInfo.targetCoreId),
+                                                 cpuRewriter.getI64IntegerAttr(sendInfo.channelInfo.channelId),
+                                                 cpuRewriter.getI32IntegerAttr(sendInfo.channelInfo.sourceCoreId),
+                                                 cpuRewriter.getI32IntegerAttr(sendInfo.channelInfo.targetCoreId),
                                                 producedValue);
+            }
          }
        }
      }
@@ -1666,23 +1831,21 @@ private:
    IRRewriter rewriter(context);

    rewriter.setInsertionPointAfter(producerOp);
-    auto savedSendInsertPoint = rewriter.saveInsertionPoint();
-    auto insertNew = [this, savedSendInsertPoint, context, loc, computeValueResults, producerCpu](size_t resultIndex,
-                                                                                                  size_t targetCpu) {
+    auto insertNew = [this, context, loc, computeValueResults, producerCpu](size_t resultIndex, size_t targetCpu) {
      auto channelId = nextChannelId++;
      LazyInsertComputeResult::ChannelInfo channelInfo {
        channelId, getPhysicalCoreId(producerCpu), getPhysicalCoreId(targetCpu)};
-      auto insertVal = [&context, loc, computeValueResults, channelInfo, resultIndex, savedSendInsertPoint](
-                         mlir::IRRewriter::InsertPoint) {
-        IRRewriter rewriter(context);
-        rewriter.restoreInsertionPoint(savedSendInsertPoint);
-        spatial::SpatChannelSendOp::create(rewriter,
-                                           loc,
-                                           rewriter.getI64IntegerAttr(channelInfo.channelId),
-                                           rewriter.getI32IntegerAttr(channelInfo.sourceCoreId),
-                                           rewriter.getI32IntegerAttr(channelInfo.targetCoreId),
-                                           computeValueResults.getOuter(resultIndex));
-      };
+      auto insertVal =
+        [&context, loc, computeValueResults, channelInfo, resultIndex](mlir::IRRewriter::InsertPoint insertPoint) {
+          IRRewriter rewriter(context);
+          rewriter.restoreInsertionPoint(insertPoint);
+          spatial::SpatChannelSendOp::create(rewriter,
+                                             loc,
+                                             rewriter.getI64IntegerAttr(channelInfo.channelId),
+                                             rewriter.getI32IntegerAttr(channelInfo.sourceCoreId),
+                                             rewriter.getI32IntegerAttr(channelInfo.targetCoreId),
+                                             computeValueResults.getOuter(resultIndex));
+        };
      std::pair<LazyInsertComputeResult::ChannelInfo, std::function<void(mlir::IRRewriter::InsertPoint)>> ret {
        channelInfo, insertVal};
      return ret;
@@ -10,8 +10,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"

-#include <tuple>
-
 #include "RegularOpCompaction.hpp"
 #include "src/Accelerators/PIM/Conversion/SpatialToPim/TensorPackingPatterns.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
@@ -340,7 +338,18 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
          ++runIt;
        }

-        if (run.size() > 1) {
+        bool hasRepeatedEndpoint = false;
+        for (size_t lhs = 0; lhs < run.size() && !hasRepeatedEndpoint; ++lhs) {
+          for (size_t rhs = lhs + 1; rhs < run.size(); ++rhs) {
+            if (run[lhs].getSourceCoreId() == run[rhs].getSourceCoreId()
+                && run[lhs].getTargetCoreId() == run[rhs].getTargetCoreId()) {
+              hasRepeatedEndpoint = true;
+              break;
+            }
+          }
+        }
+
+        if (run.size() > 1 && !hasRepeatedEndpoint) {
          struct ReceiveEntry {
            spatial::SpatChannelReceiveOp op;
            size_t originalIndex = 0;
@@ -352,10 +361,6 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
          sortedEntries.reserve(run.size());
          for (auto [originalIndex, op] : llvm::enumerate(run))
            sortedEntries.push_back({op, originalIndex, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
-          llvm::stable_sort(sortedEntries, [](const ReceiveEntry& lhs, const ReceiveEntry& rhs) {
-            return std::tuple(lhs.sourceCoreId, lhs.targetCoreId, lhs.channelId)
-                 < std::tuple(rhs.sourceCoreId, rhs.targetCoreId, rhs.channelId);
-          });

          SmallVector<int64_t> channelIds;
          SmallVector<int32_t> sourceCoreIds;
@@ -436,10 +441,6 @@ void compactScalarChannelRuns(func::FuncOp funcOp, int64_t& nextChannelId) {
          sortedEntries.reserve(run.size());
          for (auto op : run)
            sortedEntries.push_back({op, op.getSourceCoreId(), op.getTargetCoreId(), op.getChannelId()});
-          llvm::stable_sort(sortedEntries, [](const SendEntry& lhs, const SendEntry& rhs) {
-            return std::tuple(lhs.sourceCoreId, lhs.targetCoreId, lhs.channelId)
-                 < std::tuple(rhs.sourceCoreId, rhs.targetCoreId, rhs.channelId);
-          });

          SmallVector<int64_t> channelIds;
          SmallVector<int32_t> sourceCoreIds;
@@ -66,8 +66,10 @@ static Value buildSubviewChunk(const StaticSubviewInfo& info,
  return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
 }

-static SmallVector<Value>
-delinearizeIndexValue(Value linearIndex, ArrayRef<int64_t> shape, ArrayRef<int64_t> strides, PatternRewriter& rewriter) {
+static SmallVector<Value> delinearizeIndexValue(Value linearIndex,
+                                                ArrayRef<int64_t> shape,
+                                                ArrayRef<int64_t> strides,
+                                                PatternRewriter& rewriter) {
  SmallVector<Value> indices;
  indices.reserve(shape.size());

@@ -112,7 +114,8 @@ static Value buildDynamicSubviewChunk(const StaticSubviewInfo& info,
      assert(info.strides[dim] == 1 && "loop-based subview rewrite requires unit strides");
      chunkOffsets.push_back(addDynamicOffset(info.offsets[dim], outerIndices[dim], rewriter));
      chunkSizes.push_back(rewriter.getIndexAttr(1));
-    } else {
+    }
+    else {
      chunkOffsets.push_back(info.offsets[dim]);
      chunkSizes.push_back(rewriter.getIndexAttr(info.sizes.back()));
    }
@@ -122,11 +125,8 @@ static Value buildDynamicSubviewChunk(const StaticSubviewInfo& info,
  return memref::SubViewOp::create(rewriter, loc, info.source, chunkOffsets, chunkSizes, chunkStrides);
 }

-static Value buildContiguousChunk(Value source,
-                                  ArrayRef<int64_t> copyShape,
-                                  ArrayRef<Value> outerIndices,
-                                  Location loc,
-                                  PatternRewriter& rewriter) {
+static Value buildContiguousChunk(
+  Value source, ArrayRef<int64_t> copyShape, ArrayRef<Value> outerIndices, Location loc, PatternRewriter& rewriter) {
  SmallVector<OpFoldResult> chunkOffsets;
  SmallVector<OpFoldResult> chunkSizes;
  SmallVector<OpFoldResult> chunkStrides;
@@ -203,7 +203,8 @@ static LogicalResult rewriteSubviewCopyLikeOp(CopyOp copyOp,
    rewriter.setInsertionPointToStart(loop.getBody());

    SmallVector<Value> outerIndices =
-      outerShape.empty() ? SmallVector<Value> {} : delinearizeIndexValue(loop.getInductionVar(), outerShape, outerStrides, rewriter);
+      outerShape.empty() ? SmallVector<Value> {}
+                         : delinearizeIndexValue(loop.getInductionVar(), outerShape, outerStrides, rewriter);
    Value chunkDst = splitDst ? buildDynamicSubviewChunk(*dstSubview, outerIndices, copyOp.getLoc(), rewriter)
                              : buildContiguousChunk(dst, copyShape, outerIndices, copyOp.getLoc(), rewriter);
    Value chunkSrc = splitSrc ? buildDynamicSubviewChunk(*srcSubview, outerIndices, copyOp.getLoc(), rewriter)
@@ -6,10 +6,10 @@

 #include "llvm/ADT/STLExtras.h"

+#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
 #include "src/Accelerators/PIM/Dialect/Spatial/SpatialOps.hpp"
-#include "src/Accelerators/PIM/Common/IR/SubviewUtils.hpp"

 using namespace mlir;

@@ -60,6 +60,7 @@ def main():
    ap.add_argument("--simulator-dir", default=None,
                    help="Path to pim-simulator crate root (default: auto-detected relative to script).")
    ap.add_argument("--threshold", type=float, default=1e-3, help="Max allowed diff per output element.")
+    ap.add_argument("--seed", type=int, default=0, help="RNG seed for generated validation inputs.")
    ap.add_argument("--crossbar-size", type=int, default=64)
    ap.add_argument("--crossbar-count", type=int, default=8)
    ap.add_argument("--core-count", type=int, default=None,
@@ -117,6 +118,7 @@ def main():
                onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
                crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
                threshold=a.threshold,
+                seed=a.seed,
                reporter=reporter,
                model_index=index,
                model_total=len(onnx_files),
@@ -268,7 +268,7 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1

 def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
                     simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
-                     reporter=None, model_index=1, model_total=1, verbose=False):
+                     seed=0, reporter=None, model_index=1, model_total=1, verbose=False):
    network_onnx_path = Path(network_onnx_path).resolve()
    raptor_path = Path(raptor_path).resolve()
    onnx_include_dir = Path(onnx_include_dir).resolve()
@@ -306,7 +306,7 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,

        print_stage(reporter, model_index, model_total, network_onnx_path.name, "Generate Inputs")
        inputs_descriptor, outputs_descriptor = onnx_io(network_onnx_path)
-        inputs_list, _inputs_dict = gen_random_inputs(inputs_descriptor)
+        inputs_list, _inputs_dict = gen_random_inputs(inputs_descriptor, seed=seed)
        flags, _files = save_inputs_to_files(network_onnx_path, inputs_list, out_dir=workspace_dir / "inputs")
        print_info(reporter, f"Saved {len(inputs_list)} input file(s) to {workspace_dir / 'inputs'}")
        reporter.advance()
Author	SHA1	Message	Date
NiccoloN	8d95c604a6	automatic code formatting Validate Operations / validate-operations (push) Has been cancelled Details	2026-05-13 21:51:19 +02:00
NiccoloN	55eda487dc	use seed in validate.py for deterministic tests	2026-05-13 21:49:36 +02:00
NiccoloN	061139aefb	fix wrong send/receive reordering in post dcp merge instructions compaction	2026-05-13 21:48:49 +02:00