faster pim VerificationPass.cpp and pim code emission

2026-05-25 15:24:12 +02:00
parent 4855a2e105
commit e8a08f6dd0
18 changed files with 1610 additions and 573 deletions
@@ -4,13 +4,16 @@

 #include "llvm-project/clang/include/clang/Basic/LLVM.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_os_ostream.h"

 #include <fstream>
+#include <limits>
 #include <optional>

 #include "onnx-mlir/Compiler/OMCompilerTypes.h"
+#include "src/Accelerators/PIM/Common/IR/AddressAnalysis.hpp"
 #include "src/Accelerators/PIM/Common/PimCommon.hpp"
 #include "src/Accelerators/PIM/Common/Support/ReportUtils.hpp"
 #include "src/Accelerators/PIM/Compiler/PimBinaryFormat.hpp"
@@ -23,6 +26,13 @@ struct MemEntry {
  size_t size;
 };

+struct MemoryValueKey {
+  mlir::Value value;
+  std::optional<unsigned> lane;
+
+  bool operator==(const MemoryValueKey& other) const { return value == other.value && lane == other.lane; }
+};
+
 struct MemoryReportRow {
  uint64_t numAlloca = 0;
  uint64_t sizeAlloca = 0;
@@ -50,33 +60,33 @@ struct MemoryReportEntry {
 };

 class PimMemory {
-  llvm::SmallVector<std::pair<MemEntry, mlir::Value>, 32> memEntries;
-  llvm::SmallDenseMap<mlir::Value, MemEntry, 32>& globalMemEntriesMap;
-  llvm::SmallDenseMap<mlir::Value, MemEntry, 32> ownedMemEntriesMap;
+  llvm::SmallVector<std::pair<MemEntry, MemoryValueKey>, 32> memEntries;
+  llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& globalMemEntriesMap;
+  llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32> ownedMemEntriesMap;

  size_t minAlignment = 4;
  size_t firstAvailableAddress = 0;

-  MemEntry* gatherMemEntry(mlir::Value value);
+  MemEntry* gatherMemEntry(mlir::Value value, std::optional<unsigned> lane = std::nullopt);
  void allocateGatheredMemory();
-  void allocateMemoryForValue(mlir::Value value, MemEntry& memEntry);
+  void allocateMemoryForValue(const MemoryValueKey& key, MemEntry& memEntry);

 public:
-  PimMemory(llvm::SmallDenseMap<mlir::Value, MemEntry, 32>& globalMemEntriesMap)
+  PimMemory(llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& globalMemEntriesMap)
  : globalMemEntriesMap(globalMemEntriesMap) {}

  void allocateHost(mlir::ModuleOp moduleOp, mlir::func::FuncOp funcOp);
-  void allocateCore(mlir::Operation* op);
+  void allocateCore(mlir::Operation* op, std::optional<unsigned> lane = std::nullopt);
  MemoryReportRow getReportRow() const;
  void remove(mlir::Value val);

  size_t getFirstAvailableAddress() const { return firstAvailableAddress; }
-  MemEntry getMemEntry(mlir::Value value) const;
+  MemEntry getMemEntry(const MemoryValueKey& key) const;
 };

 class PimAcceleratorMemory {
 public:
-  llvm::SmallDenseMap<mlir::Value, MemEntry, 32> memEntriesMap;
+  llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32> memEntriesMap;
  PimMemory hostMem;

 private:
@@ -84,14 +94,21 @@ private:
  std::fstream fileReport;
  std::optional<MemoryReportRow> hostReportRow;
  llvm::SmallVector<MemoryReportEntry, 32> reportEntries;
+  mutable llvm::DenseMap<mlir::Value, CompiledIndexExpr> compiledIndexExprs;
+  mutable llvm::DenseMap<mlir::Value, CompiledAddressExpr> compiledAddressExprs;

 public:
  PimAcceleratorMemory()
  : hostMem(memEntriesMap), fileReport(openReportFile("memory_report")) {}
+  PimAcceleratorMemory(const llvm::SmallDenseMap<MemoryValueKey, MemEntry, 32>& initialMemEntries, bool enableReport)
+  : memEntriesMap(initialMemEntries), hostMem(memEntriesMap), fileReport(enableReport ? openReportFile("memory_report") : std::fstream()) {}

  PimMemory& getOrCreateDeviceMem(size_t id);

-  size_t getValueAddress(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const;
+  size_t getValueAddress(mlir::Value value,
+                         const StaticValueKnowledge& knowledge = {},
+                         std::optional<unsigned> lane = std::nullopt) const;
+  llvm::FailureOr<int64_t> getIndexValue(mlir::Value value, const StaticValueKnowledge& knowledge = {}) const;
  void reportHost();
  void recordCoreReport(size_t coreId, const MemoryReportRow& row);
  void recordBatchReport(uint64_t batchId,
@@ -103,15 +120,24 @@ public:
  void clean(mlir::Operation* op);
 };

+struct CoreEmissionJob {
+  mlir::Operation* coreLikeOp = nullptr;
+  size_t originalCoreId = 0;
+  size_t emittedCoreId = 0;
+  llvm::SmallVector<unsigned, 4> lanes;
+  std::optional<uint64_t> batchReportId;
+};
+
 class PimCodeGen {
  PimAcceleratorMemory& memory;
  llvm::raw_fd_ostream& coreBinaryStream;
  llvm::raw_fd_ostream* coreJsonStream;
  const llvm::DenseMap<size_t, size_t>& emittedCoreIds;
+  std::optional<unsigned> batchLane;
  mutable uint32_t emittedInstructionCount = 0;

  size_t addressOf(mlir::Value value, const StaticValueKnowledge& knowledge) const {
-    return memory.getValueAddress(value, knowledge);
+    return memory.getValueAddress(value, knowledge, batchLane);
  }
  size_t remapCoreId(size_t coreId) const;

@@ -141,6 +167,10 @@ public:
  : memory(memory), coreBinaryStream(coreBinary), coreJsonStream(coreJson), emittedCoreIds(emittedCoreIds) {}

  uint32_t getEmittedInstructionCount() const { return emittedInstructionCount; }
+  void setBatchLane(std::optional<unsigned> lane) { batchLane = lane; }
+  llvm::FailureOr<int64_t> indexOf(mlir::Value value, const StaticValueKnowledge& knowledge) const {
+    return memory.getIndexValue(value, knowledge);
+  }

  void codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp, const StaticValueKnowledge& knowledge) const;
  void codeGenLoadBatchOp(pim::PimMemCopyHostToDevBatchOp loadOp, const StaticValueKnowledge& knowledge) const;
@@ -151,6 +181,14 @@ public:
  void codeGenReceiveTensorOp(pim::PimReceiveTensorOp receiveTensorOp, const StaticValueKnowledge& knowledge) const;
  void codeGenSendOp(pim::PimSendOp sendOp, const StaticValueKnowledge& knowledge) const;
  void codeGenSendTensorOp(pim::PimSendTensorOp sendTensorOp, const StaticValueKnowledge& knowledge) const;
+  void codeGenReceiveBatchOp(pim::PimReceiveBatchOp receiveOp, unsigned lane, const StaticValueKnowledge& knowledge) const;
+  void codeGenReceiveTensorBatchOp(pim::PimReceiveTensorBatchOp receiveOp,
+                                   llvm::ArrayRef<int32_t> laneCoreIds,
+                                   const StaticValueKnowledge& knowledge) const;
+  void codeGenSendBatchOp(pim::PimSendBatchOp sendOp, unsigned lane, const StaticValueKnowledge& knowledge) const;
+  void codeGenSendTensorBatchOp(pim::PimSendTensorBatchOp sendOp,
+                                llvm::ArrayRef<int32_t> laneCoreIds,
+                                const StaticValueKnowledge& knowledge) const;
  void codeGenConcatOp(pim::PimConcatOp concatOp, const StaticValueKnowledge& knowledge) const;

  template <typename MVMTy>
@@ -173,3 +211,24 @@ public:
 OnnxMlirCompilerErrorCodes compileToPimCode(mlir::ModuleOp& moduleOpRef, std::string& outputDirName);

 } // namespace onnx_mlir
+
+namespace llvm {
+
+template <>
+struct DenseMapInfo<onnx_mlir::MemoryValueKey> {
+  static onnx_mlir::MemoryValueKey getEmptyKey() {
+    return {DenseMapInfo<mlir::Value>::getEmptyKey(), 0};
+  }
+
+  static onnx_mlir::MemoryValueKey getTombstoneKey() {
+    return {DenseMapInfo<mlir::Value>::getTombstoneKey(), 0};
+  }
+
+  static unsigned getHashValue(const onnx_mlir::MemoryValueKey& key) {
+    return hash_combine(key.value, key.lane.value_or(std::numeric_limits<unsigned>::max()));
+  }
+
+  static bool isEqual(const onnx_mlir::MemoryValueKey& lhs, const onnx_mlir::MemoryValueKey& rhs) { return lhs == rhs; }
+};
+
+} // namespace llvm