huge refactor for high RewritePatterns usage and less ad-hoc cpp code

remove Spatial many ops in favor of tensor ops like in pim
2026-05-12 10:35:44 +02:00
parent feaff820e1
commit 909c4acfdd
84 changed files with 4048 additions and 3310 deletions
@@ -15,7 +15,10 @@ add_pim_library(OMPimCompilerOptions

 add_pim_library(OMPimCompilerUtils
  PimCompilerUtils.cpp
+  PimArtifactWriter.cpp
+  PimBatchEmission.cpp
  PimCodeGen.cpp
+  PimWeightEmitter.cpp

  EXCLUDE_FROM_OM_LIBS

@@ -0,0 +1,123 @@
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <vector>
+
+#include "src/Accelerators/PIM/Common/IR/WeightUtils.hpp"
+#include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+
+using namespace llvm;
+using namespace mlir;
+
+namespace onnx_mlir {
+
+OnnxMlirCompilerErrorCodes writeHostCoreJson(StringRef outputDirPath) {
+  std::error_code errorCode;
+  std::string outputHostCorePath = outputDirPath.str() + "/core_0.json";
+  raw_fd_ostream hostFileStream(outputHostCorePath, errorCode);
+  if (errorCode) {
+    errs() << "Error while opening host core file `" << outputHostCorePath << "`: " << errorCode.message() << '\n';
+    return InvalidOutputFileAccess;
+  }
+
+  // The host core json contains two no-op-like instructions to satisfy pimsim-nn.
+  hostFileStream << "[{\"imm\":0,\"op\":\"sldi\",\"rd\":0},{\"imm\":0,\"op\":\"sldi\",\"rd\":0}]";
+  hostFileStream.close();
+  return CompilerSuccess;
+}
+
+OnnxMlirCompilerErrorCodes
+writeMemoryBinary(ModuleOp moduleOp, func::FuncOp funcOp, PimAcceleratorMemory& memory, StringRef outputDirPath) {
+  auto memoryFilePath = (outputDirPath + "/memory.bin").str();
+  std::error_code errorCode;
+  raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, sys::fs::OF_None);
+  if (errorCode) {
+    errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n';
+    return InvalidOutputFileAccess;
+  }
+
+  std::vector<char> memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0);
+
+  SmallPtrSet<Operation*, 16> writtenGlobals;
+  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
+    if (hasWeightAlways(getGlobalOp))
+      return;
+    auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
+    if (!globalOp)
+      return;
+    if (!writtenGlobals.insert(globalOp.getOperation()).second)
+      return;
+    auto initialValue = globalOp.getInitialValue();
+    if (!initialValue)
+      return;
+    auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
+    if (!denseAttr)
+      return;
+
+    MemEntry memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
+    ArrayRef<char> rawData = denseAttr.getRawData();
+    char* dst = memoryBuffer.data() + memEntry.address;
+
+    if (denseAttr.isSplat()) {
+      size_t elementSize = rawData.size();
+      assert(elementSize * getGlobalOp.getType().getNumElements() == memEntry.size && "Data size mismatch");
+      for (size_t offset = 0; offset < memEntry.size; offset += elementSize)
+        std::memcpy(dst + offset, rawData.data(), std::min(elementSize, memEntry.size - offset));
+    }
+    else {
+      assert(rawData.size() == memEntry.size && "Data size mismatch");
+      std::memcpy(dst, rawData.data(), rawData.size());
+    }
+  });
+
+  memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size());
+  memoryFileStream.close();
+  return CompilerSuccess;
+}
+
+OnnxMlirCompilerErrorCodes writeConfigJson(func::FuncOp funcOp,
+                                           PimAcceleratorMemory& memory,
+                                           size_t maxCoreId,
+                                           json::Object xbarsPerArrayGroup,
+                                           StringRef outputDirPath) {
+  json::Object configJson;
+
+  configJson["core_cnt"] = maxCoreId + 1;
+  configJson["adc_count"] = 16;
+  configJson["cell_precision"] = 2;
+  configJson["xbar_array_count"] = crossbarCountInCore.getValue();
+  configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()};
+  configJson["array_group_map"] = std::move(xbarsPerArrayGroup);
+
+  json::Array inputsAddresses;
+  for (BlockArgument input : funcOp.getArguments())
+    inputsAddresses.push_back(memory.getValueAddress(input));
+  configJson["inputs_addresses"] = std::move(inputsAddresses);
+
+  json::Array outputsAddresses;
+  for (func::ReturnOp returnOp : funcOp.getOps<func::ReturnOp>())
+    for (mlir::Value output : returnOp.getOperands())
+      outputsAddresses.push_back(memory.getValueAddress(output));
+  configJson["outputs_addresses"] = std::move(outputsAddresses);
+
+  auto configPath = (outputDirPath + "/config.json").str();
+  std::error_code errorCode;
+  raw_fd_ostream jsonOS(configPath, errorCode);
+  if (errorCode) {
+    errs() << "Error while opening config file: " << errorCode.message() << '\n';
+    return InvalidOutputFileAccess;
+  }
+  jsonOS << json::Value(std::move(configJson)) << '\n';
+  jsonOS.close();
+  return CompilerSuccess;
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/JSON.h"
+
+#include "onnx-mlir/Compiler/OMCompilerTypes.h"
+
+namespace onnx_mlir {
+
+class PimAcceleratorMemory;
+
+OnnxMlirCompilerErrorCodes writeHostCoreJson(llvm::StringRef outputDirPath);
+OnnxMlirCompilerErrorCodes writeMemoryBinary(mlir::ModuleOp moduleOp,
+                                             mlir::func::FuncOp funcOp,
+                                             PimAcceleratorMemory& memory,
+                                             llvm::StringRef outputDirPath);
+OnnxMlirCompilerErrorCodes writeConfigJson(mlir::func::FuncOp funcOp,
+                                           PimAcceleratorMemory& memory,
+                                           size_t maxCoreId,
+                                           llvm::json::Object xbarsPerArrayGroup,
+                                           llvm::StringRef outputDirPath);
+
+} // namespace onnx_mlir
@@ -0,0 +1,126 @@
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+
+#include "src/Accelerators/PIM/Common/PimCommon.hpp"
+#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
+
+using namespace mlir;
+
+namespace onnx_mlir {
+namespace {
+
+static SmallVector<int32_t> getBatchCoreIds(pim::PimCoreBatchOp coreBatchOp) {
+  auto coreIdsAttr = coreBatchOp->getAttrOfType<DenseI32ArrayAttr>(onnx_mlir::kCoreIdsAttrName);
+  assert(coreIdsAttr && "pim.core_batch requires coreIds array attribute");
+  return SmallVector<int32_t>(coreIdsAttr.asArrayRef().begin(), coreIdsAttr.asArrayRef().end());
+}
+
+static SmallVector<int32_t> getLaneChunkCoreIds(ArrayRef<int32_t> coreIds, size_t laneCount, unsigned lane) {
+  SmallVector<int32_t> laneCoreIds;
+  laneCoreIds.reserve(coreIds.size() / laneCount);
+  for (size_t chunkIndex = 0; chunkIndex < coreIds.size() / laneCount; ++chunkIndex)
+    laneCoreIds.push_back(coreIds[chunkIndex * laneCount + lane]);
+  return laneCoreIds;
+}
+
+} // namespace
+
+LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
+                                          unsigned lane,
+                                          llvm::function_ref<LogicalResult(pim::PimCoreOp)> callback) {
+  OwningOpRef<ModuleOp> scratchModule = ModuleOp::create(coreBatchOp.getLoc());
+  OpBuilder builder(scratchModule->getContext());
+  builder.setInsertionPointToStart(scratchModule->getBody());
+
+  size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
+  size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount;
+  SmallVector<Value> laneWeights;
+  laneWeights.reserve(weightsPerLane);
+  for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex)
+    laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]);
+
+  auto coreIds = getBatchCoreIds(coreBatchOp);
+  auto scalarCore = pim::PimCoreOp::create(
+    builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane]));
+  Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end());
+  IRMapping mapper;
+  if (coreBatchOp.getBody().front().getNumArguments() == 1)
+    mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]);
+
+  builder.setInsertionPointToEnd(block);
+  for (Operation& op : coreBatchOp.getBody().front()) {
+    if (isa<pim::PimHaltOp>(op)) {
+      pim::PimHaltOp::create(builder, op.getLoc());
+      continue;
+    }
+
+    if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
+      pim::PimSendOp::create(builder,
+                             sendBatchOp.getLoc(),
+                             mapper.lookup(sendBatchOp.getInput()),
+                             sendBatchOp.getSizeAttr(),
+                             builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
+      continue;
+    }
+
+    if (auto sendTensorBatchOp = dyn_cast<pim::PimSendTensorBatchOp>(op)) {
+      pim::PimSendTensorOp::create(
+        builder,
+        sendTensorBatchOp.getLoc(),
+        mapper.lookup(sendTensorBatchOp.getInput()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(sendTensorBatchOp.getTargetCoreIds(), laneCount, lane)));
+      continue;
+    }
+
+    if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
+      auto scalarReceive =
+        pim::PimReceiveOp::create(builder,
+                                  receiveBatchOp.getLoc(),
+                                  receiveBatchOp.getOutput().getType(),
+                                  mapper.lookup(receiveBatchOp.getOutputBuffer()),
+                                  receiveBatchOp.getSizeAttr(),
+                                  builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
+      mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
+      continue;
+    }
+
+    if (auto receiveTensorBatchOp = dyn_cast<pim::PimReceiveTensorBatchOp>(op)) {
+      auto scalarReceive = pim::PimReceiveTensorOp::create(
+        builder,
+        receiveTensorBatchOp.getLoc(),
+        receiveTensorBatchOp.getOutput().getType(),
+        mapper.lookup(receiveTensorBatchOp.getOutputBuffer()),
+        builder.getDenseI32ArrayAttr(getLaneChunkCoreIds(receiveTensorBatchOp.getSourceCoreIds(), laneCount, lane)));
+      mapper.map(receiveTensorBatchOp.getOutput(), scalarReceive.getOutput());
+      continue;
+    }
+
+    if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
+      Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource());
+      if (!hostSource)
+        hostSource = memcpBatchOp.getHostSource();
+
+      auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder,
+                                                           memcpBatchOp.getLoc(),
+                                                           memcpBatchOp.getOutput().getType(),
+                                                           mapper.lookup(memcpBatchOp.getDeviceTarget()),
+                                                           hostSource,
+                                                           memcpBatchOp.getDeviceTargetOffsetAttr(),
+                                                           memcpBatchOp.getHostSourceOffsetAttr(),
+                                                           memcpBatchOp.getSizeAttr());
+      mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
+      continue;
+    }
+
+    Operation* cloned = builder.clone(op, mapper);
+    for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
+      mapper.map(originalResult, clonedResult);
+  }
+
+  if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
+    pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
+  return callback(scalarCore);
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
+
+namespace onnx_mlir {
+
+mlir::LogicalResult withScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp,
+                                                unsigned lane,
+                                                llvm::function_ref<mlir::LogicalResult(pim::PimCoreOp)> callback);
+
+} // namespace onnx_mlir
@@ -5,12 +5,10 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Verifier.h"

 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
@@ -21,7 +19,6 @@
 #include <absl/types/compare.h>
 #include <algorithm>
 #include <cassert>
-#include <cmath>
 #include <cstdint>
 #include <fstream>
 #include <string>
@@ -29,8 +26,11 @@

 #include "Common/PimCommon.hpp"
 #include "Conversion/ONNXToSpatial/Common/Common.hpp"
+#include "src/Accelerators/PIM/Compiler/PimArtifactWriter.hpp"
+#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
 #include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Compiler/PimWeightEmitter.hpp"
 #include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"

 using namespace llvm;
@@ -42,79 +42,6 @@ static size_t getValueSizeInBytes(mlir::Value value) {
  return type.getNumElements() * type.getElementTypeBitWidth() / 8;
 }

-struct DenseWeightView {
-  DenseElementsAttr denseAttr;
-  SmallVector<int64_t> shape;
-  SmallVector<int64_t> strides;
-  int64_t offset = 0;
-};
-
-static SmallVector<int64_t> computeRowMajorStridesForShape(ArrayRef<int64_t> shape) {
-  SmallVector<int64_t> strides(shape.size(), 1);
-  for (int64_t index = static_cast<int64_t>(shape.size()) - 2; index >= 0; --index)
-    strides[index] = strides[index + 1] * shape[index + 1];
-  return strides;
-}
-
-static bool allStaticSubviewParts(memref::SubViewOp subview) {
-  return llvm::all_of(subview.getStaticOffsets(), [](int64_t value) { return !ShapedType::isDynamic(value); })
-      && llvm::all_of(subview.getStaticSizes(), [](int64_t value) { return !ShapedType::isDynamic(value); })
-      && llvm::all_of(subview.getStaticStrides(), [](int64_t value) { return !ShapedType::isDynamic(value); });
-}
-
-static FailureOr<DenseWeightView> resolveDenseWeightView(ModuleOp moduleOp, mlir::Value weight) {
-  SmallVector<memref::SubViewOp> subviews;
-  mlir::Value current = weight;
-  memref::GetGlobalOp getGlobalOp;
-
-  while (true) {
-    Operation* defOp = current.getDefiningOp();
-    if (!defOp)
-      return failure();
-    if ((getGlobalOp = dyn_cast<memref::GetGlobalOp>(defOp)))
-      break;
-    if (auto subview = dyn_cast<memref::SubViewOp>(defOp)) {
-      if (!allStaticSubviewParts(subview))
-        return failure();
-      subviews.push_back(subview);
-      current = subview.getSource();
-      continue;
-    }
-    if (auto cast = dyn_cast<memref::CastOp>(defOp)) {
-      current = cast.getSource();
-      continue;
-    }
-    return failure();
-  }
-
-  auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
-  if (!globalOp || !globalOp.getInitialValue())
-    return failure();
-
-  auto denseAttr = dyn_cast<DenseElementsAttr>(*globalOp.getInitialValue());
-  if (!denseAttr)
-    return failure();
-
-  DenseWeightView view;
-  view.denseAttr = denseAttr;
-  view.shape.assign(denseAttr.getType().getShape().begin(), denseAttr.getType().getShape().end());
-  view.strides = computeRowMajorStridesForShape(view.shape);
-
-  for (memref::SubViewOp subview : llvm::reverse(subviews)) {
-    SmallVector<int64_t> nextStrides;
-    nextStrides.reserve(subview.getStaticStrides().size());
-    for (auto [offset, stride, sourceStride] :
-         llvm::zip_equal(subview.getStaticOffsets(), subview.getStaticStrides(), view.strides)) {
-      view.offset += offset * sourceStride;
-      nextStrides.push_back(stride * sourceStride);
-    }
-    view.shape.assign(subview.getStaticSizes().begin(), subview.getStaticSizes().end());
-    view.strides = std::move(nextStrides);
-  }
-
-  return view;
-}
-
 MemEntry* PimMemory::gatherMemEntry(mlir::Value value) {
  auto type = cast<ShapedType>(value.getType());
  assert("Only static shape is supported" && type.hasStaticShape());
@@ -745,80 +672,6 @@ static SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
  return coreLikeOps;
 }

-static pim::PimCoreOp materializeScalarCoreFromBatchLane(pim::PimCoreBatchOp coreBatchOp, unsigned lane) {
-  OpBuilder builder(coreBatchOp);
-  builder.setInsertionPointAfter(coreBatchOp);
-
-  size_t laneCount = static_cast<size_t>(coreBatchOp.getLaneCount());
-  size_t weightsPerLane = coreBatchOp.getWeights().size() / laneCount;
-  SmallVector<mlir::Value> laneWeights;
-  laneWeights.reserve(weightsPerLane);
-  for (size_t weightIndex = 0; weightIndex < weightsPerLane; ++weightIndex)
-    laneWeights.push_back(coreBatchOp.getWeights()[lane * weightsPerLane + weightIndex]);
-
-  auto coreIds = getBatchCoreIds(coreBatchOp);
-  auto scalarCore = pim::PimCoreOp::create(
-    builder, coreBatchOp.getLoc(), ValueRange(laneWeights), builder.getI32IntegerAttr(coreIds[lane]));
-  Block* block = builder.createBlock(&scalarCore.getBody(), scalarCore.getBody().end());
-  IRMapping mapper;
-  if (coreBatchOp.getBody().front().getNumArguments() == 1)
-    mapper.map(coreBatchOp.getBody().front().getArgument(0), coreBatchOp.getInputs()[lane]);
-
-  builder.setInsertionPointToEnd(block);
-  for (Operation& op : coreBatchOp.getBody().front()) {
-    if (isa<pim::PimHaltOp>(op)) {
-      pim::PimHaltOp::create(builder, op.getLoc());
-      continue;
-    }
-
-    if (auto sendBatchOp = dyn_cast<pim::PimSendBatchOp>(op)) {
-      pim::PimSendOp::create(builder,
-                             sendBatchOp.getLoc(),
-                             mapper.lookup(sendBatchOp.getInput()),
-                             sendBatchOp.getSizeAttr(),
-                             builder.getI32IntegerAttr(sendBatchOp.getTargetCoreIds()[lane]));
-      continue;
-    }
-
-    if (auto receiveBatchOp = dyn_cast<pim::PimReceiveBatchOp>(op)) {
-      auto scalarReceive =
-        pim::PimReceiveOp::create(builder,
-                                  receiveBatchOp.getLoc(),
-                                  receiveBatchOp.getOutput().getType(),
-                                  mapper.lookup(receiveBatchOp.getOutputBuffer()),
-                                  receiveBatchOp.getSizeAttr(),
-                                  builder.getI32IntegerAttr(receiveBatchOp.getSourceCoreIds()[lane]));
-      mapper.map(receiveBatchOp.getOutput(), scalarReceive.getOutput());
-      continue;
-    }
-
-    if (auto memcpBatchOp = dyn_cast<pim::PimMemCopyHostToDevBatchOp>(op)) {
-      mlir::Value hostSource = mapper.lookupOrNull(memcpBatchOp.getHostSource());
-      if (!hostSource)
-        hostSource = memcpBatchOp.getHostSource();
-
-      auto scalarCopy = pim::PimMemCopyHostToDevOp::create(builder,
-                                                           memcpBatchOp.getLoc(),
-                                                           memcpBatchOp.getOutput().getType(),
-                                                           mapper.lookup(memcpBatchOp.getDeviceTarget()),
-                                                           hostSource,
-                                                           memcpBatchOp.getDeviceTargetOffsetAttr(),
-                                                           memcpBatchOp.getHostSourceOffsetAttr(),
-                                                           memcpBatchOp.getSizeAttr());
-      mapper.map(memcpBatchOp.getOutput(), scalarCopy.getOutput());
-      continue;
-    }
-
-    Operation* cloned = builder.clone(op, mapper);
-    for (auto [originalResult, clonedResult] : llvm::zip(op.getResults(), cloned->getResults()))
-      mapper.map(originalResult, clonedResult);
-  }
-
-  if (block->empty() || !isa<pim::PimHaltOp>(block->back()))
-    pim::PimHaltOp::create(builder, coreBatchOp.getLoc());
-  return scalarCore;
-}
-
 static void aliasMaterializedHostGlobals(ModuleOp moduleOp,
                                         func::FuncOp funcOp,
                                         pim::PimCoreOp coreOp,
@@ -844,56 +697,6 @@ static void aliasMaterializedHostGlobals(ModuleOp moduleOp,
  });
 }

-/// Write global constant data into a binary memory image at their allocated addresses.
-static OnnxMlirCompilerErrorCodes
-writeMemoryBinary(ModuleOp moduleOp, func::FuncOp funcOp, PimAcceleratorMemory& memory, StringRef outputDirPath) {
-  auto memoryFilePath = (outputDirPath + "/memory.bin").str();
-  std::error_code errorCode;
-  raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, sys::fs::OF_None);
-  if (errorCode) {
-    errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n';
-    return InvalidOutputFileAccess;
-  }
-
-  std::vector<char> memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0);
-
-  SmallPtrSet<Operation*, 16> writtenGlobals;
-  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
-    if (hasWeightAlways(getGlobalOp))
-      return;
-    auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
-    if (!globalOp)
-      return;
-    if (!writtenGlobals.insert(globalOp.getOperation()).second)
-      return;
-    auto initialValue = globalOp.getInitialValue();
-    if (!initialValue)
-      return;
-    auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
-    if (!denseAttr)
-      return;
-
-    MemEntry memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
-    ArrayRef<char> rawData = denseAttr.getRawData();
-    char* dst = memoryBuffer.data() + memEntry.address;
-
-    if (denseAttr.isSplat()) {
-      size_t elementSize = rawData.size();
-      assert(elementSize * getGlobalOp.getType().getNumElements() == memEntry.size && "Data size mismatch");
-      for (size_t offset = 0; offset < memEntry.size; offset += elementSize)
-        std::memcpy(dst + offset, rawData.data(), std::min(elementSize, memEntry.size - offset));
-    }
-    else {
-      assert(rawData.size() == memEntry.size && "Data size mismatch");
-      std::memcpy(dst, rawData.data(), rawData.size());
-    }
-  });
-
-  memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size());
-  memoryFileStream.close();
-  return CompilerSuccess;
-}
-
 /// Dispatch all operations in a core region to the appropriate code generator.
 /// scf.for loops are statically unrolled via walkPimCoreBlock so that addressing is
 /// fully resolved before the JSON instructions are emitted.
@@ -948,7 +751,6 @@ static int64_t codeGenCoreOps(Block& block, PimCodeGen& coreCodeGen) {
        coreCodeGen.codeGetGlobalOp(getGlobalOp, knowledge);
      else {
        op.emitError("Unsupported codegen for this operation");
-        op.dump();
        return failure();
      }
      processedOperations++;
@@ -957,154 +759,6 @@ static int64_t codeGenCoreOps(Block& block, PimCodeGen& coreCodeGen) {
  return failed(result) ? -1 : static_cast<int64_t>(processedOperations);
 }

-llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>>
-createAndPopulateWeightFolder(func::FuncOp funcOp, StringRef outputDirPath) {
-  ModuleOp moduleOp = funcOp->getParentOfType<ModuleOp>();
-  auto coreWeightsDirPath = outputDirPath + "/weights";
-  auto error = sys::fs::create_directory(coreWeightsDirPath);
-  assert(!error && "Error creating weights directory");
-  size_t indexFileName = 0;
-
-  int64_t xbarSize = crossbarSize.getValue();
-  llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>> mapCoreWeightToFileName;
-  llvm::DenseMap<memref::GlobalOp, std::string> mapGlobalOpToFileName;
-
-  SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
-
-  for (Operation* op : coreLikeOps) {
-    SmallVector<pim::PimCoreOp> scalarCores;
-    if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
-      scalarCores.push_back(coreOp);
-    }
-    else {
-      auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
-      for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
-        scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane));
-    }
-
-    for (pim::PimCoreOp coreOp : scalarCores) {
-      size_t coreId = static_cast<size_t>(coreOp.getCoreId());
-      for (unsigned index : getUsedWeightIndices(coreOp)) {
-        if (index >= coreOp.getWeights().size()) {
-          coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
-          assert(index < coreOp.getWeights().size() && "Weight index is out of range");
-        }
-        mlir::Value weight = coreOp.getWeights()[index];
-
-        auto weightView = resolveDenseWeightView(moduleOp, weight);
-        if (failed(weightView)) {
-          coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(index));
-          assert(succeeded(weightView) && "Weight is not from a dense memref.global view");
-        }
-
-        if (mapCoreWeightToFileName[coreId].contains(weight))
-          continue;
-
-        auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
-        auto globalOp = getGlobalOp ? lookupGlobalForGetGlobal(moduleOp, getGlobalOp) : memref::GlobalOp {};
-        if (globalOp && mapGlobalOpToFileName.contains(globalOp)) {
-          auto& fileName = mapGlobalOpToFileName[globalOp];
-          mapCoreWeightToFileName[coreId].insert({weight, fileName});
-          continue;
-        }
-
-        DenseElementsAttr denseAttr = weightView->denseAttr;
-        ArrayRef<int64_t> shape = weightView->shape;
-        assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
-        int64_t numRows = shape[0];
-        int64_t numCols = shape[1];
-        assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
-
-        size_t elementByteWidth = denseAttr.getElementType().getIntOrFloatBitWidth() / 8;
-
-        std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin";
-        auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str();
-        std::error_code errorCode;
-        raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
-        if (errorCode) {
-          errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
-          assert(errorCode);
-        }
-
-        uint64_t zero = 0;
-        for (int64_t row = 0; row < xbarSize; row++) {
-          for (int64_t col = 0; col < xbarSize; col++) {
-            if (row < numRows && col < numCols) {
-              int64_t elementIndex = weightView->offset + row * weightView->strides[0] + col * weightView->strides[1];
-              APInt bits = denseAttr.getValues<APFloat>()[elementIndex].bitcastToAPInt();
-              uint64_t word = bits.getZExtValue();
-              weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
-            }
-            else {
-              weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
-            }
-          }
-        }
-
-        weightFileStream.close();
-        if (globalOp)
-          mapGlobalOpToFileName.insert({globalOp, newFileName});
-        mapCoreWeightToFileName[coreId].insert({weight, newFileName});
-      }
-    }
-
-    for (pim::PimCoreOp coreOp : scalarCores)
-      if (coreOp.getOperation() != op)
-        coreOp.erase();
-  }
-  return mapCoreWeightToFileName;
-}
-
-/// Write the top-level PIM configuration JSON (core count, crossbar config, I/O addresses).
-static OnnxMlirCompilerErrorCodes writeConfigJson(func::FuncOp funcOp,
-                                                  PimAcceleratorMemory& memory,
-                                                  size_t maxCoreId,
-                                                  json::Object xbarsPerArrayGroup,
-                                                  StringRef outputDirPath) {
-  json::Object configJson;
-
-  // pimsim-nn indexes cores directly by their numeric core ID, with the host
-  // occupying core 0.
-  configJson["core_cnt"] = maxCoreId + 1;
-
-  // TODO: Should this be based on the floating point type used in the model?
-  // The 2 following values determine the bitwidth of the vectors' elements: bitwidth = adc_count * cell_precision
-
-  // Number of ADC for MVM units
-  configJson["adc_count"] = 16;
-  // The bit precision of each ADC
-  configJson["cell_precision"] = 2;
-
-  // Crossbar configuration
-  configJson["xbar_array_count"] = crossbarCountInCore.getValue();
-  configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()};
-  configJson["array_group_map"] = std::move(xbarsPerArrayGroup);
-
-  // Memory layout of inputs and outputs
-  json::Array inputsAddresses;
-  for (BlockArgument input : funcOp.getArguments())
-    inputsAddresses.push_back(memory.getValueAddress(input));
-  configJson["inputs_addresses"] = std::move(inputsAddresses);
-
-  json::Array outputsAddresses;
-  for (func::ReturnOp returnOp : funcOp.getOps<func::ReturnOp>())
-    for (mlir::Value output : returnOp.getOperands())
-      outputsAddresses.push_back(memory.getValueAddress(output));
-  configJson["outputs_addresses"] = std::move(outputsAddresses);
-
-  auto configPath = (outputDirPath + "/config.json").str();
-  std::error_code errorCode;
-  raw_fd_ostream jsonOS(configPath, errorCode);
-  if (errorCode) {
-    errs() << "Error while opening config file: " << errorCode.message() << '\n';
-    return InvalidOutputFileAccess;
-  }
-  jsonOS << json::Value(std::move(configJson)) << '\n';
-  jsonOS.close();
-
-  return CompilerSuccess;
-}
-
 OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::string& outputDirPath) {
  if (!outputDirPath.empty()) {
    if (auto error = sys::fs::create_directory(outputDirPath)) {
@@ -1125,17 +779,8 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
  if (auto err = writeMemoryBinary(moduleOp, funcOp, memory, outputDirPath))
    return err;

-  // Write empty host core file
-  std::error_code errorCode;
-  auto outputHostCorePath = outputDirPath + "/core_0.json";
-  raw_fd_ostream hostFileStream(outputHostCorePath, errorCode);
-  if (errorCode) {
-    errs() << "Error while opening host core file `" << outputHostCorePath << "`: " << errorCode.message() << '\n';
-    return InvalidOutputFileAccess;
-  }
-  // The host core json contains 2 random instructions, just to make pimsim-nn happy
-  hostFileStream << "[{\"imm\":0,\"op\":\"sldi\",\"rd\":0},{\"imm\":0,\"op\":\"sldi\",\"rd\":0}]";
-  hostFileStream.close();
+  if (auto err = writeHostCoreJson(outputDirPath))
+    return err;

  // For each core, specify the number of crossbar per array group.
  // This implementation always assigns one crossbar per group.
@@ -1167,17 +812,7 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
  }

  for (Operation* op : coreLikeOps) {
-    SmallVector<pim::PimCoreOp> scalarCores;
-    if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
-      scalarCores.push_back(coreOp);
-    }
-    else {
-      auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
-      for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
-        scalarCores.push_back(materializeScalarCoreFromBatchLane(coreBatchOp, lane));
-    }
-
-    for (pim::PimCoreOp coreOp : scalarCores) {
+    auto emitCore = [&](pim::PimCoreOp coreOp, bool temporaryCore) -> OnnxMlirCompilerErrorCodes {
      size_t originalCoreId = static_cast<size_t>(coreOp.getCoreId());
      size_t coreId = emittedCoreIds.lookup(originalCoreId);
      maxCoreId = std::max(maxCoreId, coreId);
@@ -1232,13 +867,26 @@ OnnxMlirCompilerErrorCodes onnx_mlir::compileToPimJson(ModuleOp& moduleOp, std::
      }

      xbarsPerArrayGroup["core" + std::to_string(coreId)] = std::move(xbarsPerGroup);
+      if (temporaryCore)
+        coreOp.walk([&memory](Operation* op) { memory.clean(op); });
+      return CompilerSuccess;
+    };
+
+    if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
+      if (auto err = emitCore(coreOp, false))
+        return err;
+      continue;
    }

-    for (pim::PimCoreOp coreOp : scalarCores)
-      if (coreOp.getOperation() != op) {
-        coreOp.walk([&memory](Operation* op) { memory.clean(op); });
-        coreOp.erase();
-      }
+    auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
+    for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane) {
+      OnnxMlirCompilerErrorCodes laneResult = CompilerSuccess;
+      if (failed(withScalarCoreFromBatchLane(coreBatchOp, lane, [&](pim::PimCoreOp coreOp) {
+            laneResult = emitCore(coreOp, true);
+            return laneResult == CompilerSuccess ? success() : failure();
+          })))
+        return laneResult == CompilerSuccess ? CompilerFailure : laneResult;
+    }
  }

  return writeConfigJson(funcOp, memory, maxCoreId, std::move(xbarsPerArrayGroup), outputDirPath);
@@ -0,0 +1,221 @@
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cassert>
+
+#include "Conversion/ONNXToSpatial/Common/Common.hpp"
+#include "src/Accelerators/PIM/Common/IR/WeightUtils.hpp"
+#include "src/Accelerators/PIM/Compiler/PimBatchEmission.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Compiler/PimWeightEmitter.hpp"
+#include "src/Accelerators/PIM/Dialect/Pim/PimOps.hpp"
+
+using namespace llvm;
+using namespace mlir;
+
+namespace onnx_mlir {
+namespace {
+
+struct DenseWeightView {
+  DenseElementsAttr denseAttr;
+  SmallVector<int64_t> shape;
+  SmallVector<int64_t> strides;
+  int64_t offset = 0;
+};
+
+SmallVector<int64_t> computeRowMajorStridesForShape(ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> strides(shape.size(), 1);
+  for (int64_t index = static_cast<int64_t>(shape.size()) - 2; index >= 0; --index)
+    strides[index] = strides[index + 1] * shape[index + 1];
+  return strides;
+}
+
+bool allStaticSubviewParts(memref::SubViewOp subview) {
+  return llvm::all_of(subview.getStaticOffsets(), [](int64_t value) { return !ShapedType::isDynamic(value); })
+      && llvm::all_of(subview.getStaticSizes(), [](int64_t value) { return !ShapedType::isDynamic(value); })
+      && llvm::all_of(subview.getStaticStrides(), [](int64_t value) { return !ShapedType::isDynamic(value); });
+}
+
+FailureOr<DenseWeightView> resolveDenseWeightView(ModuleOp moduleOp, mlir::Value weight) {
+  SmallVector<memref::SubViewOp> subviews;
+  mlir::Value current = weight;
+  memref::GetGlobalOp getGlobalOp;
+
+  while (true) {
+    Operation* defOp = current.getDefiningOp();
+    if (!defOp)
+      return failure();
+    if ((getGlobalOp = dyn_cast<memref::GetGlobalOp>(defOp)))
+      break;
+    if (auto subview = dyn_cast<memref::SubViewOp>(defOp)) {
+      if (!allStaticSubviewParts(subview))
+        return failure();
+      subviews.push_back(subview);
+      current = subview.getSource();
+      continue;
+    }
+    if (auto cast = dyn_cast<memref::CastOp>(defOp)) {
+      current = cast.getSource();
+      continue;
+    }
+    return failure();
+  }
+
+  auto globalOp = lookupGlobalForGetGlobal(moduleOp, getGlobalOp);
+  if (!globalOp || !globalOp.getInitialValue())
+    return failure();
+
+  auto denseAttr = dyn_cast<DenseElementsAttr>(*globalOp.getInitialValue());
+  if (!denseAttr)
+    return failure();
+
+  DenseWeightView view;
+  view.denseAttr = denseAttr;
+  view.shape.assign(denseAttr.getType().getShape().begin(), denseAttr.getType().getShape().end());
+  view.strides = computeRowMajorStridesForShape(view.shape);
+
+  for (memref::SubViewOp subview : llvm::reverse(subviews)) {
+    SmallVector<int64_t> nextStrides;
+    nextStrides.reserve(subview.getStaticStrides().size());
+    for (auto [offset, stride, sourceStride] :
+         llvm::zip_equal(subview.getStaticOffsets(), subview.getStaticStrides(), view.strides)) {
+      view.offset += offset * sourceStride;
+      nextStrides.push_back(stride * sourceStride);
+    }
+    view.shape.assign(subview.getStaticSizes().begin(), subview.getStaticSizes().end());
+    view.strides = std::move(nextStrides);
+  }
+
+  return view;
+}
+
+SmallVector<unsigned, 8> getUsedWeightIndices(Block& block) {
+  SmallVector<unsigned, 8> indices;
+  auto addIndex = [&](unsigned weightIndex) {
+    if (!llvm::is_contained(indices, weightIndex))
+      indices.push_back(weightIndex);
+  };
+
+  block.walk([&](pim::PimMVMOp mvmOp) { addIndex(mvmOp.getWeightIndex()); });
+  block.walk([&](pim::PimVMMOp vmmOp) { addIndex(vmmOp.getWeightIndex()); });
+  llvm::sort(indices);
+  return indices;
+}
+
+SmallVector<unsigned, 8> getUsedWeightIndices(pim::PimCoreOp coreOp) {
+  return getUsedWeightIndices(coreOp.getBody().front());
+}
+
+SmallVector<Operation*> collectTopLevelCoreLikeOps(func::FuncOp funcOp) {
+  SmallVector<Operation*> coreLikeOps;
+  for (Operation& op : funcOp.getBody().front())
+    if (dyn_cast<pim::PimCoreOp>(&op) || dyn_cast<pim::PimCoreBatchOp>(&op))
+      coreLikeOps.push_back(&op);
+  return coreLikeOps;
+}
+
+} // namespace
+
+llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>>
+createAndPopulateWeightFolder(func::FuncOp funcOp, StringRef outputDirPath) {
+  ModuleOp moduleOp = funcOp->getParentOfType<ModuleOp>();
+  auto coreWeightsDirPath = outputDirPath + "/weights";
+  auto error = sys::fs::create_directory(coreWeightsDirPath);
+  assert(!error && "Error creating weights directory");
+  size_t indexFileName = 0;
+
+  int64_t xbarSize = crossbarSize.getValue();
+  llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>> mapCoreWeightToFileName;
+  llvm::DenseMap<memref::GlobalOp, std::string> mapGlobalOpToFileName;
+
+  SmallVector<Operation*> coreLikeOps = collectTopLevelCoreLikeOps(funcOp);
+
+  for (Operation* op : coreLikeOps) {
+    auto processCore = [&](pim::PimCoreOp coreOp) {
+      size_t coreId = static_cast<size_t>(coreOp.getCoreId());
+      for (unsigned index : getUsedWeightIndices(coreOp)) {
+        if (index >= coreOp.getWeights().size()) {
+          coreOp.emitWarning("Weight index " + std::to_string(index) + " is out of range");
+          assert(index < coreOp.getWeights().size() && "Weight index is out of range");
+        }
+        mlir::Value weight = coreOp.getWeights()[index];
+
+        auto weightView = resolveDenseWeightView(moduleOp, weight);
+        if (failed(weightView)) {
+          coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(index));
+          assert(succeeded(weightView) && "Weight is not from a dense memref.global view");
+        }
+
+        if (mapCoreWeightToFileName[coreId].contains(weight))
+          continue;
+
+        auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
+        auto globalOp = getGlobalOp ? lookupGlobalForGetGlobal(moduleOp, getGlobalOp) : memref::GlobalOp {};
+        if (globalOp && mapGlobalOpToFileName.contains(globalOp)) {
+          auto& fileName = mapGlobalOpToFileName[globalOp];
+          mapCoreWeightToFileName[coreId].insert({weight, fileName});
+          continue;
+        }
+
+        DenseElementsAttr denseAttr = weightView->denseAttr;
+        ArrayRef<int64_t> shape = weightView->shape;
+        assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
+        int64_t numRows = shape[0];
+        int64_t numCols = shape[1];
+        assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
+
+        size_t elementByteWidth = denseAttr.getElementType().getIntOrFloatBitWidth() / 8;
+
+        std::string newFileName = "crossbar_" + std::to_string(indexFileName++) + ".bin";
+        auto weightFilePath = (coreWeightsDirPath + "/" + newFileName).str();
+        std::error_code errorCode;
+        raw_fd_ostream weightFileStream(weightFilePath, errorCode, sys::fs::OF_None);
+        if (errorCode) {
+          errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
+          assert(errorCode);
+        }
+
+        uint64_t zero = 0;
+        for (int64_t row = 0; row < xbarSize; row++) {
+          for (int64_t col = 0; col < xbarSize; col++) {
+            if (row < numRows && col < numCols) {
+              int64_t elementIndex = weightView->offset + row * weightView->strides[0] + col * weightView->strides[1];
+              APInt bits = denseAttr.getValues<APFloat>()[elementIndex].bitcastToAPInt();
+              uint64_t word = bits.getZExtValue();
+              weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
+            }
+            else {
+              weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
+            }
+          }
+        }
+
+        weightFileStream.close();
+        if (globalOp)
+          mapGlobalOpToFileName.insert({globalOp, newFileName});
+        mapCoreWeightToFileName[coreId].insert({weight, newFileName});
+      }
+      return success();
+    };
+
+    if (auto coreOp = dyn_cast<pim::PimCoreOp>(op)) {
+      (void) processCore(coreOp);
+      continue;
+    }
+
+    auto coreBatchOp = cast<pim::PimCoreBatchOp>(op);
+    for (unsigned lane = 0; lane < static_cast<unsigned>(coreBatchOp.getLaneCount()); ++lane)
+      if (failed(withScalarCoreFromBatchLane(coreBatchOp, lane, processCore)))
+        return mapCoreWeightToFileName;
+  }
+  return mapCoreWeightToFileName;
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Value.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+
+#include <string>
+
+namespace onnx_mlir {
+
+llvm::DenseMap<size_t, llvm::DenseMap<mlir::Value, std::string>>
+createAndPopulateWeightFolder(mlir::func::FuncOp funcOp, llvm::StringRef outputDirPath);
+
+} // namespace onnx_mlir