add PIM accelerator

2026-02-24 15:09:18 +01:00
parent b24a0df8d7
commit a6e928bdd7
67 changed files with 9109 additions and 1 deletions
@@ -0,0 +1,44 @@
+get_property(OMLibs GLOBAL PROPERTY ONNX_MLIR_LIBS)
+
+add_onnx_mlir_library(OMPimCompilerOptions
+  PimCompilerOptions.cpp
+
+  EXCLUDE_FROM_OM_LIBS
+
+  INCLUDE_DIRS PRIVATE
+  ${PIM_SRC_ROOT}
+  ${PIM_BIN_ROOT}
+  ${PIM_ONNX_MLIR_SRC_ROOT}
+  ${PIM_ONNX_MLIR_BIN_ROOT}
+
+  LINK_LIBS PUBLIC
+  ${OMLibs}
+  OMCompilerOptions
+
+  ACCEL_INCLUDE_DIRS PRIVATE
+  ${PIM_ONNX_MLIR_SRC_ROOT}
+  ${PIM_ONNX_MLIR_BIN_ROOT}
+)
+
+add_onnx_mlir_library(OMPimCompilerUtils
+  PimCompilerUtils.cpp
+  PimCodeGen.cpp
+
+  EXCLUDE_FROM_OM_LIBS
+
+  INCLUDE_DIRS PRIVATE
+  ${PIM_SRC_ROOT}
+  ${PIM_BIN_ROOT}
+  ${PIM_ONNX_MLIR_SRC_ROOT}
+  ${PIM_ONNX_MLIR_BIN_ROOT}
+
+  LINK_LIBS PUBLIC
+  ${OMLibs}
+  OMCompilerUtils
+  OMPimCompilerOptions
+  OMCompilerPasses
+
+  ACCEL_INCLUDE_DIRS PRIVATE
+  ${PIM_ONNX_MLIR_SRC_ROOT}
+  ${PIM_ONNX_MLIR_BIN_ROOT}
+)
@@ -0,0 +1,704 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+
+#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
+#include "Conversion/SpatialToPIM/SpatialToPIMCommon.hpp"
+#include "Dialect/Spatial/SpatialOps.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerUtils.hpp"
+#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
+#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
+#include "src/Compiler/CompilerPasses.hpp"
+#include "src/Compiler/CompilerUtils.hpp"
+
+namespace onnx_mlir {
+
+MemEntry* PimMemory::gatherMemEntry(Value value) {
+  auto type = cast<ShapedType>(value.getType());
+  assert("Only static shape is supported" && type.hasStaticShape());
+  size_t allocSize = type.getNumElements() * type.getElementType().getIntOrFloatBitWidth() / 8;
+  MemEntry memEntry = {0, allocSize};
+  return &memEntries.emplace_back(memEntry, value).first;
+}
+
+void PimMemory::allocateMemoryForValue(Value value, MemEntry& memEntry) {
+  memEntry.address = firstAvailableAddress;
+  firstAvailableAddress += memEntry.size;
+  // Alignment
+  if (size_t remainder = firstAvailableAddress % minAlignment)
+    firstAvailableAddress += minAlignment - remainder;
+
+  globalMemEntriesMap[value] = memEntry;
+}
+
+void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) {
+  // More than one SSA value per single global constant:
+  // Cannot call gatherMemEntry for each of them, otherwise memory will be allocated multiple times
+  // Thus, call gatherMemEntry only for the first SSA value and assign the same memEntry to all others
+  llvm::SmallDenseMap<memref::GlobalOp, MemEntry*, 8> globalConstants;
+  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
+    if (!getGlobalOp->hasAttr("weightAlways")) {
+      auto globalMemrefOp = moduleOp.lookupSymbol<memref::GlobalOp>(getGlobalOp.getName());
+      auto iter = globalConstants.find(globalMemrefOp);
+      if (iter == globalConstants.end())
+        globalConstants[globalMemrefOp] = gatherMemEntry(getGlobalOp);
+      else {
+        MemEntry memEntry = *iter->second;
+        globalMemEntriesMap[getGlobalOp] = memEntry;
+      }
+    }
+  });
+
+  for (Value arg : funcOp.getArguments())
+    gatherMemEntry(arg);
+
+  allocateCore(funcOp);
+}
+
+void PimMemory::allocateCore(Operation* op) {
+  op->walk([&](memref::AllocOp allocOp) { gatherMemEntry(allocOp); });
+
+  llvm::sort(memEntries, [](auto a, auto b) -> bool { return a.first.size > b.first.size; });
+  for (auto& [memEntry, value] : memEntries)
+    allocateMemoryForValue(value, memEntry);
+}
+
+MemEntry PimMemory::getMemEntry(Value value) const {
+  auto iter = globalMemEntriesMap.find(value);
+  assert("Missing memEntry for value" && iter != globalMemEntriesMap.end());
+  return iter->second;
+}
+
+PimMemory PimAcceleratorMemory::getOrCreateDeviceMem(size_t id) {
+  return deviceMem.try_emplace(id, memEntriesMap).first->second;
+}
+
+size_t PimAcceleratorMemory::getValueAddress(Value value) const {
+  while (true) {
+    auto definingOp = value.getDefiningOp();
+    if (!definingOp)
+      break;
+    if (auto dpsDefiningOp = dyn_cast<DestinationStyleOpInterface>(definingOp)) {
+      OpOperand* tiedOperand = dpsDefiningOp.getTiedOpOperand(cast<OpResult>(value));
+      if (!tiedOperand)
+        break;
+      value = tiedOperand->get();
+    }
+    else if (auto subviewDefiningOp = dyn_cast<memref::SubViewOp>(definingOp)) {
+      auto source = subviewDefiningOp.getSource();
+      auto srcShape = source.getType().getShape();
+      auto subviewOffsets = subviewDefiningOp.getStaticOffsets();
+      auto subviewSizes = subviewDefiningOp.getStaticSizes();
+      auto subviewStrides = subviewDefiningOp.getStaticStrides();
+      assert(isMemoryContiguous(srcShape, subviewOffsets, subviewSizes, subviewStrides));
+      value = source;
+    }
+    else
+      break;
+  }
+  return memEntriesMap.at(value).address;
+}
+
+llvm::json::Object PimCodeGen::createSetImmediate(size_t targetRegister, size_t immediate) {
+  llvm::json::Object returnValue;
+  returnValue["op"] = "sldi";
+  returnValue["rd"] = targetRegister;
+  returnValue["imm"] = immediate;
+  return returnValue;
+}
+
+llvm::json::Object PimCodeGen::createEmptyOffset() {
+  llvm::json::Object returnValue;
+  returnValue["offset_select"] = 0;
+  returnValue["offset_value"] = 0;
+  return returnValue;
+}
+
+void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) {
+  llvm::json::Object setRegisterJson = createSetImmediate(registerNumber, immediate);
+  coreFileStream << llvm::json::Value(std::move(setRegisterJson)) << ',';
+}
+
+void PimCodeGen::createRd(size_t rdAddress, size_t rdOffset) {
+  // rd on register 0
+  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
+}
+
+void PimCodeGen::createRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset) {
+  // rd on register 0
+  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
+  // rs1 on register 1
+  genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
+}
+
+void PimCodeGen::createRdRs1Rs2(
+  size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset) {
+  // rd on register 0
+  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
+  // rs1 on register 1
+  genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
+  // rs2 on register 2
+  genSetRegisterImmediateUnsigned(2, rs2Address + rs2Offset);
+}
+
+void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp) {
+  auto deviceDst = loadOp.getDeviceDst();
+  auto hostSrc = loadOp.getHostSrc();
+  auto deviceDstOffset = loadOp.getDeviceDstOffset();
+  auto hostSrcOffset = loadOp.getHostSrcOffset();
+  auto size = loadOp.getSize();
+
+  auto deviceDstAlloc = memory.getValueAddress(deviceDst);
+  auto hostSrcAlloc = memory.getValueAddress(hostSrc);
+
+  // Set load rd register (reg 0)
+  createRdRs1(deviceDstAlloc, deviceDstOffset, hostSrcAlloc, hostSrcOffset);
+
+  llvm::json::Object loadOpJson;
+  loadOpJson["op"] = "ld";
+  loadOpJson["rd"] = 0;
+  loadOpJson["rs1"] = 1;
+  loadOpJson["size"] = size;
+  loadOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(loadOpJson)) << ',';
+}
+
+void PimCodeGen::codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp) {
+  auto hostDst = storeOp.getHostDst();
+  auto deviceSrc = storeOp.getDeviceSrc();
+  auto hostDstOffset = storeOp.getHostDstOffset();
+  auto deviceSrcOffset = storeOp.getDeviceSrcOffset();
+  auto size = storeOp.getSize();
+
+  auto deviceSrcAlloc = memory.getValueAddress(deviceSrc);
+  auto hostDstAlloc = memory.getValueAddress(hostDst);
+
+  // Set load rd register (reg 0)
+  createRdRs1(hostDstAlloc, hostDstOffset, deviceSrcAlloc, deviceSrcOffset);
+
+  llvm::json::Object storeOpJson;
+  storeOpJson["op"] = "st";
+  storeOpJson["rd"] = 0;
+  storeOpJson["rs1"] = 1;
+  storeOpJson["size"] = size;
+  storeOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(storeOpJson)) << ',';
+}
+
+template <typename MVMTy>
+void PimCodeGen::codeGenMVMLikeOp(size_t mvmId, MVMTy mvmLikeOp, bool transposeMatrix) {
+  auto outBufAlloc = memory.getValueAddress(mvmLikeOp.getOutBuf());
+  auto vectorAlloc = memory.getValueAddress(mvmLikeOp.getVectorInput());
+
+  createRdRs1(outBufAlloc, 0, vectorAlloc, 0);
+
+  llvm::json::Object mvmOpJson;
+  mvmOpJson["op"] = "mvmul";
+  mvmOpJson["rd"] = 0;
+  mvmOpJson["rs1"] = 1;
+  mvmOpJson["group"] = mvmId;
+  mvmOpJson["relu"] = 0;
+  mvmOpJson["mbiw"] = 8;
+
+  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
+
+  // TODO: save weights somewhere (if transposeMatrix=true, then transpose the
+  // weight matrix)
+}
+
+void PimCodeGen::codeGenApplyFiltersOp(pim::PimApplyFiltersOp applyFiltersOp) {
+
+  auto outBuff = memory.getValueAddress(applyFiltersOp.getOutBuf());
+  auto inBuff = memory.getValueAddress(applyFiltersOp.getInput());
+  auto accumBuff = memory.getValueAddress(applyFiltersOp.getAccumBuf());
+
+  // Get weight indices from the operation attribute.
+  auto weightIndices = applyFiltersOp.getWeightIndices();
+
+  // Get shape of the input tensor.
+  auto inputType = cast<MemRefType>(applyFiltersOp.getInput().getType());
+  auto outputType = cast<MemRefType>(applyFiltersOp.getOutBuf().getType());
+  auto in_shape = inputType.getShape();
+  auto out_shape = outputType.getShape();
+
+  // Extract the relevant dimensions.
+  size_t in_channels = in_shape[1];                    // Number of input channels.
+  size_t out_channels = out_shape[1];                  // Number of output channels.
+
+  size_t dim2 = in_shape.size() > 2 ? in_shape[2] : 1; // Image width.
+  size_t dim3 = in_shape.size() > 3 ? in_shape[3] : 1; // Image height.
+
+  // Iterate through pixels.
+  for (size_t out_y = 0; out_y < dim3; out_y++) {
+    for (size_t out_x = 0; out_x < dim2; out_x++) {
+
+      // For each crossbar, perform the MVMUL operation.
+      size_t weightIndex = 0;
+      for (Attribute weight : weightIndices) {
+
+        // --------------------------------------
+        // --- STEP 1: Perform MVUL operation ---
+        // --------------------------------------
+
+        // Get the weight matrix ID for this position.
+        auto weightId = cast<IntegerAttr>(weight).getInt();
+
+        size_t xKer = cast<IntegerAttr>(applyFiltersOp.getXKernelPositions()[weightIndex]).getInt();
+        size_t yKer = cast<IntegerAttr>(applyFiltersOp.getYKernelPositions()[weightIndex]).getInt();
+
+        weightIndex++;
+
+        if (out_x + xKer >= dim2 || out_y + yKer >= dim3)
+          continue;
+
+        // Calculate the offset for the input (and output) tensor.
+        size_t output_offset = (out_y * dim2 + out_x) * 32 * out_channels;
+        size_t input_offset = ((out_y + yKer) * dim2 + (out_x + xKer)) * 32 * in_channels;
+
+        // Read from the input tensor and store the partial result in the
+        // accumulator buffer, if this is not the first weight matrix.
+
+        // Note that rs1 is the input tensor, and rd is the output tensor.
+        // TODO: This order of arguments is confusing, check if the correct
+        // order is being used in the WMVUL operation. The order below is
+        // correct.
+        if (weightIndices[0] != weight) {
+          createRdRs1(accumBuff, 0, inBuff, input_offset);
+        }
+        else {
+          // Otherwise store directly in the output buffer.
+          createRdRs1(outBuff, output_offset, inBuff, input_offset);
+        }
+
+        // Create the MVMUL JSON object
+        llvm::json::Object mvmOpJson;
+        mvmOpJson["op"] = "mvmul";
+        mvmOpJson["rd"] = 0;
+        mvmOpJson["rs1"] = 1;
+        mvmOpJson["group"] = weightId;
+        mvmOpJson["relu"] = 0;
+        mvmOpJson["mbiw"] = 8;
+
+        // Write the JSON to the output stream
+        coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
+
+        // --------------------------------------
+        // --- STEP 2: Perform VADD operation ---
+        // --------------------------------------
+
+        // If this is the first weight matrix, we don't need to perform a VADD.
+        if (weightIndices[0] == weight)
+          continue;
+
+        // We now need to sum the value in the accumulator buffer with the value
+        // in the output buffer, and store the result in the output buffer.
+        createRdRs1Rs2(outBuff, output_offset, accumBuff, 0, outBuff, output_offset);
+
+        llvm::json::Object vaddOpJson;
+        vaddOpJson["op"] = "vvadd";
+        vaddOpJson["rd"] = 0;
+        vaddOpJson["rs1"] = 1;
+        vaddOpJson["rs2"] = 2;
+        vaddOpJson["offset"] = createEmptyOffset();
+
+        coreFileStream << llvm::json::Value(std::move(vaddOpJson)) << ',';
+      }
+    }
+  }
+}
+
+void PimCodeGen::codeGenVAddOp(pim::PimVAddOp vaddOp) {
+  auto outBufAlloc = memory.getValueAddress(vaddOp.getOutBuf());
+  auto rs1BufferOp = memory.getValueAddress(vaddOp.getA());
+  auto rs2BufferOp = memory.getValueAddress(vaddOp.getB());
+
+  createRdRs1Rs2(outBufAlloc, 0, rs1BufferOp, 0, rs2BufferOp, 0);
+
+  // Get the size of the output buffer.
+  auto outputType = cast<MemRefType>(vaddOp.getOutBuf().getType());
+  auto out_shape = outputType.getShape();
+
+  // Multiply all dimension lengths to get the total number of elements.
+  size_t totalElements = 1;
+  for (size_t i = 0; i < out_shape.size(); i++)
+    totalElements *= out_shape[i];
+  auto elementSize = vaddOp.getOutRes().getType().getElementTypeBitWidth() / 8;
+
+  llvm::json::Object mvmOpJson;
+  mvmOpJson["op"] = "vvadd";
+  mvmOpJson["rd"] = 0;
+  mvmOpJson["rs1"] = 1;
+  mvmOpJson["rs2"] = 2;
+  mvmOpJson["offset"] = createEmptyOffset();
+  mvmOpJson["len"] = totalElements * elementSize;
+
+  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
+}
+
+void PimCodeGen::codeGenVMaxOp(pim::PimVMaxOp vmaxOp) {
+
+  auto outBufAlloc = memory.getValueAddress(vmaxOp.getOutBuf());
+  auto rs1BufferOp = memory.getValueAddress(vmaxOp.getA());
+  auto rs2BufferOp = memory.getValueAddress(vmaxOp.getB());
+
+  createRdRs1Rs2(outBufAlloc, 0, rs1BufferOp, 0, rs2BufferOp, 0);
+
+  llvm::json::Object mvmOpJson;
+  mvmOpJson["op"] = "vvmax";
+  mvmOpJson["rd"] = 0;
+  mvmOpJson["rs1"] = 1;
+  mvmOpJson["rs2"] = 2;
+  mvmOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
+}
+
+void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp) {
+  auto outBufAlloc = memory.getValueAddress(vreluOp.getOutBuf());
+  auto rs1BufferOp = memory.getValueAddress(vreluOp.getA());
+
+  createRdRs1(outBufAlloc, 0, rs1BufferOp, 0);
+
+  llvm::json::Object mvmOpJson;
+  mvmOpJson["op"] = "vrelu";
+  mvmOpJson["rd"] = 0;
+  mvmOpJson["rs1"] = 1;
+  mvmOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
+}
+
+void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp) {
+
+  auto destAlloc = memory.getValueAddress(receiveOp.getDst());
+
+  createRd(destAlloc, /* dest_offset = */ 0);
+
+  llvm::json::Object recvOpJson;
+  recvOpJson["op"] = "recv";
+  recvOpJson["rd"] = 0;
+  recvOpJson["core"] = receiveOp.getSrcCoreId();
+  recvOpJson["size"] = receiveOp.getSize();
+  recvOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(recvOpJson)) << ',';
+}
+
+void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp) {
+
+  auto srcAlloc = memory.getValueAddress(sendOp.getSrc());
+
+  // Technically a RS1 register, but its just a name..
+  createRd(srcAlloc, /* dest_offset = */ 0);
+
+  llvm::json::Object sendOpJson;
+  sendOpJson["op"] = "send";
+  sendOpJson["rd"] = 0;
+  sendOpJson["core"] = sendOp.getTargetCoreId();
+  sendOpJson["size"] = sendOp.getSize();
+  sendOpJson["offset"] = createEmptyOffset();
+
+  coreFileStream << llvm::json::Value(std::move(sendOpJson)) << ',';
+}
+
+size_t getMatrixSize(ShapedType matrixShape) {
+  if (matrixShape.getRank() != 2 && matrixShape.getRank() != 4)
+    assert(false && "Unsupported matrix shape");
+  return std::max(matrixShape.getDimSize(0), matrixShape.getDimSize(1));
+}
+
+std::string getMemorySizeAsString(size_t size) {
+  if (size > 1024 * 1024 * 1024)
+    return std::to_string(size / 1024 / 1024 / 1024) + " GB";
+  if (size > 1024 * 1024)
+    return std::to_string(size / 1024 / 1024) + " MB";
+  if (size > 1024)
+    return std::to_string(size / 1024) + " KB";
+  return std::to_string(size) + " Bytes";
+}
+
+int compileModuleToPIMJSON(const OwningOpRef<ModuleOp>& moduleOpRef, std::string& outputDirPath) {
+  ModuleOp moduleOp = moduleOpRef.get();
+
+  if (pimEmissionTarget != EmitPimCodegen) {
+    moduleOp.dump();
+    return CompilerSuccess;
+  }
+
+  if (!outputDirPath.empty()) {
+    if (auto error = llvm::sys::fs::create_directory(outputDirPath)) {
+      llvm::errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n';
+      return InvalidOutputFileAccess;
+    }
+  }
+
+  // For each core, specify the number of crossbar per array group
+  // This implementation always assigns one crossbar per group
+  llvm::json::Object xbarsPerArrayGroup;
+
+  auto funcOps = moduleOp.getOps<func::FuncOp>();
+  assert(!funcOps.empty() && "No function found in the module");
+  auto funcOp = *funcOps.begin();
+
+  PimAcceleratorMemory memory;
+  memory.hostMem.allocateHost(moduleOp, funcOp);
+
+  // Write memory binary file
+  auto memoryFilePath = outputDirPath + "/memory.bin";
+  std::error_code errorCode;
+  llvm::raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, llvm::sys::fs::OF_None);
+  if (errorCode) {
+    llvm::errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n';
+    return InvalidOutputFileAccess;
+  }
+  // Zero-initialized buffer
+  std::vector<char> memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0);
+  // Write global values at their allocated addresses
+  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
+    if (getGlobalOp->hasAttr("weightAlways"))
+      return;
+    auto globalOp = moduleOp.lookupSymbol<memref::GlobalOp>(getGlobalOp.getName());
+    if (!globalOp)
+      return;
+    auto initialValue = globalOp.getInitialValue();
+    if (!initialValue)
+      return;
+    auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
+    if (!denseAttr)
+      return;
+    auto memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
+    auto rawData = denseAttr.getRawData();
+    std::memcpy(memoryBuffer.data() + memEntry.address, rawData.data(), std::min(rawData.size(), memEntry.size));
+  });
+  memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size());
+  memoryFileStream.close();
+
+  size_t coreCount = 0;
+  for (auto coreOp : funcOp.getOps<pim::PimCoreOp>()) {
+    auto coreId = coreOp.getCoreId();
+    coreCount++;
+
+    std::error_code errorCode;
+    auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".json";
+    llvm::raw_fd_ostream coreFileStream(outputCorePath, errorCode);
+    if (errorCode) {
+      llvm::errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n';
+      return InvalidOutputFileAccess;
+    }
+
+    coreFileStream << '[';
+    auto coreNameString = "core" + std::to_string(coreId);
+
+    PimCodeGen coreCodeGen(memory, coreFileStream);
+    memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp);
+
+    size_t processedOperations = 0;
+    for (auto& op : coreOp.getBody().front()) {
+      if (isa<memref::AllocOp>(op))
+        continue;
+      if (isa<pim::PimHaltOp>(op))
+        continue;
+      if (auto loadOp = dyn_cast<pim::PimMemCopyHostToDevOp>(op)) {
+        coreCodeGen.codeGenLoadOp(loadOp);
+      }
+      else if (auto storeOp = dyn_cast<pim::PimMemCopyDevToHostOp>(op)) {
+        coreCodeGen.codeGenStoreOp(storeOp);
+      }
+      else if (auto vmmOp = dyn_cast<pim::PimVMMOp>(op)) {
+        coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(vmmOp.getWeightIndex(), vmmOp, true);
+      }
+      else if (auto mvmOp = dyn_cast<pim::PimMVMOp>(op)) {
+        coreCodeGen.codeGenMVMLikeOp<pim::PimMVMOp>(mvmOp.getWeightIndex(), mvmOp, false);
+      }
+      else if (auto applyFiltersOp = dyn_cast<pim::PimApplyFiltersOp>(op)) {
+        coreCodeGen.codeGenApplyFiltersOp(applyFiltersOp);
+      }
+      else if (auto vaddOp = dyn_cast<pim::PimVAddOp>(op)) {
+        coreCodeGen.codeGenVAddOp(vaddOp);
+      }
+      else if (auto vmaxOp = dyn_cast<pim::PimVMaxOp>(op)) {
+        coreCodeGen.codeGenVMaxOp(vmaxOp);
+      }
+      else if (auto vreluOp = dyn_cast<pim::PimVReluOp>(op)) {
+        coreCodeGen.codeGenVReluOp(vreluOp);
+      }
+      else if (auto receiveOp = dyn_cast<pim::PimReceiveOp>(op)) {
+        coreCodeGen.codeGenReceiveOp(receiveOp);
+      }
+      else if (auto sendOp = dyn_cast<pim::PimSendOp>(op)) {
+        coreCodeGen.codeGenSendOp(sendOp);
+      }
+      else if (auto sumOp = dyn_cast<pim::PimSumOp>(op)) {
+        // TODO: Implement somehow?
+        op.emitWarning("Sum operation is not supported");
+        continue;
+      }
+      else if (auto vsDivOp = dyn_cast<pim::PimVSDivOp>(op)) {
+        // TODO: Implement somehow?
+        op.emitWarning("VSDiv operation is not supported");
+        continue;
+      }
+      else if (auto vexpOp = dyn_cast<pim::PimVExpOp>(op)) {
+        // TODO: Implement somehow?
+        op.emitWarning("VExp operation is not supported");
+        continue;
+      }
+      else if (isa<memref::SubViewOp>(op)) {
+        continue;
+      }
+      else {
+        op.emitError("Unsupported codegen for this operation");
+        op.dump();
+        return CompilerFailure;
+      }
+      processedOperations++;
+    }
+    assert(processedOperations > 0);
+    // Remove trailing comma
+    coreFileStream.seek(coreFileStream.tell() - 1);
+    coreFileStream << ']';
+    coreFileStream.close();
+
+    // Create output directory for this core's crossbar weights
+    auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
+    if (auto error = llvm::sys::fs::create_directory(coreWeightsDirPath)) {
+      llvm::errs() << "Error creating core directory: " << coreWeightsDirPath << ": " << error.message() << '\n';
+      return InvalidOutputFileAccess;
+    }
+
+    int64_t xbarSize = crossbarSize.getValue();
+    size_t weightIndex = 0;
+    llvm::json::Array xbarsPerGroup;
+    for (auto weight : coreOp.getWeights()) {
+      xbarsPerGroup.push_back(weightIndex);
+      auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
+      if (!getGlobalOp) {
+        coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(weightIndex));
+        weightIndex++;
+        continue;
+      }
+
+      auto globalOp = SymbolTable::lookupNearestSymbolFrom<memref::GlobalOp>(moduleOp, getGlobalOp.getNameAttr());
+      if (!globalOp) {
+        coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(weightIndex));
+        weightIndex++;
+        continue;
+      }
+
+      auto initialValue = globalOp.getInitialValue();
+      if (!initialValue) {
+        coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(weightIndex));
+        weightIndex++;
+        continue;
+      }
+
+      auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
+      if (!denseAttr) {
+        coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(weightIndex));
+        weightIndex++;
+        continue;
+      }
+
+      auto type = denseAttr.getType();
+      auto shape = type.getShape();
+      assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
+      int64_t numRows = shape[0];
+      int64_t numCols = shape[1];
+      assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");
+
+      auto elementType = type.getElementType();
+      size_t elementByteWidth = elementType.getIntOrFloatBitWidth() / 8;
+
+      // Write crossbar weights as binary, padded to crossbarSize x crossbarSize
+      auto weightFilePath = coreWeightsDirPath + "/crossbar_" + std::to_string(weightIndex) + ".bin";
+      llvm::raw_fd_ostream weightFileStream(weightFilePath, errorCode, llvm::sys::fs::OF_None);
+      if (errorCode) {
+        llvm::errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
+        return InvalidOutputFileAccess;
+      }
+
+      uint64_t zero = 0;
+      for (int64_t row = 0; row < xbarSize; row++) {
+        for (int64_t col = 0; col < xbarSize; col++) {
+          if (row < numRows && col < numCols) {
+            int64_t index = row * numCols + col;
+            APInt bits = denseAttr.getValues<APFloat>()[index].bitcastToAPInt();
+            uint64_t word = bits.getZExtValue();
+            weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
+          }
+          else {
+            weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
+          }
+        }
+      }
+
+      weightFileStream.close();
+      weightIndex++;
+    }
+    xbarsPerArrayGroup[coreNameString] = std::move(xbarsPerGroup);
+  }
+
+  // Step 3: Write configuration to JSON
+  llvm::json::Object configJson;
+  configJson["core_cnt"] = coreCount;
+
+  // TODO: Should this be based on the floating point type used in the model?
+  //// The 2 following values determine the bitwidth of the vectors' elements:
+  //// bitwidth = adc_count * cell_precision
+  // Number of ADC for MVM units
+  configJson["adc_count"] = 16;
+  // Bit precision of each ADC
+  configJson["cell_precision"] = 2;
+
+  //// Crossbar configuration
+  configJson["xbar_array_count"] = crossbarCountInCore.getValue();
+  configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()};
+
+  // Store the crossbar sizes
+  configJson["array_group_map"] = std::move(xbarsPerArrayGroup);
+
+  // Store the memory layout of inputs and outputs
+  llvm::json::Array inputsAddresses;
+  for (BlockArgument input : funcOp.getArguments())
+    inputsAddresses.push_back(memory.getValueAddress(input));
+  configJson["inputs_addresses"] = std::move(inputsAddresses);
+  llvm::json::Array outputsAddresses;
+  for (func::ReturnOp returnOp : funcOp.getOps<func::ReturnOp>())
+    for (Value output : returnOp.getOperands())
+      outputsAddresses.push_back(memory.getValueAddress(output));
+  configJson["outputs_addresses"] = std::move(outputsAddresses);
+
+  // Step 4: Write config JSON
+  std::string openOutputErrorMsg;
+  auto configPath = outputDirPath + "/config.json";
+  std::error_code EC;
+  llvm::raw_fd_ostream jsonOS(configPath, EC);
+  if (EC) {
+    llvm::errs() << "Error while opening config file: " << EC.message() << '\n';
+    return InvalidOutputFileAccess;
+  }
+  jsonOS << llvm::json::Value(std::move(configJson)) << '\n';
+  jsonOS.close();
+
+  showCompilePhase("Code generated into " + configPath);
+
+  return CompilerSuccess;
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,97 @@
+#pragma once
+
+#include "llvm/Support/JSON.h"
+
+#include "Common/ValueMap.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerUtils.hpp"
+#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
+#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
+#include "src/Compiler/CompilerPasses.hpp"
+
+namespace onnx_mlir {
+
+struct MemEntry {
+  size_t address;
+  size_t size;
+};
+
+class PimMemory {
+  SmallVector<std::pair<MemEntry, Value>, 32> memEntries;
+  llvm::SmallDenseMap<Value, MemEntry, 32>& globalMemEntriesMap;
+
+  size_t maxSize = 0; // 0 for unbounded memory
+  size_t startAddress = 0;
+  size_t minAlignment = 4;
+  size_t firstAvailableAddress = 0;
+
+  MemEntry* gatherMemEntry(Value value);
+  void allocateMemoryForValue(Value value, MemEntry& memEntry);
+
+public:
+  PimMemory(llvm::SmallDenseMap<Value, MemEntry, 32>& globalMemEntriesMap)
+  : globalMemEntriesMap(globalMemEntriesMap) {}
+
+  void allocateHost(ModuleOp moduleOp, func::FuncOp funcOp);
+  void allocateCore(Operation* op);
+
+  size_t getFirstAvailableAddress() const { return firstAvailableAddress; }
+  MemEntry getMemEntry(Value value) const ;
+};
+
+class PimAcceleratorMemory {
+public:
+  llvm::SmallDenseMap<Value, MemEntry, 32> memEntriesMap;
+  PimMemory hostMem;
+
+private:
+  llvm::SmallDenseMap<size_t, PimMemory> deviceMem;
+
+public:
+  PimAcceleratorMemory()
+  : hostMem(memEntriesMap) {}
+
+  PimMemory getOrCreateDeviceMem(size_t id);
+
+  size_t getValueAddress(Value value) const;
+};
+
+class PimCodeGen {
+  PimAcceleratorMemory& memory;
+  llvm::raw_fd_ostream& coreFileStream;
+
+public:
+  PimCodeGen(PimAcceleratorMemory& memory, llvm::raw_fd_ostream& coreJson)
+  : memory(memory), coreFileStream(coreJson) {}
+
+  llvm::json::Object createSetImmediate(size_t targetRegister, size_t immediate);
+  llvm::json::Object createEmptyOffset();
+
+  void genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate);
+
+  void createRd(size_t rdAddress, size_t rdOffset);
+  void createRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset);
+  void createRdRs1Rs2(
+    size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset);
+
+  void codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp);
+
+  void codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp);
+
+  template <typename MVMTy>
+  void codeGenMVMLikeOp(size_t mvmId, MVMTy mvmLikeOp, bool transposeMatrix);
+
+  void codeGenReceiveOp(pim::PimReceiveOp receiveOp);
+
+  void codeGenSendOp(pim::PimSendOp sendOp);
+
+  void codeGenVAddOp(pim::PimVAddOp vaddOp);
+
+  void codeGenVMaxOp(pim::PimVMaxOp vmaxOp);
+
+  void codeGenVReluOp(pim::PimVReluOp vreluOp);
+
+  void codeGenApplyFiltersOp(pim::PimApplyFiltersOp applyFiltersOp);
+};
+
+} // namespace onnx_mlir
@@ -0,0 +1,56 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+//===------------------------- PimCompilerOptions.cpp --------------------===//
+//
+// Copyright 2022 The IBM Research Authors.
+//
+// =============================================================================
+//
+// Compiler Options for PIM
+//
+//===----------------------------------------------------------------------===//
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+
+#define DEBUG_TYPE "PimCompilerOptions"
+
+namespace onnx_mlir {
+
+llvm::cl::opt<PimEmissionTargetType> pimEmissionTarget(
+    llvm::cl::desc("[Optional] Choose PIM-related target to emit "
+                   "(once selected it will cancel the other targets):"),
+    llvm::cl::values(clEnumVal(EmitSpatial, "Lower model to spatial IR")),
+    llvm::cl::values(clEnumVal(EmitPim, "Lower model to PIM IR")),
+    llvm::cl::values(
+        clEnumVal(EmitPimBufferized, "Lower model to PIM IR and bufferize it")),
+    llvm::cl::values(clEnumVal(EmitPimCodegen, "Lower model to PIM IR and "
+                                               "generate code for PIM")),
+    llvm::cl::init(EmitPimCodegen), llvm::cl::cat(OnnxMlirOptions));
+
+llvm::cl::opt<bool> pimOnlyCodegen("pim-only-codegen",
+    llvm::cl::desc("Only generate code for PIM (assume input is already in "
+                   "bufferized PIM IR)"),
+    llvm::cl::init(false), llvm::cl::cat(OnnxMlirOptions));
+
+llvm::cl::opt<bool> useExperimentalConvImpl("use-experimental-conv-impl",
+    llvm::cl::desc("Use experimental implementation for convolution"),
+    llvm::cl::init(false), llvm::cl::cat(OnnxMlirOptions));
+
+llvm::cl::opt<size_t> crossbarSize("crossbar-size",
+    llvm::cl::desc("Width and heigth of a single crossbar"), llvm::cl::init(2));
+
+llvm::cl::opt<size_t> crossbarCountInCore("crossbar-count",
+    llvm::cl::desc("Number of crossbars in each core"), llvm::cl::init(2));
+
+llvm::cl::opt<long> coresCount("core-count",
+    llvm::cl::desc("Number of cores in the chip. `-1` to use the minimum "
+                   "amount of cores."),
+    llvm::cl::init(-1));
+
+llvm::cl::opt<bool> ignoreConcatError("ignore-concat-error",
+    llvm::cl::desc(
+        "Ignore ConcatOp corner case: do not assert and do a simplification"),
+    llvm::cl::init(false));
+
+} // namespace onnx_mlir
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "llvm/Support/CommandLine.h"
+
+#define INSTRUMENTSTAGE_ENUM_PIM
+
+#define INSTRUMENTSTAGE_CL_ENUM_PIM
+
+#define PROFILEIR_CL_ENUM_PIM
+
+#define OPTREPORT_ENUM_PIM
+
+#define OPTREPORT_CL_ENUM_PIM
+
+namespace onnx_mlir {
+typedef enum {
+  EmitSpatial = 0,
+  EmitPim = 1,
+  EmitPimBufferized = 2,
+  EmitPimCodegen = 3
+} PimEmissionTargetType;
+
+extern llvm::cl::OptionCategory OnnxMlirOptions;
+extern llvm::cl::opt<onnx_mlir::PimEmissionTargetType> pimEmissionTarget;
+
+extern llvm::cl::opt<bool> pimOnlyCodegen;
+extern llvm::cl::opt<bool> useExperimentalConvImpl;
+extern llvm::cl::opt<bool> exportCrossbarWeights;
+
+extern llvm::cl::opt<size_t> crossbarSize;
+extern llvm::cl::opt<size_t> crossbarCountInCore;
+extern llvm::cl::opt<long> coresCount;
+
+// This option, by default set to false, will ignore an error when resolving a
+// specific tiles of the operands of a concat. This specific case is when the
+// wanted tile is generated by two separate operands of the concat. If this is
+// set to false, this corner case will assert an error. If this is set to true,
+// a simplification is performed and only the tile from the first operand is
+// taken.
+extern llvm::cl::opt<bool> ignoreConcatError;
+
+} // namespace onnx_mlir
@@ -0,0 +1,56 @@
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Transforms/Passes.h"
+#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
+#include "src/Accelerators/PIM/Compiler/PimCompilerUtils.hpp"
+#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
+#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
+#include "src/Compiler/CompilerPasses.hpp"
+
+#include "llvm/Support/JSON.h"
+
+#include <cassert>
+#include <cstddef>
+
+#define DEBUG_TYPE "PimCompilerUtils"
+
+using namespace mlir;
+using namespace onnx_mlir;
+
+namespace onnx_mlir {
+
+void addPassesPim(OwningOpRef<ModuleOp>& module,
+                  PassManager& pm,
+                  EmissionTargetType& emissionTarget,
+                  std::string outputNameNoExt) {
+
+  if (pimOnlyCodegen) {
+    // Skip all the lowering passes and directly generate code for PIM.
+    return;
+  }
+
+  if (emissionTarget >= EmitONNXIR)
+    addONNXToMLIRPasses(pm, /*target CPU*/ false);
+
+  if (pimEmissionTarget >= EmitSpatial) {
+    pm.addPass(createONNXToSpatialPass());
+    // pm.addPass(createCountInstructionPass());
+    pm.addPass(createMessagePass("ONNX lowered to SPATIAL"));
+  }
+
+  if (pimEmissionTarget >= EmitPim) {
+    pm.addPass(createSpatialToPIMPass());
+    // pm.addPass(createCountInstructionPass());
+    pm.addPass(createMessagePass("SPATIAL lowered to PIM"));
+  }
+
+  if (pimEmissionTarget >= EmitPimBufferized) {
+    pm.addPass(createBufferizePimPass());
+    // pm.addPass(createCountInstructionPass());
+    pm.addPass(createMessagePass("PIM bufferized"));
+  }
+}
+
+} // namespace onnx_mlir
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+
+#include "onnx-mlir/Compiler/OMCompilerTypes.h"
+
+namespace onnx_mlir {
+
+void addPassesPim(mlir::OwningOpRef<mlir::ModuleOp>& module,
+                  mlir::PassManager& pm,
+                  EmissionTargetType& emissionTarget,
+                  std::string outputNameNoExt);
+
+int compileModuleToPIMJSON(const mlir::OwningOpRef<mlir::ModuleOp>& moduleOpRef,
+                           std::string& outputDirName);
+
+} // namespace onnx_mlir