Raptor/src/PIM/Compiler/PimCodeGen.cpp

#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/BuiltinTypes.h"

#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/raw_ostream.h"

#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstddef>

#include "Conversion/ONNXToSpatial/ONNXToSpatialCommon.hpp"
#include "Conversion/SpatialToPIM/SpatialToPIMCommon.hpp"
#include "Dialect/Spatial/SpatialOps.hpp"
#include "src/Accelerators/PIM/Compiler/PimCodeGen.hpp"
#include "src/Accelerators/PIM/Compiler/PimCompilerOptions.hpp"
#include "src/Accelerators/PIM/Compiler/PimCompilerUtils.hpp"
#include "src/Accelerators/PIM/Dialect/PIM/PimOps.hpp"
#include "src/Accelerators/PIM/Pass/PimPasses.hpp"
#include "src/Compiler/CompilerPasses.hpp"
#include "src/Compiler/CompilerUtils.hpp"

namespace onnx_mlir {

MemEntry* PimMemory::gatherMemEntry(Value value) {
  auto type = cast<ShapedType>(value.getType());
  assert("Only static shape is supported" && type.hasStaticShape());
  size_t allocSize = type.getNumElements() * type.getElementType().getIntOrFloatBitWidth() / 8;
  MemEntry memEntry = {0, allocSize};
  return &memEntries.emplace_back(memEntry, value).first;
}

void PimMemory::allocateMemoryForValue(Value value, MemEntry& memEntry) {
  memEntry.address = firstAvailableAddress;
  firstAvailableAddress += memEntry.size;
  // Alignment
  if (size_t remainder = firstAvailableAddress % minAlignment)
    firstAvailableAddress += minAlignment - remainder;

  globalMemEntriesMap[value] = memEntry;
}

void PimMemory::allocateHost(ModuleOp moduleOp, func::FuncOp funcOp) {
  // More than one SSA value per single global constant:
  // Cannot call gatherMemEntry for each of them, otherwise memory will be allocated multiple times
  // Thus, call gatherMemEntry only for the first SSA value and assign the same memEntry to all others
  llvm::SmallDenseMap<memref::GlobalOp, MemEntry*, 8> globalConstants;
  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
    if (!getGlobalOp->hasAttr("weightAlways")) {
      auto globalMemrefOp = moduleOp.lookupSymbol<memref::GlobalOp>(getGlobalOp.getName());
      auto iter = globalConstants.find(globalMemrefOp);
      if (iter == globalConstants.end())
        globalConstants[globalMemrefOp] = gatherMemEntry(getGlobalOp);
      else {
        MemEntry memEntry = *iter->second;
        globalMemEntriesMap[getGlobalOp] = memEntry;
      }
    }
  });

  for (Value arg : funcOp.getArguments())
    gatherMemEntry(arg);

  allocateCore(funcOp);
}

void PimMemory::allocateCore(Operation* op) {
  op->walk([&](memref::AllocOp allocOp) { gatherMemEntry(allocOp); });

  llvm::sort(memEntries, [](auto a, auto b) -> bool { return a.first.size > b.first.size; });
  for (auto& [memEntry, value] : memEntries)
    allocateMemoryForValue(value, memEntry);
}

MemEntry PimMemory::getMemEntry(Value value) const {
  auto iter = globalMemEntriesMap.find(value);
  assert("Missing memEntry for value" && iter != globalMemEntriesMap.end());
  return iter->second;
}

PimMemory PimAcceleratorMemory::getOrCreateDeviceMem(size_t id) {
  return deviceMem.try_emplace(id, memEntriesMap).first->second;
}

size_t PimAcceleratorMemory::getValueAddress(Value value) const {
  while (true) {
    auto definingOp = value.getDefiningOp();
    if (!definingOp)
      break;
    if (auto dpsDefiningOp = dyn_cast<DestinationStyleOpInterface>(definingOp)) {
      OpOperand* tiedOperand = dpsDefiningOp.getTiedOpOperand(cast<OpResult>(value));
      if (!tiedOperand)
        break;
      value = tiedOperand->get();
    }
    else if (auto subviewDefiningOp = dyn_cast<memref::SubViewOp>(definingOp)) {
      auto source = subviewDefiningOp.getSource();
      auto srcShape = source.getType().getShape();
      auto subviewOffsets = subviewDefiningOp.getStaticOffsets();
      auto subviewSizes = subviewDefiningOp.getStaticSizes();
      auto subviewStrides = subviewDefiningOp.getStaticStrides();
      assert(isMemoryContiguous(srcShape, subviewOffsets, subviewSizes, subviewStrides));
      value = source;
    }
    else
      break;
  }
  return memEntriesMap.at(value).address;
}

llvm::json::Object PimCodeGen::createSetImmediate(size_t targetRegister, size_t immediate) {
  llvm::json::Object returnValue;
  returnValue["op"] = "sldi";
  returnValue["rd"] = targetRegister;
  returnValue["imm"] = immediate;
  return returnValue;
}

llvm::json::Object PimCodeGen::createEmptyOffset() {
  llvm::json::Object returnValue;
  returnValue["offset_select"] = 0;
  returnValue["offset_value"] = 0;
  return returnValue;
}

void PimCodeGen::genSetRegisterImmediateUnsigned(size_t registerNumber, size_t immediate) {
  llvm::json::Object setRegisterJson = createSetImmediate(registerNumber, immediate);
  coreFileStream << llvm::json::Value(std::move(setRegisterJson)) << ',';
}

void PimCodeGen::createRd(size_t rdAddress, size_t rdOffset) {
  // rd on register 0
  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
}

void PimCodeGen::createRdRs1(size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset) {
  // rd on register 0
  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
  // rs1 on register 1
  genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
}

void PimCodeGen::createRdRs1Rs2(
  size_t rdAddress, size_t rdOffset, size_t rs1Address, size_t rs1Offset, size_t rs2Address, size_t rs2Offset) {
  // rd on register 0
  genSetRegisterImmediateUnsigned(0, rdAddress + rdOffset);
  // rs1 on register 1
  genSetRegisterImmediateUnsigned(1, rs1Address + rs1Offset);
  // rs2 on register 2
  genSetRegisterImmediateUnsigned(2, rs2Address + rs2Offset);
}

void PimCodeGen::codeGenLoadOp(pim::PimMemCopyHostToDevOp loadOp) {
  auto deviceDst = loadOp.getDeviceDst();
  auto hostSrc = loadOp.getHostSrc();
  auto deviceDstOffset = loadOp.getDeviceDstOffset();
  auto hostSrcOffset = loadOp.getHostSrcOffset();
  auto size = loadOp.getSize();

  auto deviceDstAlloc = memory.getValueAddress(deviceDst);
  auto hostSrcAlloc = memory.getValueAddress(hostSrc);

  // Set load rd register (reg 0)
  createRdRs1(deviceDstAlloc, deviceDstOffset, hostSrcAlloc, hostSrcOffset);

  llvm::json::Object loadOpJson;
  loadOpJson["op"] = "ld";
  loadOpJson["rd"] = 0;
  loadOpJson["rs1"] = 1;
  loadOpJson["size"] = size;
  loadOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(loadOpJson)) << ',';
}

void PimCodeGen::codeGenStoreOp(pim::PimMemCopyDevToHostOp storeOp) {
  auto hostDst = storeOp.getHostDst();
  auto deviceSrc = storeOp.getDeviceSrc();
  auto hostDstOffset = storeOp.getHostDstOffset();
  auto deviceSrcOffset = storeOp.getDeviceSrcOffset();
  auto size = storeOp.getSize();

  auto deviceSrcAlloc = memory.getValueAddress(deviceSrc);
  auto hostDstAlloc = memory.getValueAddress(hostDst);

  // Set load rd register (reg 0)
  createRdRs1(hostDstAlloc, hostDstOffset, deviceSrcAlloc, deviceSrcOffset);

  llvm::json::Object storeOpJson;
  storeOpJson["op"] = "st";
  storeOpJson["rd"] = 0;
  storeOpJson["rs1"] = 1;
  storeOpJson["size"] = size;
  storeOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(storeOpJson)) << ',';
}

template <typename MVMTy>
void PimCodeGen::codeGenMVMLikeOp(size_t mvmId, MVMTy mvmLikeOp, bool transposeMatrix) {
  auto outBufAlloc = memory.getValueAddress(mvmLikeOp.getOutBuf());
  auto vectorAlloc = memory.getValueAddress(mvmLikeOp.getVectorInput());

  createRdRs1(outBufAlloc, 0, vectorAlloc, 0);

  llvm::json::Object mvmOpJson;
  mvmOpJson["op"] = "mvmul";
  mvmOpJson["rd"] = 0;
  mvmOpJson["rs1"] = 1;
  mvmOpJson["group"] = mvmId;
  mvmOpJson["relu"] = 0;
  mvmOpJson["mbiw"] = 8;

  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';

  // TODO: save weights somewhere (if transposeMatrix=true, then transpose the
  // weight matrix)
}

void PimCodeGen::codeGenApplyFiltersOp(pim::PimApplyFiltersOp applyFiltersOp) {

  auto outBuff = memory.getValueAddress(applyFiltersOp.getOutBuf());
  auto inBuff = memory.getValueAddress(applyFiltersOp.getInput());
  auto accumBuff = memory.getValueAddress(applyFiltersOp.getAccumBuf());

  // Get weight indices from the operation attribute.
  auto weightIndices = applyFiltersOp.getWeightIndices();

  // Get shape of the input tensor.
  auto inputType = cast<MemRefType>(applyFiltersOp.getInput().getType());
  auto outputType = cast<MemRefType>(applyFiltersOp.getOutBuf().getType());
  auto in_shape = inputType.getShape();
  auto out_shape = outputType.getShape();

  // Extract the relevant dimensions.
  size_t in_channels = in_shape[1];                    // Number of input channels.
  size_t out_channels = out_shape[1];                  // Number of output channels.

  size_t dim2 = in_shape.size() > 2 ? in_shape[2] : 1; // Image width.
  size_t dim3 = in_shape.size() > 3 ? in_shape[3] : 1; // Image height.

  // Iterate through pixels.
  for (size_t out_y = 0; out_y < dim3; out_y++) {
    for (size_t out_x = 0; out_x < dim2; out_x++) {

      // For each crossbar, perform the MVMUL operation.
      size_t weightIndex = 0;
      for (Attribute weight : weightIndices) {

        // --------------------------------------
        // --- STEP 1: Perform MVUL operation ---
        // --------------------------------------

        // Get the weight matrix ID for this position.
        auto weightId = cast<IntegerAttr>(weight).getInt();

        size_t xKer = cast<IntegerAttr>(applyFiltersOp.getXKernelPositions()[weightIndex]).getInt();
        size_t yKer = cast<IntegerAttr>(applyFiltersOp.getYKernelPositions()[weightIndex]).getInt();

        weightIndex++;

        if (out_x + xKer >= dim2 || out_y + yKer >= dim3)
          continue;

        // Calculate the offset for the input (and output) tensor.
        size_t output_offset = (out_y * dim2 + out_x) * 32 * out_channels;
        size_t input_offset = ((out_y + yKer) * dim2 + (out_x + xKer)) * 32 * in_channels;

        // Read from the input tensor and store the partial result in the
        // accumulator buffer, if this is not the first weight matrix.

        // Note that rs1 is the input tensor, and rd is the output tensor.
        // TODO: This order of arguments is confusing, check if the correct
        // order is being used in the WMVUL operation. The order below is
        // correct.
        if (weightIndices[0] != weight) {
          createRdRs1(accumBuff, 0, inBuff, input_offset);
        }
        else {
          // Otherwise store directly in the output buffer.
          createRdRs1(outBuff, output_offset, inBuff, input_offset);
        }

        // Create the MVMUL JSON object
        llvm::json::Object mvmOpJson;
        mvmOpJson["op"] = "mvmul";
        mvmOpJson["rd"] = 0;
        mvmOpJson["rs1"] = 1;
        mvmOpJson["group"] = weightId;
        mvmOpJson["relu"] = 0;
        mvmOpJson["mbiw"] = 8;

        // Write the JSON to the output stream
        coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';

        // --------------------------------------
        // --- STEP 2: Perform VADD operation ---
        // --------------------------------------

        // If this is the first weight matrix, we don't need to perform a VADD.
        if (weightIndices[0] == weight)
          continue;

        // We now need to sum the value in the accumulator buffer with the value
        // in the output buffer, and store the result in the output buffer.
        createRdRs1Rs2(outBuff, output_offset, accumBuff, 0, outBuff, output_offset);

        llvm::json::Object vaddOpJson;
        vaddOpJson["op"] = "vvadd";
        vaddOpJson["rd"] = 0;
        vaddOpJson["rs1"] = 1;
        vaddOpJson["rs2"] = 2;
        vaddOpJson["offset"] = createEmptyOffset();

        coreFileStream << llvm::json::Value(std::move(vaddOpJson)) << ',';
      }
    }
  }
}

void PimCodeGen::codeGenVAddOp(pim::PimVAddOp vaddOp) {
  auto outBufAlloc = memory.getValueAddress(vaddOp.getOutBuf());
  auto rs1BufferOp = memory.getValueAddress(vaddOp.getA());
  auto rs2BufferOp = memory.getValueAddress(vaddOp.getB());

  createRdRs1Rs2(outBufAlloc, 0, rs1BufferOp, 0, rs2BufferOp, 0);

  // Get the size of the output buffer.
  auto outputType = cast<MemRefType>(vaddOp.getOutBuf().getType());
  auto out_shape = outputType.getShape();

  // Multiply all dimension lengths to get the total number of elements.
  size_t totalElements = 1;
  for (size_t i = 0; i < out_shape.size(); i++)
    totalElements *= out_shape[i];
  auto elementSize = vaddOp.getOutRes().getType().getElementTypeBitWidth() / 8;

  llvm::json::Object mvmOpJson;
  mvmOpJson["op"] = "vvadd";
  mvmOpJson["rd"] = 0;
  mvmOpJson["rs1"] = 1;
  mvmOpJson["rs2"] = 2;
  mvmOpJson["offset"] = createEmptyOffset();
  mvmOpJson["len"] = totalElements * elementSize;

  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
}

void PimCodeGen::codeGenVMaxOp(pim::PimVMaxOp vmaxOp) {

  auto outBufAlloc = memory.getValueAddress(vmaxOp.getOutBuf());
  auto rs1BufferOp = memory.getValueAddress(vmaxOp.getA());
  auto rs2BufferOp = memory.getValueAddress(vmaxOp.getB());

  createRdRs1Rs2(outBufAlloc, 0, rs1BufferOp, 0, rs2BufferOp, 0);

  llvm::json::Object mvmOpJson;
  mvmOpJson["op"] = "vvmax";
  mvmOpJson["rd"] = 0;
  mvmOpJson["rs1"] = 1;
  mvmOpJson["rs2"] = 2;
  mvmOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
}

void PimCodeGen::codeGenVReluOp(pim::PimVReluOp vreluOp) {
  auto outBufAlloc = memory.getValueAddress(vreluOp.getOutBuf());
  auto rs1BufferOp = memory.getValueAddress(vreluOp.getA());

  createRdRs1(outBufAlloc, 0, rs1BufferOp, 0);

  llvm::json::Object mvmOpJson;
  mvmOpJson["op"] = "vrelu";
  mvmOpJson["rd"] = 0;
  mvmOpJson["rs1"] = 1;
  mvmOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(mvmOpJson)) << ',';
}

void PimCodeGen::codeGenReceiveOp(pim::PimReceiveOp receiveOp) {

  auto destAlloc = memory.getValueAddress(receiveOp.getDst());

  createRd(destAlloc, /* dest_offset = */ 0);

  llvm::json::Object recvOpJson;
  recvOpJson["op"] = "recv";
  recvOpJson["rd"] = 0;
  recvOpJson["core"] = receiveOp.getSrcCoreId();
  recvOpJson["size"] = receiveOp.getSize();
  recvOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(recvOpJson)) << ',';
}

void PimCodeGen::codeGenSendOp(pim::PimSendOp sendOp) {

  auto srcAlloc = memory.getValueAddress(sendOp.getSrc());

  // Technically a RS1 register, but its just a name..
  createRd(srcAlloc, /* dest_offset = */ 0);

  llvm::json::Object sendOpJson;
  sendOpJson["op"] = "send";
  sendOpJson["rd"] = 0;
  sendOpJson["core"] = sendOp.getTargetCoreId();
  sendOpJson["size"] = sendOp.getSize();
  sendOpJson["offset"] = createEmptyOffset();

  coreFileStream << llvm::json::Value(std::move(sendOpJson)) << ',';
}

size_t getMatrixSize(ShapedType matrixShape) {
  if (matrixShape.getRank() != 2 && matrixShape.getRank() != 4)
    assert(false && "Unsupported matrix shape");
  return std::max(matrixShape.getDimSize(0), matrixShape.getDimSize(1));
}

std::string getMemorySizeAsString(size_t size) {
  if (size > 1024 * 1024 * 1024)
    return std::to_string(size / 1024 / 1024 / 1024) + " GB";
  if (size > 1024 * 1024)
    return std::to_string(size / 1024 / 1024) + " MB";
  if (size > 1024)
    return std::to_string(size / 1024) + " KB";
  return std::to_string(size) + " Bytes";
}

int compileModuleToPIMJSON(const OwningOpRef<ModuleOp>& moduleOpRef, std::string& outputDirPath) {
  ModuleOp moduleOp = moduleOpRef.get();

  if (pimEmissionTarget != EmitPimCodegen) {
    moduleOp.dump();
    return CompilerSuccess;
  }

  if (!outputDirPath.empty()) {
    if (auto error = llvm::sys::fs::create_directory(outputDirPath)) {
      llvm::errs() << "Error creating output directory: " << outputDirPath << ": " << error.message() << '\n';
      return InvalidOutputFileAccess;
    }
  }

  // For each core, specify the number of crossbar per array group
  // This implementation always assigns one crossbar per group
  llvm::json::Object xbarsPerArrayGroup;

  auto funcOps = moduleOp.getOps<func::FuncOp>();
  assert(!funcOps.empty() && "No function found in the module");
  auto funcOp = *funcOps.begin();

  PimAcceleratorMemory memory;
  memory.hostMem.allocateHost(moduleOp, funcOp);

  // Write memory binary file
  auto memoryFilePath = outputDirPath + "/memory.bin";
  std::error_code errorCode;
  llvm::raw_fd_ostream memoryFileStream(memoryFilePath, errorCode, llvm::sys::fs::OF_None);
  if (errorCode) {
    llvm::errs() << "Error while opening memory file " << memoryFilePath << ": " << errorCode.message() << '\n';
    return InvalidOutputFileAccess;
  }
  // Zero-initialized buffer
  std::vector<char> memoryBuffer(memory.hostMem.getFirstAvailableAddress(), 0);
  // Write global values at their allocated addresses
  funcOp.walk([&](memref::GetGlobalOp getGlobalOp) {
    if (getGlobalOp->hasAttr("weightAlways"))
      return;
    auto globalOp = moduleOp.lookupSymbol<memref::GlobalOp>(getGlobalOp.getName());
    if (!globalOp)
      return;
    auto initialValue = globalOp.getInitialValue();
    if (!initialValue)
      return;
    auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
    if (!denseAttr)
      return;
    auto memEntry = memory.hostMem.getMemEntry(getGlobalOp.getResult());
    auto rawData = denseAttr.getRawData();
    std::memcpy(memoryBuffer.data() + memEntry.address, rawData.data(), std::min(rawData.size(), memEntry.size));
  });
  memoryFileStream.write(memoryBuffer.data(), memoryBuffer.size());
  memoryFileStream.close();

  size_t coreCount = 0;
  for (auto coreOp : funcOp.getOps<pim::PimCoreOp>()) {
    auto coreId = coreOp.getCoreId();
    coreCount++;

    std::error_code errorCode;
    auto outputCorePath = outputDirPath + "/core_" + std::to_string(coreId) + ".json";
    llvm::raw_fd_ostream coreFileStream(outputCorePath, errorCode);
    if (errorCode) {
      llvm::errs() << "Error while opening core file `" << outputCorePath << "`: " << errorCode.message() << '\n';
      return InvalidOutputFileAccess;
    }

    coreFileStream << '[';
    auto coreNameString = "core" + std::to_string(coreId);

    PimCodeGen coreCodeGen(memory, coreFileStream);
    memory.getOrCreateDeviceMem(coreId).allocateCore(coreOp);

    size_t processedOperations = 0;
    for (auto& op : coreOp.getBody().front()) {
      if (isa<memref::AllocOp>(op))
        continue;
      if (isa<pim::PimHaltOp>(op))
        continue;
      if (auto loadOp = dyn_cast<pim::PimMemCopyHostToDevOp>(op)) {
        coreCodeGen.codeGenLoadOp(loadOp);
      }
      else if (auto storeOp = dyn_cast<pim::PimMemCopyDevToHostOp>(op)) {
        coreCodeGen.codeGenStoreOp(storeOp);
      }
      else if (auto vmmOp = dyn_cast<pim::PimVMMOp>(op)) {
        coreCodeGen.codeGenMVMLikeOp<pim::PimVMMOp>(vmmOp.getWeightIndex(), vmmOp, true);
      }
      else if (auto mvmOp = dyn_cast<pim::PimMVMOp>(op)) {
        coreCodeGen.codeGenMVMLikeOp<pim::PimMVMOp>(mvmOp.getWeightIndex(), mvmOp, false);
      }
      else if (auto applyFiltersOp = dyn_cast<pim::PimApplyFiltersOp>(op)) {
        coreCodeGen.codeGenApplyFiltersOp(applyFiltersOp);
      }
      else if (auto vaddOp = dyn_cast<pim::PimVAddOp>(op)) {
        coreCodeGen.codeGenVAddOp(vaddOp);
      }
      else if (auto vmaxOp = dyn_cast<pim::PimVMaxOp>(op)) {
        coreCodeGen.codeGenVMaxOp(vmaxOp);
      }
      else if (auto vreluOp = dyn_cast<pim::PimVReluOp>(op)) {
        coreCodeGen.codeGenVReluOp(vreluOp);
      }
      else if (auto receiveOp = dyn_cast<pim::PimReceiveOp>(op)) {
        coreCodeGen.codeGenReceiveOp(receiveOp);
      }
      else if (auto sendOp = dyn_cast<pim::PimSendOp>(op)) {
        coreCodeGen.codeGenSendOp(sendOp);
      }
      else if (auto sumOp = dyn_cast<pim::PimSumOp>(op)) {
        // TODO: Implement somehow?
        op.emitWarning("Sum operation is not supported");
        continue;
      }
      else if (auto vsDivOp = dyn_cast<pim::PimVSDivOp>(op)) {
        // TODO: Implement somehow?
        op.emitWarning("VSDiv operation is not supported");
        continue;
      }
      else if (auto vexpOp = dyn_cast<pim::PimVExpOp>(op)) {
        // TODO: Implement somehow?
        op.emitWarning("VExp operation is not supported");
        continue;
      }
      else if (isa<memref::SubViewOp>(op)) {
        continue;
      }
      else {
        op.emitError("Unsupported codegen for this operation");
        op.dump();
        return CompilerFailure;
      }
      processedOperations++;
    }
    assert(processedOperations > 0);
    // Remove trailing comma
    coreFileStream.seek(coreFileStream.tell() - 1);
    coreFileStream << ']';
    coreFileStream.close();

    // Create output directory for this core's crossbar weights
    auto coreWeightsDirPath = outputDirPath + "/core_" + std::to_string(coreId);
    if (auto error = llvm::sys::fs::create_directory(coreWeightsDirPath)) {
      llvm::errs() << "Error creating core directory: " << coreWeightsDirPath << ": " << error.message() << '\n';
      return InvalidOutputFileAccess;
    }

    int64_t xbarSize = crossbarSize.getValue();
    size_t weightIndex = 0;
    llvm::json::Array xbarsPerGroup;
    for (auto weight : coreOp.getWeights()) {
      xbarsPerGroup.push_back(weightIndex);
      auto getGlobalOp = weight.getDefiningOp<memref::GetGlobalOp>();
      if (!getGlobalOp) {
        coreOp.emitWarning("Weight is not from a memref.get_global at index " + std::to_string(weightIndex));
        weightIndex++;
        continue;
      }

      auto globalOp = SymbolTable::lookupNearestSymbolFrom<memref::GlobalOp>(moduleOp, getGlobalOp.getNameAttr());
      if (!globalOp) {
        coreOp.emitWarning("Could not find memref.global for weight at index " + std::to_string(weightIndex));
        weightIndex++;
        continue;
      }

      auto initialValue = globalOp.getInitialValue();
      if (!initialValue) {
        coreOp.emitWarning("memref.global has no initial value at index " + std::to_string(weightIndex));
        weightIndex++;
        continue;
      }

      auto denseAttr = dyn_cast<DenseElementsAttr>(*initialValue);
      if (!denseAttr) {
        coreOp.emitWarning("memref.global initial value is not dense at index " + std::to_string(weightIndex));
        weightIndex++;
        continue;
      }

      auto type = denseAttr.getType();
      auto shape = type.getShape();
      assert(isMatrixShape(shape) && "Weight matrix must be 2-dimensional");
      int64_t numRows = shape[0];
      int64_t numCols = shape[1];
      assert(numRows <= xbarSize && numCols <= xbarSize && "Weight dimensions must not exceed crossbar size");

      auto elementType = type.getElementType();
      size_t elementByteWidth = elementType.getIntOrFloatBitWidth() / 8;

      // Write crossbar weights as binary, padded to crossbarSize x crossbarSize
      auto weightFilePath = coreWeightsDirPath + "/crossbar_" + std::to_string(weightIndex) + ".bin";
      llvm::raw_fd_ostream weightFileStream(weightFilePath, errorCode, llvm::sys::fs::OF_None);
      if (errorCode) {
        llvm::errs() << "Error while opening weight file `" << weightFilePath << "`: " << errorCode.message() << '\n';
        return InvalidOutputFileAccess;
      }

      uint64_t zero = 0;
      for (int64_t row = 0; row < xbarSize; row++) {
        for (int64_t col = 0; col < xbarSize; col++) {
          if (row < numRows && col < numCols) {
            int64_t index = row * numCols + col;
            APInt bits = denseAttr.getValues<APFloat>()[index].bitcastToAPInt();
            uint64_t word = bits.getZExtValue();
            weightFileStream.write(reinterpret_cast<const char*>(&word), elementByteWidth);
          }
          else {
            weightFileStream.write(reinterpret_cast<const char*>(&zero), elementByteWidth);
          }
        }
      }

      weightFileStream.close();
      weightIndex++;
    }
    xbarsPerArrayGroup[coreNameString] = std::move(xbarsPerGroup);
  }

  // Step 3: Write configuration to JSON
  llvm::json::Object configJson;
  configJson["core_cnt"] = coreCount;

  // TODO: Should this be based on the floating point type used in the model?
  //// The 2 following values determine the bitwidth of the vectors' elements:
  //// bitwidth = adc_count * cell_precision
  // Number of ADC for MVM units
  configJson["adc_count"] = 16;
  // Bit precision of each ADC
  configJson["cell_precision"] = 2;

  //// Crossbar configuration
  configJson["xbar_array_count"] = crossbarCountInCore.getValue();
  configJson["xbar_size"] = {crossbarSize.getValue(), crossbarSize.getValue()};

  // Store the crossbar sizes
  configJson["array_group_map"] = std::move(xbarsPerArrayGroup);

  // Store the memory layout of inputs and outputs
  llvm::json::Array inputsAddresses;
  for (BlockArgument input : funcOp.getArguments())
    inputsAddresses.push_back(memory.getValueAddress(input));
  configJson["inputs_addresses"] = std::move(inputsAddresses);
  llvm::json::Array outputsAddresses;
  for (func::ReturnOp returnOp : funcOp.getOps<func::ReturnOp>())
    for (Value output : returnOp.getOperands())
      outputsAddresses.push_back(memory.getValueAddress(output));
  configJson["outputs_addresses"] = std::move(outputsAddresses);

  // Step 4: Write config JSON
  std::string openOutputErrorMsg;
  auto configPath = outputDirPath + "/config.json";
  std::error_code EC;
  llvm::raw_fd_ostream jsonOS(configPath, EC);
  if (EC) {
    llvm::errs() << "Error while opening config file: " << EC.message() << '\n';
    return InvalidOutputFileAccess;
  }
  jsonOS << llvm::json::Value(std::move(configJson)) << '\n';
  jsonOS.close();

  showCompilePhase("Code generated into " + configPath);

  return CompilerSuccess;
}

} // namespace onnx_mlir