Raptor/validation/tools/compare_raptor_pimcomp.py

#!/usr/bin/env python3
from __future__ import annotations

import argparse
import gzip
import importlib.util
import json
import mmap
import os
import re
import shlex
import shutil
import subprocess
import sys
import time
import types
from collections import Counter
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any

import numpy as np
import onnx


REPO = Path(__file__).resolve().parents[2]
VALIDATION_DIR = REPO / "validation"
sys.path.insert(0, str(VALIDATION_DIR))

from gen_network_runner import gen_network_runner  # noqa: E402
from onnx_utils import _ONNX_TO_NP, gen_random_inputs, onnx_io, save_inputs_to_files, write_inputs_to_memory_bin  # noqa: E402
from validate_one import build_dump_ranges, parse_pim_simulator_outputs  # noqa: E402
from raptor import compile_with_raptor  # noqa: E402


@dataclass
class StepRecord:
    name: str
    duration_sec: float
    command: str
    status: str = "passed"
    returncode: int | None = None
    error: str | None = None
    output_tail: str | None = None


@dataclass
class CompareResult:
    passed: bool
    max_diffs: dict[str, float]
    status: str = "done"
    error: str | None = None


def load_pimcomp_exporter():
    path = REPO / "third_party/PIMCOMP-NN/verification/export_to_pim_simulator.py"
    spec = importlib.util.spec_from_file_location("pimcomp_exporter", path)
    module = importlib.util.module_from_spec(spec)
    assert spec is not None and spec.loader is not None
    sys.modules.setdefault("cv2", types.ModuleType("cv2"))
    spec.loader.exec_module(module)
    return module


def load_mesh_builder():
    path = REPO / "validation/pimsim-configs/generate_mesh_config.py"
    spec = importlib.util.spec_from_file_location("mesh_builder", path)
    module = importlib.util.module_from_spec(spec)
    assert spec is not None and spec.loader is not None
    spec.loader.exec_module(module)
    return module


def shell_join(cmd: list[str]) -> str:
    return shlex.join(str(arg) for arg in cmd)


def print_step(name: str, cmd: list[str] | None = None, cwd: Path | None = None):
    print(f"\n[{name}]")
    if cmd is not None:
        print(f"  cwd: {cwd or REPO}")
        print(f"  $ {shell_join(cmd)}")


def output_tail(output: str | bytes | None, limit: int = 4000) -> str:
    if output is None:
        return ""
    if isinstance(output, bytes):
        output = output.decode(errors="replace")
    return output[-limit:]


def exception_message(exc: BaseException) -> str:
    if isinstance(exc, subprocess.CalledProcessError):
        command = shell_join([str(arg) for arg in exc.cmd]) if isinstance(exc.cmd, list) else str(exc.cmd)
        tail = output_tail(exc.output)
        message = f"command failed with exit code {exc.returncode}: {command}"
        if tail:
            message += f"\n--- output tail ---\n{tail}"
        return message
    if isinstance(exc, subprocess.TimeoutExpired):
        command = shell_join([str(arg) for arg in exc.cmd]) if isinstance(exc.cmd, list) else str(exc.cmd)
        tail = output_tail(exc.output)
        message = f"command timed out after {exc.timeout} seconds: {command}"
        if tail:
            message += f"\n--- output tail ---\n{tail}"
        return message
    return f"{type(exc).__name__}: {exc}"


def print_failure(name: str, exc: BaseException | str) -> None:
    message = exc if isinstance(exc, str) else exception_message(exc)
    print(f"\n[{name} FAILED]")
    for line in message.splitlines()[:20]:
        print(f"  {line}")


def run_logged(
    name: str,
    cmd: list[str],
    *,
    cwd: Path,
    timeout_sec: float,
    steps: list[StepRecord],
) -> str:
    print_step(name, cmd, cwd)
    start = time.perf_counter()
    command = shell_join(cmd)
    try:
        proc = subprocess.run(
            cmd,
            cwd=cwd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            timeout=timeout_sec,
        )
    except subprocess.TimeoutExpired as exc:
        duration = time.perf_counter() - start
        tail = output_tail(exc.output)
        steps.append(
            StepRecord(
                name=name,
                duration_sec=duration,
                command=command,
                status="timeout",
                error=f"Timed out after {timeout_sec} seconds",
                output_tail=tail or None,
            )
        )
        raise

    duration = time.perf_counter() - start
    if proc.returncode != 0:
        tail = output_tail(proc.stdout)
        steps.append(
            StepRecord(
                name=name,
                duration_sec=duration,
                command=command,
                status="failed",
                returncode=proc.returncode,
                error=f"Exited with status {proc.returncode}",
                output_tail=tail or None,
            )
        )
        raise subprocess.CalledProcessError(proc.returncode, cmd, output=tail)

    steps.append(StepRecord(name=name, duration_sec=duration, command=command))
    return proc.stdout


def remove_tree(path: Path) -> None:
    if not path.exists() and not path.is_symlink():
        return
    if path.is_symlink() or path.is_file():
        path.unlink()
        return
    while True:
        children = list(path.iterdir())
        if not children:
            break
        for child in children:
            remove_tree(child)
    path.rmdir()


def load_model_inputs(model_path: Path, seed: int):
    model = onnx.load(model_path)
    initializer_names = {init.name for init in model.graph.initializer}
    initializer_values = {
        init.name: onnx.numpy_helper.to_array(init) for init in model.graph.initializer
    }
    inputs_desc, outputs_desc = onnx_io(model_path)
    runtime_desc = [desc for desc in inputs_desc if desc[1] not in initializer_names]
    runtime_arrays, _ = gen_random_inputs(runtime_desc, seed=seed)

    runtime_by_name = {
        desc[1]: arr for desc, arr in zip(runtime_desc, runtime_arrays)
    }
    arrays_in_order = []
    for _, name, elem_type, _ in inputs_desc:
        if name in initializer_values:
            arrays_in_order.append(initializer_values[name].astype(_ONNX_TO_NP[elem_type], copy=False))
        else:
            arrays_in_order.append(runtime_by_name[name])
    runtime_only = [arr for desc, arr in zip(inputs_desc, arrays_in_order) if desc[1] not in initializer_names]
    return inputs_desc, outputs_desc, arrays_in_order, runtime_only


def compare_simulator_outputs(
    output_bin: Path,
    outputs_desc: list[tuple[int, str, int, list[int]]],
    reference_dir: Path,
    *,
    threshold: float,
    rtol: float,
) -> CompareResult:
    sim_arrays = parse_pim_simulator_outputs(output_bin, outputs_desc)
    max_diffs: dict[str, float] = {}
    passed = True
    for sim_array, (idx, name, _, shape) in zip(sim_arrays, outputs_desc):
        csv_name = reference_dir / f"output{idx}_{sanitize_output_name(name)}.csv"
        ref = np.loadtxt(csv_name, delimiter=",", dtype=np.float32).reshape(shape)
        diff = np.abs(sim_array.astype(np.float64) - ref.astype(np.float64))
        allowed = threshold + rtol * np.abs(ref.astype(np.float64))
        max_diffs[name] = float(np.max(diff))
        if not np.all(diff <= allowed):
            passed = False
    return CompareResult(passed=passed, max_diffs=max_diffs)


def sanitize_output_name(name: str) -> str:
    return "".join(ch if ch.isalnum() or ch in "_.-" else "_" for ch in name[:255])


def load_effective_hardware(args: argparse.Namespace) -> dict[str, int]:
    config_path = args.pimcomp_dir / "config.json"
    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)
    rows, cols = config["chip_config"]["network_config"]["layout"]
    xbar_h, xbar_w = config["chip_config"]["core_config"]["matrix_config"]["xbar_size"]
    hardware = {
        "mesh_rows": args.mesh_rows or rows,
        "mesh_cols": args.mesh_cols or cols,
        "crossbar_count": args.crossbar_count or config["chip_config"]["core_config"]["matrix_config"]["xbar_array_count"],
        "crossbar_size": args.crossbar_size or xbar_h,
    }
    if xbar_h != xbar_w:
        raise ValueError(f"Only square crossbars are supported, got {xbar_h}x{xbar_w}")
    hardware["core_count"] = args.core_count or hardware["mesh_rows"] * hardware["mesh_cols"]
    return hardware


def write_pimsim_config(args: argparse.Namespace, out_dir: Path, hardware: dict[str, int]) -> Path:
    mesh_builder = load_mesh_builder()
    example_config = REPO / "backend-simulators/pim/pimsim-nn/example/config/latency_config.json"
    with open(example_config, "r", encoding="utf-8") as f:
        config = json.load(f)
    config["chip_config"]["core_config"]["matrix_config"]["xbar_array_count"] = hardware["crossbar_count"]
    config["chip_config"]["core_config"]["matrix_config"]["xbar_size"] = [
        hardware["crossbar_size"],
        hardware["crossbar_size"],
    ]
    config["chip_config"]["network_config"]["layout"] = [
        hardware["mesh_rows"],
        hardware["mesh_cols"],
    ]
    config["chip_config"]["network_config"]["net_config_file_path"] = f"network_mesh_{hardware['core_count']}.json"
    config["chip_config"]["core_cnt"] = hardware["core_count"]
    config["sim_config"]["sim_mode"] = 1 if args.pimsim_mode == "latency" else 0
    config["sim_config"]["sim_time"] = args.pimsim_time_ms
    out_dir.mkdir(parents=True, exist_ok=True)
    config_path = out_dir / f"{args.pimsim_mode}_config.json"
    network_path = out_dir / f"network_mesh_{hardware['core_count']}.json"
    with open(config_path, "w", encoding="utf-8") as f:
        json.dump(config, f, indent=2)
        f.write("\n")
    with open(network_path, "w", encoding="utf-8") as f:
        json.dump(
            mesh_builder.build_network(
                hardware["core_count"],
                (hardware["mesh_rows"], hardware["mesh_cols"]),
            ),
            f,
            separators=(",", ":"),
        )
        f.write("\n")
    return config_path


def compile_reference(
    args: argparse.Namespace,
    model_path: Path,
    work_dir: Path,
    steps: list[StepRecord],
) -> tuple[Path, Path, Path]:
    raptor_dir = work_dir / "reference"
    runner_dir = work_dir / "runner"
    build_dir = runner_dir / "build"
    raptor_dir.mkdir(parents=True, exist_ok=True)
    build_dir.mkdir(parents=True, exist_ok=True)
    stem = model_path.stem
    onnx_ir_base = raptor_dir / stem
    runner_base = runner_dir / stem

    run_logged(
        "Reference Emit ONNX IR",
        [str(args.raptor_path), str(model_path), "-o", str(onnx_ir_base), "--EmitONNXIR"],
        cwd=REPO,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    run_logged(
        "Reference Native Compile",
        [str(args.raptor_path), "-O3", str(model_path), "-o", str(runner_base)],
        cwd=REPO,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    network_so = runner_base.with_suffix(".so")
    network_mlir = onnx_ir_base.with_suffix(".onnx.mlir")

    print_step("Generate Runner Source")
    gen_network_runner(model_path, network_so, args.onnx_include_dir, out=runner_dir / "runner.c", verbose=False)

    run_logged(
        "Configure Runner",
        ["cmake", str(runner_dir), "-DCMAKE_BUILD_TYPE=Release", "-DCMAKE_C_FLAGS_RELEASE=-O3"],
        cwd=build_dir,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    run_logged(
        "Build Runner",
        ["cmake", "--build", ".", "-j"],
        cwd=build_dir,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    return network_mlir, network_so, build_dir / "runner"


def generate_reference_outputs(
    runner_path: Path,
    runner_build_dir: Path,
    model_path: Path,
    arrays_in_order: list[np.ndarray],
    steps: list[StepRecord],
    args: argparse.Namespace,
    out_dir: Path,
) -> Path:
    inputs_dir = out_dir / "inputs"
    reference_dir = out_dir / "reference_outputs"
    inputs_dir.mkdir(parents=True, exist_ok=True)
    reference_dir.mkdir(parents=True, exist_ok=True)
    flags, _ = save_inputs_to_files(model_path, arrays_in_order, inputs_dir)
    run_logged(
        "Run Reference",
        [str(runner_path), *flags, "--save-csv-dir", str(reference_dir)],
        cwd=runner_build_dir,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    return reference_dir


def compile_raptor_target(
    model_mlir: Path,
    out_dir: Path,
    hardware: dict[str, int],
    args: argparse.Namespace,
    steps: list[StepRecord],
) -> tuple[Path, dict[str, float]]:
    out_dir.mkdir(parents=True, exist_ok=True)
    cmd = [
        str(args.raptor_path),
        str(model_mlir),
        "-o",
        str(out_dir / "model"),
        "--maccel=PIM",
        "--EmitPimCodegen",
        f"--crossbar-size={hardware['crossbar_size']}",
        f"--crossbar-count={hardware['crossbar_count']}",
        f"--core-count={hardware['core_count']}",
        "--pim-emit-json",
        *args.raptor_extra_arg,
    ]
    print_step("Compile Raptor PIM", cmd, REPO)
    start = time.perf_counter()
    command = shell_join(cmd)
    raptor_extra_args = ["--pim-emit-json", *args.raptor_extra_arg]
    try:
        timings = compile_with_raptor(
            model_mlir,
            args.raptor_path,
            out_dir / "model",
            hardware["crossbar_size"],
            hardware["crossbar_count"],
            core_count=hardware["core_count"],
            raptor_extra_args=raptor_extra_args,
            cwd=out_dir,
            verbose=args.verbose_raptor_compile,
            timeout_sec=args.timeout_seconds,
        )
    except Exception as exc:
        steps.append(
            StepRecord(
                name="Compile Raptor PIM",
                duration_sec=time.perf_counter() - start,
                command=command,
                status="failed",
                error=exception_message(exc),
            )
        )
        raise

    steps.append(
        StepRecord(
            name="Compile Raptor PIM",
            duration_sec=time.perf_counter() - start,
            command=command,
        )
    )
    return out_dir / "pim", timings


def run_rust_validation(
    label: str,
    pim_dir: Path,
    config_path: Path,
    outputs_desc: list[tuple[int, str, int, list[int]]],
    reference_dir: Path,
    steps: list[StepRecord],
    args: argparse.Namespace,
) -> CompareResult:
    output_bin = pim_dir.parent / "semantic_validation" / "out.bin"
    dump_ranges = build_dump_ranges(config_path, outputs_desc)
    cmd = [
        "cargo",
        "run",
        "--no-default-features",
        "--release",
        "--package",
        "pim-simulator",
        "--bin",
        "pim-simulator",
        "--",
        "-f",
        str(pim_dir),
        "-o",
        str(output_bin),
        "-d",
        dump_ranges,
    ]
    simulation_dir = pim_dir.parent / "semantic_validation"
    simulation_dir.mkdir(parents=True, exist_ok=True)
    run_logged(
        label,
        cmd,
        cwd=args.pim_simulator_dir,
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    return compare_simulator_outputs(
        output_bin,
        outputs_desc,
        reference_dir,
        threshold=args.threshold,
        rtol=args.rtol,
    )


def copy_pimcomp_outputs(args: argparse.Namespace, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    for name in ("SimulationInfo.gz", "VerificationInfo.json", "MappingResult.txt"):
        shutil.copy2(args.pimcomp_dir / "output" / name, out_dir / name)


def compile_pimcomp(
    args: argparse.Namespace,
    model_path: Path,
    out_dir: Path,
    steps: list[StepRecord],
) -> tuple[Path, Path]:
    out_dir.mkdir(parents=True, exist_ok=True)
    model_name = f"compare_{model_path.stem}"
    frontend_json = args.pimcomp_dir / "models/JSON" / f"{model_name}.json"
    frontend_cmd = [
        "python3",
        "frontend.py",
        "--model_path",
        str(model_path),
        "--save_path",
        str(frontend_json),
    ]
    run_logged(
        "PIMCOMP Frontend",
        frontend_cmd,
        cwd=args.pimcomp_dir / "frontend",
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    backend_cmd = [
        str(args.pimcomp_dir / "build" / "PIMCOMP-NN"),
        f"-m={model_name}",
        "-p=batch",
        "-v=YES",
        "-s=YES",
    ]
    run_logged(
        "PIMCOMP Backend",
        backend_cmd,
        cwd=args.pimcomp_dir / "build",
        timeout_sec=args.timeout_seconds,
        steps=steps,
    )
    copy_pimcomp_outputs(args, out_dir)
    return out_dir / "VerificationInfo.json", out_dir / "SimulationInfo.gz"


def export_pimcomp_for_pimsim_nn(simulation_info: Path, output_dir: Path) -> Path:
    if output_dir.exists():
        remove_tree(output_dir)
    with gzip.open(simulation_info, "rt", encoding="utf-8") as f:
        sim_info = json.load(f)

    output_dir.mkdir(parents=True, exist_ok=True)
    sim_config = sim_info["config"]
    present_core_indices = sorted(
        int(key[4:]) for key, value in sim_info.items() if key.startswith("core") and isinstance(value, list) and value
    )
    if not present_core_indices:
        raise ValueError("PIMCOMP SimulationInfo.gz does not contain any non-empty core instruction streams")
    expected_core_indices = list(range(present_core_indices[-1] + 1))
    if present_core_indices != expected_core_indices:
        raise ValueError(f"PIMCOMP core numbering is not contiguous: {present_core_indices}")

    config = {
        "core_cnt": len(present_core_indices),
        "xbar_size": sim_config["xbar_size"],
        "xbar_array_count": sim_config["xbar_array_count"],
        "cell_precision": sim_config["cell_precision"],
        "adc_count": sim_config["adc_count"],
        "array_group_map": {},
    }
    for core_idx in present_core_indices:
        core_name = f"core{core_idx}"
        config["array_group_map"][core_name] = sim_config["array_group_map"].get(core_name, [])

    with open(output_dir / "config.json", "w", encoding="utf-8") as f:
        json.dump(config, f, separators=(",", ":"))
        f.write("\n")

    for core_idx in present_core_indices:
        core_key = f"core{core_idx}"
        instructions = sim_info[core_key]
        with open(output_dir / f"core_{core_idx}.json", "w", encoding="utf-8") as f:
            json.dump(instructions, f, separators=(",", ":"))
            f.write("\n")
    return output_dir


def flatten_pimcomp_input(array: np.ndarray) -> np.ndarray:
    tensor = array.astype(np.float32, copy=False)
    if tensor.ndim == 4:
        tensor = tensor.transpose((0, 2, 3, 1))
    return tensor.reshape(-1)


def export_pimcomp_for_rust(
    model_path: Path,
    verification_info: Path,
    simulation_info: Path,
    runtime_inputs: list[np.ndarray],
    output_dir: Path,
) -> Path:
    if len(runtime_inputs) != 1:
        raise ValueError("PIMCOMP export currently requires exactly one runtime input tensor")
    if output_dir.exists():
        remove_tree(output_dir)
    exporter = load_pimcomp_exporter()
    with open(verification_info, "r", encoding="utf-8") as f:
        final_info = json.load(f)
    with gzip.open(simulation_info, "rt", encoding="utf-8") as f:
        sim_info = json.load(f)

    onnx_model, weights, gemm_weights, output_to_weight, output_to_bias = exporter.load_model_info(
        model_path, final_info
    )
    input_tensor = flatten_pimcomp_input(runtime_inputs[0])
    node_list = final_info["node_list"]
    max_output = exporter.max_output_element_num(node_list)
    local_group_map = exporter.map_local_groups(final_info, sim_info)

    output_dir.mkdir(parents=True, exist_ok=True)
    weights_dir = output_dir / "weights"
    weights_dir.mkdir(parents=True, exist_ok=True)

    input_addr = 0
    cursor = exporter.byte_offset(len(input_tensor))
    bias_addrs: dict[str, int] = {}
    for node_name, bias_name in output_to_bias.items():
        bias = weights[bias_name].astype(np.float32).flatten()
        bias_addrs[node_name] = cursor
        cursor += exporter.byte_offset(len(bias))

    lldi_addrs: dict[tuple[bytes, int], int] = {}
    for core_idx in range(sim_info["config"]["core_cnt"]):
        for inst in sim_info.get(f"core{core_idx}", []) or []:
            if inst["op"] != "lldi":
                continue
            key = (exporter.float32_bytes(inst["imm"]), inst["len"])
            if key not in lldi_addrs:
                lldi_addrs[key] = cursor
                cursor += exporter.byte_offset(inst["len"])

    output_base = (cursor + 255) & ~255
    memory_size = output_base + exporter.byte_offset(max_output * len(node_list))
    memory = bytearray(memory_size)
    memory[input_addr : input_addr + input_tensor.nbytes] = input_tensor.tobytes()
    for node_name, bias_name in output_to_bias.items():
        bias = weights[bias_name].astype(np.float32).flatten()
        start = bias_addrs[node_name]
        memory[start : start + bias.nbytes] = bias.tobytes()
    for (value_bytes, element_num), start in lldi_addrs.items():
        value = np.frombuffer(value_bytes, dtype=np.float32)[0]
        blob = np.full(element_num, value, dtype=np.float32)
        memory[start : start + blob.nbytes] = blob.tobytes()

    config = {
        "core_cnt": sim_info["config"]["core_cnt"],
        "xbar_size": sim_info["config"]["xbar_size"],
        "xbar_array_count": sim_info["config"]["xbar_array_count"],
        "cell_precision": sim_info["config"]["cell_precision"],
        "adc_count": sim_info["config"]["adc_count"],
        "array_group_map": {},
        "inputs_addresses": [input_addr],
        "outputs_addresses": [],
    }
    output_name_to_node = {node["name"]: node for node in node_list}
    for graph_output in onnx_model.graph.output:
        node = output_name_to_node[graph_output.name]
        config["outputs_addresses"].append(output_base + exporter.byte_offset(node["new_node_index"] * max_output))

    ag_info = final_info["AG_info"]
    weight_counter = 0
    xbar_size = int(sim_info["config"]["xbar_size"][0])
    for core_idx in range(config["core_cnt"]):
        core_name = f"core{core_idx}"
        core_dir = output_dir / f"core_{core_idx}"
        core_dir.mkdir(parents=True, exist_ok=True)
        local_to_global = local_group_map.get(core_idx, {})
        ag_counts = sim_info["config"]["array_group_map"].get(core_name, [])
        group_prefix = []
        total_crossbars = 0
        for count in ag_counts:
            group_prefix.append(total_crossbars)
            total_crossbars += count
        config["array_group_map"][core_name] = list(range(total_crossbars))

        for local_group, global_ag in sorted(local_to_global.items()):
            info = ag_info[global_ag]
            weight_name = output_to_weight[info["node_name"]]
            matrix = gemm_weights[weight_name]
            row_slice = slice(info["height_start"], info["height_end"] + 1)
            first_physical = group_prefix[local_group]
            for crossbar_idx, crossbar in enumerate(info["crossbar"]):
                col_slice = slice(crossbar["width_start"], crossbar["width_end"] + 1)
                tile = np.zeros((xbar_size, col_slice.stop - col_slice.start), dtype=np.float32)
                tile_rows = matrix[row_slice, col_slice].astype(np.float32)
                tile[: tile_rows.shape[0], :] = tile_rows
                weight_path = weights_dir / f"crossbar_{weight_counter}.bin"
                weight_path.write_bytes(tile.tobytes(order="C"))
                os.symlink(weight_path.resolve(), core_dir / f"crossbar_{first_physical + crossbar_idx}.bin")
                weight_counter += 1

        instructions = []
        last_sldi_by_rd: dict[int, int] = {}
        ver_ops = exporter.filtered_verification_ops(final_info, core_idx)
        ver_index = 0
        for sim_inst in sim_info.get(core_name, []) or []:
            op = sim_inst["op"]
            if op == "setbw":
                instructions.append(sim_inst)
                continue
            if op == "sldi":
                translated = {"op": "sldi", "rd": sim_inst["rd"], "imm": exporter.byte_offset(sim_inst["imm"])}
                instructions.append(translated)
                last_sldi_by_rd[sim_inst["rd"]] = len(instructions) - 1
                continue
            if ver_index >= len(ver_ops):
                raise RuntimeError(f"core{core_idx}: simulation op {op} has no matching verification op")
            ver_inst = ver_ops[ver_index]
            ver_index += 1
            ver_op = ver_inst["operation"].lower()
            if ver_op != op:
                raise RuntimeError(
                    f"core{core_idx}: simulation/verification op mismatch {op} vs {ver_op} at {ver_index - 1}"
                )
            if op == "ld":
                if ver_inst["stage"] == "INPUT":
                    src = input_addr + exporter.byte_offset(ver_inst["source_offset"])
                elif ver_inst["stage"] == "BIAS":
                    src = bias_addrs[node_list[ver_inst["node_index"]]["name"]] + exporter.byte_offset(ver_inst["source_offset"])
                else:
                    raise RuntimeError(f"Unsupported LD stage {ver_inst['stage']}")
                instructions[last_sldi_by_rd[sim_inst["rs1"]]]["imm"] = src
                translated = dict(sim_inst)
                translated["size"] = exporter.byte_offset(sim_inst["size"])
                instructions.append(translated)
            elif op == "st":
                dst = output_base + exporter.byte_offset(
                    ver_inst["node_index"] * max_output + ver_inst["destination_offset"]
                )
                instructions[last_sldi_by_rd[sim_inst["rd"]]]["imm"] = dst
                translated = dict(sim_inst)
                translated["size"] = exporter.byte_offset(sim_inst["size"])
                instructions.append(translated)
            elif op == "lldi":
                key = (exporter.float32_bytes(sim_inst["imm"]), sim_inst["len"])
                src = lldi_addrs[key]
                temp_rd = 1 if sim_inst["rd"] == 0 else 0
                instructions.append({"op": "sldi", "rd": temp_rd, "imm": src})
                instructions.append(
                    {
                        "op": "ld",
                        "rd": sim_inst["rd"],
                        "rs1": temp_rd,
                        "size": exporter.byte_offset(sim_inst["len"]),
                        "offset": sim_inst["offset"],
                    }
                )
            elif op in ("lmv", "vvadd", "vvmul", "vvmax", "vrelu"):
                translated = dict(sim_inst)
                translated["len"] = exporter.byte_offset(sim_inst["len"])
                instructions.append(translated)
            elif op in ("send", "recv"):
                translated = dict(sim_inst)
                translated["size"] = exporter.byte_offset(sim_inst["size"])
                instructions.append(translated)
            elif op == "mvmul":
                local_group = sim_inst["group"]
                global_ag = local_to_global[local_group]
                first_physical = group_prefix[local_group]
                widths = [
                    crossbar["width_end"] - crossbar["width_start"] + 1
                    for crossbar in ag_info[global_ag]["crossbar"]
                ]
                dst = instructions[last_sldi_by_rd[sim_inst["rd"]]]["imm"]
                src = instructions[last_sldi_by_rd[sim_inst["rs1"]]]["imm"]
                out_offset = 0
                for idx, width in enumerate(widths):
                    instructions.append({"op": "sldi", "rd": sim_inst["rd"], "imm": dst + exporter.byte_offset(out_offset)})
                    instructions.append({"op": "sldi", "rd": sim_inst["rs1"], "imm": src})
                    translated = dict(sim_inst)
                    translated["group"] = first_physical + idx
                    instructions.append(translated)
                    out_offset += width
            else:
                raise RuntimeError(f"Unsupported PIMCOMP op {op}")

        with open(output_dir / f"core_{core_idx}.json", "w", encoding="utf-8") as f:
            json.dump(instructions, f, separators=(",", ":"))
            f.write("\n")

    with open(output_dir / "config.json", "w", encoding="utf-8") as f:
        json.dump(config, f, separators=(",", ":"))
        f.write("\n")
    (output_dir / "memory.bin").write_bytes(memory)
    return output_dir


def parse_pimsim_nn_report(output: str) -> dict[str, float | int | str]:
    patterns = {
        "output_count": r"output count:\s+([0-9]+)\s+samples",
        "throughput": r"throughput:\s+([0-9.]+)\s+samples/s",
        "average_latency_ms": r"average latency:\s+([0-9.eE+-]+)\s+ms",
        "latency_ms": r"latency:\s+([0-9.eE+-]+)\s+ms",
        "average_power_mw": r"average power:\s+([0-9.eE+-]+)\s+mW",
        "average_energy_pj": r"average energy:\s+([0-9.eE+-]+)\s+pJ/it",
    }
    result: dict[str, float | int | str] = {"raw_output": output}
    for key, pattern in patterns.items():
        match = re.search(pattern, output)
        if match:
            value = match.group(1)
            result[key] = int(value) if key == "output_count" else float(value)
    return result


def run_pimsim_nn(
    label: str,
    inst_path: Path,
    config_path: Path,
    single_file: bool,
    steps: list[StepRecord],
    args: argparse.Namespace,
) -> dict[str, Any]:
    cmd = [
        str(args.pimsim_nn_build_dir / "ChipTest"),
        str(inst_path),
        str(config_path),
        "true" if single_file else "false",
    ]
    output = run_logged(
        label,
        cmd,
        cwd=args.pimsim_nn_build_dir,
        timeout_sec=args.timeout_seconds * 10.0,
        steps=steps,
    )
    return parse_pimsim_nn_report(output)


def parse_raptor_instructions(pim_dir: Path) -> dict[str, Any]:
    op_re = re.compile(br'"op":"([^"]+)"')
    counts = Counter()
    per_core = []
    for path in sorted(pim_dir.glob("core_*.json"), key=lambda p: int(p.stem.split("_")[1])):
        with path.open("rb") as f:
            mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
            core_counts = Counter(m.group(1).decode() for m in op_re.finditer(mm))
            mm.close()
        total = sum(core_counts.values())
        counts.update(core_counts)
        per_core.append(
            {
                "core": path.stem,
                "total": total,
                "send": core_counts.get("send", 0),
                "recv": core_counts.get("recv", 0),
                "mvmul": core_counts.get("mvmul", 0),
            }
        )
    return {
        "active_cores": sum(1 for entry in per_core if entry["total"]),
        "total_instructions": int(sum(counts.values())),
        "op_counts": dict(counts),
        "top_cores_by_total": sorted(per_core, key=lambda entry: entry["total"], reverse=True)[:10],
        "top_cores_by_send": sorted(per_core, key=lambda entry: entry["send"], reverse=True)[:10],
        "top_cores_by_recv": sorted(per_core, key=lambda entry: entry["recv"], reverse=True)[:10],
    }


def parse_pimcomp_instructions(simulation_info: Path) -> dict[str, Any]:
    with gzip.open(simulation_info, "rt", encoding="utf-8") as f:
        data = json.load(f)
    per_core = []
    counts = Counter()
    for key in sorted((name for name in data if name.startswith("core")), key=lambda name: int(name[4:])):
        insts = data[key]
        core_counts = Counter((inst.get("operation") or inst.get("op") or "unknown").lower() for inst in insts)
        counts.update(core_counts)
        per_core.append(
            {
                "core": key,
                "total": int(sum(core_counts.values())),
                "send": core_counts.get("send", 0),
                "recv": core_counts.get("recv", 0),
                "mvmul": core_counts.get("mvmul", 0),
            }
        )
    return {
        "active_cores": sum(1 for entry in per_core if entry["total"]),
        "total_instructions": int(sum(counts.values())),
        "op_counts": dict(counts),
        "top_cores_by_total": sorted(per_core, key=lambda entry: entry["total"], reverse=True)[:10],
        "top_cores_by_send": sorted(per_core, key=lambda entry: entry["send"], reverse=True)[:10],
        "top_cores_by_recv": sorted(per_core, key=lambda entry: entry["recv"], reverse=True)[:10],
    }


def format_op_table(counts: dict[str, int], total: int) -> list[str]:
    if total <= 0:
        return ["| n/a | 0 | n/a |"]
    rows = []
    for op, count in sorted(counts.items(), key=lambda item: item[1], reverse=True):
        rows.append(f"| `{op}` | {count} | {100.0 * count / total:.2f}% |")
    return rows


def validation_status(result: CompareResult) -> str:
    if result.status == "done":
        return "PASS" if result.passed else "FAIL"
    return result.status.upper()


def skipped_validation(reason: str) -> CompareResult:
    return CompareResult(passed=False, max_diffs={}, status="skipped", error=reason)


def failed_validation(error: BaseException | str) -> CompareResult:
    message = error if isinstance(error, str) else exception_message(error)
    return CompareResult(passed=False, max_diffs={}, status="failed", error=message)


def skipped_perf(reason: str) -> dict[str, Any]:
    return {"skipped": True, "reason": reason}


def failed_perf(error: BaseException | str) -> dict[str, Any]:
    message = error if isinstance(error, str) else exception_message(error)
    return {"error": message}


def perf_status(perf: dict[str, Any]) -> str:
    if perf.get("skipped"):
        return "SKIPPED"
    if perf.get("error"):
        return "FAILED"
    return "DONE"


def perf_value(perf: dict[str, Any], key: str) -> Any:
    return perf[key] if key in perf else "n/a"


def empty_instruction_summary(reason: str | None = None, error: str | None = None) -> dict[str, Any]:
    result: dict[str, Any] = {
        "active_cores": 0,
        "total_instructions": 0,
        "op_counts": {},
        "top_cores_by_total": [],
        "top_cores_by_send": [],
        "top_cores_by_recv": [],
    }
    if reason is not None:
        result["skipped"] = True
        result["reason"] = reason
    if error is not None:
        result["error"] = error
    return result


def optional_path(path: Path | None) -> str | None:
    return str(path) if path is not None else None


def record_failure(failures: list[dict[str, str]], stage: str, exc: BaseException | str) -> None:
    message = exc if isinstance(exc, str) else exception_message(exc)
    failures.append({"stage": stage, "error": message})
    print_failure(stage, message)


def try_stage(
    failures: list[dict[str, str]],
    stage: str,
    func,
    *args,
    **kwargs,
):
    try:
        return func(*args, **kwargs)
    except Exception as exc:
        record_failure(failures, stage, exc)
        return None


def try_stage_success(
    failures: list[dict[str, str]],
    stage: str,
    func,
    *args,
    **kwargs,
) -> bool:
    try:
        func(*args, **kwargs)
        return True
    except Exception as exc:
        record_failure(failures, stage, exc)
        return False


def write_report(
    report_path: Path,
    *,
    model_path: Path,
    hardware: dict[str, int],
    steps: list[StepRecord],
    failures: list[dict[str, str]],
    raptor_validation: CompareResult,
    pimcomp_validation: CompareResult,
    raptor_perf: dict[str, Any],
    pimcomp_perf: dict[str, Any],
    raptor_instr: dict[str, Any],
    pimcomp_instr: dict[str, Any],
    raptor_pass_timings: dict[str, float],
    pimsim_mode: str,
):
    lines = [
        "# Raptor vs PIMCOMP Comparison Report",
        "",
        f"- Model: `{model_path}`",
        f"- Hardware: `{hardware.get('core_count', 'n/a')} cores`, `{hardware.get('crossbar_count', 'n/a')} xbars/core`, `{hardware.get('crossbar_size', 'n/a')}x{hardware.get('crossbar_size', 'n/a')}` crossbars, mesh `{hardware.get('mesh_rows', 'n/a')}x{hardware.get('mesh_cols', 'n/a')}`",
        "",
    ]

    if failures or any(step.status != "passed" for step in steps):
        lines.extend(
            [
                "## Failures / Skipped Work",
                "",
                "The script did not abort. The failed stage was recorded and any dependent stage was skipped when its inputs were not available.",
                "",
            ]
        )
        if failures:
            lines.extend(["| Stage | Error |", "|---|---|"])
            for failure in failures:
                error = failure["error"].replace("\n", "<br>")
                lines.append(f"| {failure['stage']} | {error} |")
            lines.append("")

    lines.extend(
        [
            "## Semantic Validation",
            "",
            f"- Raptor via `pim-simulator`: `{validation_status(raptor_validation)}`",
            f"- PIMCOMP via exported `pim-simulator`: `{validation_status(pimcomp_validation)}`",
        ]
    )
    if raptor_validation.error:
        lines.append(f"- Raptor validation note: `{raptor_validation.error.splitlines()[0]}`")
    if pimcomp_validation.error:
        lines.append(f"- PIMCOMP validation note: `{pimcomp_validation.error.splitlines()[0]}`")

    lines.extend(["", "### Max Output Differences", ""])
    diff_names = sorted(set(raptor_validation.max_diffs) | set(pimcomp_validation.max_diffs))
    if diff_names:
        lines.extend(["| Output | Raptor max diff | PIMCOMP max diff |", "|---|---:|---:|"])
        for name in diff_names:
            lines.append(
                f"| `{name}` | {raptor_validation.max_diffs.get(name, float('nan')):.6e} | "
                f"{pimcomp_validation.max_diffs.get(name, float('nan')):.6e} |"
            )
    else:
        lines.append("No output differences are available because validation did not run or failed before comparison.")

    lines.extend(
        [
            "",
            "## pimsim-nn Performance",
            "",
            f"- Mode: `{pimsim_mode}`",
            "",
        ]
    )
    if pimsim_mode == "throughput":
        lines.extend(
            [
                "| Compiler | Status | Throughput (samples/s) | Avg latency (ms) | Avg power (mW) | Avg energy (pJ/it) | Output count |",
                "|---|---|---:|---:|---:|---:|---:|",
                f"| Raptor | {perf_status(raptor_perf)} | {perf_value(raptor_perf, 'throughput')} | {perf_value(raptor_perf, 'average_latency_ms')} | "
                f"{perf_value(raptor_perf, 'average_power_mw')} | {perf_value(raptor_perf, 'average_energy_pj')} | {perf_value(raptor_perf, 'output_count')} |",
                f"| PIMCOMP | {perf_status(pimcomp_perf)} | {perf_value(pimcomp_perf, 'throughput')} | {perf_value(pimcomp_perf, 'average_latency_ms')} | "
                f"{perf_value(pimcomp_perf, 'average_power_mw')} | {perf_value(pimcomp_perf, 'average_energy_pj')} | {perf_value(pimcomp_perf, 'output_count')} |",
                "",
            ]
        )
    else:
        lines.extend(
            [
                "| Compiler | Status | Latency (ms) | Avg power (mW) | Avg energy (pJ) |",
                "|---|---|---:|---:|---:|",
                f"| Raptor | {perf_status(raptor_perf)} | {perf_value(raptor_perf, 'latency_ms')} | "
                f"{perf_value(raptor_perf, 'average_power_mw')} | {perf_value(raptor_perf, 'average_energy_pj')} |",
                f"| PIMCOMP | {perf_status(pimcomp_perf)} | {perf_value(pimcomp_perf, 'latency_ms')} | "
                f"{perf_value(pimcomp_perf, 'average_power_mw')} | {perf_value(pimcomp_perf, 'average_energy_pj')} |",
                "",
            ]
        )
    if raptor_perf.get("reason") or raptor_perf.get("error"):
        lines.append(f"- Raptor pimsim-nn note: `{(raptor_perf.get('reason') or raptor_perf.get('error')).splitlines()[0]}`")
    if pimcomp_perf.get("reason") or pimcomp_perf.get("error"):
        lines.append(f"- PIMCOMP pimsim-nn note: `{(pimcomp_perf.get('reason') or pimcomp_perf.get('error')).splitlines()[0]}`")
    if lines[-1] != "":
        lines.append("")

    lines.extend(
        [
            "## Instruction Summary",
            "",
            "| Compiler | Status | Active cores | Total instructions | Sends | Receives | MVMUL |",
            "|---|---|---:|---:|---:|---:|---:|",
            f"| Raptor | {'FAILED' if raptor_instr.get('error') else 'SKIPPED' if raptor_instr.get('skipped') else 'DONE'} | {raptor_instr.get('active_cores', 0)} | {raptor_instr.get('total_instructions', 0)} | {raptor_instr.get('op_counts', {}).get('send', 0)} | {raptor_instr.get('op_counts', {}).get('recv', 0)} | {raptor_instr.get('op_counts', {}).get('mvmul', 0)} |",
            f"| PIMCOMP | {'FAILED' if pimcomp_instr.get('error') else 'SKIPPED' if pimcomp_instr.get('skipped') else 'DONE'} | {pimcomp_instr.get('active_cores', 0)} | {pimcomp_instr.get('total_instructions', 0)} | {pimcomp_instr.get('op_counts', {}).get('send', 0)} | {pimcomp_instr.get('op_counts', {}).get('recv', 0)} | {pimcomp_instr.get('op_counts', {}).get('mvmul', 0)} |",
            "",
            "### Raptor Op Distribution",
            "",
            "| Op | Count | Share |",
            "|---|---:|---:|",
            *format_op_table(raptor_instr.get("op_counts", {}), raptor_instr.get("total_instructions", 0)),
            "",
            "### PIMCOMP Op Distribution",
            "",
            "| Op | Count | Share |",
            "|---|---:|---:|",
            *format_op_table(pimcomp_instr.get("op_counts", {}), pimcomp_instr.get("total_instructions", 0)),
            "",
            "## Step Timings",
            "",
            "| Step | Status | Duration (s) | Return code |",
            "|---|---|---:|---:|",
        ]
    )
    for step in steps:
        lines.append(
            f"| {step.name} | {step.status.upper()} | {step.duration_sec:.3f} | "
            f"{step.returncode if step.returncode is not None else ''} |"
        )
    failed_steps = [step for step in steps if step.status != "passed"]
    if failed_steps:
        lines.extend(["", "### Failed Step Details", ""])
        for step in failed_steps:
            lines.extend(
                [
                    f"#### {step.name}",
                    "",
                    f"- Command: `{step.command}`",
                    f"- Error: `{step.error or 'n/a'}`",
                ]
            )
            if step.output_tail:
                lines.extend(["", "```text", step.output_tail, "```"])
            lines.append("")

    if raptor_pass_timings:
        lines.extend(["", "## Raptor Pass Timings", "", "| Pass | Duration (s) |", "|---|---:|"])
        for name, duration in raptor_pass_timings.items():
            lines.append(f"| {name} | {duration:.4f} |")
    report_path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True, type=Path)
    parser.add_argument("--out-dir", required=True, type=Path)
    parser.add_argument("--raptor-path", default=REPO / "build_release/Release/bin/onnx-mlir", type=Path)
    parser.add_argument("--onnx-include-dir", default=REPO / "onnx-mlir/include", type=Path)
    parser.add_argument("--pimcomp-dir", default=REPO / "third_party/PIMCOMP-NN", type=Path)
    parser.add_argument("--pim-simulator-dir", default=REPO / "backend-simulators/pim/pim-simulator", type=Path)
    parser.add_argument("--pimsim-nn-build-dir", default=REPO / "backend-simulators/pim/pimsim-nn/build", type=Path)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--threshold", type=float, default=1e-3)
    parser.add_argument("--rtol", type=float, default=1e-5)
    parser.add_argument("--timeout-seconds", type=float, default=3600.0)
    parser.add_argument("--core-count", type=int)
    parser.add_argument("--crossbar-count", type=int)
    parser.add_argument("--crossbar-size", type=int)
    parser.add_argument("--mesh-rows", type=int)
    parser.add_argument("--mesh-cols", type=int)
    parser.add_argument("--pimsim-time-ms", type=int, default=1000)
    parser.add_argument("--pimsim-mode", choices=["latency", "throughput"], default="latency")
    parser.add_argument("--skip-pimsim-nn", action="store_true")
    parser.add_argument("--verbose-raptor-compile", action="store_true")
    parser.add_argument("--raptor-extra-arg", action="append", default=[])
    parser.add_argument(
        "--fail-on-error",
        action="store_true",
        help="Return a non-zero process status after writing the reports if any compilation/run stage failed.",
    )
    args = parser.parse_args()

    model_path = args.model.resolve()
    out_dir = args.out_dir.resolve()
    out_dir.mkdir(parents=True, exist_ok=True)

    failures: list[dict[str, str]] = []
    steps: list[StepRecord] = []
    hardware: dict[str, int] = {
        "mesh_rows": 0,
        "mesh_cols": 0,
        "crossbar_count": 0,
        "crossbar_size": 0,
        "core_count": 0,
    }
    inputs_desc: list[tuple[int, str, int, list[int]]] = []
    outputs_desc: list[tuple[int, str, int, list[int]]] = []
    arrays_in_order: list[np.ndarray] = []
    runtime_inputs: list[np.ndarray] = []

    network_mlir: Path | None = None
    runner_path: Path | None = None
    reference_dir: Path | None = None
    raptor_pim_dir: Path | None = None
    raptor_pass_timings: dict[str, float] = {}
    verification_info: Path | None = None
    simulation_info: Path | None = None
    pimcomp_export_dir: Path | None = None
    pimsim_config: Path | None = None

    raptor_validation = skipped_validation("Raptor validation did not run")
    pimcomp_validation = skipped_validation("PIMCOMP validation did not run")
    raptor_perf: dict[str, Any] = skipped_perf("pimsim-nn Raptor did not run")
    pimcomp_perf: dict[str, Any] = skipped_perf("pimsim-nn PIMCOMP did not run")
    raptor_instr: dict[str, Any] = empty_instruction_summary("Raptor instruction parsing did not run")
    pimcomp_instr: dict[str, Any] = empty_instruction_summary("PIMCOMP instruction parsing did not run")

    loaded_hardware = try_stage(failures, "Load hardware configuration", load_effective_hardware, args)
    if loaded_hardware is not None:
        hardware = loaded_hardware

    model_io = try_stage(failures, "Load model inputs", load_model_inputs, model_path, args.seed)
    if model_io is not None:
        inputs_desc, outputs_desc, arrays_in_order, runtime_inputs = model_io

    expected_network_mlir = out_dir / "reference" / f"{model_path.stem}.onnx.mlir"
    expected_runner_path = out_dir / "runner" / "build" / "runner"

    reference_compile = try_stage(
        failures,
        "Compile reference",
        compile_reference,
        args,
        model_path,
        out_dir,
        steps,
    )
    if reference_compile is not None:
        network_mlir, _, runner_path = reference_compile
    else:
        if expected_network_mlir.exists():
            network_mlir = expected_network_mlir
            print(f"\n[Continue] Reusing partial ONNX MLIR: {network_mlir}")
        if expected_runner_path.exists():
            runner_path = expected_runner_path
            print(f"\n[Continue] Reusing partial runner: {runner_path}")

    if runner_path is not None and runner_path.exists() and model_io is not None:
        generated_reference = try_stage(
            failures,
            "Run reference",
            generate_reference_outputs,
            runner_path,
            runner_path.parent,
            model_path,
            arrays_in_order,
            steps,
            args,
            out_dir,
        )
        if generated_reference is not None:
            reference_dir = generated_reference
    else:
        record_failure(
            failures,
            "Skip reference outputs",
            "Reference outputs were skipped because the native runner or model inputs are not available.",
        )

    if network_mlir is not None and network_mlir.exists() and hardware["core_count"] > 0:
        compiled_raptor = try_stage(
            failures,
            "Compile Raptor PIM",
            compile_raptor_target,
            network_mlir,
            out_dir / "raptor",
            hardware,
            args,
            steps,
        )
        if compiled_raptor is not None:
            raptor_pim_dir, raptor_pass_timings = compiled_raptor
    else:
        record_failure(
            failures,
            "Skip Raptor PIM compile",
            "Raptor PIM compile was skipped because the ONNX MLIR or hardware configuration is not available.",
        )

    if raptor_pim_dir is not None:
        wrote_inputs = try_stage_success(
            failures,
            "Write Raptor inputs",
            write_inputs_to_memory_bin,
            raptor_pim_dir / "memory.bin",
            raptor_pim_dir / "config.json",
            runtime_inputs,
        )
        if wrote_inputs and reference_dir is not None and outputs_desc:
            validation = try_stage(
                failures,
                "Rust Validation Raptor",
                run_rust_validation,
                "Rust Validation Raptor",
                raptor_pim_dir,
                raptor_pim_dir / "config.json",
                outputs_desc,
                reference_dir,
                steps,
                args,
            )
            raptor_validation = validation if validation is not None else failed_validation("Raptor validation failed")
        elif reference_dir is None:
            raptor_validation = skipped_validation("Reference outputs are not available")
        elif not outputs_desc:
            raptor_validation = skipped_validation("Output descriptors are not available")
        else:
            raptor_validation = skipped_validation("Raptor input materialization failed")
    else:
        raptor_validation = skipped_validation("Raptor PIM compilation did not produce a PIM directory")

    compiled_pimcomp = try_stage(
        failures,
        "Compile PIMCOMP",
        compile_pimcomp,
        args,
        model_path,
        out_dir / "pimcomp",
        steps,
    )
    if compiled_pimcomp is not None:
        verification_info, simulation_info = compiled_pimcomp

    if verification_info is not None and simulation_info is not None and model_io is not None:
        exported = try_stage(
            failures,
            "Export PIMCOMP for Rust",
            export_pimcomp_for_rust,
            model_path,
            verification_info,
            simulation_info,
            runtime_inputs,
            out_dir / "pimcomp_exported",
        )
        if exported is not None:
            pimcomp_export_dir = exported
    elif verification_info is None or simulation_info is None:
        record_failure(
            failures,
            "Skip PIMCOMP Rust export",
            "PIMCOMP Rust export was skipped because PIMCOMP did not produce VerificationInfo.json and SimulationInfo.gz.",
        )
    else:
        record_failure(
            failures,
            "Skip PIMCOMP Rust export",
            "PIMCOMP Rust export was skipped because model inputs are not available.",
        )

    if pimcomp_export_dir is not None and reference_dir is not None and outputs_desc:
        validation = try_stage(
            failures,
            "Rust Validation PIMCOMP",
            run_rust_validation,
            "Rust Validation PIMCOMP",
            pimcomp_export_dir,
            pimcomp_export_dir / "config.json",
            outputs_desc,
            reference_dir,
            steps,
            args,
        )
        pimcomp_validation = validation if validation is not None else failed_validation("PIMCOMP validation failed")
    elif pimcomp_export_dir is None:
        pimcomp_validation = skipped_validation("PIMCOMP Rust export is not available")
    elif reference_dir is None:
        pimcomp_validation = skipped_validation("Reference outputs are not available")
    else:
        pimcomp_validation = skipped_validation("Output descriptors are not available")

    if hardware["core_count"] > 0:
        written_config = try_stage(
            failures,
            "Write pimsim-nn config",
            write_pimsim_config,
            args,
            out_dir / "pimsim_config",
            hardware,
        )
        if written_config is not None:
            pimsim_config = written_config
    else:
        record_failure(
            failures,
            "Skip pimsim-nn config",
            "pimsim-nn config was skipped because the hardware configuration is not available.",
        )

    if args.skip_pimsim_nn:
        raptor_perf = skipped_perf("Skipped by --skip-pimsim-nn")
        pimcomp_perf = skipped_perf("Skipped by --skip-pimsim-nn")
    elif pimsim_config is None:
        raptor_perf = skipped_perf("pimsim-nn config is not available")
        pimcomp_perf = skipped_perf("pimsim-nn config is not available")
    else:
        if raptor_pim_dir is not None:
            perf = try_stage(
                failures,
                "pimsim-nn Raptor",
                run_pimsim_nn,
                "pimsim-nn Raptor",
                raptor_pim_dir,
                pimsim_config,
                False,
                steps,
                args,
            )
            raptor_perf = perf if perf is not None else failed_perf("pimsim-nn Raptor failed")
        else:
            raptor_perf = skipped_perf("Raptor PIM directory is not available")

        if simulation_info is not None:
            pimcomp_pimsim_dir = try_stage(
                failures,
                "Export PIMCOMP for pimsim-nn",
                export_pimcomp_for_pimsim_nn,
                simulation_info,
                out_dir / "pimcomp_pimsim_nn",
            )
            if pimcomp_pimsim_dir is not None:
                perf = try_stage(
                    failures,
                    "pimsim-nn PIMCOMP",
                    run_pimsim_nn,
                    "pimsim-nn PIMCOMP",
                    pimcomp_pimsim_dir,
                    pimsim_config,
                    False,
                    steps,
                    args,
                )
                pimcomp_perf = perf if perf is not None else failed_perf("pimsim-nn PIMCOMP failed")
            else:
                pimcomp_perf = failed_perf("PIMCOMP pimsim-nn export failed")
        else:
            pimcomp_perf = skipped_perf("PIMCOMP SimulationInfo.gz is not available")

    if raptor_pim_dir is not None and raptor_pim_dir.exists():
        parsed = try_stage(failures, "Parse Raptor instructions", parse_raptor_instructions, raptor_pim_dir)
        raptor_instr = parsed if parsed is not None else empty_instruction_summary(error="Failed to parse Raptor instructions")
    else:
        raptor_instr = empty_instruction_summary("Raptor PIM directory is not available")

    if simulation_info is not None and simulation_info.exists():
        parsed = try_stage(failures, "Parse PIMCOMP instructions", parse_pimcomp_instructions, simulation_info)
        pimcomp_instr = parsed if parsed is not None else empty_instruction_summary(error="Failed to parse PIMCOMP instructions")
    else:
        pimcomp_instr = empty_instruction_summary("PIMCOMP SimulationInfo.gz is not available")

    report_path = out_dir / "comparison_report.md"
    write_report(
        report_path,
        model_path=model_path,
        hardware=hardware,
        steps=steps,
        failures=failures,
        raptor_validation=raptor_validation,
        pimcomp_validation=pimcomp_validation,
        raptor_perf=raptor_perf,
        pimcomp_perf=pimcomp_perf,
        raptor_instr=raptor_instr,
        pimcomp_instr=pimcomp_instr,
        raptor_pass_timings=raptor_pass_timings,
        pimsim_mode=args.pimsim_mode,
    )

    json_report = {
        "model": str(model_path),
        "hardware": hardware,
        "pimsim_mode": args.pimsim_mode,
        "failures": failures,
        "steps": [asdict(step) for step in steps],
        "raptor_validation": asdict(raptor_validation),
        "pimcomp_validation": asdict(pimcomp_validation),
        "raptor_performance": raptor_perf,
        "pimcomp_performance": pimcomp_perf,
        "raptor_instruction_summary": raptor_instr,
        "pimcomp_instruction_summary": pimcomp_instr,
        "raptor_pass_timings": raptor_pass_timings,
        "paths": {
            "reference_outputs": optional_path(reference_dir),
            "raptor_pim": optional_path(raptor_pim_dir),
            "pimcomp_simulation_info": optional_path(simulation_info),
            "pimcomp_exported_pim": optional_path(pimcomp_export_dir),
            "pimsim_config": optional_path(pimsim_config),
            "report_markdown": str(report_path),
        },
    }
    json_path = out_dir / "comparison_report.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_report, f, indent=2)
        f.write("\n")

    print(f"\n[Done]")
    print(f"  Report: {report_path}")
    print(f"  JSON:   {json_path}")
    if failures or any(step.status != "passed" for step in steps):
        print(f"  Completed with {len(failures)} recorded failure/skipped stage(s).")

    if args.fail_on_error and (failures or any(step.status != "passed" for step in steps)):
        raise SystemExit(1)


if __name__ == "__main__":
    main()