From 6d69600bc176dd282a8f33f98449f67a990b363b Mon Sep 17 00:00:00 2001
From: ilgeco <gabriele.magnani@mail.polimi.it>
Date: Wed, 10 Jun 2026 11:59:43 +0200
Subject: [PATCH] Yolo Image Validator + new accept rule

---
 src/PIM/Compiler/PimCompilerOptions.cpp       |   6 +
 src/PIM/Compiler/PimCompilerOptions.hpp       |   1 +
 src/PIM/Compiler/PimCompilerUtils.cpp         |   3 +-
 validation/gen_network_runner.py              |   5 +-
 validation/raptor.py                          |   5 +-
 .../tools/yolo_local_image_validation.py      | 223 +++++++++
 .../tools/yolo_real_image_validation.py       | 425 ++++++++++++++++++
 validation/validate.py                        |   9 +-
 validation/validate_one.py                    |  19 +-
 9 files changed, 685 insertions(+), 11 deletions(-)
 create mode 100644 validation/tools/yolo_local_image_validation.py
 create mode 100644 validation/tools/yolo_real_image_validation.py
diff --git a/src/PIM/Compiler/PimCompilerOptions.cpp b/src/PIM/Compiler/PimCompilerOptions.cpp
index 0578e4e..7d1cb01 100644
--- a/src/PIM/Compiler/PimCompilerOptions.cpp
+++ b/src/PIM/Compiler/PimCompilerOptions.cpp
@@ -38,6 +38,12 @@ llvm::cl::opt<bool>
                  llvm::cl::init(false),
                  llvm::cl::cat(OnnxMlirOptions));
 
+llvm::cl::opt<bool>
+  pimDisableMemoryCoalescing("pim-disable-memory-coalescing",
+                             llvm::cl::desc("Skip the PIM memory coalescing pass (developer diagnostic option)"),
+                             llvm::cl::init(false),
+                             llvm::cl::cat(OnnxMlirOptions));
+
 llvm::cl::opt<bool> useExperimentalConvImpl("use-experimental-conv-impl",
                                             llvm::cl::desc("Use experimental implementation for convolution"),
                                             llvm::cl::init(false),
diff --git a/src/PIM/Compiler/PimCompilerOptions.hpp b/src/PIM/Compiler/PimCompilerOptions.hpp
index b486070..3d90409 100644
--- a/src/PIM/Compiler/PimCompilerOptions.hpp
+++ b/src/PIM/Compiler/PimCompilerOptions.hpp
@@ -36,6 +36,7 @@ extern llvm::cl::opt<PimMergeSchedulerType> pimMergeScheduler;
 extern llvm::cl::opt<PimMemoryReportLevel> pimMemoryReport;
 
 extern llvm::cl::opt<bool> pimOnlyCodegen;
+extern llvm::cl::opt<bool> pimDisableMemoryCoalescing;
 extern llvm::cl::opt<bool> useExperimentalConvImpl;
 extern llvm::cl::opt<bool> pimEmitJson;
 
diff --git a/src/PIM/Compiler/PimCompilerUtils.cpp b/src/PIM/Compiler/PimCompilerUtils.cpp
index 5035379..e9bc397 100644
--- a/src/PIM/Compiler/PimCompilerUtils.cpp
+++ b/src/PIM/Compiler/PimCompilerUtils.cpp
@@ -46,7 +46,8 @@ void addPassesPim(OwningOpRef<ModuleOp>& module,
   if (pimEmissionTarget >= EmitPimCodegen) {
     pm.addPass(createPimHostConstantFoldingPass());
     pm.addPass(createMessagePass("Pim host constants folded"));
-    pm.addPass(createPimMemoryCoalescingPass());
+    if (!pimDisableMemoryCoalescing)
+      pm.addPass(createPimMemoryCoalescingPass());
     pm.addPass(createPimVerificationPass());
     pm.addPass(createMessagePass("Pim verified"));
     pm.addPass(createEmitPimCodePass());
diff --git a/validation/gen_network_runner.py b/validation/gen_network_runner.py
index 7966fc8..7f0b731 100644
--- a/validation/gen_network_runner.py
+++ b/validation/gen_network_runner.py
@@ -199,7 +199,10 @@ int main(int argc, char **argv) {{
 
   // ---- Cleanup ----
   omTensorListDestroy(in_list);
-  omTensorListDestroy(out_list);
+  // Some debug-heavy models return aliased outputs. This runner is a short-
+  // lived process, so destroy only the list wrapper and let process exit
+  // reclaim the output tensors safely.
+  omTensorListDestroyShallow(out_list);
   return 0;
 }}
 """
diff --git a/validation/raptor.py b/validation/raptor.py
index 371c30c..e9feac7 100644
--- a/validation/raptor.py
+++ b/validation/raptor.py
@@ -41,7 +41,8 @@ def _format_command(cmd):
 
 def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
                         crossbar_size, crossbar_count, core_count=None, pim_merge_scheduler="peft",
-                        pim_memory_report="none", cwd=None, verbose=False, reporter=None, timeout_sec=None):
+                        pim_memory_report="none", raptor_extra_args=None, cwd=None, verbose=False,
+                        reporter=None, timeout_sec=None):
     # Define the arguments, with the possibility to set crossbar size and count
     args = [
         network_path,
@@ -57,6 +58,8 @@ def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
         args.append(f"--core-count={core_count}")
     if pim_memory_report != "none":
         args.append(f"--pim-memory-report={pim_memory_report}")
+    if raptor_extra_args:
+        args.extend(str(arg) for arg in raptor_extra_args)
     if verbose:
         args.append("--enable-timing")
 
diff --git a/validation/tools/yolo_local_image_validation.py b/validation/tools/yolo_local_image_validation.py
new file mode 100644
index 0000000..f0cff5b
--- /dev/null
+++ b/validation/tools/yolo_local_image_validation.py
@@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+from onnx_utils import _ONNX_TO_NP, onnx_io, write_inputs_to_memory_bin
+from validate_one import MODE_COMPILE_ONLY, build_dump_ranges, run_pim_simulator, sanitize_output_name, validate_network
+from yolo_real_image_validation import (
+    IMAGE_CASES,
+    decode_yolo_output,
+    download_image,
+    draw_detections,
+    letterbox_rgb,
+    save_tensor_csv,
+    top_unique_labels,
+)
+
+
+def resolve_default_paths():
+    validation_dir = Path(__file__).resolve().parent.parent
+    repo_root = validation_dir.parent
+    return {
+        "validation_dir": validation_dir,
+        "repo_root": repo_root,
+        "network_dir": validation_dir / "networks" / "yolo11n" / "depth_51",
+        "raptor_path": repo_root / "build_release" / "Release" / "bin" / "onnx-mlir",
+        "onnx_include_dir": repo_root / "onnx-mlir" / "include",
+        "simulator_dir": repo_root / "backend-simulators" / "pim" / "pim-simulator",
+    }
+
+
+def find_network_onnx(network_dir: Path) -> Path:
+    onnx_files = sorted(network_dir.glob("*.onnx"))
+    if not onnx_files:
+        raise FileNotFoundError(f"No .onnx file found in {network_dir}")
+    if len(onnx_files) > 1:
+        names = ", ".join(path.name for path in onnx_files)
+        raise RuntimeError(f"Expected exactly one .onnx file in {network_dir}, found: {names}")
+    return onnx_files[0]
+
+
+def local_case_paths(network_dir: Path, case_name: str):
+    return {
+        "root": network_dir,
+        "runner": network_dir / "runner" / "build" / "runner",
+        "runner_build": network_dir / "runner" / "build",
+        "raptor_pim": network_dir / "raptor" / "pim",
+        "real_root": network_dir / "real_image_validation",
+        "input_csv": network_dir / "real_image_validation" / "inputs" / f"{case_name}.csv",
+        "ref_dir": network_dir / "real_image_validation" / "reference" / case_name,
+        "sim_dir": network_dir / "real_image_validation" / "simulation" / case_name,
+        "sim_bin": network_dir / "real_image_validation" / "simulation" / case_name / "out.bin",
+    }
+
+
+def ensure_local_artifacts(args, network_onnx_path: Path):
+    validate_network(
+        network_onnx_path=network_onnx_path,
+        raptor_path=args.raptor_path,
+        onnx_include_dir=args.onnx_include_dir,
+        simulator_dir=args.simulator_dir,
+        crossbar_size=args.crossbar_size,
+        crossbar_count=args.crossbar_count,
+        core_count=args.core_count,
+        command_timeout_seconds=args.command_timeout_seconds,
+        mode=MODE_COMPILE_ONLY,
+        verbose=args.verbose,
+    )
+
+
+def ensure_existing_artifacts(network_dir: Path):
+    required_paths = [
+        network_dir / "runner" / "build" / "runner",
+        network_dir / "raptor" / "pim" / "config.json",
+        network_dir / "raptor" / "pim" / "memory.bin",
+    ]
+    missing = [str(path) for path in required_paths if not path.exists()]
+    if missing:
+        raise FileNotFoundError(
+            "Missing compiled local artifacts. Re-run without --skip-compile or restore these paths:\n  "
+            + "\n  ".join(missing)
+        )
+
+
+def run_local_reference_and_simulator(args, network_dir: Path, network_onnx_path: Path, case_name: str):
+    paths = local_case_paths(network_dir, case_name)
+    paths["ref_dir"].mkdir(parents=True, exist_ok=True)
+    paths["sim_dir"].mkdir(parents=True, exist_ok=True)
+
+    output_descriptors = onnx_io(network_onnx_path)[1]
+    if len(output_descriptors) != 1:
+        raise RuntimeError(f"Expected one YOLO output tensor, found {len(output_descriptors)}")
+
+    runner_cmd = [
+        str(paths["runner"]),
+        "--in0-csv-file",
+        str(paths["input_csv"]),
+        "--in0-shape",
+        "1x3x640x640",
+        "--save-csv-dir",
+        str(paths["ref_dir"]),
+    ]
+    subprocess.run(runner_cmd, cwd=paths["runner_build"], check=True)
+
+    tensor = np.loadtxt(paths["input_csv"], delimiter=",", dtype=np.float32).reshape(1, 3, 640, 640)
+    write_inputs_to_memory_bin(paths["raptor_pim"] / "memory.bin", paths["raptor_pim"] / "config.json", [tensor])
+
+    dump_ranges = build_dump_ranges(paths["raptor_pim"] / "config.json", output_descriptors)
+    run_pim_simulator(
+        args.simulator_dir,
+        paths["raptor_pim"],
+        paths["sim_bin"],
+        dump_ranges,
+        timeout_sec=args.command_timeout_seconds,
+    )
+    return paths, output_descriptors[0]
+
+
+def analyze_case(args, network_dir: Path, network_onnx_path: Path, case, work_dir: Path):
+    image_path = work_dir / f"{case.name}{Path(case.url).suffix or '.img'}"
+    csv_path = work_dir / f"{case.name}.csv"
+    annotated_dir = args.annotated_dir
+    annotated_dir.mkdir(parents=True, exist_ok=True)
+    download_image(case.url, image_path)
+    tensor = letterbox_rgb(Image.open(image_path))
+    save_tensor_csv(tensor, csv_path)
+
+    paths = local_case_paths(network_dir, case.name)
+    paths["input_csv"].parent.mkdir(parents=True, exist_ok=True)
+    paths["input_csv"].write_bytes(csv_path.read_bytes())
+    paths, output_descriptor = run_local_reference_and_simulator(args, network_dir, network_onnx_path, case.name)
+
+    output_index, output_name, output_dtype_code, output_shape = output_descriptor
+    output_dtype = np.dtype(_ONNX_TO_NP[output_dtype_code])
+    ref_csv_path = paths["ref_dir"] / f"output{output_index}_{sanitize_output_name(output_name)}.csv"
+    ref = np.loadtxt(ref_csv_path, delimiter=",", dtype=output_dtype).reshape(output_shape)
+    sim = np.frombuffer(
+        paths["sim_bin"].read_bytes(),
+        dtype=output_dtype,
+        count=int(np.prod(output_shape)),
+    ).reshape(output_shape)
+
+    abs_diff = np.abs(sim.astype(np.float64) - ref.astype(np.float64))
+    rel_diff = abs_diff / np.maximum(np.abs(ref.astype(np.float64)), 1e-12)
+
+    ref_detections = decode_yolo_output(ref)
+    sim_detections = decode_yolo_output(sim)
+    ref_labels = top_unique_labels(ref_detections)
+    sim_labels = top_unique_labels(sim_detections)
+    ref_image_path = annotated_dir / f"{case.name}_reference.png"
+    sim_image_path = annotated_dir / f"{case.name}_simulator.png"
+    draw_detections(image_path, ref_detections, ref_image_path)
+    draw_detections(image_path, sim_detections, sim_image_path)
+
+    return {
+        "case": case.name,
+        "expected_label": case.expected_label,
+        "ref_top_labels": ref_labels,
+        "sim_top_labels": sim_labels,
+        "top1_match": bool(ref_labels and sim_labels and ref_labels[0] == sim_labels[0]),
+        "expected_in_ref": case.expected_label in ref_labels,
+        "expected_in_sim": case.expected_label in sim_labels,
+        "max_abs_diff": float(abs_diff.max()),
+        "mean_abs_diff": float(abs_diff.mean()),
+        "max_rel_diff": float(rel_diff.max()),
+        "mean_rel_diff": float(rel_diff.mean()),
+        "reference_annotated_image": str(ref_image_path),
+        "simulator_annotated_image": str(sim_image_path),
+        "ref_top_detections": ref_detections[:5],
+        "sim_top_detections": sim_detections[:5],
+    }
+
+
+def main():
+    defaults = resolve_default_paths()
+
+    parser = argparse.ArgumentParser(description="Validate YOLO detections on real images using local compilation and simulator execution.")
+    parser.add_argument("--network-dir", type=Path, default=defaults["network_dir"])
+    parser.add_argument("--network-onnx", type=Path, default=None)
+    parser.add_argument("--raptor-path", type=Path, default=defaults["raptor_path"])
+    parser.add_argument("--onnx-include-dir", type=Path, default=defaults["onnx_include_dir"])
+    parser.add_argument("--simulator-dir", type=Path, default=defaults["simulator_dir"])
+    parser.add_argument("--crossbar-size", type=int, default=2048)
+    parser.add_argument("--crossbar-count", type=int, default=256)
+    parser.add_argument("--core-count", type=int, default=1000)
+    parser.add_argument("--command-timeout-seconds", type=float, default=7200.0)
+    parser.add_argument("--skip-compile", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    parser.add_argument(
+        "--annotated-dir",
+        type=Path,
+        default=defaults["network_dir"] / "real_image_validation" / "annotated",
+    )
+    args = parser.parse_args()
+
+    args.network_dir = args.network_dir.resolve()
+    args.network_onnx = args.network_onnx.resolve() if args.network_onnx else find_network_onnx(args.network_dir)
+    args.raptor_path = args.raptor_path.resolve()
+    args.onnx_include_dir = args.onnx_include_dir.resolve()
+    args.simulator_dir = args.simulator_dir.resolve()
+    args.annotated_dir = args.annotated_dir.resolve()
+
+    if not args.skip_compile:
+        ensure_local_artifacts(args, args.network_onnx)
+    else:
+        ensure_existing_artifacts(args.network_dir)
+
+    reports = []
+    with tempfile.TemporaryDirectory(prefix="yolo_local_images_") as tmp_dir:
+        work_dir = Path(tmp_dir)
+        for case in IMAGE_CASES:
+            reports.append(analyze_case(args, args.network_dir, args.network_onnx, case, work_dir))
+
+    print(json.dumps({"network_dir": str(args.network_dir), "network_onnx": str(args.network_onnx), "cases": reports}, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/validation/tools/yolo_real_image_validation.py b/validation/tools/yolo_real_image_validation.py
new file mode 100644
index 0000000..173989c
--- /dev/null
+++ b/validation/tools/yolo_real_image_validation.py
@@ -0,0 +1,425 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import shlex
+import subprocess
+import tempfile
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageDraw
+
+
+COCO80_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard",
+    "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
+    "scissors", "teddy bear", "hair drier", "toothbrush",
+]
+
+
+@dataclass(frozen=True)
+class ImageCase:
+    name: str
+    url: str
+    expected_label: str
+
+
+IMAGE_CASES = [
+    ImageCase(
+        name="cat_coco_39769",
+        url="http://images.cocodataset.org/val2017/000000039769.jpg",
+        expected_label="cat",
+    ),
+    ImageCase(
+        name="dog_pytorch_hub",
+        url="https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+        expected_label="dog",
+    ),
+        ImageCase(
+        name="cute_kitty",
+        url="https://images.unsplash.com/photo-1529778873920-4da4926a72c2?q=80&w=872&auto=format&fit=crop&ixlib=rb-4.1.0&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" ,
+        expected_label="cat",
+    ),
+
+]
+
+
+def run(cmd, *, cwd=None, capture_output=False, input_bytes=None):
+    return subprocess.run(
+        cmd,
+        cwd=cwd,
+        check=True,
+        input=input_bytes,
+        capture_output=capture_output,
+    )
+
+
+def ssh_command(ssh_key: str, remote_host: str, command: str):
+    return ["ssh", "-i", ssh_key, remote_host, command]
+
+
+def remote_bash(ssh_key: str, remote_host: str, command: str, *, capture_output=False, input_bytes=None):
+    return run(
+        ssh_command(ssh_key, remote_host, command),
+        capture_output=capture_output,
+        input_bytes=input_bytes,
+    )
+
+
+def download_image(url: str, path: Path):
+    with urllib.request.urlopen(url) as response:
+        path.write_bytes(response.read())
+
+
+def letterbox_rgb(image: Image.Image, size: int = 640) -> np.ndarray:
+    image = image.convert("RGB")
+    width, height = image.size
+    scale = min(size / width, size / height)
+    resized_width = max(1, int(round(width * scale)))
+    resized_height = max(1, int(round(height * scale)))
+    resized = image.resize((resized_width, resized_height), Image.Resampling.BILINEAR)
+
+    canvas = Image.new("RGB", (size, size), (114, 114, 114))
+    offset_x = (size - resized_width) // 2
+    offset_y = (size - resized_height) // 2
+    canvas.paste(resized, (offset_x, offset_y))
+
+    array = np.asarray(canvas, dtype=np.float32) / 255.0
+    chw = np.transpose(array, (2, 0, 1))
+    return np.expand_dims(chw, axis=0)
+
+
+def letterbox_params(width: int, height: int, size: int = 640):
+    scale = min(size / width, size / height)
+    resized_width = max(1, int(round(width * scale)))
+    resized_height = max(1, int(round(height * scale)))
+    offset_x = (size - resized_width) // 2
+    offset_y = (size - resized_height) // 2
+    return scale, offset_x, offset_y
+
+
+def save_tensor_csv(array: np.ndarray, path: Path):
+    flat = array.reshape(-1)
+    np.savetxt(path, flat[np.newaxis, :], delimiter=",", fmt="%.9g")
+
+
+def iou_xyxy(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
+    x1 = np.maximum(box[0], boxes[:, 0])
+    y1 = np.maximum(box[1], boxes[:, 1])
+    x2 = np.minimum(box[2], boxes[:, 2])
+    y2 = np.minimum(box[3], boxes[:, 3])
+
+    inter_w = np.maximum(0.0, x2 - x1)
+    inter_h = np.maximum(0.0, y2 - y1)
+    inter = inter_w * inter_h
+
+    area_box = np.maximum(0.0, box[2] - box[0]) * np.maximum(0.0, box[3] - box[1])
+    area_boxes = np.maximum(0.0, boxes[:, 2] - boxes[:, 0]) * np.maximum(0.0, boxes[:, 3] - boxes[:, 1])
+    union = area_box + area_boxes - inter
+    return np.divide(inter, union, out=np.zeros_like(inter), where=union > 0)
+
+
+def decode_yolo_output(
+    output: np.ndarray,
+    *,
+    conf_threshold: float = 0.25,
+    iou_threshold: float = 0.45,
+    max_detections: int = 50,
+):
+    predictions = output[0].T
+    boxes_xywh = predictions[:, :4]
+    class_scores = predictions[:, 4:]
+
+    class_ids = np.argmax(class_scores, axis=1)
+    confidences = class_scores[np.arange(class_scores.shape[0]), class_ids]
+    keep = confidences >= conf_threshold
+
+    if not np.any(keep):
+        return []
+
+    boxes_xywh = boxes_xywh[keep]
+    class_ids = class_ids[keep]
+    confidences = confidences[keep]
+
+    boxes_xyxy = np.empty_like(boxes_xywh)
+    boxes_xyxy[:, 0] = boxes_xywh[:, 0] - boxes_xywh[:, 2] / 2.0
+    boxes_xyxy[:, 1] = boxes_xywh[:, 1] - boxes_xywh[:, 3] / 2.0
+    boxes_xyxy[:, 2] = boxes_xywh[:, 0] + boxes_xywh[:, 2] / 2.0
+    boxes_xyxy[:, 3] = boxes_xywh[:, 1] + boxes_xywh[:, 3] / 2.0
+
+    detections = []
+    for class_id in np.unique(class_ids):
+        class_mask = class_ids == class_id
+        class_boxes = boxes_xyxy[class_mask]
+        class_scores_masked = confidences[class_mask]
+        order = np.argsort(-class_scores_masked)
+
+        while order.size > 0:
+            best = order[0]
+            detections.append({
+                "label": COCO80_CLASSES[int(class_id)],
+                "class_id": int(class_id),
+                "confidence": float(class_scores_masked[best]),
+                "box_xyxy": class_boxes[best].tolist(),
+            })
+
+            if order.size == 1:
+                break
+
+            rest = order[1:]
+            overlaps = iou_xyxy(class_boxes[best], class_boxes[rest])
+            order = rest[overlaps <= iou_threshold]
+
+    detections.sort(key=lambda det: det["confidence"], reverse=True)
+    return detections[:max_detections]
+
+
+def top_unique_labels(detections, limit: int = 5):
+    labels = []
+    seen = set()
+    for det in detections:
+        label = det["label"]
+        if label in seen:
+            continue
+        seen.add(label)
+        labels.append(label)
+        if len(labels) == limit:
+            break
+    return labels
+
+
+def clamp_box_xyxy(box_xyxy, width: int, height: int):
+    x1, y1, x2, y2 = box_xyxy
+    return [
+        max(0.0, min(float(width - 1), float(x1))),
+        max(0.0, min(float(height - 1), float(y1))),
+        max(0.0, min(float(width - 1), float(x2))),
+        max(0.0, min(float(height - 1), float(y2))),
+    ]
+
+
+def unletterbox_box_xyxy(box_xyxy, width: int, height: int, size: int = 640):
+    scale, offset_x, offset_y = letterbox_params(width, height, size=size)
+    x1, y1, x2, y2 = box_xyxy
+    return [
+        (float(x1) - offset_x) / scale,
+        (float(y1) - offset_y) / scale,
+        (float(x2) - offset_x) / scale,
+        (float(y2) - offset_y) / scale,
+    ]
+
+
+def draw_detections(image_path: Path, detections, output_path: Path, *, limit: int = 10):
+    image = Image.open(image_path).convert("RGB")
+    draw = ImageDraw.Draw(image)
+    width, height = image.size
+
+    for det in detections[:limit]:
+        box = unletterbox_box_xyxy(det["box_xyxy"], width, height)
+        box = clamp_box_xyxy(box, width, height)
+        label = f'{det["label"]} {det["confidence"]:.2f}'
+        draw.rectangle(box, outline=(255, 0, 0), width=3)
+        text_box = draw.textbbox((box[0], box[1]), label)
+        text_bg = [
+            text_box[0] - 2,
+            text_box[1] - 2,
+            text_box[2] + 2,
+            text_box[3] + 2,
+        ]
+        draw.rectangle(text_bg, fill=(255, 0, 0))
+        draw.text((box[0], box[1]), label, fill=(255, 255, 255))
+
+    image.save(output_path)
+
+
+def ensure_remote_artifacts(args):
+    remote_project = shlex.quote(args.remote_project)
+    remote_python = shlex.quote(args.remote_python)
+    validate_cmd = (
+        f"export PATH=$HOME/.cargo/bin:$PATH && "
+        f"cd {remote_project} && "
+        f"{remote_python} validation/validate.py "
+        f"--raptor-path build_release/Release/bin/onnx-mlir "
+        f"--onnx-include-dir onnx-mlir/include "
+        f"--operations-dir {shlex.quote(args.network_dir)} "
+        f"--crossbar-size {args.crossbar_size} "
+        f"--crossbar-count {args.crossbar_count} "
+        f"--core-count {args.core_count} "
+        f"--command-timeout-seconds {args.command_timeout_seconds} "
+        f"--compile-only"
+    )
+    remote_bash(args.ssh_key, args.remote_host, validate_cmd)
+
+
+def remote_case_paths(args, case_name: str):
+    network_dir = Path(args.network_dir)
+    root = Path(args.remote_project) / network_dir
+    return {
+        "root": root,
+        "runner": root / "runner" / "build" / "runner",
+        "runner_build": root / "runner" / "build",
+        "raptor_pim": root / "raptor" / "pim",
+        "real_root": root / "real_image_validation",
+        "input_csv": root / "real_image_validation" / "inputs" / f"{case_name}.csv",
+        "ref_dir": root / "real_image_validation" / "reference" / case_name,
+        "sim_dir": root / "real_image_validation" / "simulation" / case_name,
+        "sim_bin": root / "real_image_validation" / "simulation" / case_name / "out.bin",
+    }
+
+
+def write_remote_file(args, remote_path: Path, data: bytes):
+    command = (
+        f"mkdir -p {shlex.quote(str(remote_path.parent))} && "
+        f"cat > {shlex.quote(str(remote_path))}"
+    )
+    remote_bash(args.ssh_key, args.remote_host, command, input_bytes=data)
+
+
+def run_remote_reference_and_simulator(args, case_name: str):
+    paths = remote_case_paths(args, case_name)
+    quoted_project = shlex.quote(args.remote_project)
+    quoted_python = shlex.quote(args.remote_python)
+    quoted_case_csv = shlex.quote(str(paths["input_csv"]))
+    quoted_ref_dir = shlex.quote(str(paths["ref_dir"]))
+    quoted_sim_dir = shlex.quote(str(paths["sim_dir"]))
+    quoted_sim_bin = shlex.quote(str(paths["sim_bin"]))
+    quoted_runner = shlex.quote(str(paths["runner"]))
+    quoted_runner_build = shlex.quote(str(paths["runner_build"]))
+    quoted_pim = shlex.quote(str(paths["raptor_pim"]))
+
+    command = f"""
+set -e
+export PATH=$HOME/.cargo/bin:$PATH
+cd {quoted_project}
+mkdir -p {quoted_ref_dir} {quoted_sim_dir}
+cd {quoted_runner_build}
+{quoted_runner} --in0-csv-file {quoted_case_csv} --in0-shape 1x3x640x640 --save-csv-dir {quoted_ref_dir}
+cd {quoted_project}
+{quoted_python} - <<'PY'
+import json
+import numpy as np
+from pathlib import Path
+input_csv = Path({json.dumps(str(paths["input_csv"]))})
+pim_dir = Path({json.dumps(str(paths["raptor_pim"]))})
+config = json.loads((pim_dir / "config.json").read_text())
+tensor = np.loadtxt(input_csv, delimiter=",", dtype=np.float32).reshape(1, 3, 640, 640)
+with open(pim_dir / "memory.bin", "r+b") as f:
+    f.seek(config["inputs_addresses"][0])
+    f.write(tensor.tobytes(order="C"))
+output_addr = config["outputs_addresses"][0]
+output_size = 1 * 84 * 8400 * 4
+print(f"{{output_addr}},{{output_size}}")
+PY
+"""
+    result = remote_bash(args.ssh_key, args.remote_host, command, capture_output=True)
+    dump_range = result.stdout.decode().strip().splitlines()[-1]
+
+    sim_command = (
+        f"export PATH=$HOME/.cargo/bin:$PATH && "
+        f"cd {quoted_project}/backend-simulators/pim/pim-simulator && "
+        f"cargo run --no-default-features --release --package pim-simulator --bin pim-simulator -- "
+        f"-f {quoted_pim} -o {quoted_sim_bin} -d {dump_range}"
+    )
+    remote_bash(args.ssh_key, args.remote_host, sim_command)
+    return paths
+
+
+def read_remote_file(args, remote_path: Path) -> bytes:
+    result = remote_bash(
+        args.ssh_key,
+        args.remote_host,
+        f"cat {shlex.quote(str(remote_path))}",
+        capture_output=True,
+    )
+    return result.stdout
+
+
+def analyze_case(args, case: ImageCase, work_dir: Path):
+    image_path = work_dir / f"{case.name}{Path(case.url).suffix or '.img'}"
+    csv_path = work_dir / f"{case.name}.csv"
+    annotated_dir = Path(args.annotated_dir)
+    annotated_dir.mkdir(parents=True, exist_ok=True)
+    download_image(case.url, image_path)
+    tensor = letterbox_rgb(Image.open(image_path))
+    save_tensor_csv(tensor, csv_path)
+
+    remote_paths = remote_case_paths(args, case.name)
+    write_remote_file(args, remote_paths["input_csv"], csv_path.read_bytes())
+    remote_paths = run_remote_reference_and_simulator(args, case.name)
+
+    ref_csv = read_remote_file(args, remote_paths["ref_dir"] / "output0_output0.csv")
+    sim_bin = read_remote_file(args, remote_paths["sim_bin"])
+
+    ref = np.loadtxt(ref_csv.decode().splitlines(), delimiter=",", dtype=np.float32).reshape(1, 84, 8400)
+    sim = np.frombuffer(sim_bin, dtype=np.float32, count=1 * 84 * 8400).reshape(1, 84, 8400)
+    abs_diff = np.abs(sim.astype(np.float64) - ref.astype(np.float64))
+    rel_diff = abs_diff / np.maximum(np.abs(ref.astype(np.float64)), 1e-12)
+
+    ref_detections = decode_yolo_output(ref)
+    sim_detections = decode_yolo_output(sim)
+    ref_labels = top_unique_labels(ref_detections)
+    sim_labels = top_unique_labels(sim_detections)
+    ref_image_path = annotated_dir / f"{case.name}_reference.png"
+    sim_image_path = annotated_dir / f"{case.name}_simulator.png"
+    draw_detections(image_path, ref_detections, ref_image_path)
+    draw_detections(image_path, sim_detections, sim_image_path)
+
+    return {
+        "case": case.name,
+        "expected_label": case.expected_label,
+        "ref_top_labels": ref_labels,
+        "sim_top_labels": sim_labels,
+        "top1_match": bool(ref_labels and sim_labels and ref_labels[0] == sim_labels[0]),
+        "expected_in_ref": case.expected_label in ref_labels,
+        "expected_in_sim": case.expected_label in sim_labels,
+        "max_abs_diff": float(abs_diff.max()),
+        "mean_abs_diff": float(abs_diff.mean()),
+        "max_rel_diff": float(rel_diff.max()),
+        "mean_rel_diff": float(rel_diff.mean()),
+        "reference_annotated_image": str(ref_image_path),
+        "simulator_annotated_image": str(sim_image_path),
+        "ref_top_detections": ref_detections[:5],
+        "sim_top_detections": sim_detections[:5],
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Validate YOLO detections on real animal images against the simulator.")
+    parser.add_argument("--remote-host", default="gmagnani@monolith")
+    parser.add_argument("--ssh-key", default="~/.ssh/github")
+    parser.add_argument("--remote-project", default="/home/gmagnani/Project/Raptor")
+    parser.add_argument("--remote-python", default="/home/gmagnani/venv/bin/python")
+    parser.add_argument("--network-dir", default="validation/networks/yolo11n/depth_51")
+    parser.add_argument("--crossbar-size", type=int, default=2048)
+    parser.add_argument("--crossbar-count", type=int, default=256)
+    parser.add_argument("--core-count", type=int, default=1000)
+    parser.add_argument("--command-timeout-seconds", type=int, default=7200)
+    parser.add_argument("--skip-compile", action="store_true")
+    parser.add_argument("--annotated-dir", default="validation/networks/yolo11n/depth_51/real_image_validation/annotated")
+    args = parser.parse_args()
+
+    args.ssh_key = str(Path(args.ssh_key).expanduser())
+
+    if not args.skip_compile:
+        ensure_remote_artifacts(args)
+
+    reports = []
+    with tempfile.TemporaryDirectory(prefix="yolo_real_images_") as tmp_dir:
+        work_dir = Path(tmp_dir)
+        for case in IMAGE_CASES:
+            reports.append(analyze_case(args, case, work_dir))
+
+    print(json.dumps({"network_dir": args.network_dir, "cases": reports}, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/validation/validate.py b/validation/validate.py
index 8d234aa..054228b 100644
--- a/validation/validate.py
+++ b/validation/validate.py
@@ -67,7 +67,10 @@ def main():
     ap.add_argument("--operations-dir", default=None, help="Root of the operations tree (default: operations).")
     ap.add_argument("--simulator-dir", default=None,
                     help="Path to pim-simulator crate root (default: auto-detected relative to script).")
-    ap.add_argument("--threshold", type=float, default=1e-3, help="Max allowed diff per output element.")
+    ap.add_argument("--threshold", type=float, default=1e-3,
+                    help="Absolute tolerance for per-element output comparison.")
+    ap.add_argument("--relative-threshold", type=float, default=1e-5,
+                    help="Relative tolerance for per-element output comparison.")
     ap.add_argument("--seed", type=int, default=0, help="RNG seed for generated validation inputs.")
     ap.add_argument("--crossbar-size", type=int, default=64)
     ap.add_argument("--crossbar-count", type=int, default=8)
@@ -77,6 +80,8 @@ def main():
                     help="Scheduler used by the Spatial merge-compute-nodes pass.")
     ap.add_argument("--pim-memory-report", choices=("none", "summary", "full"), default="none",
                     help="Emit a human-readable PIM memory planning report during codegen.")
+    ap.add_argument("--raptor-extra-arg", action="append", default=[],
+                    help="Additional argument to pass through to the Raptor compiler. Repeat as needed.")
     ap.add_argument("--command-timeout-seconds", type=float, default=1000000.0,
                     help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")
     ap.add_argument("--clean", action="store_true",
@@ -145,8 +150,10 @@ def main():
                 onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
                 crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
                 pim_merge_scheduler=a.pim_merge_scheduler, pim_memory_report=a.pim_memory_report,
+                raptor_extra_args=a.raptor_extra_arg,
                 command_timeout_seconds=a.command_timeout_seconds,
                 threshold=a.threshold,
+                rtol=a.relative_threshold,
                 seed=a.seed,
                 reporter=reporter,
                 model_index=index,
diff --git a/validation/validate_one.py b/validation/validate_one.py
index 44858c4..13df8f0 100644
--- a/validation/validate_one.py
+++ b/validation/validate_one.py
@@ -258,14 +258,18 @@ def parse_pim_simulator_outputs(output_bin_path, outputs_descriptor):
     return arrays
 
 
-def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1e-3, verbose=False):
+def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1e-3, rtol=1e-5, verbose=False):
     all_passed = True
     rows = []
     for sim_array, (oi, name, _, shape) in zip(sim_arrays, outputs_descriptor):
         csv_name = f"output{oi}_{sanitize_output_name(name)}.csv"
         runner_array = np.loadtxt(runner_out_dir / csv_name, delimiter=',', dtype=np.float32).reshape(shape)
-        max_diff = float(np.max(np.abs(sim_array.astype(np.float64) - runner_array.astype(np.float64))))
-        passed = max_diff <= threshold
+        sim_array64 = sim_array.astype(np.float64)
+        runner_array64 = runner_array.astype(np.float64)
+        abs_diff = np.abs(sim_array64 - runner_array64)
+        allowed_diff = threshold + rtol * np.abs(runner_array64)
+        max_diff = float(np.max(abs_diff))
+        passed = bool(np.all(abs_diff <= allowed_diff))
         rows.append((name, f"{max_diff:.6e}", passed))
         if not passed:
             all_passed = False
@@ -289,7 +293,8 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1
 
 def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
                      simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None,
-                     pim_merge_scheduler="peft", pim_memory_report="none", threshold=1e-3,
+                     pim_merge_scheduler="peft", pim_memory_report="none", raptor_extra_args=None,
+                     threshold=1e-3, rtol=1e-5,
                      seed=0, reporter=None, model_index=1, model_total=1, verbose=False,
                      command_timeout_seconds=60.0, mode=MODE_FULL):
     network_onnx_path = Path(network_onnx_path).resolve()
@@ -343,7 +348,7 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
                 pim_pass_timings = compile_with_raptor(
                     network_mlir_path, raptor_path, pim_output_base, crossbar_size, crossbar_count,
                     core_count=core_count, pim_merge_scheduler=pim_merge_scheduler,
-                    pim_memory_report=pim_memory_report,
+                    pim_memory_report=pim_memory_report, raptor_extra_args=raptor_extra_args,
                     cwd=raptor_dir, verbose=verbose, reporter=reporter, timeout_sec=command_timeout_seconds)
                 print_info(reporter, f"PIM artifacts saved to {raptor_dir / 'pim'}")
                 reporter.advance()
@@ -383,7 +388,7 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
             pim_pass_timings = compile_with_raptor(
                 network_mlir_path, raptor_path, pim_output_base, crossbar_size, crossbar_count,
                 core_count=core_count, pim_merge_scheduler=pim_merge_scheduler,
-                pim_memory_report=pim_memory_report,
+                pim_memory_report=pim_memory_report, raptor_extra_args=raptor_extra_args,
                 cwd=raptor_dir, verbose=verbose, reporter=reporter, timeout_sec=command_timeout_seconds)
             print_info(reporter, f"PIM artifacts saved to {raptor_dir / 'pim'}")
             reporter.advance()
@@ -403,7 +408,7 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
         print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compare Outputs")
         sim_arrays = parse_pim_simulator_outputs(output_bin_path, outputs_descriptor)
         reporter.suspend()
-        passed = validate_outputs(sim_arrays, out_dir, outputs_descriptor, threshold, verbose=verbose)
+        passed = validate_outputs(sim_arrays, out_dir, outputs_descriptor, threshold, rtol=rtol, verbose=verbose)
         reporter.resume()
         reporter.advance()
         reporter.record_result(passed)