E' ancora tutto rotto

2026-06-25 16:24:14 +02:00
parent 62dd40ee89
commit be0bcc9dcc
10 changed files with 20197 additions and 2863 deletions
@@ -0,0 +1,134 @@
+# the name by which the project can be referenced within Serena
+project_name: raptor
+
+# list of languages for which language servers are started; choose from:
+#   al                  angular             ansible             bash                clojure
+#   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
+#   dart                elixir              elm                 erlang              fortran
+#   fsharp              go                  groovy              haskell             haxe
+#   hlsl                html                java                json                julia
+#   kotlin              lean4               lua                 luau                markdown
+#   matlab              msl                 nix                 ocaml               pascal
+#   perl                php                 php_phpactor        powershell          python
+#   python_jedi         python_ty           r                   rego                ruby
+#   ruby_solargraph     rust                scala               scss                solidity
+#   svelte              swift               systemverilog       terraform           toml
+#   typescript          typescript_vts      vue                 yaml                zig
+#   (This list may be outdated. For the current list, see values of Language enum here:
+#   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
+#   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
+# Note:
+#   - For C, use cpp
+#   - For JavaScript, use typescript
+#   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
+#   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
+#   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
+#   - For Free Pascal/Lazarus, use pascal
+# Special requirements:
+#   Some languages require additional setup/installations.
+#   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
+# When using multiple languages, the first language server that supports a given file will be used for that file.
+# The first language is the default language and the respective language server will be used as a fallback.
+# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
+languages:
+- cpp
+- rust
+- python
+
+# the encoding used by text files in the project
+# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
+encoding: utf-8
+
+# list of additional paths to ignore in this project.
+# Same syntax as gitignore, so you can use * and **.
+# Note: global ignored_paths from serena_config.yml are also applied additively.
+ignored_paths:
+
+# list of mode names that are to be activated by default, overriding the setting in the global configuration.
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
+# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
+# for this project.
+# This setting can, in turn, be overridden by CLI parameters (--mode).
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+default_modes:
+
+# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+added_modes:
+
+# list of tool names to exclude.
+# This extends the existing exclusions (e.g. from the global configuration)
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+excluded_tools: []
+
+# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
+# This extends the existing inclusions (e.g. from the global configuration).
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+included_optional_tools: []
+
+# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
+# This cannot be combined with non-empty excluded_tools or included_optional_tools.
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+fixed_tools: []
+
+# time budget (seconds) per tool call for the retrieval of additional symbol information
+# such as docstrings or parameter information.
+# This overrides the corresponding setting in the global configuration; see the documentation there.
+# If null or missing, use the setting from the global configuration.
+symbol_info_budget:
+
+# The language backend to use for this project.
+# If not set, the global setting from serena_config.yml is used.
+# Valid values: LSP, JetBrains
+# Note: the backend is fixed at startup. If a project with a different backend
+# is activated post-init, an error will be returned.
+language_backend:
+
+# line ending convention to use when writing source files.
+# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
+# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
+line_ending:
+
+# list of regex patterns which, when matched, mark a memory entry as read‑only.
+# Extends the list from the global configuration, merging the two lists.
+read_only_memory_patterns: []
+
+# list of regex patterns for memories to completely ignore.
+# Matching memories will not appear in list_memories or activate_project output
+# and cannot be accessed via read_memory or write_memory.
+# To access ignored memory files, use the read_file tool on the raw file path.
+# Extends the list from the global configuration, merging the two lists.
+# Example: ["_archive/.*", "_episodes/.*"]
+ignored_memory_patterns: []
+
+# advanced configuration option allowing to configure language server-specific options.
+# Maps the language key to the options.
+# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
+# No documentation on options means no options are available.
+ls_specific_settings: {}
+
+# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
+# Paths can be absolute or relative to the project root.
+# Each folder is registered as an LSP workspace folder, enabling language servers to discover
+# symbols and references across package boundaries.
+# Currently supported for: TypeScript.
+# Example:
+#   additional_workspace_folders:
+#     - ../sibling-package
+#     - ../shared-lib
+additional_workspace_folders: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# whether to use project's .gitignore files to ignore files
+ignore_all_files_in_gitignore: true
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ''
@@ -97,11 +97,17 @@ static spatial::SpatReconciliatorOp insertRowStripReconciliator(IRRewriter& rewr
                                              value.getLoc(),
                                              outputType,
                                              value,
+                                              ValueRange {},
                                              rewriter.getStringAttr(kLogicalLayout),
                                              rewriter.getStringAttr(kRowStripLayout),
                                              rewriter.getDenseI64ArrayAttr(offsets),
                                              rewriter.getDenseI64ArrayAttr(sizes),
-                                              rewriter.getStringAttr(kRowStripIndexMap));
+                                              rewriter.getStringAttr(kRowStripIndexMap),
+                                              nullptr,
+                                              nullptr,
+                                              nullptr,
+                                              nullptr,
+                                              nullptr);
 }

 static void materializeDenseUses(IRRewriter& rewriter,
@@ -233,15 +233,21 @@ def SpatReluPlanOp : SpatOp<"relu_plan", []> {
 }

 def SpatReconciliatorOp : SpatOp<"reconciliator", []> {
-  let summary = "Passive logical-to-physical layout selection record";
+  let summary = "Logical-to-physical layout record or explicit fragment assembly";

  let arguments = (ins
    SpatTensor:$input,
+    Variadic<SpatTensor>:$fragments,
    StrAttr:$logicalLayout,
    StrAttr:$physicalLayout,
    DenseI64ArrayAttr:$fragmentOffsets,
    DenseI64ArrayAttr:$fragmentSizes,
-    StrAttr:$indexMap
+    StrAttr:$indexMap,
+    OptionalAttr<StrAttr>:$mode,
+    OptionalAttr<DenseI64ArrayAttr>:$fragmentOperandIndices,
+    OptionalAttr<DenseI64ArrayAttr>:$fragmentStrides,
+    OptionalAttr<StrAttr>:$conflictPolicy,
+    OptionalAttr<StrAttr>:$coveragePolicy
  );

  let results = (outs
@@ -383,7 +383,7 @@ LogicalResult SpatConcatOp::verify() {
 static bool isKnownLogicalLayout(StringRef layout) { return layout == "nchw"; }

 static bool isKnownPhysicalLayout(StringRef layout) {
-  return layout == "dense_nchw" || layout == "nchw_row_strip";
+  return layout == "dense_nchw" || layout == "nchw_row_strip" || layout == "fragmented";
 }

 static LogicalResult verifyPlanTensorTypes(Operation* op, Value input, Value output, StringRef kind) {
@@ -437,7 +437,9 @@ LogicalResult SpatReluPlanOp::verify() {
 }

 LogicalResult SpatReconciliatorOp::verify() {
-  if (failed(verifyPlanTensorTypes(getOperation(), getInput(), getOutput(), "spat.reconciliator")))
+  auto modeAttr = getModeAttr();
+  bool isFragmentAssembly = modeAttr && modeAttr.getValue() == "fragment_assembly";
+  if (!isFragmentAssembly && failed(verifyPlanTensorTypes(getOperation(), getInput(), getOutput(), "spat.reconciliator")))
    return failure();
  if (!isKnownLogicalLayout(getLogicalLayout()))
    return emitError("requires a known logical layout");
@@ -452,23 +454,154 @@ LogicalResult SpatReconciliatorOp::verify() {
  auto sizes = getFragmentSizes();
  if (offsets.size() != sizes.size())
    return emitError("fragment offset and size arrays must have the same length");
+  int64_t rank = logicalType.getRank();
  if (offsets.empty())
    return success();
-
-  int64_t rank = logicalType.getRank();
  if (rank <= 0 || offsets.size() % rank != 0)
    return emitError("fragment metadata must be a whole number of rank-sized fragments");

-  ArrayRef<int64_t> shape = logicalType.getShape();
-  for (int64_t index = 0; index < static_cast<int64_t>(offsets.size()); ++index) {
-    int64_t dim = index % rank;
-    int64_t offset = offsets[index];
-    int64_t size = sizes[index];
-    if (offset < 0 || size < 0)
-      return emitError("fragment offsets and sizes must be non-negative");
-    int64_t logicalDim = shape[dim];
-    if (!ShapedType::isDynamic(logicalDim) && offset + size > logicalDim)
-      return emitError("fragment bounds must stay within the logical tensor shape");
+  auto verifyBoundsOnly = [&](ArrayRef<int64_t> strideValues) -> LogicalResult {
+    ArrayRef<int64_t> shape = logicalType.getShape();
+    for (int64_t index = 0; index < static_cast<int64_t>(offsets.size()); ++index) {
+      int64_t dim = index % rank;
+      int64_t offset = offsets[index];
+      int64_t size = sizes[index];
+      int64_t stride = strideValues.empty() ? 1 : strideValues[index];
+      if (offset < 0 || size < 0 || stride < 0)
+        return emitError("fragment offsets, sizes, and strides must be non-negative");
+      int64_t logicalDim = shape[dim];
+      if (!ShapedType::isDynamic(logicalDim) && offset + size > logicalDim)
+        return emitError("fragment bounds must stay within the logical tensor shape");
+      if (stride != 1)
+        return emitError("fragment assembly currently requires unit strides");
+    }
+    return success();
+  };
+
+  if (!isFragmentAssembly) {
+    if (failed(verifyBoundsOnly({})))
+      return failure();
+    if (!getFragments().empty())
+      return emitError("legacy reconciliator does not accept extra fragment operands");
+    if (getFragmentStridesAttr() || getConflictPolicyAttr() || getCoveragePolicyAttr())
+      return emitError("legacy reconciliator does not accept fragment assembly attributes");
+    return success();
+  }
+
+  auto stridesAttr = getFragmentStridesAttr();
+  auto operandIndicesAttr = getFragmentOperandIndicesAttr();
+  if (!operandIndicesAttr)
+    return emitError("fragment assembly reconciliator requires fragment operand indices");
+  if (!stridesAttr)
+    return emitError("fragment assembly reconciliator requires fragment strides");
+  ArrayRef<int64_t> operandIndices = operandIndicesAttr.asArrayRef();
+  ArrayRef<int64_t> strides = stridesAttr.asArrayRef();
+  if (strides.size() != offsets.size())
+    return emitError("fragment stride and offset arrays must have the same length");
+  if (!getConflictPolicyAttr() || !getCoveragePolicyAttr())
+    return emitError("fragment assembly reconciliator requires conflict and coverage policies");
+  if (getConflictPolicy() != "disjoint")
+    return emitError("fragment assembly reconciliator currently supports only conflict_policy=\"disjoint\"");
+  if (getCoveragePolicy() != "complete" && getCoveragePolicy() != "partial")
+    return emitError("fragment assembly reconciliator coverage_policy must be \"complete\" or \"partial\"");
+
+  SmallVector<Value> operands;
+  operands.push_back(getInput());
+  llvm::append_range(operands, getFragments());
+  int64_t operandCount = static_cast<int64_t>(operands.size());
+  int64_t fragmentCount = static_cast<int64_t>(operandIndices.size());
+  if (operandCount == 0)
+    return emitError("fragment assembly reconciliator requires at least one operand");
+  if (static_cast<int64_t>(offsets.size()) != fragmentCount * rank)
+    return emitError("fragment assembly metadata count must match operand count * result rank");
+  if (failed(verifyBoundsOnly(strides)))
+    return failure();
+
+  SmallVector<std::pair<SmallVector<int64_t, 4>, SmallVector<int64_t, 4>>, 8> slices;
+  slices.reserve(static_cast<size_t>(fragmentCount));
+  SmallVector<SmallVector<SmallVector<int64_t, 4>, 4>, 8> sizesByOperand(static_cast<size_t>(operandCount));
+  for (int64_t fragmentIndex = 0; fragmentIndex < fragmentCount; ++fragmentIndex) {
+    int64_t operandIndex = operandIndices[fragmentIndex];
+    if (operandIndex < 0 || operandIndex >= operandCount)
+      return emitError("fragment assembly operand index is out of range");
+
+    auto operandType = dyn_cast<RankedTensorType>(operands[operandIndex].getType());
+    if (!operandType || !operandType.hasStaticShape())
+      return emitError("fragment assembly reconciliator requires static ranked tensor operands");
+    if (operandType.getRank() != rank)
+      return emitError("fragment assembly reconciliator requires operand/result rank match");
+
+    SmallVector<int64_t, 4> fragmentOffsets;
+    SmallVector<int64_t, 4> fragmentSizes;
+    fragmentOffsets.reserve(rank);
+    fragmentSizes.reserve(rank);
+    for (int64_t dim = 0; dim < rank; ++dim) {
+      int64_t flatIndex = fragmentIndex * rank + dim;
+      fragmentOffsets.push_back(offsets[flatIndex]);
+      fragmentSizes.push_back(sizes[flatIndex]);
+    }
+
+    sizesByOperand[static_cast<size_t>(operandIndex)].push_back(fragmentSizes);
+
+    for (const auto& [existingOffsets, existingSizes] : slices) {
+      bool overlaps = true;
+      for (int64_t dim = 0; dim < rank; ++dim) {
+        int64_t begin = fragmentOffsets[dim];
+        int64_t end = begin + fragmentSizes[dim];
+        int64_t existingBegin = existingOffsets[dim];
+        int64_t existingEnd = existingBegin + existingSizes[dim];
+        if (end <= existingBegin || existingEnd <= begin) {
+          overlaps = false;
+          break;
+        }
+      }
+      if (overlaps)
+        return emitError("fragment assembly reconciliator requires disjoint static slices");
+    }
+    slices.push_back({std::move(fragmentOffsets), std::move(fragmentSizes)});
+  }
+
+  for (int64_t operandIndex = 0; operandIndex < operandCount; ++operandIndex) {
+    if (sizesByOperand[static_cast<size_t>(operandIndex)].empty())
+      return emitError("fragment assembly reconciliator requires every operand to contribute at least one fragment");
+
+    auto operandType = cast<RankedTensorType>(operands[operandIndex].getType());
+    ArrayRef<int64_t> operandShape = operandType.getShape();
+    auto& fragmentShapes = sizesByOperand[static_cast<size_t>(operandIndex)];
+    if (fragmentShapes.size() == 1) {
+      if (!llvm::equal(operandShape, fragmentShapes.front()))
+        return emitError("single-fragment reconciliator operand shape must match declared fragment size");
+      continue;
+    }
+
+    ArrayRef<int64_t> fragmentShape = fragmentShapes.front();
+    for (ArrayRef<int64_t> otherShape : fragmentShapes)
+      if (!llvm::equal(fragmentShape, otherShape))
+        return emitError("packed reconciliator operand requires equal fragment sizes per operand");
+    if (llvm::equal(operandShape, fragmentShape))
+      continue;
+    if (!llvm::equal(operandShape.drop_front(), fragmentShape.drop_front()))
+      return emitError("packed reconciliator operand must match fragment shape on non-packed dimensions");
+    if (operandShape.front() != static_cast<int64_t>(fragmentShapes.size()) * fragmentShape.front())
+      return emitError("packed reconciliator operand first dimension must equal fragment_count * fragment_size");
+  }
+
+  if (getCoveragePolicy() == "complete") {
+    int64_t covered = 0;
+    int64_t logicalElements = 1;
+    for (int64_t dimSize : logicalType.getShape()) {
+      if (ShapedType::isDynamic(dimSize))
+        return emitError("fragment assembly complete coverage requires static result shape");
+      logicalElements *= dimSize;
+    }
+    for (const auto& [ignoredOffsets, fragmentSizes] : slices) {
+      int64_t fragmentElements = 1;
+      for (int64_t dimSize : fragmentSizes)
+        fragmentElements *= dimSize;
+      covered += fragmentElements;
+    }
+    if (covered != logicalElements)
+      return emitError("fragment assembly complete coverage must cover the whole result exactly");
  }

  return success();
@@ -0,0 +1,128 @@
+--- src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp	2026-06-24 18:51:29.043731129 +0000
+++ src/PIM/Dialect/Spatial/Transforms/MergeComputeNodes/MaterializeMergeSchedule.cpp	2026-06-24 18:51:29.026726895 +0000
+@@ -4112,104 +4112,8 @@
+                                                                 Value originalOutput,
+                                                                 Location loc);
+ 
+-FailureOr<SmallVector<OpFoldResult, 4>> rematerializeProjectionIndexListForBatchHostOutput(
+-  MaterializerState& state,
+-  MaterializedClass& sourceClass,
+-  ArrayRef<OpFoldResult> values,
+-  IRMapping& mapper,
+-  Location loc) {
+-  SmallVector<OpFoldResult, 4> localized;
+-  localized.reserve(values.size());
+-  for (OpFoldResult value : values) {
+-    FailureOr<OpFoldResult> remapped =
+-      rematerializeIndexOpFoldResultInClass(state, sourceClass, value, loc, &mapper);
+-    if (failed(remapped))
+-      return failure();
+-    localized.push_back(*remapped);
+-  }
+-  return localized;
+-}
+-
+-LogicalResult createProjectionAwareBatchHostInsert(MaterializerState& state,
+-                                                  MaterializedClass& sourceClass,
+-                                                  Value originalOutput,
+-                                                  Value payload,
+-                                                  Value destination,
+-                                                  ArrayRef<ProducerKey> keys,
+-                                                  Location loc) {
+-  auto originalResult = dyn_cast<OpResult>(originalOutput);
+-  if (!originalResult)
+-    return failure();
+-
+-  auto sourceBatch = dyn_cast_or_null<SpatComputeBatch>(originalResult.getOwner());
+-  if (!sourceBatch || sourceBatch.getNumResults() == 0)
+-    return failure();
+-
+-  FailureOr<tensor::ParallelInsertSliceOp> projection =
+-    getBatchResultProjectionInsert(sourceBatch, originalResult.getResultNumber());
+-  if (failed(projection))
+-    return failure();
+-
+-  auto sourceLaneArg = sourceBatch.getLaneArgument();
+-  if (!sourceLaneArg)
+-    return failure();
+-
+-  auto materializedBatch = dyn_cast<SpatScheduledComputeBatch>(sourceClass.op);
+-  if (!materializedBatch)
+-    return failure();
+-
+-  auto materializedLaneArg = materializedBatch.getLaneArgument();
+-  if (!materializedLaneArg)
+-    return failure();
+-
+-  if (keys.size() != sourceClass.cpus.size())
+-    return failure();
+-
+-  SmallVector<int64_t, 8> logicalLanes;
+-  logicalLanes.reserve(keys.size());
+-  for (ProducerKey key : keys) {
+-    if (key.instance.op != sourceBatch.getOperation() || key.resultIndex != originalResult.getResultNumber())
+-      return failure();
+-    logicalLanes.push_back(key.instance.laneStart);
+-  }
+-
+-  IRMapping mapper;
+-  Value logicalLane = createIndexedIndexValue(state,
+-                                             sourceClass.op,
+-                                             ArrayRef<int64_t>(logicalLanes),
+-                                             *materializedLaneArg,
+-                                             loc,
+-                                             static_cast<int64_t>(sourceClass.cpus.size()),
+-                                             /*allowExhaustiveTiledSearch=*/false);
+-  mapper.map(*sourceLaneArg, logicalLane);
+-
+-  FailureOr<SmallVector<OpFoldResult, 4>> offsets =
+-    rematerializeProjectionIndexListForBatchHostOutput(
+-      state, sourceClass, projection->getMixedOffsets(), mapper, loc);
+-  if (failed(offsets))
+-    return failure();
+-  FailureOr<SmallVector<OpFoldResult, 4>> sizes =
+-    rematerializeProjectionIndexListForBatchHostOutput(
+-      state, sourceClass, projection->getMixedSizes(), mapper, loc);
+-  if (failed(sizes))
+-    return failure();
+-  FailureOr<SmallVector<OpFoldResult, 4>> strides =
+-    rematerializeProjectionIndexListForBatchHostOutput(
+-      state, sourceClass, projection->getMixedStrides(), mapper, loc);
+-  if (failed(strides))
+-    return failure();
+-
+-  tensor::ParallelInsertSliceOp::create(
+-    state.rewriter, loc, payload, destination, *offsets, *sizes, *strides);
+-  return success();
+-}
+-
+ LogicalResult
+-setHostOutputValue(MaterializerState& state,
+-                   MaterializedClass& sourceClass,
+-                   Value originalOutput,
+-                   Value payload,
+-                   ArrayRef<ProducerKey> keys = {}) {
+setHostOutputValue(MaterializerState& state, MaterializedClass& sourceClass, Value originalOutput, Value payload) {
+   auto resultIt = sourceClass.hostOutputToResultIndex.find(originalOutput);
+   if (resultIt == sourceClass.hostOutputToResultIndex.end())
+     return sourceClass.op->emitError("missing host result slot for materialized output")
+@@ -4253,10 +4157,6 @@
+     return batch.emitOpError("expected compute_batch output block argument while materializing batch output");
+ 
+   state.rewriter.setInsertionPointToStart(&inParallelOp.getRegion().front());
+-  if (succeeded(createProjectionAwareBatchHostInsert(
+-        state, sourceClass, originalOutput, payload, *outputArg, keys, payload.getLoc())))
+-    return success();
+-
+   createDim0ParallelInsertSlice(state, payload.getLoc(), payload, *outputArg, *laneArg);
+   return success();
+ }
+@@ -4276,7 +4176,7 @@
+ 
+   MaterializedClass& ownerClass = state.classes[ownerIt->second];
+   if (sourceClass.id == ownerClass.id)
+-    return setHostOutputValue(state, ownerClass, originalOutput, payload, keys);
+    return setHostOutputValue(state, ownerClass, originalOutput, payload);
+ 
+   // Keep the old deadlock-free communication discipline: only scalar-to-scalar
+   // host-owner forwarding is introduced here. Batch host publication remains on
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3.13
+
+import argparse
+import math
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageDraw
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+VALIDATION_DIR = SCRIPT_DIR.parent
+REPO_ROOT = VALIDATION_DIR.parent
+if str(VALIDATION_DIR) not in sys.path:
+    sys.path.insert(0, str(VALIDATION_DIR))
+
+from onnx_utils import _ONNX_TO_NP, onnx_io, write_inputs_to_memory_bin
+from validate_one import (
+    MODE_COMPILE_ONLY,
+    build_dump_ranges,
+    parse_pim_simulator_outputs,
+    run_pim_simulator,
+    sanitize_output_name,
+    validate_network,
+)
+from yolo_real_image_validation import save_tensor_csv
+
+IMAGENET_MEAN = np.asarray([0.485, 0.456, 0.406], dtype=np.float32)
+IMAGENET_STD = np.asarray([0.229, 0.224, 0.225], dtype=np.float32)
+DEFAULT_VGG_MODEL = VALIDATION_DIR / "networks" / "vgg16" / "depth_35" / "vgg16_depth_35.onnx"
+DEFAULT_RESNET_MODEL = VALIDATION_DIR / "networks" / "resnet" / "resnet18_torchvision.onnx"
+
+
+def resolve_default_paths():
+    return {
+        "raptor_path": REPO_ROOT / "build_release" / "Release" / "bin" / "onnx-mlir",
+        "onnx_include_dir": REPO_ROOT / "onnx-mlir" / "include",
+        "simulator_dir": REPO_ROOT / "backend-simulators" / "pim" / "pim-simulator",
+    }
+
+
+def resolve_model_path(network: str | None, model: Path | None) -> Path:
+    if model is not None:
+        return model.resolve()
+    if network == "resnet":
+        return DEFAULT_RESNET_MODEL.resolve()
+    if network == "vgg":
+        return DEFAULT_VGG_MODEL.resolve()
+    raise SystemExit("Pass --model or select a default with --network {resnet,vgg}.")
+
+
+def ensure_local_artifacts(args, model_path: Path):
+    validate_network(
+        network_onnx_path=model_path,
+        raptor_path=args.raptor_path,
+        onnx_include_dir=args.onnx_include_dir,
+        simulator_dir=args.simulator_dir,
+        crossbar_size=args.crossbar_size,
+        crossbar_count=args.crossbar_count,
+        core_count=args.core_count,
+        command_timeout_seconds=args.command_timeout_seconds,
+        mode=MODE_COMPILE_ONLY,
+        verbose=args.verbose,
+    )
+
+
+def ensure_existing_artifacts(model_dir: Path):
+    required_paths = [
+        model_dir / "runner" / "build" / "runner",
+        model_dir / "raptor" / "pim" / "config.json",
+        model_dir / "raptor" / "pim" / "memory.bin",
+    ]
+    missing = [str(path) for path in required_paths if not path.exists()]
+    if missing:
+        raise FileNotFoundError(
+            "Missing compiled local artifacts. Re-run without --skip-compile or restore these paths:\n  "
+            + "\n  ".join(missing)
+        )
+
+
+def preprocess_classification_image(image_path: Path) -> tuple[Image.Image, np.ndarray]:
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    scale = 256.0 / min(width, height)
+    resized_size = (
+        max(1, int(round(width * scale))),
+        max(1, int(round(height * scale))),
+    )
+    resized = image.resize(resized_size, Image.Resampling.BILINEAR)
+
+    left = (resized.width - 224) // 2
+    top = (resized.height - 224) // 2
+    cropped = resized.crop((left, top, left + 224, top + 224))
+
+    array = np.asarray(cropped, dtype=np.float32) / 255.0
+    array = (array - IMAGENET_MEAN) / IMAGENET_STD
+    chw = np.transpose(array, (2, 0, 1))
+    tensor = np.expand_dims(chw.astype(np.float32, copy=False), axis=0)
+    return image, tensor
+
+
+def load_labels(labels_path: Path | None) -> list[str] | None:
+    if labels_path is None:
+        return None
+    labels = [line.strip() for line in labels_path.read_text().splitlines()]
+    return labels or None
+
+
+def softmax(values: np.ndarray) -> np.ndarray:
+    shifted = values - np.max(values)
+    exp = np.exp(shifted)
+    denom = exp.sum()
+    if not math.isfinite(float(denom)) or denom <= 0.0:
+        raise RuntimeError("Softmax received non-finite output scores.")
+    return exp / denom
+
+
+def decode_classification_output(output: np.ndarray, labels: list[str] | None, top_k: int):
+    scores = np.asarray(output, dtype=np.float64).reshape(-1)
+    probabilities = softmax(scores)
+    limit = min(top_k, probabilities.size)
+    top_indices = np.argsort(probabilities)[-limit:][::-1]
+    results = []
+    for index in top_indices:
+        label = None
+        if labels is not None and 0 <= int(index) < len(labels):
+            label = labels[int(index)]
+        results.append(
+            {
+                "index": int(index),
+                "label": label,
+                "probability": float(probabilities[int(index)]),
+            }
+        )
+    return results
+
+
+def render_result_line(result) -> str:
+    name = result["label"] if result["label"] else f'class {result["index"]}'
+    return f'{name}: {result["probability"] * 100.0:.2f}%'
+
+
+def draw_classification_panel(image: Image.Image, results, output_path: Path):
+    annotated = image.copy()
+    draw = ImageDraw.Draw(annotated)
+    lines = [render_result_line(result) for result in results]
+    if not lines:
+        lines = ["No predictions"]
+
+    padding = 10
+    line_gap = 4
+    max_width = 0
+    line_heights = []
+    for line in lines:
+        left, top, right, bottom = draw.textbbox((0, 0), line)
+        max_width = max(max_width, right - left)
+        line_heights.append(bottom - top)
+
+    panel_height = padding * 2 + sum(line_heights) + line_gap * (len(lines) - 1)
+    panel_width = padding * 2 + max_width
+    origin_x = 12
+    origin_y = 12
+    draw.rounded_rectangle(
+        (origin_x, origin_y, origin_x + panel_width, origin_y + panel_height),
+        radius=10,
+        fill=(0, 0, 0),
+    )
+
+    y = origin_y + padding
+    for line, line_height in zip(lines, line_heights):
+        draw.text((origin_x + padding, y), line, fill=(255, 255, 255))
+        y += line_height + line_gap
+
+    annotated.save(output_path)
+
+
+def run_reference_and_simulator(args, model_path: Path, tensor: np.ndarray):
+    model_dir = model_path.parent
+    runner_build_dir = model_dir / "runner" / "build"
+    runner_path = runner_build_dir / "runner"
+    pim_dir = model_dir / "raptor" / "pim"
+    simulation_dir = model_dir / "classification_demo" / "simulation"
+    reference_dir = model_dir / "classification_demo" / "reference"
+    inputs_dir = model_dir / "classification_demo" / "inputs"
+
+    simulation_dir.mkdir(parents=True, exist_ok=True)
+    reference_dir.mkdir(parents=True, exist_ok=True)
+    inputs_dir.mkdir(parents=True, exist_ok=True)
+
+    input_descriptors, output_descriptors = onnx_io(model_path)
+    if len(input_descriptors) != 1:
+        raise RuntimeError(f"Expected one classification input tensor, found {len(input_descriptors)}")
+    if len(output_descriptors) != 1:
+        raise RuntimeError(f"Expected one classification output tensor, found {len(output_descriptors)}")
+
+    input_index, _input_name, _input_dtype, input_shape = input_descriptors[0]
+    if list(tensor.shape) != list(input_shape):
+        raise RuntimeError(f"Preprocessed tensor shape {list(tensor.shape)} does not match model input {input_shape}")
+
+    input_csv = inputs_dir / "in0.csv"
+    save_tensor_csv(tensor, input_csv)
+
+    runner_cmd = [
+        str(runner_path),
+        f"--in{input_index}-csv-file",
+        str(input_csv),
+        f"--in{input_index}-shape",
+        "x".join(str(dim) for dim in tensor.shape),
+        "--save-csv-dir",
+        str(reference_dir),
+    ]
+    subprocess.run(runner_cmd, cwd=runner_build_dir, check=True)
+
+    write_inputs_to_memory_bin(pim_dir / "memory.bin", pim_dir / "config.json", [tensor])
+    dump_ranges = build_dump_ranges(pim_dir / "config.json", output_descriptors)
+    output_bin_path = simulation_dir / "out.bin"
+    run_pim_simulator(
+        args.simulator_dir,
+        pim_dir,
+        output_bin_path,
+        dump_ranges,
+        timeout_sec=args.command_timeout_seconds,
+    )
+
+    output_index, output_name, output_dtype_code, output_shape = output_descriptors[0]
+    output_dtype = np.dtype(_ONNX_TO_NP[output_dtype_code])
+    reference_csv = reference_dir / f"output{output_index}_{sanitize_output_name(output_name)}.csv"
+    reference_output = np.loadtxt(reference_csv, delimiter=",", dtype=output_dtype).reshape(output_shape)
+    simulator_output = parse_pim_simulator_outputs(output_bin_path, output_descriptors)[0]
+    return reference_output, simulator_output
+
+
+def print_topk(title: str, results):
+    print(title)
+    for rank, result in enumerate(results, start=1):
+        label_text = result["label"] if result["label"] else f'class {result["index"]}'
+        print(f'  {rank}. {label_text} ({result["probability"] * 100.0:.2f}%) [index={result["index"]}]')
+
+
+def main():
+    defaults = resolve_default_paths()
+
+    parser = argparse.ArgumentParser(description="Run a VGG or ResNet ONNX model through the Raptor simulator and annotate the image with top classification results.")
+    parser.add_argument("--model", type=Path, default=None)
+    parser.add_argument("--network", choices=("resnet", "vgg"), default=None)
+    parser.add_argument("--image", type=Path, required=True)
+    parser.add_argument("--labels", type=Path, default=None)
+    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument("--raptor-path", type=Path, default=defaults["raptor_path"])
+    parser.add_argument("--onnx-include-dir", type=Path, default=defaults["onnx_include_dir"])
+    parser.add_argument("--simulator-dir", type=Path, default=defaults["simulator_dir"])
+    parser.add_argument("--crossbar-size", type=int, default=2048)
+    parser.add_argument("--crossbar-count", type=int, default=256)
+    parser.add_argument("--core-count", type=int, default=1000)
+    parser.add_argument("--top-k", type=int, default=5)
+    parser.add_argument("--command-timeout-seconds", type=float, default=7200.0)
+    parser.add_argument("--skip-compile", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+
+    args.model = resolve_model_path(args.network, args.model)
+    args.image = args.image.resolve()
+    args.output = args.output.resolve()
+    args.labels = args.labels.resolve() if args.labels else None
+    args.raptor_path = args.raptor_path.resolve()
+    args.onnx_include_dir = args.onnx_include_dir.resolve()
+    args.simulator_dir = args.simulator_dir.resolve()
+
+    if not args.skip_compile:
+        ensure_local_artifacts(args, args.model)
+    else:
+        ensure_existing_artifacts(args.model.parent)
+
+    original_image, tensor = preprocess_classification_image(args.image)
+    labels = load_labels(args.labels)
+    reference_output, simulator_output = run_reference_and_simulator(args, args.model, tensor)
+    reference_results = decode_classification_output(reference_output, labels, args.top_k)
+    simulator_results = decode_classification_output(simulator_output, labels, args.top_k)
+
+    print_topk("Reference top-k:", reference_results)
+    print_topk("Simulator top-k:", simulator_results)
+
+    reference_scores = np.asarray(reference_output, dtype=np.float64).reshape(-1)
+    simulator_scores = np.asarray(simulator_output, dtype=np.float64).reshape(-1)
+    max_abs_diff = float(np.max(np.abs(reference_scores - simulator_scores)))
+    print(f"Max absolute score diff: {max_abs_diff:.6e}")
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    draw_classification_panel(original_image, simulator_results, args.output)
+    print(f"Annotated image saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()