add peft scheduling
Validate Operations / validate-operations (push) Has been cancelled

better deadlock report by pim simulator
This commit is contained in:
NiccoloN
2026-05-18 12:09:27 +02:00
parent de0a2f4561
commit f1602c0550
26 changed files with 1215 additions and 113 deletions
+4 -1
View File
@@ -41,7 +41,8 @@ def _format_command(cmd):
def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
crossbar_size, crossbar_count, core_count=None, cwd=None, verbose=False, reporter=None):
crossbar_size, crossbar_count, core_count=None, pim_merge_scheduler="peft",
cwd=None, verbose=False, reporter=None, timeout_sec=None):
# Define the arguments, with the possibility to set crossbar size and count
args = [
network_path,
@@ -51,6 +52,7 @@ def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
"--EmitPimCodegen",
f"--crossbar-size={crossbar_size}",
f"--crossbar-count={crossbar_count}",
f"--pim-merge-scheduler={pim_merge_scheduler}",
]
if core_count is not None:
args.append(f"--core-count={core_count}")
@@ -69,6 +71,7 @@ def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
cwd=cwd,
reporter=reporter,
capture_output=True,
timeout_sec=timeout_sec,
)
if reporter is None:
print(Fore.GREEN + "Raptor execution successful" + Style.RESET_ALL)
+19 -6
View File
@@ -3,6 +3,7 @@ import os
import pty
import selectors
import subprocess
import time
MAX_ERROR_OUTPUT_BYTES = 8192
@@ -16,16 +17,26 @@ def _read_chunk(fd, treat_eio_as_eof=False):
raise
def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=True):
def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=True, timeout_sec=None):
selector = selectors.DefaultSelector()
recent_output = bytearray()
captured_output = bytearray()
deadline = None if timeout_sec is None else time.monotonic() + timeout_sec
try:
selector.register(fd, selectors.EVENT_READ)
while selector.get_map():
for key, _ in selector.select():
select_timeout = None
if deadline is not None:
remaining = deadline - time.monotonic()
if remaining <= 0:
process.kill()
process.wait()
raise subprocess.TimeoutExpired(process.args, timeout_sec, output=bytes(captured_output))
select_timeout = min(1.0, remaining)
for key, _ in selector.select(select_timeout):
data = _read_chunk(key.fileobj, treat_eio_as_eof=treat_eio_as_eof)
if not data:
selector.unregister(key.fileobj)
@@ -53,7 +64,7 @@ def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=
return bytes(captured_output)
def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False):
def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False, timeout_sec=None):
if reporter is None:
if capture_output:
completed = subprocess.run(
@@ -62,9 +73,10 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout_sec,
)
return completed.stdout.decode("utf-8", errors="replace")
subprocess.run(cmd, cwd=cwd, check=True)
subprocess.run(cmd, cwd=cwd, check=True, timeout=timeout_sec)
return None
stream_output = bool(getattr(reporter, "verbose", False))
@@ -74,6 +86,7 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
cwd=cwd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=timeout_sec,
)
if completed.returncode != 0:
raise subprocess.CalledProcessError(completed.returncode, completed.args, output=completed.stdout)
@@ -89,7 +102,7 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
stderr=subprocess.STDOUT,
)
assert process.stdout is not None
output = _stream_output(process.stdout.fileno(), process, reporter)
output = _stream_output(process.stdout.fileno(), process, reporter, timeout_sec=timeout_sec)
return output.decode("utf-8", errors="replace") if capture_output else None
try:
@@ -102,5 +115,5 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
finally:
os.close(slave_fd)
output = _stream_output(master_fd, process, reporter, treat_eio_as_eof=True)
output = _stream_output(master_fd, process, reporter, treat_eio_as_eof=True, timeout_sec=timeout_sec)
return output.decode("utf-8", errors="replace") if capture_output else None
+9 -1
View File
@@ -64,7 +64,11 @@ def main():
ap.add_argument("--crossbar-size", type=int, default=64)
ap.add_argument("--crossbar-count", type=int, default=8)
ap.add_argument("--core-count", type=int, default=None,
help="Core count to pass to Raptor. If omitted, Raptor uses its default.")
help="Core count to pass to Raptor. Required for PIM validation.")
ap.add_argument("--pim-merge-scheduler", choices=("peft", "dcp"), default="peft",
help="Scheduler used by the Spatial merge-compute-nodes pass.")
ap.add_argument("--command-timeout-seconds", type=float, default=60.0,
help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")
ap.add_argument("--clean", action="store_true",
help="Remove generated validation artifacts under each model workspace and exit.")
ap.add_argument("--verbose", action="store_true",
@@ -98,6 +102,8 @@ def main():
missing_args.append("--raptor-path")
if not a.onnx_include_dir:
missing_args.append("--onnx-include-dir")
if a.core_count is None:
missing_args.append("--core-count")
if missing_args:
ap.error("the following arguments are required unless --clean is used: " + ", ".join(missing_args))
@@ -117,6 +123,8 @@ def main():
result = validate_network(
onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
pim_merge_scheduler=a.pim_merge_scheduler,
command_timeout_seconds=a.command_timeout_seconds,
threshold=a.threshold,
seed=a.seed,
reporter=reporter,
+24 -16
View File
@@ -142,8 +142,8 @@ class ProgressReporter:
self.rendered_width = 0
def run_command(cmd, cwd=None, reporter=None):
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter)
def run_command(cmd, cwd=None, reporter=None, timeout_sec=None):
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter, timeout_sec=timeout_sec)
def clean_workspace_artifacts(workspace_dir, model_stem):
@@ -186,21 +186,22 @@ def print_info(reporter, message):
reporter.log(f" {message}")
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None):
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None, timeout_sec=None):
stem = network_onnx_path.stem
onnx_ir_base = raptor_dir / stem
runner_base = runner_dir / stem
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"], reporter=reporter)
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter)
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"],
reporter=reporter, timeout_sec=timeout_sec)
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter, timeout_sec=timeout_sec)
network_so_path = runner_base.with_suffix(".so")
network_mlir_path = onnx_ir_base.with_suffix(".onnx.mlir")
onnx_ir_base.with_suffix(".tmp").unlink(missing_ok=True)
return network_so_path, network_mlir_path
def build_onnx_runner(source_dir, build_dir, reporter=None):
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter)
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter)
def build_onnx_runner(source_dir, build_dir, reporter=None, timeout_sec=None):
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
return build_dir / "runner"
@@ -214,13 +215,14 @@ def build_dump_ranges(config_path, outputs_descriptor):
return ",".join(ranges)
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None):
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None, timeout_sec=None):
run_command(
["cargo", "run", "--no-default-features", "--release", "--package", "pim-simulator", "--bin", "pim-simulator",
"--",
"-f", str(pim_dir), "-o", str(output_bin_path), "-d", dump_ranges],
cwd=simulator_dir,
reporter=reporter,
timeout_sec=timeout_sec,
)
@@ -267,8 +269,10 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1
def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
seed=0, reporter=None, model_index=1, model_total=1, verbose=False):
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None,
pim_merge_scheduler="peft", threshold=1e-3,
seed=0, reporter=None, model_index=1, model_total=1, verbose=False,
command_timeout_seconds=60.0):
network_onnx_path = Path(network_onnx_path).resolve()
raptor_path = Path(raptor_path).resolve()
onnx_include_dir = Path(onnx_include_dir).resolve()
@@ -292,7 +296,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
try:
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile ONNX")
network_so_path, network_mlir_path = compile_onnx_network(
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter)
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"MLIR saved to {network_mlir_path}")
print_info(reporter, f"Shared library saved to {network_so_path}")
reporter.advance()
@@ -300,7 +305,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Build Runner")
gen_network_runner(network_onnx_path, network_so_path, onnx_include_dir, out=runner_dir / "runner.c",
verbose=False)
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter)
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"Runner built at {runner_path}")
reporter.advance()
@@ -316,14 +322,15 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
Path.mkdir(out_dir, exist_ok=True)
run_cmd = [runner_path, *flags]
run_cmd += ["--save-csv-dir", f"{out_dir}"]
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter)
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter, timeout_sec=command_timeout_seconds)
print_info(reporter, f"Reference outputs saved to {out_dir}")
reporter.advance()
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile PIM")
pim_pass_timings = compile_with_raptor(
network_mlir_path, raptor_path, raptor_dir / network_onnx_path.stem, crossbar_size, crossbar_count,
core_count=core_count, cwd=raptor_dir, verbose=verbose, reporter=reporter)
core_count=core_count, pim_merge_scheduler=pim_merge_scheduler,
cwd=raptor_dir, verbose=verbose, reporter=reporter, timeout_sec=command_timeout_seconds)
print_info(reporter, f"PIM artifacts saved to {raptor_dir / 'pim'}")
reporter.advance()
@@ -334,7 +341,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
Path.mkdir(simulation_dir, exist_ok=True)
dump_ranges = build_dump_ranges(pim_dir / "config.json", outputs_descriptor)
output_bin_path = simulation_dir / "out.bin"
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter)
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"Simulator output saved to {output_bin_path}")
reporter.advance()