add peft scheduling
Validate Operations / validate-operations (push) Has been cancelled

better deadlock report by pim simulator
This commit is contained in:
NiccoloN
2026-05-18 12:09:27 +02:00
parent de0a2f4561
commit f1602c0550
26 changed files with 1215 additions and 113 deletions
+24 -16
View File
@@ -142,8 +142,8 @@ class ProgressReporter:
self.rendered_width = 0
def run_command(cmd, cwd=None, reporter=None):
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter)
def run_command(cmd, cwd=None, reporter=None, timeout_sec=None):
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter, timeout_sec=timeout_sec)
def clean_workspace_artifacts(workspace_dir, model_stem):
@@ -186,21 +186,22 @@ def print_info(reporter, message):
reporter.log(f" {message}")
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None):
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None, timeout_sec=None):
stem = network_onnx_path.stem
onnx_ir_base = raptor_dir / stem
runner_base = runner_dir / stem
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"], reporter=reporter)
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter)
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"],
reporter=reporter, timeout_sec=timeout_sec)
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter, timeout_sec=timeout_sec)
network_so_path = runner_base.with_suffix(".so")
network_mlir_path = onnx_ir_base.with_suffix(".onnx.mlir")
onnx_ir_base.with_suffix(".tmp").unlink(missing_ok=True)
return network_so_path, network_mlir_path
def build_onnx_runner(source_dir, build_dir, reporter=None):
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter)
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter)
def build_onnx_runner(source_dir, build_dir, reporter=None, timeout_sec=None):
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
return build_dir / "runner"
@@ -214,13 +215,14 @@ def build_dump_ranges(config_path, outputs_descriptor):
return ",".join(ranges)
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None):
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None, timeout_sec=None):
run_command(
["cargo", "run", "--no-default-features", "--release", "--package", "pim-simulator", "--bin", "pim-simulator",
"--",
"-f", str(pim_dir), "-o", str(output_bin_path), "-d", dump_ranges],
cwd=simulator_dir,
reporter=reporter,
timeout_sec=timeout_sec,
)
@@ -267,8 +269,10 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1
def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
seed=0, reporter=None, model_index=1, model_total=1, verbose=False):
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None,
pim_merge_scheduler="peft", threshold=1e-3,
seed=0, reporter=None, model_index=1, model_total=1, verbose=False,
command_timeout_seconds=60.0):
network_onnx_path = Path(network_onnx_path).resolve()
raptor_path = Path(raptor_path).resolve()
onnx_include_dir = Path(onnx_include_dir).resolve()
@@ -292,7 +296,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
try:
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile ONNX")
network_so_path, network_mlir_path = compile_onnx_network(
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter)
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"MLIR saved to {network_mlir_path}")
print_info(reporter, f"Shared library saved to {network_so_path}")
reporter.advance()
@@ -300,7 +305,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Build Runner")
gen_network_runner(network_onnx_path, network_so_path, onnx_include_dir, out=runner_dir / "runner.c",
verbose=False)
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter)
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"Runner built at {runner_path}")
reporter.advance()
@@ -316,14 +322,15 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
Path.mkdir(out_dir, exist_ok=True)
run_cmd = [runner_path, *flags]
run_cmd += ["--save-csv-dir", f"{out_dir}"]
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter)
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter, timeout_sec=command_timeout_seconds)
print_info(reporter, f"Reference outputs saved to {out_dir}")
reporter.advance()
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile PIM")
pim_pass_timings = compile_with_raptor(
network_mlir_path, raptor_path, raptor_dir / network_onnx_path.stem, crossbar_size, crossbar_count,
core_count=core_count, cwd=raptor_dir, verbose=verbose, reporter=reporter)
core_count=core_count, pim_merge_scheduler=pim_merge_scheduler,
cwd=raptor_dir, verbose=verbose, reporter=reporter, timeout_sec=command_timeout_seconds)
print_info(reporter, f"PIM artifacts saved to {raptor_dir / 'pim'}")
reporter.advance()
@@ -334,7 +341,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
Path.mkdir(simulation_dir, exist_ok=True)
dump_ranges = build_dump_ranges(pim_dir / "config.json", outputs_descriptor)
output_bin_path = simulation_dir / "out.bin"
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter)
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter,
timeout_sec=command_timeout_seconds)
print_info(reporter, f"Simulator output saved to {output_bin_path}")
reporter.advance()