better deadlock report by pim simulator
This commit is contained in:
@@ -41,7 +41,8 @@ def _format_command(cmd):
|
||||
|
||||
|
||||
def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
|
||||
crossbar_size, crossbar_count, core_count=None, cwd=None, verbose=False, reporter=None):
|
||||
crossbar_size, crossbar_count, core_count=None, pim_merge_scheduler="peft",
|
||||
cwd=None, verbose=False, reporter=None, timeout_sec=None):
|
||||
# Define the arguments, with the possibility to set crossbar size and count
|
||||
args = [
|
||||
network_path,
|
||||
@@ -51,6 +52,7 @@ def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
|
||||
"--EmitPimCodegen",
|
||||
f"--crossbar-size={crossbar_size}",
|
||||
f"--crossbar-count={crossbar_count}",
|
||||
f"--pim-merge-scheduler={pim_merge_scheduler}",
|
||||
]
|
||||
if core_count is not None:
|
||||
args.append(f"--core-count={core_count}")
|
||||
@@ -69,6 +71,7 @@ def compile_with_raptor(network_path, raptor_onnx_path: Path, output_base: Path,
|
||||
cwd=cwd,
|
||||
reporter=reporter,
|
||||
capture_output=True,
|
||||
timeout_sec=timeout_sec,
|
||||
)
|
||||
if reporter is None:
|
||||
print(Fore.GREEN + "Raptor execution successful" + Style.RESET_ALL)
|
||||
|
||||
@@ -3,6 +3,7 @@ import os
|
||||
import pty
|
||||
import selectors
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
MAX_ERROR_OUTPUT_BYTES = 8192
|
||||
|
||||
@@ -16,16 +17,26 @@ def _read_chunk(fd, treat_eio_as_eof=False):
|
||||
raise
|
||||
|
||||
|
||||
def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=True):
|
||||
def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=True, timeout_sec=None):
|
||||
selector = selectors.DefaultSelector()
|
||||
recent_output = bytearray()
|
||||
captured_output = bytearray()
|
||||
deadline = None if timeout_sec is None else time.monotonic() + timeout_sec
|
||||
|
||||
try:
|
||||
selector.register(fd, selectors.EVENT_READ)
|
||||
|
||||
while selector.get_map():
|
||||
for key, _ in selector.select():
|
||||
select_timeout = None
|
||||
if deadline is not None:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
process.kill()
|
||||
process.wait()
|
||||
raise subprocess.TimeoutExpired(process.args, timeout_sec, output=bytes(captured_output))
|
||||
select_timeout = min(1.0, remaining)
|
||||
|
||||
for key, _ in selector.select(select_timeout):
|
||||
data = _read_chunk(key.fileobj, treat_eio_as_eof=treat_eio_as_eof)
|
||||
if not data:
|
||||
selector.unregister(key.fileobj)
|
||||
@@ -53,7 +64,7 @@ def _stream_output(fd, process, reporter, treat_eio_as_eof=False, stream_output=
|
||||
return bytes(captured_output)
|
||||
|
||||
|
||||
def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False):
|
||||
def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False, timeout_sec=None):
|
||||
if reporter is None:
|
||||
if capture_output:
|
||||
completed = subprocess.run(
|
||||
@@ -62,9 +73,10 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
|
||||
check=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
return completed.stdout.decode("utf-8", errors="replace")
|
||||
subprocess.run(cmd, cwd=cwd, check=True)
|
||||
subprocess.run(cmd, cwd=cwd, check=True, timeout=timeout_sec)
|
||||
return None
|
||||
|
||||
stream_output = bool(getattr(reporter, "verbose", False))
|
||||
@@ -74,6 +86,7 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
|
||||
cwd=cwd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=timeout_sec,
|
||||
)
|
||||
if completed.returncode != 0:
|
||||
raise subprocess.CalledProcessError(completed.returncode, completed.args, output=completed.stdout)
|
||||
@@ -89,7 +102,7 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
|
||||
stderr=subprocess.STDOUT,
|
||||
)
|
||||
assert process.stdout is not None
|
||||
output = _stream_output(process.stdout.fileno(), process, reporter)
|
||||
output = _stream_output(process.stdout.fileno(), process, reporter, timeout_sec=timeout_sec)
|
||||
return output.decode("utf-8", errors="replace") if capture_output else None
|
||||
|
||||
try:
|
||||
@@ -102,5 +115,5 @@ def run_command_with_reporter(cmd, cwd=None, reporter=None, capture_output=False
|
||||
finally:
|
||||
os.close(slave_fd)
|
||||
|
||||
output = _stream_output(master_fd, process, reporter, treat_eio_as_eof=True)
|
||||
output = _stream_output(master_fd, process, reporter, treat_eio_as_eof=True, timeout_sec=timeout_sec)
|
||||
return output.decode("utf-8", errors="replace") if capture_output else None
|
||||
|
||||
@@ -64,7 +64,11 @@ def main():
|
||||
ap.add_argument("--crossbar-size", type=int, default=64)
|
||||
ap.add_argument("--crossbar-count", type=int, default=8)
|
||||
ap.add_argument("--core-count", type=int, default=None,
|
||||
help="Core count to pass to Raptor. If omitted, Raptor uses its default.")
|
||||
help="Core count to pass to Raptor. Required for PIM validation.")
|
||||
ap.add_argument("--pim-merge-scheduler", choices=("peft", "dcp"), default="peft",
|
||||
help="Scheduler used by the Spatial merge-compute-nodes pass.")
|
||||
ap.add_argument("--command-timeout-seconds", type=float, default=60.0,
|
||||
help="Per-subprocess timeout in seconds for compiler, runner, and simulator commands.")
|
||||
ap.add_argument("--clean", action="store_true",
|
||||
help="Remove generated validation artifacts under each model workspace and exit.")
|
||||
ap.add_argument("--verbose", action="store_true",
|
||||
@@ -98,6 +102,8 @@ def main():
|
||||
missing_args.append("--raptor-path")
|
||||
if not a.onnx_include_dir:
|
||||
missing_args.append("--onnx-include-dir")
|
||||
if a.core_count is None:
|
||||
missing_args.append("--core-count")
|
||||
if missing_args:
|
||||
ap.error("the following arguments are required unless --clean is used: " + ", ".join(missing_args))
|
||||
|
||||
@@ -117,6 +123,8 @@ def main():
|
||||
result = validate_network(
|
||||
onnx_path, a.raptor_path, a.onnx_include_dir, simulator_dir,
|
||||
crossbar_size=a.crossbar_size, crossbar_count=a.crossbar_count, core_count=a.core_count,
|
||||
pim_merge_scheduler=a.pim_merge_scheduler,
|
||||
command_timeout_seconds=a.command_timeout_seconds,
|
||||
threshold=a.threshold,
|
||||
seed=a.seed,
|
||||
reporter=reporter,
|
||||
|
||||
+24
-16
@@ -142,8 +142,8 @@ class ProgressReporter:
|
||||
self.rendered_width = 0
|
||||
|
||||
|
||||
def run_command(cmd, cwd=None, reporter=None):
|
||||
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter)
|
||||
def run_command(cmd, cwd=None, reporter=None, timeout_sec=None):
|
||||
run_command_with_reporter(cmd, cwd=cwd, reporter=reporter, timeout_sec=timeout_sec)
|
||||
|
||||
|
||||
def clean_workspace_artifacts(workspace_dir, model_stem):
|
||||
@@ -186,21 +186,22 @@ def print_info(reporter, message):
|
||||
reporter.log(f" {message}")
|
||||
|
||||
|
||||
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None):
|
||||
def compile_onnx_network(network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=None, timeout_sec=None):
|
||||
stem = network_onnx_path.stem
|
||||
onnx_ir_base = raptor_dir / stem
|
||||
runner_base = runner_dir / stem
|
||||
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"], reporter=reporter)
|
||||
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter)
|
||||
run_command([raptor_path, network_onnx_path, "-o", onnx_ir_base, "--EmitONNXIR"],
|
||||
reporter=reporter, timeout_sec=timeout_sec)
|
||||
run_command([raptor_path, network_onnx_path, "-o", runner_base], reporter=reporter, timeout_sec=timeout_sec)
|
||||
network_so_path = runner_base.with_suffix(".so")
|
||||
network_mlir_path = onnx_ir_base.with_suffix(".onnx.mlir")
|
||||
onnx_ir_base.with_suffix(".tmp").unlink(missing_ok=True)
|
||||
return network_so_path, network_mlir_path
|
||||
|
||||
|
||||
def build_onnx_runner(source_dir, build_dir, reporter=None):
|
||||
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter)
|
||||
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter)
|
||||
def build_onnx_runner(source_dir, build_dir, reporter=None, timeout_sec=None):
|
||||
run_command(["cmake", source_dir], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
|
||||
run_command(["cmake", "--build", ".", "-j"], cwd=build_dir, reporter=reporter, timeout_sec=timeout_sec)
|
||||
return build_dir / "runner"
|
||||
|
||||
|
||||
@@ -214,13 +215,14 @@ def build_dump_ranges(config_path, outputs_descriptor):
|
||||
return ",".join(ranges)
|
||||
|
||||
|
||||
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None):
|
||||
def run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=None, timeout_sec=None):
|
||||
run_command(
|
||||
["cargo", "run", "--no-default-features", "--release", "--package", "pim-simulator", "--bin", "pim-simulator",
|
||||
"--",
|
||||
"-f", str(pim_dir), "-o", str(output_bin_path), "-d", dump_ranges],
|
||||
cwd=simulator_dir,
|
||||
reporter=reporter,
|
||||
timeout_sec=timeout_sec,
|
||||
)
|
||||
|
||||
|
||||
@@ -267,8 +269,10 @@ def validate_outputs(sim_arrays, runner_out_dir, outputs_descriptor, threshold=1
|
||||
|
||||
|
||||
def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None, threshold=1e-3,
|
||||
seed=0, reporter=None, model_index=1, model_total=1, verbose=False):
|
||||
simulator_dir, crossbar_size=64, crossbar_count=8, core_count=None,
|
||||
pim_merge_scheduler="peft", threshold=1e-3,
|
||||
seed=0, reporter=None, model_index=1, model_total=1, verbose=False,
|
||||
command_timeout_seconds=60.0):
|
||||
network_onnx_path = Path(network_onnx_path).resolve()
|
||||
raptor_path = Path(raptor_path).resolve()
|
||||
onnx_include_dir = Path(onnx_include_dir).resolve()
|
||||
@@ -292,7 +296,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||
try:
|
||||
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile ONNX")
|
||||
network_so_path, network_mlir_path = compile_onnx_network(
|
||||
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter)
|
||||
network_onnx_path, raptor_path, raptor_dir, runner_dir, reporter=reporter,
|
||||
timeout_sec=command_timeout_seconds)
|
||||
print_info(reporter, f"MLIR saved to {network_mlir_path}")
|
||||
print_info(reporter, f"Shared library saved to {network_so_path}")
|
||||
reporter.advance()
|
||||
@@ -300,7 +305,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Build Runner")
|
||||
gen_network_runner(network_onnx_path, network_so_path, onnx_include_dir, out=runner_dir / "runner.c",
|
||||
verbose=False)
|
||||
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter)
|
||||
runner_path = build_onnx_runner(runner_dir, runner_build_dir, reporter=reporter,
|
||||
timeout_sec=command_timeout_seconds)
|
||||
print_info(reporter, f"Runner built at {runner_path}")
|
||||
reporter.advance()
|
||||
|
||||
@@ -316,14 +322,15 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||
Path.mkdir(out_dir, exist_ok=True)
|
||||
run_cmd = [runner_path, *flags]
|
||||
run_cmd += ["--save-csv-dir", f"{out_dir}"]
|
||||
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter)
|
||||
run_command(run_cmd, cwd=runner_build_dir, reporter=reporter, timeout_sec=command_timeout_seconds)
|
||||
print_info(reporter, f"Reference outputs saved to {out_dir}")
|
||||
reporter.advance()
|
||||
|
||||
print_stage(reporter, model_index, model_total, network_onnx_path.name, "Compile PIM")
|
||||
pim_pass_timings = compile_with_raptor(
|
||||
network_mlir_path, raptor_path, raptor_dir / network_onnx_path.stem, crossbar_size, crossbar_count,
|
||||
core_count=core_count, cwd=raptor_dir, verbose=verbose, reporter=reporter)
|
||||
core_count=core_count, pim_merge_scheduler=pim_merge_scheduler,
|
||||
cwd=raptor_dir, verbose=verbose, reporter=reporter, timeout_sec=command_timeout_seconds)
|
||||
print_info(reporter, f"PIM artifacts saved to {raptor_dir / 'pim'}")
|
||||
reporter.advance()
|
||||
|
||||
@@ -334,7 +341,8 @@ def validate_network(network_onnx_path, raptor_path, onnx_include_dir,
|
||||
Path.mkdir(simulation_dir, exist_ok=True)
|
||||
dump_ranges = build_dump_ranges(pim_dir / "config.json", outputs_descriptor)
|
||||
output_bin_path = simulation_dir / "out.bin"
|
||||
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter)
|
||||
run_pim_simulator(simulator_dir, pim_dir, output_bin_path, dump_ranges, reporter=reporter,
|
||||
timeout_sec=command_timeout_seconds)
|
||||
print_info(reporter, f"Simulator output saved to {output_bin_path}")
|
||||
reporter.advance()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user