Skip to content

Commit b0eca32

Browse files
committed
debug
1 parent 0b31278 commit b0eca32

File tree

1 file changed

+58
-2
lines changed

1 file changed

+58
-2
lines changed

toolchain/mfc/test/test.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,17 +322,73 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
322322
raise MFCException(f"Test {case}: {msg}")
323323

324324
if ARG("test_all"):
325-
case.delete_output()
325+
# Don't delete output here - we need restart_data from the simulation above
326326
# Check timeout before launching the (potentially long) post-process run
327327
if timeout_flag.is_set():
328328
raise TestTimeoutError("Test case exceeded 1 hour timeout")
329-
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
329+
# Run only POST_PROCESS since PRE_PROCESS and SIMULATION already ran successfully above
330+
cmd = case.run([POST_PROCESS], gpus=devices)
330331
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
331332
common.file_write(out_filepath, cmd.stdout)
332333

333334
# Check return code from post-process run
334335
if cmd.returncode != 0:
335336
cons.print(cmd.stdout)
337+
338+
# Extra debug for multi-rank restart/post-process issues
339+
if getattr(case, "ppn", 1) >= 2:
340+
case_dir = case.get_dirpath()
341+
restart_dir = os.path.join(case_dir, "restart_data")
342+
343+
cons.print("[bold yellow]Multi-rank debug (ppn >= 2): inspecting restart_data and post_process.inp[/bold yellow]")
344+
cons.print(f"[bold yellow] Case directory:[/bold yellow] {case_dir}")
345+
cons.print(f"[bold yellow] restart_data directory:[/bold yellow] {restart_dir}")
346+
347+
# List restart_data contents
348+
if os.path.isdir(restart_dir):
349+
try:
350+
entries = sorted(os.listdir(restart_dir))
351+
except OSError as exc:
352+
cons.print(f"[bold yellow] Could not list restart_data contents: {exc}[/bold yellow]")
353+
else:
354+
cons.print(f"[bold yellow] restart_data entries ({len(entries)} total, showing up to 20):[/bold yellow]")
355+
for name in entries[:20]:
356+
cons.print(f" - {name}")
357+
else:
358+
cons.print("[bold yellow] restart_data directory does not exist[/bold yellow]")
359+
360+
# Dump key case parameters relevant to restart/post-process
361+
params = getattr(case, "params", {})
362+
def _param(name: str):
363+
return params.get(name, "<unset>")
364+
365+
cons.print("[bold yellow] Selected case parameters relevant to restart:[/bold yellow]")
366+
for key in (
367+
"t_step_start",
368+
"t_step_stop",
369+
"t_step_save",
370+
"n_start",
371+
"t_save",
372+
"parallel_io",
373+
"file_per_process",
374+
):
375+
cons.print(f" {key} = {_param(key)}")
376+
377+
# Show the beginning of post_process.inp if present
378+
ppi_path = os.path.join(case_dir, "post_process.inp")
379+
if os.path.exists(ppi_path):
380+
cons.print(f"[bold yellow] First lines of post_process.inp ({ppi_path}):[/bold yellow]")
381+
try:
382+
with open(ppi_path, "r", encoding="utf-8", errors="replace") as f:
383+
for i, line in enumerate(f):
384+
if i >= 40:
385+
break
386+
cons.print(" " + line.rstrip())
387+
except OSError as exc:
388+
cons.print(f"[bold yellow] Could not read post_process.inp: {exc}[/bold yellow]")
389+
else:
390+
cons.print("[bold yellow] post_process.inp not found in case directory[/bold yellow]")
391+
336392
raise MFCException(
337393
f"Test {case}: Failed to execute MFC (post-process). "
338394
f"See log at: {out_filepath}"

0 commit comments

Comments
 (0)