From 831a8caa2a41beb8cf41fbae756e7b78d6fd3d8b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 10:10:54 -0500 Subject: [PATCH 1/2] fix gpu templates for real though --- toolchain/mfc/run/run.py | 13 +++++++++++-- toolchain/templates/bridges2.mako | 4 ++-- toolchain/templates/carpenter-cray.mako | 2 +- toolchain/templates/carpenter.mako | 2 +- toolchain/templates/default.mako | 2 +- toolchain/templates/delta.mako | 4 ++-- toolchain/templates/deltaai.mako | 4 ++-- toolchain/templates/frontier.mako | 8 ++++---- toolchain/templates/hipergator.mako | 4 ++-- toolchain/templates/nautilus.mako | 4 ++-- toolchain/templates/oscar.mako | 4 ++-- toolchain/templates/phoenix-bench.mako | 4 ++-- toolchain/templates/phoenix.mako | 4 ++-- toolchain/templates/santis.mako | 4 ++-- toolchain/templates/summit.mako | 14 +++++++------- 15 files changed, 43 insertions(+), 34 deletions(-) diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 99d5b3d6c1..82d8804899 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -7,7 +7,7 @@ from ..build import get_targets, build, REQUIRED_TARGETS, SIMULATION from ..printer import cons -from ..state import ARG, ARGS, CFG +from ..state import ARG, ARGS, CFG, gpuConfigOptions from ..common import MFCException, isspace, file_read, does_command_exist from ..common import MFC_TEMPLATE_DIR, file_write, system, MFC_ROOT_DIR from ..common import format_list_to_string, file_dump_yaml @@ -99,6 +99,12 @@ def __generate_job_script(targets, case: input.MFCInputFile): 'HIP_VISIBLE_DEVICES': gpu_ids }) + # Compute GPU mode booleans for templates + gpu_mode = ARG('gpu') + gpu_enabled = (gpu_mode != gpuConfigOptions.NONE.value) + gpu_acc = (gpu_mode == gpuConfigOptions.ACC.value) + gpu_mp = (gpu_mode == gpuConfigOptions.MP.value) + content = __get_template().render( **{**ARGS(), 'targets': targets}, ARG=ARG, @@ -107,7 +113,10 @@ def __generate_job_script(targets, case: input.MFCInputFile): MFC_ROOT_DIR=MFC_ROOT_DIR, SIMULATION=SIMULATION, qsystem=queues.get_system(), - profiler=shlex.join(__profiler_prepend()) + profiler=shlex.join(__profiler_prepend()), + gpu_enabled=gpu_enabled, + gpu_acc=gpu_acc, + gpu_mp=gpu_mp ) file_write(__job_script_filepath(), content) diff --git a/toolchain/templates/bridges2.mako b/toolchain/templates/bridges2.mako index 4536551943..e08fc7e5a4 100644 --- a/toolchain/templates/bridges2.mako +++ b/toolchain/templates/bridges2.mako @@ -14,7 +14,7 @@ % if account: #SBATCH --account="${account}" % endif -% if gpu: +% if gpu_enabled: #SBATCH --gpu-bind=verbose,closest #SBATCH --gres=gpu:v100-16:${tasks_per_node} % endif @@ -31,7 +31,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c b -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c b -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/carpenter-cray.mako b/toolchain/templates/carpenter-cray.mako index bfaad0b427..6fcc990a5f 100644 --- a/toolchain/templates/carpenter-cray.mako +++ b/toolchain/templates/carpenter-cray.mako @@ -25,7 +25,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c cc -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c cc -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/carpenter.mako b/toolchain/templates/carpenter.mako index a652a887f7..516f9a3eb1 100644 --- a/toolchain/templates/carpenter.mako +++ b/toolchain/templates/carpenter.mako @@ -25,7 +25,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c c -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c c -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/default.mako b/toolchain/templates/default.mako index b1cdaf81e0..2d05d3ab3c 100644 --- a/toolchain/templates/default.mako +++ b/toolchain/templates/default.mako @@ -48,7 +48,7 @@ if engine == 'batch': (set -x; ${profiler} \ jsrun --nrs ${tasks_per_node*nodes} \ --cpu_per_rs 1 \ - --gpu_per_rs ${1 if gpu else 0} \ + --gpu_per_rs ${1 if gpu_enabled else 0} \ --tasks_per_rs 1 \ "${target.get_install_binpath(case)}") elif [ "$binary" == "srun" ]; then diff --git a/toolchain/templates/delta.mako b/toolchain/templates/delta.mako index 694f22c457..9dc185fca4 100644 --- a/toolchain/templates/delta.mako +++ b/toolchain/templates/delta.mako @@ -14,7 +14,7 @@ % if account: #SBATCH --account="${account}" % endif -% if gpu: +% if gpu_enabled: #SBATCH --gpus-per-node=${tasks_per_node} #SBATCH --mem=208G #SBATCH --gpu-bind=closest @@ -32,7 +32,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c d -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c d -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/deltaai.mako b/toolchain/templates/deltaai.mako index 8492aa7a06..7c7f6365c9 100644 --- a/toolchain/templates/deltaai.mako +++ b/toolchain/templates/deltaai.mako @@ -14,7 +14,7 @@ % if account: #SBATCH --account="${account}" % endif -% if gpu: +% if gpu_enabled: #SBATCH --gpus-per-node=${tasks_per_node} #SBATCH --mem=208G #SBATCH --gpu-bind=closest @@ -32,7 +32,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c dai -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c dai -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/frontier.mako b/toolchain/templates/frontier.mako index 7a27d8588b..f4a530d69e 100644 --- a/toolchain/templates/frontier.mako +++ b/toolchain/templates/frontier.mako @@ -10,7 +10,7 @@ #SBATCH --time=${walltime} #SBATCH --cpus-per-task=7 #SBATCH -C nvme -% if gpu != 'no': +% if gpu_enabled: #SBATCH --gpus-per-task=1 #SBATCH --gpu-bind=closest % endif @@ -34,12 +34,12 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" % if engine == 'batch': -. ./mfc.sh load -c f -m ${'g' if gpu != 'no' else 'c'} +. ./mfc.sh load -c f -m ${'g' if gpu_enabled else 'c'} % endif cd - > /dev/null echo -% if gpu != 'no': +% if gpu_enabled: export MPICH_GPU_SUPPORT_ENABLED=1 % else: export MPICH_GPU_SUPPORT_ENABLED=0 @@ -66,7 +66,7 @@ ulimit -s unlimited % if engine == 'interactive': --unbuffered --nodes ${nodes} --ntasks-per-node ${tasks_per_node} \ --cpus-per-task 7 \ - % if gpu != 'no': + % if gpu_enabled: --gpus-per-task 1 --gpu-bind closest \ % endif ${profiler} "${target.get_install_binpath(case)}") diff --git a/toolchain/templates/hipergator.mako b/toolchain/templates/hipergator.mako index 3de36fea5b..89a40308e9 100644 --- a/toolchain/templates/hipergator.mako +++ b/toolchain/templates/hipergator.mako @@ -9,7 +9,7 @@ #SBATCH --output="${name}.out" #SBATCH --time=${walltime} #SBATCH --cpus-per-task=7 -% if gpu: +% if gpu_enabled: #SBATCH --gpus-per-task=1 #SBATCH --gpu-bind=closest % endif @@ -35,7 +35,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" % if engine == 'batch': -. ./mfc.sh load -c h -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c h -m ${'g' if gpu_enabled else 'c'} % endif cd - > /dev/null echo diff --git a/toolchain/templates/nautilus.mako b/toolchain/templates/nautilus.mako index 5dfbdfdd84..86de171af1 100644 --- a/toolchain/templates/nautilus.mako +++ b/toolchain/templates/nautilus.mako @@ -14,7 +14,7 @@ % if account: #SBATCH --account="${account}" % endif -% if gpu: +% if gpu_enabled: #SBATCH --gpu-bind=verbose,closest #SBATCH --gres=gpu:v100-16:${tasks_per_node} % endif @@ -31,7 +31,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c n -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c n -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/oscar.mako b/toolchain/templates/oscar.mako index 158a217cc1..af54ca7187 100644 --- a/toolchain/templates/oscar.mako +++ b/toolchain/templates/oscar.mako @@ -14,7 +14,7 @@ % if account: #SBATCH --account="${account}" % endif -% if gpu: +% if gpu_enabled: #SBATCH --gpus-per-node=${tasks_per_node} #SBATCH --mem=64G #SBATCH --gpu-bind=closest @@ -32,7 +32,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOTDIR}" -. ./mfc.sh load -c o -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c o -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/phoenix-bench.mako b/toolchain/templates/phoenix-bench.mako index e699da3e4c..2e69b003aa 100644 --- a/toolchain/templates/phoenix-bench.mako +++ b/toolchain/templates/phoenix-bench.mako @@ -17,7 +17,7 @@ % if quality_of_service: #SBATCH --qos=${quality_of_service} % endif -% if gpu: +% if gpu_enabled: #SBATCH --gres=gpu:V100:${tasks_per_node} #SBATCH --mem-per-gpu=16G\ % endif @@ -31,7 +31,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c p -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c p -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/phoenix.mako b/toolchain/templates/phoenix.mako index e5e139905f..c2ad4aab38 100644 --- a/toolchain/templates/phoenix.mako +++ b/toolchain/templates/phoenix.mako @@ -17,7 +17,7 @@ % if quality_of_service: #SBATCH --qos=${quality_of_service} % endif -% if gpu: +% if gpu_enabled: #SBATCH --gres=gpu:V100:${tasks_per_node} #SBATCH --mem-per-gpu=16G\ % endif @@ -31,7 +31,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c p -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c p -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index cb4b330625..413f5eb608 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -59,7 +59,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" % if engine == 'batch': -. ./mfc.sh load -c san -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c san -m ${'g' if gpu_enabled else 'c'} % endif cd - > /dev/null echo @@ -74,7 +74,7 @@ echo --ntasks=${nodes*tasks_per_node} \ --cpus-per-task 72 \ --cpu-bind=none \ - % if gpu: + % if gpu_enabled: --gpus-per-task 1 \ % endif --wait 200 --bcast=/tmp/${target.name} \ diff --git a/toolchain/templates/summit.mako b/toolchain/templates/summit.mako index 1ca902b2b1..8b94964c35 100644 --- a/toolchain/templates/summit.mako +++ b/toolchain/templates/summit.mako @@ -16,7 +16,7 @@ ${helpers.template_prologue()} ok ":) Loading modules:\n" cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c s -m ${'g' if gpu else 'c'} +. ./mfc.sh load -c s -m ${'g' if gpu_enabled else 'c'} cd - > /dev/null echo @@ -27,12 +27,12 @@ echo (set -x; ${rofiler} "${target.get_install_binpath(case)}") % else: (set -x; ${profiler} \ - jsrun \ - ${'--smpiargs="-gpu"' if gpu else ''} \ - --nrs ${tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs ${1 if gpu else 0} \ - --tasks_per_rs 1 \ + jsrun \ + ${'--smpiargs="-gpu"' if gpu_enabled else ''} \ + --nrs ${tasks_per_node*nodes} \ + --cpu_per_rs 1 \ + --gpu_per_rs ${1 if gpu_enabled else 0} \ + --tasks_per_rs 1 \ "${target.get_install_binpath(case)}") % endif From 5d6f9ae8482e955d6b9863f244d3e8bc1f8a29ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 5 Dec 2025 10:36:15 -0500 Subject: [PATCH 2/2] fix ups on formatting and some defaults --- toolchain/mfc/args.py | 2 +- toolchain/mfc/run/run.py | 14 +++++++--- toolchain/templates/frontier.mako | 2 +- toolchain/templates/phoenix.mako | 2 +- toolchain/templates/summit.mako | 44 ------------------------------- 5 files changed, 14 insertions(+), 50 deletions(-) delete mode 100644 toolchain/templates/summit.mako diff --git a/toolchain/mfc/args.py b/toolchain/mfc/args.py index 169ee076a4..50682d9f4e 100644 --- a/toolchain/mfc/args.py +++ b/toolchain/mfc/args.py @@ -59,7 +59,7 @@ def add_common_arguments(p: argparse.ArgumentParser, mask = None): if "m" not in mask: for f in dataclasses.fields(config): if f.name == 'gpu': - p.add_argument(f"--{f.name}", action="store", nargs='?', const= gpuConfigOptions.ACC.value,default=gpuConfigOptions.ACC.value, dest=f.name, choices=[e.value for e in gpuConfigOptions], help=f"Turn the {f.name} option to OpenACC or OpenMP.") + p.add_argument(f"--{f.name}", action="store", nargs='?', const= gpuConfigOptions.ACC.value,default=gpuConfigOptions.NONE.value, dest=f.name, choices=[e.value for e in gpuConfigOptions], help=f"Turn the {f.name} option to OpenACC or OpenMP.") p.add_argument(f"--no-{f.name}", action="store_const", const = gpuConfigOptions.NONE.value, dest=f.name, help=f"Turn the {f.name} option OFF.") continue p.add_argument( f"--{f.name}", action="store_true", help=f"Turn the {f.name} option ON.") diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 82d8804899..fc2366e497 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -101,9 +101,17 @@ def __generate_job_script(targets, case: input.MFCInputFile): # Compute GPU mode booleans for templates gpu_mode = ARG('gpu') - gpu_enabled = (gpu_mode != gpuConfigOptions.NONE.value) - gpu_acc = (gpu_mode == gpuConfigOptions.ACC.value) - gpu_mp = (gpu_mode == gpuConfigOptions.MP.value) + + # Validate gpu_mode is one of the expected values + valid_gpu_modes = {e.value for e in gpuConfigOptions} + if gpu_mode not in valid_gpu_modes: + raise MFCException( + f"Invalid GPU mode '{gpu_mode}'. Must be one of: {', '.join(sorted(valid_gpu_modes))}" + ) + + gpu_enabled = gpu_mode != gpuConfigOptions.NONE.value + gpu_acc = gpu_mode == gpuConfigOptions.ACC.value + gpu_mp = gpu_mode == gpuConfigOptions.MP.value content = __get_template().render( **{**ARGS(), 'targets': targets}, diff --git a/toolchain/templates/frontier.mako b/toolchain/templates/frontier.mako index f4a530d69e..474baf0586 100644 --- a/toolchain/templates/frontier.mako +++ b/toolchain/templates/frontier.mako @@ -45,7 +45,7 @@ echo export MPICH_GPU_SUPPORT_ENABLED=0 % endif -%if unified: +% if unified: export CRAY_ACC_USE_UNIFIED_MEM=1 % endif diff --git a/toolchain/templates/phoenix.mako b/toolchain/templates/phoenix.mako index c2ad4aab38..3115bb688c 100644 --- a/toolchain/templates/phoenix.mako +++ b/toolchain/templates/phoenix.mako @@ -19,7 +19,7 @@ % endif % if gpu_enabled: #SBATCH --gres=gpu:V100:${tasks_per_node} -#SBATCH --mem-per-gpu=16G\ +#SBATCH --mem-per-gpu=16G % endif % if email: #SBATCH --mail-user=${email} diff --git a/toolchain/templates/summit.mako b/toolchain/templates/summit.mako deleted file mode 100644 index 8b94964c35..0000000000 --- a/toolchain/templates/summit.mako +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash - -<%namespace name="helpers" file="helpers.mako"/> - -% if engine == 'batch': -#BSUB -J {{{name}}} -#BSUB -nnodes {{{nodes}}} -#BSUB -W {{{walltime[:-3]}}} -#BSUB -N -% if account: -#BSUB -P {{{account}}} -% endif -% endif - -${helpers.template_prologue()} - -ok ":) Loading modules:\n" -cd "${MFC_ROOT_DIR}" -. ./mfc.sh load -c s -m ${'g' if gpu_enabled else 'c'} -cd - > /dev/null -echo - -% for target in targets: - ${helpers.run_prologue(target)} - - % if not mpi: - (set -x; ${rofiler} "${target.get_install_binpath(case)}") - % else: - (set -x; ${profiler} \ - jsrun \ - ${'--smpiargs="-gpu"' if gpu_enabled else ''} \ - --nrs ${tasks_per_node*nodes} \ - --cpu_per_rs 1 \ - --gpu_per_rs ${1 if gpu_enabled else 0} \ - --tasks_per_rs 1 \ - "${target.get_install_binpath(case)}") - % endif - - ${helpers.run_epilogue(target)} - - echo -% endfor - -${helpers.template_epilogue()}