Skip to content

Commit 30fecc3

Browse files
authored
feat: cpu-function altered to support cpus-per-gpu, too (#28)
This PR works in combination with snakemake/snakemake-executor-plugin-slurm#173, only. It sees changes in the function to get the cpu settings. - it is possible to ommit cpu-settins upon submission, now. Required, because apparently some clusters do not allow this for GPU jobs (which is crazy, but the way we do it know, should not break workflows) - it is possible now to require CPUs using `--cpus-per-gpu`, too. (Only for GPU jobs, of course.) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced SLURM executor plugin with improved GPU job support - Added more flexible CPU and GPU resource allocation handling - **Bug Fixes** - Improved resource specification logic for different cluster configurations - Better handling of CPU allocation for GPU and non-GPU jobs <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 13b72f5 commit 30fecc3

File tree

1 file changed

+34
-13
lines changed

1 file changed

+34
-13
lines changed

snakemake_executor_plugin_slurm_jobstep/__init__.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ def __post_init__(self):
4545
# These environment variables are set by SLURM.
4646
# only needed for commented out jobstep handling below
4747
self.jobid = os.getenv("SLURM_JOB_ID")
48+
# we consider this job to be a GPU job, if a GPU has been reserved
49+
self.gpu_job = os.getenv("SLURM_GPUS")
4850

4951
def run_job(self, job: JobExecutorInterface):
5052
# Implement here how to run a job.
@@ -92,16 +94,12 @@ def run_job(self, job: JobExecutorInterface):
9294
# # now: the last one
9395
# # this way, we ensure that level jobs depending on the current level
9496
# # get started
95-
# jobsteps[level_list[-1]] = subprocess.Popen(
96-
# get_call(level_list[-1], aux="--dependency=singleton"), shell=True
97-
# )
98-
9997
if "mpi" in job.resources.keys():
10098
# MPI job:
10199
# No need to prepend `srun`, as this will happen inside of the job's shell
102100
# command or script (!).
103101
# The following call invokes snakemake, which in turn takes care of all
104-
# auxilliary work around the actual command
102+
# auxiliary work around the actual command
105103
# like remote file support, benchmark setup, error handling, etc.
106104
# AND there can be stuff around the srun call within the job, like any
107105
# commands which should be executed before.
@@ -119,8 +117,8 @@ def run_job(self, job: JobExecutorInterface):
119117
# has set the resources correctly.
120118

121119
call = "srun -n1 --cpu-bind=q "
122-
call += f"--cpus-per-task {get_cpus_per_task(job)} "
123-
call += f"{self.format_job_exec(job)}"
120+
call += f" {get_cpu_setting(job, self.gpu_job)} "
121+
call += f" {self.format_job_exec(job)}"
124122

125123
self.logger.debug(f"This job is a group job: {job.is_group()}")
126124
self.logger.debug(f"The call for this job is: {call}")
@@ -155,14 +153,37 @@ def get_exec_mode(self) -> ExecMode:
155153
return ExecMode.REMOTE
156154

157155

158-
def get_cpus_per_task(job: JobExecutorInterface):
159-
cpus_per_task = job.threads
156+
def get_cpu_setting(job: JobExecutorInterface, gpu: bool) -> str:
157+
# per default, we assume that Snakemake's threads are the same as the
158+
# cpus per task or per gpu. If the user has set the cpus_per_task or
159+
# cpus_per_gpu explicitly, we use these values.
160+
cpus_per_task = cpus_per_gpu = job.threads
161+
# cpus_per_task and cpus_per_gpu are mutually exclusive
160162
if job.resources.get("cpus_per_task"):
163+
cpus_per_task = job.resources.cpus_per_task
161164
if not isinstance(cpus_per_task, int):
162165
raise WorkflowError(
163166
f"cpus_per_task must be an integer, but is {cpus_per_task}"
164167
)
165-
cpus_per_task = job.resources.cpus_per_task
166-
# ensure that at least 1 cpu is requested
167-
# because 0 is not allowed by slurm
168-
return max(1, cpus_per_task)
168+
# If explicetily set to < 0, return an empty string
169+
# some clusters do not allow CPU settings (e.g. in GPU partitions).
170+
if cpus_per_task < 0:
171+
return ""
172+
# ensure that at least 1 cpu is requested
173+
# because 0 is not allowed by slurm
174+
cpus_per_task = max(1, job.resources.cpus_per_task)
175+
return f"--cpus-per-task={cpus_per_task}"
176+
elif gpu and job.resources.get("cpus_per_gpu"):
177+
cpus_per_gpu = job.resources.cpus_per_gpu
178+
if not isinstance(cpus_per_gpu, int):
179+
raise WorkflowError(
180+
f"cpus_per_gpu must be an integer, but is {cpus_per_gpu}"
181+
)
182+
# If explicetily set to < 0, return an empty string
183+
# some clusters do not allow CPU settings (e.g. in GPU partitions).
184+
# Currently, 0 is not allowed by SLURM.
185+
if cpus_per_gpu <= 0:
186+
return ""
187+
return f"--cpus-per-gpu={cpus_per_gpu}"
188+
else:
189+
return f"--cpus-per-task={cpus_per_task}"

0 commit comments

Comments
 (0)