Skip to content

Commit 7097ced

Browse files
committed
Add fetch running jobs in API to return all vec-inf jobs running for the user
1 parent ba3303c commit 7097ced

File tree

3 files changed

+50
-1
lines changed

3 files changed

+50
-1
lines changed

vec_inf/client/_helper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ def _get_base_status_data(self) -> StatusResponse:
721721
Basic status information for the job
722722
"""
723723
try:
724-
job_name = self.job_status["JobName"]
724+
job_name = self.job_status["JobName"].removesuffix("-vec-inf")
725725
job_state = self.job_status["JobState"]
726726
except KeyError:
727727
job_name = "UNAVAILABLE"

vec_inf/client/_slurm_script_generator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ def _generate_shebang(self) -> str:
8989
for arg, value in SLURM_JOB_CONFIG_ARGS.items():
9090
if self.params.get(value):
9191
shebang.append(f"#SBATCH --{arg}={self.params[value]}")
92+
if value == "model_name":
93+
shebang[-1] += "-vec-inf"
9294
if self.is_multinode:
9395
shebang += SLURM_SCRIPT_TEMPLATE["shebang"]["multinode"]
9496
return "\n".join(shebang)
@@ -328,6 +330,8 @@ def _generate_batch_slurm_script_shebang(self) -> str:
328330
model_params = self.params["models"][model_name]
329331
if model_params.get(value) and value not in ["out_file", "err_file"]:
330332
shebang.append(f"#SBATCH --{arg}={model_params[value]}")
333+
if value == "model_name":
334+
shebang[-1] += "-vec-inf"
331335
shebang[-1] += "\n"
332336
shebang.append(BATCH_SLURM_SCRIPT_TEMPLATE["hetjob"])
333337
# Remove the last hetjob line

vec_inf/client/api.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import time
1515
import warnings
1616
from pathlib import Path
17+
import re
18+
import subprocess
1719
from typing import Any, Optional, Union
1820

1921
from vec_inf.client._exceptions import (
@@ -181,6 +183,49 @@ def batch_launch_models(
181183
)
182184
return model_launcher.launch()
183185

186+
def fetch_running_jobs(self) -> list[str]:
187+
"""
188+
Fetch the list of running vec-inf job IDs for the current user.
189+
190+
Returns
191+
-------
192+
list[str]
193+
List of matching job names; empty list if squeue unavailable.
194+
"""
195+
196+
try:
197+
# Run squeue for current user
198+
res = subprocess.run(
199+
["squeue", "--me", "--noheader"],
200+
capture_output=True, text=True, check=True
201+
)
202+
job_ids = [ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()]
203+
204+
if not job_ids:
205+
return []
206+
207+
# For each job, fetch the full JobName and filter by suffix
208+
matching_ids = []
209+
for jid in job_ids:
210+
try:
211+
sctl = subprocess.run(
212+
["scontrol", "show", "job", "-o", jid],
213+
capture_output=True, text=True, check=True
214+
)
215+
# Example: "JobId=12345 JobName=my-long-job-name-vec-inf ..."
216+
m = re.search(r"\bJobName=([^\s]+)", sctl.stdout)
217+
if m and m.group(1).endswith("-vec-inf"):
218+
matching_ids.append(jid)
219+
except subprocess.CalledProcessError:
220+
# Job might have finished between squeue and scontrol; skip
221+
continue
222+
223+
return matching_ids
224+
225+
except subprocess.CalledProcessError as e:
226+
raise SlurmJobError(f"Error running slurm command: {e}") from e
227+
228+
184229
def get_status(self, slurm_job_id: str) -> StatusResponse:
185230
"""Get the status of a running model.
186231

0 commit comments

Comments
 (0)