|
14 | 14 | import time |
15 | 15 | import warnings |
16 | 16 | from pathlib import Path |
| 17 | +import re |
| 18 | +import subprocess |
17 | 19 | from typing import Any, Optional, Union |
18 | 20 |
|
19 | 21 | from vec_inf.client._exceptions import ( |
@@ -181,6 +183,49 @@ def batch_launch_models( |
181 | 183 | ) |
182 | 184 | return model_launcher.launch() |
183 | 185 |
|
| 186 | + def fetch_running_jobs(self) -> list[str]: |
| 187 | + """ |
| 188 | + Fetch the list of running vec-inf job IDs for the current user. |
| 189 | +
|
| 190 | + Returns |
| 191 | + ------- |
| 192 | + list[str] |
| 193 | + List of matching job names; empty list if squeue unavailable. |
| 194 | + """ |
| 195 | + |
| 196 | + try: |
| 197 | + # Run squeue for current user |
| 198 | + res = subprocess.run( |
| 199 | + ["squeue", "--me", "--noheader"], |
| 200 | + capture_output=True, text=True, check=True |
| 201 | + ) |
| 202 | + job_ids = [ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()] |
| 203 | + |
| 204 | + if not job_ids: |
| 205 | + return [] |
| 206 | + |
| 207 | + # For each job, fetch the full JobName and filter by suffix |
| 208 | + matching_ids = [] |
| 209 | + for jid in job_ids: |
| 210 | + try: |
| 211 | + sctl = subprocess.run( |
| 212 | + ["scontrol", "show", "job", "-o", jid], |
| 213 | + capture_output=True, text=True, check=True |
| 214 | + ) |
| 215 | + # Example: "JobId=12345 JobName=my-long-job-name-vec-inf ..." |
| 216 | + m = re.search(r"\bJobName=([^\s]+)", sctl.stdout) |
| 217 | + if m and m.group(1).endswith("-vec-inf"): |
| 218 | + matching_ids.append(jid) |
| 219 | + except subprocess.CalledProcessError: |
| 220 | + # Job might have finished between squeue and scontrol; skip |
| 221 | + continue |
| 222 | + |
| 223 | + return matching_ids |
| 224 | + |
| 225 | + except subprocess.CalledProcessError as e: |
| 226 | + raise SlurmJobError(f"Error running slurm command: {e}") from e |
| 227 | + |
| 228 | + |
184 | 229 | def get_status(self, slurm_job_id: str) -> StatusResponse: |
185 | 230 | """Get the status of a running model. |
186 | 231 |
|
|
0 commit comments