|
46 | 46 | from .efficiency_report import create_efficiency_report |
47 | 47 | from .submit_string import get_submit_command |
48 | 48 | from .partitions import read_partition_file, get_best_partition |
49 | | -from .validation import validate_slurm_extra, validate_executor_settings |
| 49 | +from .validation import ( |
| 50 | + validate_slurm_extra, |
| 51 | + validate_executor_settings, |
| 52 | + validate_status_command_settings, |
| 53 | +) |
50 | 54 |
|
51 | 55 |
|
52 | 56 | def _get_status_command_default(): |
@@ -266,8 +270,7 @@ class ExecutorSettings(ExecutorSettingsBase): |
266 | 270 |
|
267 | 271 | def __post_init__(self): |
268 | 272 | """Validate settings after initialization.""" |
269 | | - # Run all validation checks |
270 | | - validate_executor_settins(self) |
| 273 | + validate_executor_settings(self) |
271 | 274 |
|
272 | 275 |
|
273 | 276 | # Required: |
@@ -329,61 +332,10 @@ def __post_init__(self, test_mode: bool = False): |
329 | 332 | else None |
330 | 333 | ) |
331 | 334 | atexit.register(self.clean_old_logs) |
332 | | - |
333 | | - # Validate status_command configuration if the field exists |
334 | | - self._validate_status_command_settings() |
335 | | - |
336 | | - def _validate_status_command_settings(self): |
337 | | - """Validate and provide feedback about status_command configuration.""" |
338 | | - if hasattr(self.workflow.executor_settings, "status_command"): |
339 | | - status_command = self.workflow.executor_settings.status_command |
340 | | - if status_command: |
341 | | - min_job_age = get_min_job_age() |
342 | | - sacct_available = is_query_tool_available("sacct") |
343 | | - |
344 | | - # Threshold: 3x initial status check interval (default 40s) |
345 | | - initial_interval = getattr( |
346 | | - self.workflow.executor_settings, |
347 | | - "init_seconds_before_status_checks", |
348 | | - 40, |
349 | | - ) |
350 | | - dynamic_check_threshold = 3 * initial_interval |
351 | | - |
352 | | - if not sacct_available and status_command == "sacct": |
353 | | - self.logger.warning( |
354 | | - "The 'sacct' command is not available on this system. " |
355 | | - "Using 'squeue' instead for job status queries." |
356 | | - ) |
357 | | - elif sacct_available and min_job_age is not None: |
358 | | - if ( |
359 | | - min_job_age < dynamic_check_threshold |
360 | | - and status_command == "squeue" |
361 | | - ): |
362 | | - self.logger.warning( |
363 | | - f"MinJobAge is {min_job_age} seconds " |
364 | | - f"(< {dynamic_check_threshold}s). " |
365 | | - f"This may cause 'squeue' to miss recently finished jobs " |
366 | | - "that have been purged from slurmctld, leading to job " |
367 | | - "status queries being impossible with 'squeue'. " |
368 | | - "Consider using 'sacct' instead or let your admini- " |
369 | | - "strator increase MinJobAge. " |
370 | | - "(Threshold is 3x status check interval: 3 x " |
371 | | - f"{initial_interval}s = " |
372 | | - f"{dynamic_check_threshold}s)" |
373 | | - ) |
374 | | - elif ( |
375 | | - min_job_age >= dynamic_check_threshold |
376 | | - and status_command == "sacct" |
377 | | - ): |
378 | | - self.logger.warning( |
379 | | - f"MinJobAge is {min_job_age} seconds (>= " |
380 | | - f"{dynamic_check_threshold}s). " |
381 | | - f"The 'squeue' command should work reliably for " |
382 | | - "status queries. " |
383 | | - "(Threshold is 3x status check interval: 3 x " |
384 | | - f"{initial_interval}s = " |
385 | | - f"{dynamic_check_threshold}s)" |
386 | | - ) |
| 335 | + # moved validation to validation.py |
| 336 | + validate_status_command_settings( |
| 337 | + self.workflow.executor_settings, self.logger |
| 338 | + ) |
387 | 339 |
|
388 | 340 | def get_status_command(self): |
389 | 341 | """Get the status command to use, with fallback logic.""" |
@@ -654,13 +606,32 @@ async def check_active_jobs( |
654 | 606 | missing_sacct_status = set() |
655 | 607 |
|
656 | 608 | # decide which status command to use |
657 | | - status_command = self.get_status_command() |
658 | | - # Getting the actual command with parameters. |
659 | | - # Here, the command will be a list generated with |
660 | | - # shlex.split(). |
661 | | - if status_command == "sacct": |
| 609 | + status_command_name = self.get_status_command() |
| 610 | + min_job_age = get_min_job_age() |
| 611 | + initial_interval = getattr( |
| 612 | + self.workflow.executor_settings, |
| 613 | + "init_seconds_before_status_checks", |
| 614 | + 40, |
| 615 | + ) |
| 616 | + dynamic_check_threshold = 3 * initial_interval |
| 617 | + if status_command_name == "squeue": |
| 618 | + if ( |
| 619 | + min_job_age is None |
| 620 | + or min_job_age < dynamic_check_threshold |
| 621 | + ) and is_query_tool_available("sacct"): |
| 622 | + self.logger.info( |
| 623 | + "Falling back to 'sacct' for status queries " |
| 624 | + f"(MinJobAge={min_job_age}; threshold={dynamic_check_threshold}s)." |
| 625 | + ) |
| 626 | + status_command_name = "sacct" |
| 627 | + if status_command_name == "sacct" and not is_query_tool_available("sacct"): |
| 628 | + self.logger.info( |
| 629 | + "'sacct' unavailable, using 'squeue' for status queries." |
| 630 | + ) |
| 631 | + status_command_name = "squeue" |
| 632 | + if status_command_name == "sacct": |
662 | 633 | status_command = query_job_status_sacct(self.run_uuid) |
663 | | - elif status_command == "squeue": |
| 634 | + else: |
664 | 635 | status_command = query_job_status_squeue(self.run_uuid) |
665 | 636 |
|
666 | 637 | # this code is inspired by the snakemake profile: |
|
0 commit comments