Merge branch 'main' into feature/misc-fixes

XkunW · web-flow · commit 11da9206c798 · 2025-11-06T11:10:38.000-05:00
diff --git a/MODEL_TRACKING.md b/MODEL_TRACKING.md
@@ -166,8 +166,8 @@ This document tracks all model weights available in the `/model-weights` directo
 | Model | Configuration |
 |:------|:-------------|
 | `Qwen3-14B` | ✅ |
-| `Qwen3-8B` | ❌ |
-| `Qwen3-32B` | ❌ |
+| `Qwen3-8B` | ✅ |
+| `Qwen3-32B` | ✅ |
 | `Qwen3-235B-A22B` | ❌ |
 | `Qwen3-Embedding-8B` | ❌ |
 
@@ -187,6 +187,11 @@ This document tracks all model weights available in the `/model-weights` directo
 | `DeepSeek-Coder-V2-Lite-Instruct` | ❌ |
 | `deepseek-math-7b-instruct` | ❌ |
 
+### OpenAI: GPT-OSS
+| Model | Configuration |
+|:------|:-------------|
+| `gpt-oss-120b` | ✅ |
+
 ### Other LLM Models
 | Model | Configuration |
 |:------|:-------------|
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
 [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
 [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
-[![vLLM](https://img.shields.io/badge/vLLM-0.10.1.1-blue)](https://docs.vllm.ai/en/v0.10.1.1/)
+[![vLLM](https://img.shields.io/badge/vLLM-0.11.0-blue)](https://docs.vllm.ai/en/v0.11.0/)
 ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
 
 This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **This package runs natively on the Vector Institute cluster environments**. To adapt to other environments, follow the instructions in [Installation](#installation).
@@ -20,7 +20,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
 ```bash
 pip install vec-inf
 ```
-Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.10.1.1`.
+Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
 
 If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
 * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](vec_inf/config/), then install from source by running `pip install .`.
diff --git a/docs/index.md b/docs/index.md
@@ -12,7 +12,7 @@ If you are using the Vector cluster environment, and you don't need any customiz
 pip install vec-inf
 ```
 
-Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.10.1.1`.
+Otherwise, we recommend using the provided [`Dockerfile`](https://github.com/VectorInstitute/vector-inference/blob/main/Dockerfile) to set up your own environment with the package. The latest image has `vLLM` version `0.11.0`.
 
 If you'd like to use `vec-inf` on your own Slurm cluster, you would need to update the configuration files, there are 3 ways to do it:
 * Clone the repository and update the `environment.yaml` and the `models.yaml` file in [`vec_inf/config`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config), then install from source by running `pip install .`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vec-inf"
-version = "0.7.1"
+version = "0.7.2"
 description = "Efficient LLM inference on Slurm clusters using vLLM."
 readme = "README.md"
 authors = [{name = "Marshall Wang", email = "marshall.wang@vectorinstitute.ai"}]
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
@@ -36,6 +36,43 @@ def __init__(self, model_name: str, params: dict[str, Any]):
         self.model_name = model_name
         self.params = params
 
+    def _add_resource_allocation_details(self, table: Table) -> None:
+        """Add resource allocation details to the table."""
+        optional_fields = [
+            ("account", "Account"),
+            ("work_dir", "Working Directory"),
+            ("resource_type", "Resource Type"),
+            ("partition", "Partition"),
+            ("qos", "QoS"),
+        ]
+        for key, label in optional_fields:
+            if self.params.get(key):
+                table.add_row(label, self.params[key])
+
+    def _add_vllm_config(self, table: Table) -> None:
+        """Add vLLM configuration details to the table."""
+        if self.params.get("vllm_args"):
+            table.add_row("vLLM Arguments:", style="magenta")
+            for arg, value in self.params["vllm_args"].items():
+                table.add_row(f"  {arg}:", str(value))
+
+    def _add_env_vars(self, table: Table) -> None:
+        """Add environment variable configuration details to the table."""
+        if self.params.get("env"):
+            table.add_row("Environment Variables", style="magenta")
+            for arg, value in self.params["env"].items():
+                table.add_row(f"  {arg}:", str(value))
+
+    def _add_bind_paths(self, table: Table) -> None:
+        """Add bind path configuration details to the table."""
+        if self.params.get("bind"):
+            table.add_row("Bind Paths", style="magenta")
+            for path in self.params["bind"].split(","):
+                host = target = path
+                if ":" in path:
+                    host, target = path.split(":")
+                table.add_row(f"  {host}:", target)
+
     def format_table_output(self) -> Table:
         """Format output as rich Table.
 
@@ -59,16 +96,7 @@ def format_table_output(self) -> Table:
         table.add_row("Vocabulary Size", self.params["vocab_size"])
 
         # Add resource allocation details
-        if self.params.get("account"):
-            table.add_row("Account", self.params["account"])
-        if self.params.get("work_dir"):
-            table.add_row("Working Directory", self.params["work_dir"])
-        if self.params.get("resource_type"):
-            table.add_row("Resource Type", self.params["resource_type"])
-        if self.params.get("partition"):
-            table.add_row("Partition", self.params["partition"])
-        if self.params.get("qos"):
-            table.add_row("QoS", self.params["qos"])
+        self._add_resource_allocation_details(table)
         table.add_row("Time Limit", self.params["time"])
         table.add_row("Num Nodes", self.params["num_nodes"])
         table.add_row("GPUs/Node", self.params["gpus_per_node"])
@@ -84,23 +112,10 @@ def format_table_output(self) -> Table:
         )
         table.add_row("Log Directory", self.params["log_dir"])
 
-        # Add vLLM configuration details
-        table.add_row("vLLM Arguments:", style="magenta")
-        for arg, value in self.params["vllm_args"].items():
-            table.add_row(f"  {arg}:", str(value))
-
-        # Add environment variable configuration details
-        table.add_row("Environment Variables", style="magenta")
-        for arg, value in self.params["env"].items():
-            table.add_row(f"  {arg}:", str(value))
-
-        # Add bind path configuration details
-        table.add_row("Bind Paths", style="magenta")
-        for path in self.params["bind"].split(","):
-            host = target = path
-            if ":" in path:
-                host, target = path.split(":")
-            table.add_row(f"  {host}:", target)
+        # Add configuration details
+        self._add_vllm_config(table)
+        self._add_env_vars(table)
+        self._add_bind_paths(table)
 
         return table
 
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
@@ -196,23 +196,14 @@ def _process_env_vars(self, env_arg: str) -> dict[str, str]:
                         print(f"WARNING: Could not parse env var: {line}")
         return env_vars
 
-    def _get_launch_params(self) -> dict[str, Any]:
-        """Prepare launch parameters, set log dir, and validate required fields.
-
-        Returns
-        -------
-        dict[str, Any]
-            Dictionary of prepared launch parameters
+    def _apply_cli_overrides(self, params: dict[str, Any]) -> None:
+        """Apply CLI argument overrides to params.
 
-        Raises
-        ------
-        MissingRequiredFieldsError
-            If required fields are missing or tensor parallel size is not specified
-            when using multiple GPUs
+        Parameters
+        ----------
+        params : dict[str, Any]
+            Dictionary of launch parameters to override
         """
-        params = self.model_config.model_dump(exclude_none=True)
-
-        # Override config defaults with CLI arguments
         if self.kwargs.get("vllm_args"):
             vllm_args = self._process_vllm_args(self.kwargs["vllm_args"])
             for key, value in vllm_args.items():
@@ -225,13 +216,29 @@ def _get_launch_params(self) -> dict[str, Any]:
                 params["env"][key] = str(value)
             del self.kwargs["env"]
 
+        if self.kwargs.get("bind") and params.get("bind"):
+            params["bind"] = f"{params['bind']},{self.kwargs['bind']}"
+            del self.kwargs["bind"]
+
         for key, value in self.kwargs.items():
             params[key] = value
 
-        # Check for required fields without default vals, will raise an error if missing
-        utils.check_required_fields(params)
+    def _validate_resource_allocation(self, params: dict[str, Any]) -> None:
+        """Validate resource allocation and parallelization settings.
 
-        # Validate resource allocation and parallelization settings
+        Parameters
+        ----------
+        params : dict[str, Any]
+            Dictionary of launch parameters to validate
+
+        Raises
+        ------
+        MissingRequiredFieldsError
+            If tensor parallel size is not specified when using multiple GPUs
+        ValueError
+            If total # of GPUs requested is not a power of two
+            If mismatch between total # of GPUs requested and parallelization settings
+        """
         if (
             int(params["gpus_per_node"]) > 1
             and params["vllm_args"].get("--tensor-parallel-size") is None
@@ -252,19 +259,18 @@ def _get_launch_params(self) -> dict[str, Any]:
                 "Mismatch between total number of GPUs requested and parallelization settings"
             )
 
-        # Convert gpus_per_node and resource_type to gres
-        resource_type = params.get("resource_type")
-        if resource_type:
-            params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
-        else:
-            params["gres"] = f"gpu:{params['gpus_per_node']}"
+    def _setup_log_files(self, params: dict[str, Any]) -> None:
+        """Set up log directory and file paths.
 
-        # Create log directory
+        Parameters
+        ----------
+        params : dict[str, Any]
+            Dictionary of launch parameters to set up log files
+        """
         params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
         params["log_dir"].mkdir(parents=True, exist_ok=True)
         params["src_dir"] = SRC_DIR
 
-        # Construct slurm log file paths
         params["out_file"] = (
             f"{params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out"
         )
@@ -275,6 +281,35 @@ def _get_launch_params(self) -> dict[str, Any]:
             f"{params['log_dir']}/{self.model_name}.$SLURM_JOB_ID/{self.model_name}.$SLURM_JOB_ID.json"
         )
 
+    def _get_launch_params(self) -> dict[str, Any]:
+        """Prepare launch parameters, set log dir, and validate required fields.
+
+        Returns
+        -------
+        dict[str, Any]
+            Dictionary of prepared launch parameters
+        """
+        params = self.model_config.model_dump(exclude_none=True)
+
+        # Override config defaults with CLI arguments
+        self._apply_cli_overrides(params)
+
+        # Check for required fields without default vals, will raise an error if missing
+        utils.check_required_fields(params)
+
+        # Validate resource allocation and parallelization settings
+        self._validate_resource_allocation(params)
+
+        # Convert gpus_per_node and resource_type to gres
+        resource_type = params.get("resource_type")
+        if resource_type:
+            params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
+        else:
+            params["gres"] = f"gpu:{params['gpus_per_node']}"
+
+        # Setup log files
+        self._setup_log_files(params)
+
         # Convert path to string for JSON serialization
         for field in params:
             if field in ["vllm_args", "env"]:
diff --git a/vec_inf/client/_utils.py b/vec_inf/client/_utils.py
@@ -108,15 +108,64 @@ def is_server_running(
     if isinstance(log_content, str):
         return log_content
 
-    status: Union[str, tuple[ModelStatus, str]] = ModelStatus.LAUNCHING
+    # Patterns that indicate fatal errors (not just warnings)
+    fatal_error_patterns = [
+        "traceback",
+        "exception",
+        "fatal error",
+        "critical error",
+        "failed to",
+        "could not",
+        "unable to",
+        "error:",
+    ]
+
+    # Patterns to ignore (non-fatal warnings/info messages)
+    ignore_patterns = [
+        "deprecated",
+        "futurewarning",
+        "userwarning",
+        "deprecationwarning",
+        "slurmstepd: error:",  # SLURM cancellation messages (often after server started)
+    ]
+
+    ready_signature_found = False
+    fatal_error_line = None
 
     for line in log_content:
-        if "error" in line.lower():
-            status = (ModelStatus.FAILED, line.strip("\n"))
+        line_lower = line.lower()
+
+        # Check for ready signature first - if found, server is running
         if MODEL_READY_SIGNATURE in line:
-            status = "RUNNING"
+            ready_signature_found = True
+            # Continue checking to see if there are errors after startup
+
+        # Check for fatal errors (only if we haven't seen ready signature yet)
+        if not ready_signature_found:
+            # Skip lines that match ignore patterns
+            if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
+                continue
 
-    return status
+            # Check for fatal error patterns
+            for pattern in fatal_error_patterns:
+                if pattern in line_lower:
+                    # Additional check: skip if it's part of a warning message
+                    # (warnings often contain "error:" but aren't fatal)
+                    if "warning" in line_lower and "error:" in line_lower:
+                        continue
+                    fatal_error_line = line.strip("\n")
+                    break
+
+    # If we found a fatal error, mark as failed
+    if fatal_error_line:
+        return (ModelStatus.FAILED, fatal_error_line)
+
+    # If ready signature was found and no fatal errors, server is running
+    if ready_signature_found:
+        return "RUNNING"
+
+    # Otherwise, still launching
+    return ModelStatus.LAUNCHING
 
 
 def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
diff --git a/vec_inf/config/models.yaml b/vec_inf/config/models.yaml