Merge branch 'main' into slurm_dependency

XkunW · web-flow · commit 170c61a7dfd6 · 2025-05-09T11:08:52.000-04:00
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.3.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.4.1-devel-ubuntu20.04
 
 # Non-interactive apt-get commands
 ARG DEBIAN_FRONTEND=noninteractive
@@ -41,8 +41,10 @@ COPY . /vec-inf
 
 # Install project dependencies with build requirements
 RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu121" uv pip install --system -e .[dev]
-# Install Flash Attention
+# Install FlashAttention
 RUN python3.10 -m pip install flash-attn --no-build-isolation
+# Install FlashInfer
+RUN python3.10 -m pip install flashinfer-python -i https://flashinfer.ai/whl/cu124/torch2.6/
 
 # Final configuration
 RUN mkdir -p /vec-inf/nccl && \
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 ----------------------------------------------------
 
 [![PyPI](https://img.shields.io/pypi/v/vec-inf)](https://pypi.org/project/vec-inf)
-[![downloads](https://img.shields.io/pypi/dm/vec-inf)]
+[![downloads](https://img.shields.io/pypi/dm/vec-inf)](https://pypistats.org/packages/vec-inf)
 [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
 [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs.yml)
 [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/main/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/main)
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -32,7 +32,7 @@
     MetricsResponseFormatter,
     StatusResponseFormatter,
 )
-from vec_inf.client import LaunchOptions, LaunchOptionsDict, VecInfClient
+from vec_inf.client import LaunchOptions, VecInfClient
 
 
 CONSOLE = Console()
@@ -63,6 +63,11 @@ def cli() -> None:
     type=int,
     help="Number of GPUs/node to use, default to suggested resource allocation for model",
 )
+@click.option(
+    "--account",
+    type=str,
+    help="Charge resources used by this job to specified account.",
+)
 @click.option(
     "--qos",
     type=str,
@@ -142,17 +147,18 @@ def launch(
     """
     try:
         # Convert cli_kwargs to LaunchOptions
-        kwargs = {k: v for k, v in cli_kwargs.items() if k != "json_mode"}
-        # Cast the dictionary to LaunchOptionsDict
-        options_dict: LaunchOptionsDict = kwargs  # type: ignore
-        launch_options = LaunchOptions(**options_dict)
+        json_mode = cli_kwargs["json_mode"]
+        del cli_kwargs["json_mode"]
+
+        launch_options = LaunchOptions(**cli_kwargs)  # type: ignore
 
         # Start the client and launch model inference server
         client = VecInfClient()
         launch_response = client.launch_model(model_name, launch_options)
 
         # Display launch information
         launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
+
         if cli_kwargs.get("json_mode"):
             click.echo(json.dumps(launch_response.config))
         else:
diff --git a/vec_inf/client/__init__.py b/vec_inf/client/__init__.py
@@ -9,7 +9,6 @@
 from vec_inf.client.config import ModelConfig
 from vec_inf.client.models import (
     LaunchOptions,
-    LaunchOptionsDict,
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
@@ -28,6 +27,5 @@
     "ModelStatus",
     "ModelType",
     "LaunchOptions",
-    "LaunchOptionsDict",
     "ModelConfig",
 ]
diff --git a/vec_inf/client/_client_vars.py b/vec_inf/client/_client_vars.py
@@ -56,6 +56,7 @@
 SLURM_JOB_CONFIG_ARGS = {
     "job-name": "model_name",
     "partition": "partition",
+    "account": "account",
     "qos": "qos",
     "time": "time",
     "nodes": "num_nodes",
@@ -66,6 +67,13 @@
     "error": "err_file",
 }
 
+# vLLM engine args mapping between short and long names
+VLLM_SHORT_TO_LONG_MAP = {
+    "-tp": "--tensor-parallel-size",
+    "-pp": "--pipeline-parallel-size",
+    "-O": "--compilation-config",
+}
+
 
 # Slurm script templates
 class ShebangConfig(TypedDict):
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
@@ -19,6 +19,7 @@
     KEY_METRICS,
     REQUIRED_FIELDS,
     SRC_DIR,
+    VLLM_SHORT_TO_LONG_MAP,
 )
 from vec_inf.client._exceptions import (
     MissingRequiredFieldsError,
@@ -156,9 +157,14 @@ def _process_vllm_args(self, arg_string: str) -> dict[str, Any]:
         for arg in arg_string.split(","):
             if "=" in arg:
                 key, value = arg.split("=")
-                vllm_args[key] = value
+                if key.strip() in VLLM_SHORT_TO_LONG_MAP:
+                    key = VLLM_SHORT_TO_LONG_MAP[key.strip()]
+                vllm_args[key.strip()] = value.strip()
+            elif "-O" in arg.strip():
+                key = VLLM_SHORT_TO_LONG_MAP["-O"]
+                vllm_args[key] = arg.strip()[2:].strip()
             else:
-                vllm_args[arg] = True
+                vllm_args[arg.strip()] = True
         return vllm_args
 
     def _get_launch_params(self) -> dict[str, Any]:
@@ -175,7 +181,7 @@ def _get_launch_params(self) -> dict[str, Any]:
             If required fields are missing or tensor parallel size is not specified
             when using multiple GPUs
         """
-        params = self.model_config.model_dump()
+        params = self.model_config.model_dump(exclude_none=True)
 
         # Override config defaults with CLI arguments
         if self.kwargs.get("vllm_args"):
diff --git a/vec_inf/client/_slurm_script_generator.py b/vec_inf/client/_slurm_script_generator.py
@@ -68,7 +68,8 @@ def _generate_shebang(self) -> str:
         """
         shebang = [SLURM_SCRIPT_TEMPLATE["shebang"]["base"]]
         for arg, value in SLURM_JOB_CONFIG_ARGS.items():
-            shebang.append(f"#SBATCH --{arg}={self.params[value]}")
+            if self.params.get(value):
+                shebang.append(f"#SBATCH --{arg}={self.params[value]}")
         if self.is_multinode:
             shebang += SLURM_SCRIPT_TEMPLATE["shebang"]["multinode"]
         return "\n".join(shebang)
diff --git a/vec_inf/client/config.py b/vec_inf/client/config.py
@@ -47,6 +47,8 @@ class ModelConfig(BaseModel):
         Memory allocation per node in GB format (e.g., '32G')
     vocab_size : int
         Size of the model's vocabulary (1-1,000,000)
+    account : Optional[str], optional
+        Charge resources used by this job to specified account.
     qos : Union[QOS, str], optional
         Quality of Service tier for job scheduling
     time : str, optional
@@ -92,6 +94,9 @@ class ModelConfig(BaseModel):
         description="Memory per node",
     )
     vocab_size: int = Field(..., gt=0, le=1_000_000)
+    account: Optional[str] = Field(
+        default=None, description="Account name for job scheduling"
+    )
     qos: Union[QOS, str] = Field(
         default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
     )
diff --git a/vec_inf/client/models.py b/vec_inf/client/models.py
@@ -25,7 +25,7 @@
 
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Optional, TypedDict, Union
+from typing import Any, Optional, Union
 
 
 class ModelStatus(str, Enum):
@@ -164,6 +164,8 @@ class LaunchOptions:
         Number of nodes to allocate
     gpus_per_node : int, optional
         Number of GPUs per node
+    account : str, optional
+        Account name for job scheduling
     qos : str, optional
         Quality of Service level
     time : str, optional
@@ -187,6 +189,7 @@ class LaunchOptions:
     partition: Optional[str] = None
     num_nodes: Optional[int] = None
     gpus_per_node: Optional[int] = None
+    account: Optional[str] = None
     qos: Optional[str] = None
     time: Optional[str] = None
     vocab_size: Optional[int] = None
@@ -197,43 +200,6 @@ class LaunchOptions:
     vllm_args: Optional[str] = None
 
 
-class LaunchOptionsDict(TypedDict):
-    """TypedDict for LaunchOptions.
-
-    A TypedDict representation of LaunchOptions for type checking and
-    serialization purposes. All fields are optional and may be None.
-
-    Attributes
-    ----------
-    model_family : str, optional
-        Family/architecture of the model
-    model_variant : str, optional
-        Specific variant/version of the model
-    partition : str, optional
-        SLURM partition to use
-    num_nodes : int, optional
-        Number of nodes to allocate
-    gpus_per_node : int, optional
-        Number of GPUs per node
-    qos : str, optional
-        Quality of Service level
-    time : str, optional
-        Time limit for the job
-    vocab_size : int, optional
-        Size of model vocabulary
-    data_type : str, optional
-        Data type for model weights
-    venv : str, optional
-        Virtual environment to use
-    log_dir : str, optional
-        Directory for logs
-    model_weights_parent_dir : str, optional
-        Parent directory containing model weights
-    vllm_args : str, optional
-        Additional arguments for vLLM
-    """
-
-
 @dataclass
 class ModelInfo:
     """Information about an available model.