Merge pull request #76 from VectorInstitute/nmb_vlm_models

XkunW · web-flow · commit a4aad27c89ee · 2025-03-20T11:15:14.000-04:00
* Onboard new VLM models 
* Add prefix cache hit rate metric to metrics command
diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
@@ -68,6 +68,9 @@ def _exists(p):
         # Allow access to the default config file
         if str(p).endswith("config/models.yaml"):
             return True
+        # Allow access to the default log directory
+        if str(p).endswith(".vec-inf-logs"):
+            return True
         # Use mock_exists for other paths
         return mock_exists(p)
 
@@ -143,14 +146,27 @@ def test_paths():
 def mock_truediv(test_paths):
     """Fixture providing path joining mock."""
 
-    def _mock_truediv(self, other):
-        if str(self) == str(test_paths["weights_dir"]) and other == "unknown-model":
-            return test_paths["unknown_model"]
-        if str(self) == str(test_paths["log_dir"]):
-            return test_paths["log_dir"] / other
-        if str(self) == str(test_paths["log_dir"] / "model_family_placeholder"):
-            return test_paths["log_dir"] / "model_family_placeholder" / other
-        return Path(str(self)) / str(other)
+    def _mock_truediv(*args):
+        # Handle the case where it's called with just one argument
+        if len(args) == 1:
+            other = args[0]
+            return test_paths.get(other, Path(str(other)))
+
+        # Normal case with self and other
+        self, other = args
+        specific_paths = {
+            (str(test_paths["weights_dir"]), "unknown-model"): test_paths[
+                "unknown_model"
+            ],
+            (str(test_paths["log_dir"]), other): test_paths["log_dir"] / other,
+            (
+                str(test_paths["log_dir"] / "model_family_placeholder"),
+                other,
+            ): test_paths["log_dir"] / "model_family_placeholder" / other,
+            ("/home/user", ".vec-inf-logs"): test_paths["log_dir"],
+        }
+
+        return specific_paths.get((str(self), other), Path(str(self)) / str(other))
 
     return _mock_truediv
 
@@ -201,12 +217,26 @@ def base_patches(test_paths, mock_truediv, debug_helper):
             "pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent
         ),
         patch("pathlib.Path.__truediv__", side_effect=mock_truediv),
+        patch("pathlib.Path.iterdir", return_value=[]),  # Mock empty directory listing
         patch("json.dump"),
         patch("pathlib.Path.touch"),
         patch("vec_inf.cli._helper.Path", return_value=test_paths["weights_dir"]),
+        patch(
+            "pathlib.Path.home", return_value=Path("/home/user")
+        ),  # Mock home directory
     ]
 
 
+@pytest.fixture
+def apply_base_patches(base_patches):
+    """Fixture to apply all base patches."""
+    with ExitStack() as stack:
+        # Apply all patches
+        for patch_obj in base_patches:
+            stack.enter_context(patch_obj)
+        yield
+
+
 def test_launch_command_success(runner, mock_launch_output, path_exists, debug_helper):
     """Test successful model launch with minimal required arguments."""
     test_log_dir = Path("/tmp/test_vec_inf_logs")
@@ -374,7 +404,7 @@ def test_list_single_model(runner):
 
 
 def test_metrics_command_pending_server(
-    runner, mock_status_output, path_exists, debug_helper
+    runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
     """Test metrics command when server is pending."""
     with (
@@ -398,7 +428,7 @@ def test_metrics_command_pending_server(
 
 
 def test_metrics_command_server_not_ready(
-    runner, mock_status_output, path_exists, debug_helper
+    runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
     """Test metrics command when server is running but not ready."""
     with (
@@ -420,7 +450,7 @@ def test_metrics_command_server_not_ready(
 
 @patch("vec_inf.cli._helper.requests.get")
 def test_metrics_command_server_ready(
-    mock_get, runner, mock_status_output, path_exists, debug_helper
+    mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
     """Test metrics command when server is ready and returning metrics."""
     metrics_response = """
@@ -459,7 +489,7 @@ def test_metrics_command_server_ready(
 
 @patch("vec_inf.cli._helper.requests.get")
 def test_metrics_command_request_failed(
-    mock_get, runner, mock_status_output, path_exists, debug_helper
+    mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
     """Test metrics command when request to metrics endpoint fails."""
     mock_get.side_effect = requests.exceptions.RequestException("Connection refused")
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -107,8 +107,8 @@ def cli() -> None:
 )
 @click.option(
     "--compilation-config",
-    type=click.Choice(["0", "1", "2", "3"]),
-    help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",
+    type=click.Choice(["0", "3"]),
+    help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
 )
 @click.option(
     "--enforce-eager",
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
@@ -352,6 +352,7 @@ def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
         self.log_dir = log_dir
         self.status_info = self._get_status_info()
         self.metrics_url = self._build_metrics_url()
+        self.enabled_prefix_caching = self._check_prefix_caching()
 
         self._prev_prompt_tokens: float = 0.0
         self._prev_generation_tokens: float = 0.0
@@ -386,6 +387,18 @@ def _build_metrics_url(self) -> str:
             (parsed.scheme, parsed.netloc, f"{clean_path}/metrics", "", "", "")
         )
 
+    def _check_prefix_caching(self) -> bool:
+        """Check if prefix caching is enabled."""
+        job_json = utils.read_slurm_log(
+            cast(str, self.status_info["model_name"]),
+            self.slurm_job_id,
+            "json",
+            self.log_dir,
+        )
+        if isinstance(job_json, str):
+            return False
+        return bool(cast(dict[str, str], job_json).get("enable_prefix_caching", False))
+
     def fetch_metrics(self) -> Union[dict[str, float], str]:
         """Fetch metrics from the endpoint."""
         try:
@@ -476,6 +489,10 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
             "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
         }
 
+        if self.enabled_prefix_caching:
+            key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
+            key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
+
         parsed: dict[str, float] = {}
         for line in metrics_text.split("\n"):
             if line.startswith("#") or not line.strip():
@@ -532,6 +549,16 @@ def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
             f"{metrics.get('cpu_cache_usage', 0) * 100:.1f}%",
         )
 
+        if self.enabled_prefix_caching:
+            table.add_row(
+                "GPU Prefix Cache Hit Rate",
+                f"{metrics.get('gpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
+            )
+            table.add_row(
+                "CPU Prefix Cache Hit Rate",
+                f"{metrics.get('cpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
+            )
+
         # Show average latency if available
         if "avg_request_latency" in metrics:
             table.add_row(
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
@@ -35,6 +35,8 @@ def read_slurm_log(
     if not log_dir:
         # Default log directory
         models_dir = Path.home() / ".vec-inf-logs"
+        if not models_dir.exists():
+            return "LOG DIR NOT FOUND"
         # Iterate over all dirs in models_dir, sorted by dir name length in desc order
         for directory in sorted(
             [d for d in models_dir.iterdir() if d.is_dir()],
diff --git a/vec_inf/config/README.md b/vec_inf/config/README.md
@@ -162,6 +162,13 @@ More profiling metrics coming soon!
 
 ## Vision Language Models
 
+### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
+
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
+
+
 ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
 
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
@@ -181,6 +188,7 @@ More profiling metrics coming soon!
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
+| [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
 
 ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
 
@@ -199,6 +207,27 @@ More profiling metrics coming soon!
 |:----------:|:----------:|:----------:|:----------:|
 | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
 
+### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
+
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
+| [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
+| [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
+
+### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
+
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
+
+### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
+| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
+|:----------:|:----------:|:----------:|:----------:|
+| [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
+| [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
+
+
 ## Text Embedding Models
 
 ### [Liang Wang: e5](https://huggingface.co/intfloat)
@@ -225,3 +254,4 @@ More profiling metrics coming soon!
 | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
 |:----------:|:----------:|:----------:|:----------:|
 | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
+| [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
diff --git a/vec_inf/config/models.yaml b/vec_inf/config/models.yaml

Original file line number	Diff line number	Diff line change
`@@ -107,8 +107,8 @@ def cli() -> None:`
`107`	`107`	`)`
`108`	`108`	`@click.option(`
`109`	`109`	`"--compilation-config",`
`110`		`- type=click.Choice(["0", "1", "2", "3"]),`
`111`		`- help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",`
	`110`	`+ type=click.Choice(["0", "3"]),`
	`111`	`+ help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",`
`112`	`112`	`)`
`113`	`113`	`@click.option(`
`114`	`114`	`"--enforce-eager",`