Skip to content

Commit a4aad27

Browse files
authored
Merge pull request #76 from VectorInstitute/nmb_vlm_models
* Onboard new VLM models * Add prefix cache hit rate metric to metrics command
2 parents 9ef73c0 + 784b6cb commit a4aad27

File tree

6 files changed

+188
-441
lines changed

6 files changed

+188
-441
lines changed

tests/vec_inf/cli/test_cli.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ def _exists(p):
6868
# Allow access to the default config file
6969
if str(p).endswith("config/models.yaml"):
7070
return True
71+
# Allow access to the default log directory
72+
if str(p).endswith(".vec-inf-logs"):
73+
return True
7174
# Use mock_exists for other paths
7275
return mock_exists(p)
7376

@@ -143,14 +146,27 @@ def test_paths():
143146
def mock_truediv(test_paths):
144147
"""Fixture providing path joining mock."""
145148

146-
def _mock_truediv(self, other):
147-
if str(self) == str(test_paths["weights_dir"]) and other == "unknown-model":
148-
return test_paths["unknown_model"]
149-
if str(self) == str(test_paths["log_dir"]):
150-
return test_paths["log_dir"] / other
151-
if str(self) == str(test_paths["log_dir"] / "model_family_placeholder"):
152-
return test_paths["log_dir"] / "model_family_placeholder" / other
153-
return Path(str(self)) / str(other)
149+
def _mock_truediv(*args):
150+
# Handle the case where it's called with just one argument
151+
if len(args) == 1:
152+
other = args[0]
153+
return test_paths.get(other, Path(str(other)))
154+
155+
# Normal case with self and other
156+
self, other = args
157+
specific_paths = {
158+
(str(test_paths["weights_dir"]), "unknown-model"): test_paths[
159+
"unknown_model"
160+
],
161+
(str(test_paths["log_dir"]), other): test_paths["log_dir"] / other,
162+
(
163+
str(test_paths["log_dir"] / "model_family_placeholder"),
164+
other,
165+
): test_paths["log_dir"] / "model_family_placeholder" / other,
166+
("/home/user", ".vec-inf-logs"): test_paths["log_dir"],
167+
}
168+
169+
return specific_paths.get((str(self), other), Path(str(self)) / str(other))
154170

155171
return _mock_truediv
156172

@@ -201,12 +217,26 @@ def base_patches(test_paths, mock_truediv, debug_helper):
201217
"pathlib.Path.parent", return_value=debug_helper.config_file.parent.parent
202218
),
203219
patch("pathlib.Path.__truediv__", side_effect=mock_truediv),
220+
patch("pathlib.Path.iterdir", return_value=[]), # Mock empty directory listing
204221
patch("json.dump"),
205222
patch("pathlib.Path.touch"),
206223
patch("vec_inf.cli._helper.Path", return_value=test_paths["weights_dir"]),
224+
patch(
225+
"pathlib.Path.home", return_value=Path("/home/user")
226+
), # Mock home directory
207227
]
208228

209229

230+
@pytest.fixture
231+
def apply_base_patches(base_patches):
232+
"""Fixture to apply all base patches."""
233+
with ExitStack() as stack:
234+
# Apply all patches
235+
for patch_obj in base_patches:
236+
stack.enter_context(patch_obj)
237+
yield
238+
239+
210240
def test_launch_command_success(runner, mock_launch_output, path_exists, debug_helper):
211241
"""Test successful model launch with minimal required arguments."""
212242
test_log_dir = Path("/tmp/test_vec_inf_logs")
@@ -374,7 +404,7 @@ def test_list_single_model(runner):
374404

375405

376406
def test_metrics_command_pending_server(
377-
runner, mock_status_output, path_exists, debug_helper
407+
runner, mock_status_output, path_exists, debug_helper, apply_base_patches
378408
):
379409
"""Test metrics command when server is pending."""
380410
with (
@@ -398,7 +428,7 @@ def test_metrics_command_pending_server(
398428

399429

400430
def test_metrics_command_server_not_ready(
401-
runner, mock_status_output, path_exists, debug_helper
431+
runner, mock_status_output, path_exists, debug_helper, apply_base_patches
402432
):
403433
"""Test metrics command when server is running but not ready."""
404434
with (
@@ -420,7 +450,7 @@ def test_metrics_command_server_not_ready(
420450

421451
@patch("vec_inf.cli._helper.requests.get")
422452
def test_metrics_command_server_ready(
423-
mock_get, runner, mock_status_output, path_exists, debug_helper
453+
mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
424454
):
425455
"""Test metrics command when server is ready and returning metrics."""
426456
metrics_response = """
@@ -459,7 +489,7 @@ def test_metrics_command_server_ready(
459489

460490
@patch("vec_inf.cli._helper.requests.get")
461491
def test_metrics_command_request_failed(
462-
mock_get, runner, mock_status_output, path_exists, debug_helper
492+
mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
463493
):
464494
"""Test metrics command when request to metrics endpoint fails."""
465495
mock_get.side_effect = requests.exceptions.RequestException("Connection refused")

vec_inf/cli/_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ def cli() -> None:
107107
)
108108
@click.option(
109109
"--compilation-config",
110-
type=click.Choice(["0", "1", "2", "3"]),
111-
help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",
110+
type=click.Choice(["0", "3"]),
111+
help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
112112
)
113113
@click.option(
114114
"--enforce-eager",

vec_inf/cli/_helper.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
352352
self.log_dir = log_dir
353353
self.status_info = self._get_status_info()
354354
self.metrics_url = self._build_metrics_url()
355+
self.enabled_prefix_caching = self._check_prefix_caching()
355356

356357
self._prev_prompt_tokens: float = 0.0
357358
self._prev_generation_tokens: float = 0.0
@@ -386,6 +387,18 @@ def _build_metrics_url(self) -> str:
386387
(parsed.scheme, parsed.netloc, f"{clean_path}/metrics", "", "", "")
387388
)
388389

390+
def _check_prefix_caching(self) -> bool:
391+
"""Check if prefix caching is enabled."""
392+
job_json = utils.read_slurm_log(
393+
cast(str, self.status_info["model_name"]),
394+
self.slurm_job_id,
395+
"json",
396+
self.log_dir,
397+
)
398+
if isinstance(job_json, str):
399+
return False
400+
return bool(cast(dict[str, str], job_json).get("enable_prefix_caching", False))
401+
389402
def fetch_metrics(self) -> Union[dict[str, float], str]:
390403
"""Fetch metrics from the endpoint."""
391404
try:
@@ -476,6 +489,10 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
476489
"vllm:cpu_cache_usage_perc": "cpu_cache_usage",
477490
}
478491

492+
if self.enabled_prefix_caching:
493+
key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
494+
key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
495+
479496
parsed: dict[str, float] = {}
480497
for line in metrics_text.split("\n"):
481498
if line.startswith("#") or not line.strip():
@@ -532,6 +549,16 @@ def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
532549
f"{metrics.get('cpu_cache_usage', 0) * 100:.1f}%",
533550
)
534551

552+
if self.enabled_prefix_caching:
553+
table.add_row(
554+
"GPU Prefix Cache Hit Rate",
555+
f"{metrics.get('gpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
556+
)
557+
table.add_row(
558+
"CPU Prefix Cache Hit Rate",
559+
f"{metrics.get('cpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
560+
)
561+
535562
# Show average latency if available
536563
if "avg_request_latency" in metrics:
537564
table.add_row(

vec_inf/cli/_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def read_slurm_log(
3535
if not log_dir:
3636
# Default log directory
3737
models_dir = Path.home() / ".vec-inf-logs"
38+
if not models_dir.exists():
39+
return "LOG DIR NOT FOUND"
3840
# Iterate over all dirs in models_dir, sorted by dir name length in desc order
3941
for directory in sorted(
4042
[d for d in models_dir.iterdir() if d.is_dir()],

vec_inf/config/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,13 @@ More profiling metrics coming soon!
162162

163163
## Vision Language Models
164164

165+
### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
166+
167+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
168+
|:----------:|:----------:|:----------:|:----------:|
169+
| [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
170+
171+
165172
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
166173

167174
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
@@ -181,6 +188,7 @@ More profiling metrics coming soon!
181188
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
182189
|:----------:|:----------:|:----------:|:----------:|
183190
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
191+
| [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
184192

185193
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
186194

@@ -199,6 +207,27 @@ More profiling metrics coming soon!
199207
|:----------:|:----------:|:----------:|:----------:|
200208
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
201209

210+
### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
211+
212+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
213+
|:----------:|:----------:|:----------:|:----------:|
214+
| [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
215+
| [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
216+
| [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
217+
218+
### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
219+
220+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
221+
|:----------:|:----------:|:----------:|:----------:|
222+
| [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
223+
224+
### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
225+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
226+
|:----------:|:----------:|:----------:|:----------:|
227+
| [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
228+
| [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
229+
230+
202231
## Text Embedding Models
203232

204233
### [Liang Wang: e5](https://huggingface.co/intfloat)
@@ -225,3 +254,4 @@ More profiling metrics coming soon!
225254
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
226255
|:----------:|:----------:|:----------:|:----------:|
227256
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
257+
| [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |

0 commit comments

Comments
 (0)