Skip to content

Commit eea1164

Browse files
ServeurpersoComngxson
authored andcommitted
server: add router multi-model tests (ggml-org#17704) (ggml-org#17722)
* llama-server: add router multi-model tests (ggml-org#17704) Add 4 test cases for model router: - test_router_unload_model: explicit model unloading - test_router_models_max_evicts_lru: LRU eviction with --models-max - test_router_no_models_autoload: --no-models-autoload flag behavior - test_router_api_key_required: API key authentication Tests use async model loading with polling and graceful skip when insufficient models available for eviction testing. utils.py changes: - Add models_max, models_dir, no_models_autoload attributes to ServerProcess - Handle JSONDecodeError for non-JSON error responses (fallback to text) * llama-server: update test models to new HF repos * add offline * llama-server: fix router LRU eviction test and add preloading Fix eviction test: load 2 models first, verify state, then load 3rd to trigger eviction. Previous logic loaded all 3 at once, causing first model to be evicted before verification could occur. Add module fixture to preload models via ServerPreset.load_all() and mark test presets as offline to use cached models * llama-server: fix split model download on Windows --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
1 parent 6816714 commit eea1164

File tree

3 files changed

+169
-6
lines changed

3 files changed

+169
-6
lines changed

tools/server/tests/unit/test_basic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def test_server_slots():
6565

6666
def test_load_split_model():
6767
global server
68+
server.offline = False
6869
server.model_hf_repo = "ggml-org/models"
6970
server.model_hf_file = "tinyllamas/split/stories15M-q8_0-00001-of-00003.gguf"
7071
server.model_alias = "tinyllama-split"

tools/server/tests/unit/test_router.py

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ def create_server():
1717
]
1818
)
1919
def test_router_chat_completion_stream(model: str, success: bool):
20-
# TODO: make sure the model is in cache (ie. ServerProcess.load_all()) before starting the router server
2120
global server
2221
server.start()
2322
content = ""
@@ -48,3 +47,148 @@ def test_router_chat_completion_stream(model: str, success: bool):
4847
else:
4948
assert ex is not None
5049
assert content == ""
50+
51+
52+
def _get_model_status(model_id: str) -> str:
53+
res = server.make_request("GET", "/models")
54+
assert res.status_code == 200
55+
for item in res.body.get("data", []):
56+
if item.get("id") == model_id or item.get("model") == model_id:
57+
return item["status"]["value"]
58+
raise AssertionError(f"Model {model_id} not found in /models response")
59+
60+
61+
def _wait_for_model_status(model_id: str, desired: set[str], timeout: int = 60) -> str:
62+
deadline = time.time() + timeout
63+
last_status = None
64+
while time.time() < deadline:
65+
last_status = _get_model_status(model_id)
66+
if last_status in desired:
67+
return last_status
68+
time.sleep(1)
69+
raise AssertionError(
70+
f"Timed out waiting for {model_id} to reach {desired}, last status: {last_status}"
71+
)
72+
73+
74+
def _load_model_and_wait(
75+
model_id: str, timeout: int = 60, headers: dict | None = None
76+
) -> None:
77+
load_res = server.make_request(
78+
"POST", "/models/load", data={"model": model_id}, headers=headers
79+
)
80+
assert load_res.status_code == 200
81+
assert isinstance(load_res.body, dict)
82+
assert load_res.body.get("success") is True
83+
_wait_for_model_status(model_id, {"loaded"}, timeout=timeout)
84+
85+
86+
def test_router_unload_model():
87+
global server
88+
server.start()
89+
model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
90+
91+
_load_model_and_wait(model_id)
92+
93+
unload_res = server.make_request("POST", "/models/unload", data={"model": model_id})
94+
assert unload_res.status_code == 200
95+
assert unload_res.body.get("success") is True
96+
_wait_for_model_status(model_id, {"unloaded"})
97+
98+
99+
def test_router_models_max_evicts_lru():
100+
global server
101+
server.models_max = 2
102+
server.start()
103+
104+
candidate_models = [
105+
"ggml-org/tinygemma3-GGUF:Q8_0",
106+
"ggml-org/test-model-stories260K",
107+
"ggml-org/test-model-stories260K-infill",
108+
]
109+
110+
# Load only the first 2 models to fill the cache
111+
first, second, third = candidate_models[:3]
112+
113+
_load_model_and_wait(first, timeout=120)
114+
_load_model_and_wait(second, timeout=120)
115+
116+
# Verify both models are loaded
117+
assert _get_model_status(first) == "loaded"
118+
assert _get_model_status(second) == "loaded"
119+
120+
# Load the third model - this should trigger LRU eviction of the first model
121+
_load_model_and_wait(third, timeout=120)
122+
123+
# Verify eviction: third is loaded, first was evicted
124+
assert _get_model_status(third) == "loaded"
125+
assert _get_model_status(first) == "unloaded"
126+
127+
128+
def test_router_no_models_autoload():
129+
global server
130+
server.no_models_autoload = True
131+
server.start()
132+
model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
133+
134+
res = server.make_request(
135+
"POST",
136+
"/v1/chat/completions",
137+
data={
138+
"model": model_id,
139+
"messages": [{"role": "user", "content": "hello"}],
140+
"max_tokens": 4,
141+
},
142+
)
143+
assert res.status_code == 400
144+
assert "error" in res.body
145+
146+
_load_model_and_wait(model_id)
147+
148+
success_res = server.make_request(
149+
"POST",
150+
"/v1/chat/completions",
151+
data={
152+
"model": model_id,
153+
"messages": [{"role": "user", "content": "hello"}],
154+
"max_tokens": 4,
155+
},
156+
)
157+
assert success_res.status_code == 200
158+
assert "error" not in success_res.body
159+
160+
161+
def test_router_api_key_required():
162+
global server
163+
server.api_key = "sk-router-secret"
164+
server.start()
165+
166+
model_id = "ggml-org/tinygemma3-GGUF:Q8_0"
167+
auth_headers = {"Authorization": f"Bearer {server.api_key}"}
168+
169+
res = server.make_request(
170+
"POST",
171+
"/v1/chat/completions",
172+
data={
173+
"model": model_id,
174+
"messages": [{"role": "user", "content": "hello"}],
175+
"max_tokens": 4,
176+
},
177+
)
178+
assert res.status_code == 401
179+
assert res.body.get("error", {}).get("type") == "authentication_error"
180+
181+
_load_model_and_wait(model_id, headers=auth_headers)
182+
183+
authed = server.make_request(
184+
"POST",
185+
"/v1/chat/completions",
186+
headers=auth_headers,
187+
data={
188+
"model": model_id,
189+
"messages": [{"role": "user", "content": "hello"}],
190+
"max_tokens": 4,
191+
},
192+
)
193+
assert authed.status_code == 200
194+
assert "error" not in authed.body

tools/server/tests/utils.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import re
99
import json
10+
from json import JSONDecodeError
1011
import sys
1112
import requests
1213
import time
@@ -83,6 +84,9 @@ class ServerProcess:
8384
pooling: str | None = None
8485
draft: int | None = None
8586
api_key: str | None = None
87+
models_dir: str | None = None
88+
models_max: int | None = None
89+
no_models_autoload: bool | None = None
8690
lora_files: List[str] | None = None
8791
enable_ctx_shift: int | None = False
8892
draft_min: int | None = None
@@ -143,6 +147,10 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
143147
server_args.extend(["--hf-repo", self.model_hf_repo])
144148
if self.model_hf_file:
145149
server_args.extend(["--hf-file", self.model_hf_file])
150+
if self.models_dir:
151+
server_args.extend(["--models-dir", self.models_dir])
152+
if self.models_max is not None:
153+
server_args.extend(["--models-max", self.models_max])
146154
if self.n_batch:
147155
server_args.extend(["--batch-size", self.n_batch])
148156
if self.n_ubatch:
@@ -204,6 +212,8 @@ def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
204212
server_args.extend(["--draft-min", self.draft_min])
205213
if self.no_webui:
206214
server_args.append("--no-webui")
215+
if self.no_models_autoload:
216+
server_args.append("--no-models-autoload")
207217
if self.jinja:
208218
server_args.append("--jinja")
209219
else:
@@ -295,7 +305,13 @@ def make_request(
295305
result = ServerResponse()
296306
result.headers = dict(response.headers)
297307
result.status_code = response.status_code
298-
result.body = response.json() if parse_body else None
308+
if parse_body:
309+
try:
310+
result.body = response.json()
311+
except JSONDecodeError:
312+
result.body = response.text
313+
else:
314+
result.body = None
299315
print("Response from server", json.dumps(result.body, indent=2))
300316
return result
301317

@@ -434,8 +450,9 @@ def load_all() -> None:
434450
@staticmethod
435451
def tinyllama2() -> ServerProcess:
436452
server = ServerProcess()
437-
server.model_hf_repo = "ggml-org/models"
438-
server.model_hf_file = "tinyllamas/stories260K.gguf"
453+
server.offline = True # will be downloaded by load_all()
454+
server.model_hf_repo = "ggml-org/test-model-stories260K"
455+
server.model_hf_file = None
439456
server.model_alias = "tinyllama-2"
440457
server.n_ctx = 512
441458
server.n_batch = 32
@@ -479,8 +496,8 @@ def bert_bge_small_with_fa() -> ServerProcess:
479496
def tinyllama_infill() -> ServerProcess:
480497
server = ServerProcess()
481498
server.offline = True # will be downloaded by load_all()
482-
server.model_hf_repo = "ggml-org/models"
483-
server.model_hf_file = "tinyllamas/stories260K-infill.gguf"
499+
server.model_hf_repo = "ggml-org/test-model-stories260K-infill"
500+
server.model_hf_file = None
484501
server.model_alias = "tinyllama-infill"
485502
server.n_ctx = 2048
486503
server.n_batch = 1024
@@ -537,6 +554,7 @@ def tinygemma3() -> ServerProcess:
537554
@staticmethod
538555
def router() -> ServerProcess:
539556
server = ServerProcess()
557+
server.offline = True # will be downloaded by load_all()
540558
# router server has no models
541559
server.model_file = None
542560
server.model_alias = None

0 commit comments

Comments
 (0)