Skip to content

Commit 78ffec5

Browse files
committed
allow custom-path models
1 parent 3de65f8 commit 78ffec5

File tree

3 files changed

+138
-22
lines changed

3 files changed

+138
-22
lines changed

tools/server/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1470,6 +1470,11 @@ llama-server
14701470

14711471
### Model sources
14721472

1473+
There are 3 possible sources for model files:
1474+
1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
1475+
2. Custom model directory (set via the `--models-dir` argument)
1476+
3. Custom preset (set via the `--models-preset` argument)
1477+
14731478
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
14741479

14751480
```sh
@@ -1514,6 +1519,51 @@ llama-server -ctx 8192 -n 1024 -np 2
15141519

15151520
Note: model instances inherit both command line arguments and environment variables from the router server.
15161521

1522+
Alternatively, you can also add GGUF based preset (see next section)
1523+
1524+
### Model presets
1525+
1526+
Model presets allow advanced users to define custom configurations using an `.ini` file:
1527+
1528+
```sh
1529+
llama-server --models-preset ./my-models.ini
1530+
```
1531+
1532+
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
1533+
1534+
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
1535+
1536+
Example:
1537+
1538+
```ini
1539+
version = 1
1540+
1541+
; If the key corresponds to an existing model on the server,
1542+
; this will be used as the default config for that model
1543+
[ggml-org/MY-MODEL-GGUF:Q8_0]
1544+
; string value
1545+
chat-template = chatml
1546+
; numeric value
1547+
n-gpu-layer = 123
1548+
; boolean value
1549+
jinja = false
1550+
; shorthand argument (for example, context size)
1551+
c = 4096
1552+
; environment variable name
1553+
LLAMA_ARG_CACHE_RAM = 0
1554+
; file paths are relative to server's CWD
1555+
model-draft = ./my-models/draft.gguf
1556+
; but it's RECOMMENDED to use absolute path
1557+
model-draft = /Users/abc/my-models/draft.gguf
1558+
1559+
; If the key does NOT correspond to an existing model,
1560+
; you need to specify at least the model path
1561+
[custom_model]
1562+
model = /Users/abc/my-awesome-model-Q4_K_M.gguf
1563+
```
1564+
1565+
Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
1566+
15171567
### Routing requests
15181568

15191569
Requests are routed according to the requested model name.

tools/server/server-models.cpp

Lines changed: 83 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
145145
SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
146146
}
147147

148-
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
149-
150148
// populate reserved args (will be appended by the router)
151149
for (auto & opt : ctx_params.options) {
152150
if (opt.env == nullptr) {
@@ -159,14 +157,17 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
159157
env == "LLAMA_ARG_API_KEY" ||
160158
env == "LLAMA_ARG_MODELS_DIR" ||
161159
env == "LLAMA_ARG_MODELS_MAX" ||
162-
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
163160
env == "LLAMA_ARG_MODEL" ||
164161
env == "LLAMA_ARG_MMPROJ" ||
165-
env == "LLAMA_ARG_HF_REPO") {
162+
env == "LLAMA_ARG_HF_REPO" ||
163+
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
166164
control_args[env] = opt;
167165
}
168166
}
169167

168+
// read base args from router's argv
169+
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
170+
170171
// remove any router-controlled args from base_args
171172
for (const auto & cargs : control_args) {
172173
auto it = base_args.find(cargs.second);
@@ -186,14 +187,21 @@ common_preset server_presets::get_preset(const std::string & name) {
186187

187188
void server_presets::render_args(server_model_meta & meta) {
188189
common_preset preset = meta.preset; // copy
190+
// merging 3 kinds of args:
191+
// 1. model-specific args (from preset)
189192
// force removing control args if any
190193
for (auto & cargs : control_args) {
191-
preset.options.erase(cargs.second);
194+
if (preset.options.find(cargs.second) != preset.options.end()) {
195+
SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
196+
preset.options.erase(cargs.second);
197+
}
192198
}
199+
// 2. base args (from router)
193200
// inherit from base args
194201
for (const auto & [arg, value] : base_args) {
195202
preset.options[arg] = value;
196203
}
204+
// 3. control args (from router)
197205
// set control values
198206
preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
199207
preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
@@ -231,8 +239,54 @@ server_models::server_models(
231239
LOG_WRN("failed to get server executable path: %s\n", e.what());
232240
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
233241
}
234-
// TODO: allow refreshing cached model list
235-
// add cached models
242+
load_models();
243+
}
244+
245+
void server_models::add_model(server_model_meta && meta) {
246+
if (mapping.find(meta.name) != mapping.end()) {
247+
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
248+
}
249+
presets.render_args(meta); // populate meta.args
250+
std::string name = meta.name;
251+
mapping[name] = instance_t{
252+
/* subproc */ std::make_shared<subprocess_s>(),
253+
/* th */ std::thread(),
254+
/* meta */ std::move(meta)
255+
};
256+
}
257+
258+
static std::vector<local_model> list_custom_path_models(server_presets & presets) {
259+
// detect any custom-path models in presets
260+
std::vector<local_model> custom_models;
261+
for (auto & [model_name, preset] : presets.presets) {
262+
local_model model;
263+
model.name = model_name;
264+
std::vector<common_arg> to_erase;
265+
for (auto & [arg, value] : preset.options) {
266+
std::string env(arg.env ? arg.env : "");
267+
if (env == "LLAMA_ARG_MODEL") {
268+
model.path = value;
269+
to_erase.push_back(arg);
270+
}
271+
if (env == "LLAMA_ARG_MMPROJ") {
272+
model.path_mmproj = value;
273+
to_erase.push_back(arg);
274+
}
275+
}
276+
for (auto & arg : to_erase) {
277+
preset.options.erase(arg);
278+
}
279+
if (!model.name.empty() && !model.path.empty()) {
280+
custom_models.push_back(model);
281+
}
282+
}
283+
return custom_models;
284+
}
285+
286+
// TODO: allow refreshing cached model list
287+
void server_models::load_models() {
288+
// loading models from 3 sources:
289+
// 1. cached models
236290
auto cached_models = common_list_cached_models();
237291
for (const auto & model : cached_models) {
238292
server_model_meta meta{
@@ -247,16 +301,11 @@ server_models::server_models(
247301
/* args */ std::vector<std::string>(),
248302
/* exit_code */ 0
249303
};
250-
presets.render_args(meta); // populate meta.args
251-
mapping[meta.name] = instance_t{
252-
/* subproc */ std::make_shared<subprocess_s>(),
253-
/* th */ std::thread(),
254-
/* meta */ meta
255-
};
304+
add_model(std::move(meta));
256305
}
257-
// add local models specificed via --models-dir
258-
if (!params.models_dir.empty()) {
259-
auto local_models = list_local_models(params.models_dir);
306+
// 2. local models specificed via --models-dir
307+
if (!base_params.models_dir.empty()) {
308+
auto local_models = list_local_models(base_params.models_dir);
260309
for (const auto & model : local_models) {
261310
if (mapping.find(model.name) != mapping.end()) {
262311
// already exists in cached models, skip
@@ -274,14 +323,26 @@ server_models::server_models(
274323
/* args */ std::vector<std::string>(),
275324
/* exit_code */ 0
276325
};
277-
presets.render_args(meta); // populate meta.args
278-
mapping[meta.name] = instance_t{
279-
/* subproc */ std::make_shared<subprocess_s>(),
280-
/* th */ std::thread(),
281-
/* meta */ meta
282-
};
326+
add_model(std::move(meta));
283327
}
284328
}
329+
// 3. custom-path models specified in presets
330+
auto custom_models = list_custom_path_models(presets);
331+
for (const auto & model : custom_models) {
332+
server_model_meta meta{
333+
/* preset */ presets.get_preset(model.name),
334+
/* name */ model.name,
335+
/* path */ model.path,
336+
/* path_mmproj */ model.path_mmproj,
337+
/* in_cache */ false,
338+
/* port */ 0,
339+
/* status */ SERVER_MODEL_STATUS_UNLOADED,
340+
/* last_used */ 0,
341+
/* args */ std::vector<std::string>(),
342+
/* exit_code */ 0
343+
};
344+
add_model(std::move(meta));
345+
}
285346
// log available models
286347
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
287348
for (const auto & [name, inst] : mapping) {

tools/server/server-models.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,14 @@ struct server_models {
107107
// unload least recently used models if the limit is reached
108108
void unload_lru();
109109

110+
// not thread-safe, caller must hold mutex
111+
void add_model(server_model_meta && meta);
112+
110113
public:
111114
server_models(const common_params & params, int argc, char ** argv, char ** envp);
112115

116+
void load_models();
117+
113118
// check if a model instance exists
114119
bool has_model(const std::string & name);
115120

0 commit comments

Comments
 (0)