allow custom-path models

ngxson · ngxson · commit 78ffec51c381 · 2025-12-08T21:42:16.000+01:00
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -1470,6 +1470,11 @@ llama-server
 
 ### Model sources
 
+There are 3 possible sources for model files:
+1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
+2. Custom model directory (set via the `--models-dir` argument)
+3. Custom preset (set via the `--models-preset` argument)
+
 By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
 
 ```sh
@@ -1514,6 +1519,51 @@ llama-server -ctx 8192 -n 1024 -np 2
 
 Note: model instances inherit both command line arguments and environment variables from the router server.
 
+Alternatively, you can also add GGUF based preset (see next section)
+
+### Model presets
+
+Model presets allow advanced users to define custom configurations using an `.ini` file:
+
+```sh
+llama-server --models-preset ./my-models.ini
+```
+
+Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
+
+Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
+
+Example:
+
+```ini
+version = 1
+
+; If the key corresponds to an existing model on the server,
+; this will be used as the default config for that model
+[ggml-org/MY-MODEL-GGUF:Q8_0]
+; string value
+chat-template = chatml
+; numeric value
+n-gpu-layer = 123
+; boolean value
+jinja = false
+; shorthand argument (for example, context size)
+c = 4096
+; environment variable name
+LLAMA_ARG_CACHE_RAM = 0
+; file paths are relative to server's CWD
+model-draft = ./my-models/draft.gguf
+; but it's RECOMMENDED to use absolute path
+model-draft = /Users/abc/my-models/draft.gguf
+
+; If the key does NOT correspond to an existing model,
+; you need to specify at least the model path
+[custom_model]
+model = /Users/abc/my-awesome-model-Q4_K_M.gguf
+```
+
+Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
+
 ### Routing requests
 
 Requests are routed according to the requested model name.
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
@@ -145,8 +145,6 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
         SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
     }
 
-    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
-
     // populate reserved args (will be appended by the router)
     for (auto & opt : ctx_params.options) {
         if (opt.env == nullptr) {
@@ -159,14 +157,17 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
             env == "LLAMA_ARG_API_KEY" ||
             env == "LLAMA_ARG_MODELS_DIR" ||
             env == "LLAMA_ARG_MODELS_MAX" ||
-            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD" ||
             env == "LLAMA_ARG_MODEL" ||
             env == "LLAMA_ARG_MMPROJ" ||
-            env == "LLAMA_ARG_HF_REPO") {
+            env == "LLAMA_ARG_HF_REPO" ||
+            env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
             control_args[env] = opt;
         }
     }
 
+    // read base args from router's argv
+    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
+
     // remove any router-controlled args from base_args
     for (const auto & cargs : control_args) {
         auto it = base_args.find(cargs.second);
@@ -186,14 +187,21 @@ common_preset server_presets::get_preset(const std::string & name) {
 
 void server_presets::render_args(server_model_meta & meta) {
     common_preset preset = meta.preset; // copy
+    // merging 3 kinds of args:
+    // 1. model-specific args (from preset)
     // force removing control args if any
     for (auto & cargs : control_args) {
-        preset.options.erase(cargs.second);
+        if (preset.options.find(cargs.second) != preset.options.end()) {
+            SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
+            preset.options.erase(cargs.second);
+        }
     }
+    // 2. base args (from router)
     // inherit from base args
     for (const auto & [arg, value] : base_args) {
         preset.options[arg] = value;
     }
+    // 3. control args (from router)
     // set control values
     preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
     preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
@@ -231,8 +239,54 @@ server_models::server_models(
         LOG_WRN("failed to get server executable path: %s\n", e.what());
         LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
     }
-    // TODO: allow refreshing cached model list
-    // add cached models
+    load_models();
+}
+
+void server_models::add_model(server_model_meta && meta) {
+    if (mapping.find(meta.name) != mapping.end()) {
+        throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
+    }
+    presets.render_args(meta); // populate meta.args
+    std::string name = meta.name;
+    mapping[name] = instance_t{
+        /* subproc */ std::make_shared<subprocess_s>(),
+        /* th      */ std::thread(),
+        /* meta    */ std::move(meta)
+    };
+}
+
+static std::vector<local_model> list_custom_path_models(server_presets & presets) {
+    // detect any custom-path models in presets
+    std::vector<local_model> custom_models;
+    for (auto & [model_name, preset] : presets.presets) {
+        local_model model;
+        model.name = model_name;
+        std::vector<common_arg> to_erase;
+        for (auto & [arg, value] : preset.options) {
+            std::string env(arg.env ? arg.env : "");
+            if (env == "LLAMA_ARG_MODEL") {
+                model.path = value;
+                to_erase.push_back(arg);
+            }
+            if (env == "LLAMA_ARG_MMPROJ") {
+                model.path_mmproj = value;
+                to_erase.push_back(arg);
+            }
+        }
+        for (auto & arg : to_erase) {
+            preset.options.erase(arg);
+        }
+        if (!model.name.empty() && !model.path.empty()) {
+            custom_models.push_back(model);
+        }
+    }
+    return custom_models;
+}
+
+// TODO: allow refreshing cached model list
+void server_models::load_models() {
+    // loading models from 3 sources:
+    // 1. cached models
     auto cached_models = common_list_cached_models();
     for (const auto & model : cached_models) {
         server_model_meta meta{
@@ -247,16 +301,11 @@ server_models::server_models(
             /* args        */ std::vector<std::string>(),
             /* exit_code   */ 0
         };
-        presets.render_args(meta); // populate meta.args
-        mapping[meta.name] = instance_t{
-            /* subproc */ std::make_shared<subprocess_s>(),
-            /* th      */ std::thread(),
-            /* meta    */ meta
-        };
+        add_model(std::move(meta));
     }
-    // add local models specificed via --models-dir
-    if (!params.models_dir.empty()) {
-        auto local_models = list_local_models(params.models_dir);
+    // 2. local models specificed via --models-dir
+    if (!base_params.models_dir.empty()) {
+        auto local_models = list_local_models(base_params.models_dir);
         for (const auto & model : local_models) {
             if (mapping.find(model.name) != mapping.end()) {
                 // already exists in cached models, skip
@@ -274,14 +323,26 @@ server_models::server_models(
                 /* args        */ std::vector<std::string>(),
                 /* exit_code   */ 0
             };
-            presets.render_args(meta); // populate meta.args
-            mapping[meta.name] = instance_t{
-                /* subproc */ std::make_shared<subprocess_s>(),
-                /* th      */ std::thread(),
-                /* meta    */ meta
-            };
+            add_model(std::move(meta));
         }
     }
+    // 3. custom-path models specified in presets
+    auto custom_models = list_custom_path_models(presets);
+    for (const auto & model : custom_models) {
+        server_model_meta meta{
+            /* preset      */ presets.get_preset(model.name),
+            /* name        */ model.name,
+            /* path        */ model.path,
+            /* path_mmproj */ model.path_mmproj,
+            /* in_cache    */ false,
+            /* port        */ 0,
+            /* status      */ SERVER_MODEL_STATUS_UNLOADED,
+            /* last_used   */ 0,
+            /* args        */ std::vector<std::string>(),
+            /* exit_code   */ 0
+        };
+        add_model(std::move(meta));
+    }
     // log available models
     SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
     for (const auto & [name, inst] : mapping) {
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
@@ -107,9 +107,14 @@ struct server_models {
     // unload least recently used models if the limit is reached
     void unload_lru();
 
+    // not thread-safe, caller must hold mutex
+    void add_model(server_model_meta && meta);
+
 public:
     server_models(const common_params & params, int argc, char ** argv, char ** envp);
 
+    void load_models();
+
     // check if a model instance exists
     bool has_model(const std::string & name);