Merge branch 'ggml-org:master' into phi3v_implementation

z-manoj · web-flow · commit d99547b8cffb · 2025-12-02T16:40:33.000+05:30
diff --git a/.github/workflows/winget.yml b/.github/workflows/winget.yml
@@ -9,6 +9,7 @@ jobs:
   update:
     name: Update Winget Package
     runs-on: ubuntu-latest
+    if: ${{ github.repository.owner.login == 'ggml-org' }}
 
     steps:
       - name: Install cargo binstall
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -2842,6 +2842,10 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        # TODO: probably not worth supporting quantized weight, as official BF16 is also available
+        if name.endswith("weight_scale_inv"):
+            raise ValueError("This is a quantized weight, please use BF16 weight instead")
+
         name = name.replace("language_model.", "")
         if "multi_modal_projector" in name or "vision_tower" in name:
             return []
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
         if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
             suggest = false;
         }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+        if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
             suggest = false;
         }
 #endif
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) {
     return oai_body;
 }
 
-json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) {
+json format_embeddings_response_oaicompat(
+        const json & request,
+        const std::string & model_name,
+        const json & embeddings,
+        bool use_base64) {
     json data = json::array();
     int32_t n_tokens = 0;
     int i = 0;
@@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
     }
 
     json res = json {
-        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"model", json_value(request, "model", model_name)},
         {"object", "list"},
         {"usage", json {
             {"prompt_tokens", n_tokens},
@@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
 
 json format_response_rerank(
         const json & request,
+        const std::string & model_name,
         const json & ranks,
         bool is_tei_format,
         std::vector<std::string> & texts,
@@ -1338,7 +1343,7 @@ json format_response_rerank(
     if (is_tei_format) return results;
 
     json res = json{
-        {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
+        {"model", json_value(request, "model", model_name)},
         {"object", "list"},
         {"usage", json{
             {"prompt_tokens", n_tokens},
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
@@ -13,8 +13,6 @@
 #include <vector>
 #include <cinttypes>
 
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
-
 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
 
 using json = nlohmann::ordered_json;
@@ -298,11 +296,16 @@ json oaicompat_chat_params_parse(
 json convert_anthropic_to_oai(const json & body);
 
 // TODO: move it to server-task.cpp
-json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
+json format_embeddings_response_oaicompat(
+    const json & request,
+    const std::string & model_name,
+    const json & embeddings,
+    bool use_base64 = false);
 
 // TODO: move it to server-task.cpp
 json format_response_rerank(
         const json & request,
+        const std::string & model_name,
         const json & ranks,
         bool is_tei_format,
         std::vector<std::string> & texts,
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -17,6 +17,7 @@
 #include <cinttypes>
 #include <memory>
 #include <unordered_set>
+#include <filesystem>
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -518,6 +519,8 @@ struct server_context_impl {
     // Necessary similarity of prompt for slot selection
     float slot_prompt_similarity = 0.0f;
 
+    std::string model_name; // name of the loaded model, to be used by API
+
     common_chat_templates_ptr chat_templates;
     oaicompat_parser_options  oai_parser_opt;
 
@@ -758,6 +761,18 @@ struct server_context_impl {
         }
         SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
 
+        if (!params_base.model_alias.empty()) {
+            // user explicitly specified model name
+            model_name = params_base.model_alias;
+        } else if (!params_base.model.name.empty()) {
+            // use model name in registry format (for models in cache)
+            model_name = params_base.model.name;
+        } else {
+            // fallback: derive model name from file name
+            auto model_path = std::filesystem::path(params_base.model.path);
+            model_name = model_path.filename().string();
+        }
+
         // thinking is enabled if:
         // 1. It's not explicitly disabled (reasoning_budget == 0)
         // 2. The chat template supports it
@@ -2611,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
             // OAI-compat
             task.params.res_type          = res_type;
             task.params.oaicompat_cmpl_id = completion_id;
-            // oaicompat_model is already populated by params_from_json_cmpl
+            task.params.oaicompat_model   = ctx_server.model_name;
 
             tasks.push_back(std::move(task));
         }
@@ -2939,7 +2954,7 @@ void server_routes::init_routes() {
         json data = {
             { "default_generation_settings", default_generation_settings_for_props },
             { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_alias",                 ctx_server.params_base.model_alias },
+            { "model_alias",                 ctx_server.model_name },
             { "model_path",                  ctx_server.params_base.model.path },
             { "modalities",                  json {
                 {"vision", ctx_server.oai_parser_opt.allow_image},
@@ -3181,8 +3196,8 @@ void server_routes::init_routes() {
         json models = {
             {"models", {
                 {
-                    {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"name", ctx_server.model_name},
+                    {"model", ctx_server.model_name},
                     {"modified_at", ""},
                     {"size", ""},
                     {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
@@ -3204,7 +3219,7 @@ void server_routes::init_routes() {
             {"object", "list"},
             {"data", {
                 {
-                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"id",       ctx_server.model_name},
                     {"object",   "model"},
                     {"created",  std::time(0)},
                     {"owned_by", "llamacpp"},
@@ -3351,6 +3366,7 @@ void server_routes::init_routes() {
         // write JSON response
         json root = format_response_rerank(
             body,
+            ctx_server.model_name,
             responses,
             is_tei_format,
             documents,
@@ -3613,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
 
     // write JSON response
     json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
-        ? format_embeddings_response_oaicompat(body, responses, use_base64)
+        ? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
         : json(responses);
     res->ok(root);
     return res;
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
@@ -24,8 +24,55 @@
 #include <unistd.h>
 #endif
 
+#if defined(__APPLE__) && defined(__MACH__)
+// macOS: use _NSGetExecutablePath to get the executable path
+#include <mach-o/dyld.h>
+#include <limits.h>
+#endif
+
 #define CMD_EXIT "exit"
 
+static std::filesystem::path get_server_exec_path() {
+#if defined(_WIN32)
+    wchar_t buf[32768] = { 0 };  // Large buffer to handle long paths
+    DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
+    if (len == 0 || len >= _countof(buf)) {
+        throw std::runtime_error("GetModuleFileNameW failed or path too long");
+    }
+    return std::filesystem::path(buf);
+#elif defined(__APPLE__) && defined(__MACH__)
+    char small_path[PATH_MAX];
+    uint32_t size = sizeof(small_path);
+
+    if (_NSGetExecutablePath(small_path, &size) == 0) {
+        // resolve any symlinks to get absolute path
+        try {
+            return std::filesystem::canonical(std::filesystem::path(small_path));
+        } catch (...) {
+            return std::filesystem::path(small_path);
+        }
+    } else {
+        // buffer was too small, allocate required size and call again
+        std::vector<char> buf(size);
+        if (_NSGetExecutablePath(buf.data(), &size) == 0) {
+            try {
+                return std::filesystem::canonical(std::filesystem::path(buf.data()));
+            } catch (...) {
+                return std::filesystem::path(buf.data());
+            }
+        }
+        throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
+    }
+#else
+    char path[FILENAME_MAX];
+    ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
+    if (count <= 0) {
+        throw std::runtime_error("failed to resolve /proc/self/exe");
+    }
+    return std::filesystem::path(std::string(path, count));
+#endif
+}
+
 struct local_model {
     std::string name;
     std::string path;
@@ -99,6 +146,14 @@ server_models::server_models(
     for (char ** env = envp; *env != nullptr; env++) {
         base_env.push_back(std::string(*env));
     }
+    GGML_ASSERT(!base_args.empty());
+    // set binary path
+    try {
+        base_args[0] = get_server_exec_path().string();
+    } catch (const std::exception & e) {
+        LOG_WRN("failed to get server executable path: %s\n", e.what());
+        LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
+    }
     // TODO: allow refreshing cached model list
     // add cached models
     auto cached_models = common_list_cached_models();
@@ -587,26 +642,26 @@ static void res_ok(std::unique_ptr<server_http_res> & res, const json & response
     res->data = safe_json_to_str(response_data);
 }
 
-static void res_error(std::unique_ptr<server_http_res> & res, const json & error_data) {
+static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
     res->status = json_value(error_data, "code", 500);
     res->data = safe_json_to_str({{ "error", error_data }});
 }
 
 static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
     if (name.empty()) {
-        res_error(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
+        res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
         return false;
     }
     auto meta = models.get_meta(name);
     if (!meta.has_value()) {
-        res_error(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
+        res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
         return false;
     }
     if (models_autoload) {
         models.ensure_model_loaded(name);
     } else {
         if (meta->status != SERVER_MODEL_STATUS_LOADED) {
-            res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
             return false;
         }
     }
@@ -706,11 +761,11 @@ void server_models_routes::init_routes() {
         std::string name = json_value(body, "model", std::string());
         auto model = models.get_meta(name);
         if (!model.has_value()) {
-            res_error(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
             return res;
         }
         if (model->status == SERVER_MODEL_STATUS_LOADED) {
-            res_error(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
+            res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
         models.load(name, false);
@@ -768,11 +823,11 @@ void server_models_routes::init_routes() {
         std::string name = json_value(body, "model", std::string());
         auto model = models.get_meta(name);
         if (!model.has_value()) {
-            res_error(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
+            res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
         if (model->status != SERVER_MODEL_STATUS_LOADED) {
-            res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
+            res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
             return res;
         }
         models.unload(name);
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
@@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
         }
     }
 
-    std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
-    params.oaicompat_model = json_value(data, "model", model_name);
-
     return params;
 }
 
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
@@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
     assert res.status_code == 200
     assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
     assert res.body["system_fingerprint"].startswith("b")
-    assert res.body["model"] == model if model is not None else server.model_alias
+    # we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
+    # assert res.body["model"] == model if model is not None else server.model_alias
     assert res.body["usage"]["prompt_tokens"] == n_prompt
     assert res.body["usage"]["completion_tokens"] == n_predicted
     choice = res.body["choices"][0]
@@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
 )
 def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
     global server
-    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
+    server.model_alias = "llama-test-model"
     server.start()
     res = server.make_stream_request("POST", "/chat/completions", data={
         "max_tokens": max_tokens,
@@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
             else:
                 assert "role" not in choice["delta"]
             assert data["system_fingerprint"].startswith("b")
-            assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
+            assert data["model"] == "llama-test-model"
             if last_cmpl_id is None:
                 last_cmpl_id = data["id"]
             assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream

Original file line number	Diff line number	Diff line change
`@@ -485,7 +485,7 @@ struct llama_mlock::impl {`
`485`	`485`	`if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {`
`486`	`486`	`suggest = false;`
`487`	`487`	`}`
`488`		`- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {`
	`488`	`+ if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {`
`489`	`489`	`suggest = false;`
`490`	`490`	`}`
`491`	`491`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(`
`450`	`450`	`}`
`451`	`451`	`}`
`452`	`452`
`453`		`- std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;`
`454`		`- params.oaicompat_model = json_value(data, "model", model_name);`
`455`		`-`
`456`	`453`	`return params;`
`457`	`454`	`}`
`458`	`455`