From 4a34e0e6ea830c52cd6cb24f055eb4fde2a88104 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 1 Dec 2025 22:58:49 +0100 Subject: [PATCH 1/3] server: remove default "gpt-3.5-turbo" model name --- tools/server/server-common.cpp | 11 +++++--- tools/server/server-common.h | 9 ++++--- tools/server/server-context.cpp | 27 +++++++++++++++---- tools/server/server-task.cpp | 2 +- tools/server/server-task.h | 1 + .../server/tests/unit/test_chat_completion.py | 4 +-- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 0bbc4e858f2..f48ea5b62a5 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) { return oai_body; } -json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) { +json format_embeddings_response_oaicompat( + const json & request, + const std::string & model_name, + const json & embeddings, + bool use_base64) { json data = json::array(); int32_t n_tokens = 0; int i = 0; @@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb } json res = json { - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"model", json_value(request, "model", model_name)}, {"object", "list"}, {"usage", json { {"prompt_tokens", n_tokens}, @@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb json format_response_rerank( const json & request, + const std::string & model_name, const json & ranks, bool is_tei_format, std::vector & texts, @@ -1338,7 +1343,7 @@ json format_response_rerank( if (is_tei_format) return results; json res = json{ - {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"model", json_value(request, "model", model_name)}, {"object", "list"}, {"usage", json{ {"prompt_tokens", n_tokens}, diff --git a/tools/server/server-common.h b/tools/server/server-common.h index ab8aabbad03..51ae9ea8a96 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -13,8 +13,6 @@ #include #include -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" - const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); using json = nlohmann::ordered_json; @@ -298,11 +296,16 @@ json oaicompat_chat_params_parse( json convert_anthropic_to_oai(const json & body); // TODO: move it to server-task.cpp -json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false); +json format_embeddings_response_oaicompat( + const json & request, + const std::string & model_name, + const json & embeddings, + bool use_base64 = false); // TODO: move it to server-task.cpp json format_response_rerank( const json & request, + const std::string & model_name, const json & ranks, bool is_tei_format, std::vector & texts, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e992db70f16..08b3186fc44 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -17,6 +17,7 @@ #include #include #include +#include // fix problem with std::min and std::max #if defined(_WIN32) @@ -518,6 +519,8 @@ struct server_context_impl { // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; + std::string model_name; // name of the loaded model, to be used by API + common_chat_templates_ptr chat_templates; oaicompat_parser_options oai_parser_opt; @@ -758,6 +761,18 @@ struct server_context_impl { } SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); + if (!params_base.model_alias.empty()) { + // user explicitly specified model name + model_name = params_base.model_alias; + } else if (!params_base.model.name.empty()) { + // use model name in registry format (for models in cache) + model_name = params_base.model.name; + } else { + // fallback: derive model name from file name + auto model_path = std::filesystem::path(params_base.model.path); + model_name = model_path.filename().string(); + } + // thinking is enabled if: // 1. It's not explicitly disabled (reasoning_budget == 0) // 2. The chat template supports it @@ -2605,6 +2620,7 @@ static std::unique_ptr handle_completions_impl( task.params = server_task::params_from_json_cmpl( ctx_server.ctx, ctx_server.params_base, + ctx_server.model_name, data); task.id_slot = json_value(data, "id_slot", -1); @@ -2939,7 +2955,7 @@ void server_routes::init_routes() { json data = { { "default_generation_settings", default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, - { "model_alias", ctx_server.params_base.model_alias }, + { "model_alias", ctx_server.model_name }, { "model_path", ctx_server.params_base.model.path }, { "modalities", json { {"vision", ctx_server.oai_parser_opt.allow_image}, @@ -3181,8 +3197,8 @@ void server_routes::init_routes() { json models = { {"models", { { - {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, - {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"name", ctx_server.model_name}, + {"model", ctx_server.model_name}, {"modified_at", ""}, {"size", ""}, {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash @@ -3204,7 +3220,7 @@ void server_routes::init_routes() { {"object", "list"}, {"data", { { - {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"id", ctx_server.model_name}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, @@ -3351,6 +3367,7 @@ void server_routes::init_routes() { // write JSON response json root = format_response_rerank( body, + ctx_server.model_name, responses, is_tei_format, documents, @@ -3613,7 +3630,7 @@ std::unique_ptr server_routes::handle_embeddings_impl(cons // write JSON response json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD - ? format_embeddings_response_oaicompat(body, responses, use_base64) + ? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64) : json(responses); res->ok(root); return res; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index b447a1ef6da..8b818abfcfd 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -147,6 +147,7 @@ json task_params::to_json(bool only_metrics) const { task_params server_task::params_from_json_cmpl( const llama_context * ctx, const common_params & params_base, + const std::string & model_name, const json & data) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -450,7 +451,6 @@ task_params server_task::params_from_json_cmpl( } } - std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; params.oaicompat_model = json_value(data, "model", model_name); return params; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index a22d7cab116..e0529fa86e8 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -120,6 +120,7 @@ struct server_task { static task_params params_from_json_cmpl( const llama_context * ctx, const common_params & params_base, + const std::string & model_name, const json & data); // utility function diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 392e0efecdb..7f95add2122 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -59,7 +59,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ) def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server - server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL + server.model_alias = "llama-test-model" server.start() res = server.make_stream_request("POST", "/chat/completions", data={ "max_tokens": max_tokens, @@ -81,7 +81,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte else: assert "role" not in choice["delta"] assert data["system_fingerprint"].startswith("b") - assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future + assert data["model"] == "llama-test-model" if last_cmpl_id is None: last_cmpl_id = data["id"] assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream From b3081e02444f563c7b156840c04bc61035e19e8a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 1 Dec 2025 23:23:43 +0100 Subject: [PATCH 2/3] do not reflect back model name from request --- tools/server/server-context.cpp | 3 +-- tools/server/server-task.cpp | 3 --- tools/server/server-task.h | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 08b3186fc44..aac1a70bb2b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2620,14 +2620,13 @@ static std::unique_ptr handle_completions_impl( task.params = server_task::params_from_json_cmpl( ctx_server.ctx, ctx_server.params_base, - ctx_server.model_name, data); task.id_slot = json_value(data, "id_slot", -1); // OAI-compat task.params.res_type = res_type; task.params.oaicompat_cmpl_id = completion_id; - // oaicompat_model is already populated by params_from_json_cmpl + task.params.oaicompat_model = ctx_server.model_name; tasks.push_back(std::move(task)); } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 8b818abfcfd..3f59127fb2f 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -147,7 +147,6 @@ json task_params::to_json(bool only_metrics) const { task_params server_task::params_from_json_cmpl( const llama_context * ctx, const common_params & params_base, - const std::string & model_name, const json & data) { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -451,8 +450,6 @@ task_params server_task::params_from_json_cmpl( } } - params.oaicompat_model = json_value(data, "model", model_name); - return params; } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index e0529fa86e8..a22d7cab116 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -120,7 +120,6 @@ struct server_task { static task_params params_from_json_cmpl( const llama_context * ctx, const common_params & params_base, - const std::string & model_name, const json & data); // utility function From cab54f318bb23350018b722f23e1dc2ab986f4ea Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 2 Dec 2025 10:16:19 +0100 Subject: [PATCH 3/3] fix test --- tools/server/tests/unit/test_chat_completion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 7f95add2122..093cec91555 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte assert res.status_code == 200 assert "cmpl" in res.body["id"] # make sure the completion id has the expected format assert res.body["system_fingerprint"].startswith("b") - assert res.body["model"] == model if model is not None else server.model_alias + # we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668 + # assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted choice = res.body["choices"][0]