Skip to content

Commit d99547b

Browse files
authored
Merge branch 'ggml-org:master' into phi3v_implementation
2 parents 3c5c7ad + f3a9674 commit d99547b

File tree

9 files changed

+109
-27
lines changed

9 files changed

+109
-27
lines changed

.github/workflows/winget.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ jobs:
99
update:
1010
name: Update Winget Package
1111
runs-on: ubuntu-latest
12+
if: ${{ github.repository.owner.login == 'ggml-org' }}
1213

1314
steps:
1415
- name: Install cargo binstall

convert_hf_to_gguf.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2842,6 +2842,10 @@ def set_gguf_parameters(self):
28422842
self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])
28432843

28442844
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
2845+
# TODO: probably not worth supporting quantized weight, as official BF16 is also available
2846+
if name.endswith("weight_scale_inv"):
2847+
raise ValueError("This is a quantized weight, please use BF16 weight instead")
2848+
28452849
name = name.replace("language_model.", "")
28462850
if "multi_modal_projector" in name or "vision_tower" in name:
28472851
return []

src/llama-mmap.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
485485
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
486486
suggest = false;
487487
}
488-
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
488+
if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
489489
suggest = false;
490490
}
491491
#endif

tools/server/server-common.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1263,7 +1263,11 @@ json convert_anthropic_to_oai(const json & body) {
12631263
return oai_body;
12641264
}
12651265

1266-
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64) {
1266+
json format_embeddings_response_oaicompat(
1267+
const json & request,
1268+
const std::string & model_name,
1269+
const json & embeddings,
1270+
bool use_base64) {
12671271
json data = json::array();
12681272
int32_t n_tokens = 0;
12691273
int i = 0;
@@ -1293,7 +1297,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
12931297
}
12941298

12951299
json res = json {
1296-
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
1300+
{"model", json_value(request, "model", model_name)},
12971301
{"object", "list"},
12981302
{"usage", json {
12991303
{"prompt_tokens", n_tokens},
@@ -1307,6 +1311,7 @@ json format_embeddings_response_oaicompat(const json & request, const json & emb
13071311

13081312
json format_response_rerank(
13091313
const json & request,
1314+
const std::string & model_name,
13101315
const json & ranks,
13111316
bool is_tei_format,
13121317
std::vector<std::string> & texts,
@@ -1338,7 +1343,7 @@ json format_response_rerank(
13381343
if (is_tei_format) return results;
13391344

13401345
json res = json{
1341-
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
1346+
{"model", json_value(request, "model", model_name)},
13421347
{"object", "list"},
13431348
{"usage", json{
13441349
{"prompt_tokens", n_tokens},

tools/server/server-common.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
#include <vector>
1414
#include <cinttypes>
1515

16-
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
17-
1816
const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
1917

2018
using json = nlohmann::ordered_json;
@@ -298,11 +296,16 @@ json oaicompat_chat_params_parse(
298296
json convert_anthropic_to_oai(const json & body);
299297

300298
// TODO: move it to server-task.cpp
301-
json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false);
299+
json format_embeddings_response_oaicompat(
300+
const json & request,
301+
const std::string & model_name,
302+
const json & embeddings,
303+
bool use_base64 = false);
302304

303305
// TODO: move it to server-task.cpp
304306
json format_response_rerank(
305307
const json & request,
308+
const std::string & model_name,
306309
const json & ranks,
307310
bool is_tei_format,
308311
std::vector<std::string> & texts,

tools/server/server-context.cpp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <cinttypes>
1818
#include <memory>
1919
#include <unordered_set>
20+
#include <filesystem>
2021

2122
// fix problem with std::min and std::max
2223
#if defined(_WIN32)
@@ -518,6 +519,8 @@ struct server_context_impl {
518519
// Necessary similarity of prompt for slot selection
519520
float slot_prompt_similarity = 0.0f;
520521

522+
std::string model_name; // name of the loaded model, to be used by API
523+
521524
common_chat_templates_ptr chat_templates;
522525
oaicompat_parser_options oai_parser_opt;
523526

@@ -758,6 +761,18 @@ struct server_context_impl {
758761
}
759762
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
760763

764+
if (!params_base.model_alias.empty()) {
765+
// user explicitly specified model name
766+
model_name = params_base.model_alias;
767+
} else if (!params_base.model.name.empty()) {
768+
// use model name in registry format (for models in cache)
769+
model_name = params_base.model.name;
770+
} else {
771+
// fallback: derive model name from file name
772+
auto model_path = std::filesystem::path(params_base.model.path);
773+
model_name = model_path.filename().string();
774+
}
775+
761776
// thinking is enabled if:
762777
// 1. It's not explicitly disabled (reasoning_budget == 0)
763778
// 2. The chat template supports it
@@ -2611,7 +2626,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
26112626
// OAI-compat
26122627
task.params.res_type = res_type;
26132628
task.params.oaicompat_cmpl_id = completion_id;
2614-
// oaicompat_model is already populated by params_from_json_cmpl
2629+
task.params.oaicompat_model = ctx_server.model_name;
26152630

26162631
tasks.push_back(std::move(task));
26172632
}
@@ -2939,7 +2954,7 @@ void server_routes::init_routes() {
29392954
json data = {
29402955
{ "default_generation_settings", default_generation_settings_for_props },
29412956
{ "total_slots", ctx_server.params_base.n_parallel },
2942-
{ "model_alias", ctx_server.params_base.model_alias },
2957+
{ "model_alias", ctx_server.model_name },
29432958
{ "model_path", ctx_server.params_base.model.path },
29442959
{ "modalities", json {
29452960
{"vision", ctx_server.oai_parser_opt.allow_image},
@@ -3181,8 +3196,8 @@ void server_routes::init_routes() {
31813196
json models = {
31823197
{"models", {
31833198
{
3184-
{"name", params.model_alias.empty() ? params.model.path : params.model_alias},
3185-
{"model", params.model_alias.empty() ? params.model.path : params.model_alias},
3199+
{"name", ctx_server.model_name},
3200+
{"model", ctx_server.model_name},
31863201
{"modified_at", ""},
31873202
{"size", ""},
31883203
{"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
@@ -3204,7 +3219,7 @@ void server_routes::init_routes() {
32043219
{"object", "list"},
32053220
{"data", {
32063221
{
3207-
{"id", params.model_alias.empty() ? params.model.path : params.model_alias},
3222+
{"id", ctx_server.model_name},
32083223
{"object", "model"},
32093224
{"created", std::time(0)},
32103225
{"owned_by", "llamacpp"},
@@ -3351,6 +3366,7 @@ void server_routes::init_routes() {
33513366
// write JSON response
33523367
json root = format_response_rerank(
33533368
body,
3369+
ctx_server.model_name,
33543370
responses,
33553371
is_tei_format,
33563372
documents,
@@ -3613,7 +3629,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_embeddings_impl(cons
36133629

36143630
// write JSON response
36153631
json root = res_type == TASK_RESPONSE_TYPE_OAI_EMBD
3616-
? format_embeddings_response_oaicompat(body, responses, use_base64)
3632+
? format_embeddings_response_oaicompat(body, ctx_server.model_name, responses, use_base64)
36173633
: json(responses);
36183634
res->ok(root);
36193635
return res;

tools/server/server-models.cpp

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,55 @@
2424
#include <unistd.h>
2525
#endif
2626

27+
#if defined(__APPLE__) && defined(__MACH__)
28+
// macOS: use _NSGetExecutablePath to get the executable path
29+
#include <mach-o/dyld.h>
30+
#include <limits.h>
31+
#endif
32+
2733
#define CMD_EXIT "exit"
2834

35+
static std::filesystem::path get_server_exec_path() {
36+
#if defined(_WIN32)
37+
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
38+
DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
39+
if (len == 0 || len >= _countof(buf)) {
40+
throw std::runtime_error("GetModuleFileNameW failed or path too long");
41+
}
42+
return std::filesystem::path(buf);
43+
#elif defined(__APPLE__) && defined(__MACH__)
44+
char small_path[PATH_MAX];
45+
uint32_t size = sizeof(small_path);
46+
47+
if (_NSGetExecutablePath(small_path, &size) == 0) {
48+
// resolve any symlinks to get absolute path
49+
try {
50+
return std::filesystem::canonical(std::filesystem::path(small_path));
51+
} catch (...) {
52+
return std::filesystem::path(small_path);
53+
}
54+
} else {
55+
// buffer was too small, allocate required size and call again
56+
std::vector<char> buf(size);
57+
if (_NSGetExecutablePath(buf.data(), &size) == 0) {
58+
try {
59+
return std::filesystem::canonical(std::filesystem::path(buf.data()));
60+
} catch (...) {
61+
return std::filesystem::path(buf.data());
62+
}
63+
}
64+
throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
65+
}
66+
#else
67+
char path[FILENAME_MAX];
68+
ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
69+
if (count <= 0) {
70+
throw std::runtime_error("failed to resolve /proc/self/exe");
71+
}
72+
return std::filesystem::path(std::string(path, count));
73+
#endif
74+
}
75+
2976
struct local_model {
3077
std::string name;
3178
std::string path;
@@ -99,6 +146,14 @@ server_models::server_models(
99146
for (char ** env = envp; *env != nullptr; env++) {
100147
base_env.push_back(std::string(*env));
101148
}
149+
GGML_ASSERT(!base_args.empty());
150+
// set binary path
151+
try {
152+
base_args[0] = get_server_exec_path().string();
153+
} catch (const std::exception & e) {
154+
LOG_WRN("failed to get server executable path: %s\n", e.what());
155+
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
156+
}
102157
// TODO: allow refreshing cached model list
103158
// add cached models
104159
auto cached_models = common_list_cached_models();
@@ -587,26 +642,26 @@ static void res_ok(std::unique_ptr<server_http_res> & res, const json & response
587642
res->data = safe_json_to_str(response_data);
588643
}
589644

590-
static void res_error(std::unique_ptr<server_http_res> & res, const json & error_data) {
645+
static void res_err(std::unique_ptr<server_http_res> & res, const json & error_data) {
591646
res->status = json_value(error_data, "code", 500);
592647
res->data = safe_json_to_str({{ "error", error_data }});
593648
}
594649

595650
static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
596651
if (name.empty()) {
597-
res_error(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
652+
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
598653
return false;
599654
}
600655
auto meta = models.get_meta(name);
601656
if (!meta.has_value()) {
602-
res_error(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
657+
res_err(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
603658
return false;
604659
}
605660
if (models_autoload) {
606661
models.ensure_model_loaded(name);
607662
} else {
608663
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
609-
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
664+
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
610665
return false;
611666
}
612667
}
@@ -706,11 +761,11 @@ void server_models_routes::init_routes() {
706761
std::string name = json_value(body, "model", std::string());
707762
auto model = models.get_meta(name);
708763
if (!model.has_value()) {
709-
res_error(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
764+
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
710765
return res;
711766
}
712767
if (model->status == SERVER_MODEL_STATUS_LOADED) {
713-
res_error(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
768+
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
714769
return res;
715770
}
716771
models.load(name, false);
@@ -768,11 +823,11 @@ void server_models_routes::init_routes() {
768823
std::string name = json_value(body, "model", std::string());
769824
auto model = models.get_meta(name);
770825
if (!model.has_value()) {
771-
res_error(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
826+
res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
772827
return res;
773828
}
774829
if (model->status != SERVER_MODEL_STATUS_LOADED) {
775-
res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
830+
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
776831
return res;
777832
}
778833
models.unload(name);

tools/server/server-task.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -450,9 +450,6 @@ task_params server_task::params_from_json_cmpl(
450450
}
451451
}
452452

453-
std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
454-
params.oaicompat_model = json_value(data, "model", model_name);
455-
456453
return params;
457454
}
458455

tools/server/tests/unit/test_chat_completion.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
4141
assert res.status_code == 200
4242
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
4343
assert res.body["system_fingerprint"].startswith("b")
44-
assert res.body["model"] == model if model is not None else server.model_alias
44+
# we no longer reflect back the model name, see https://github.com/ggml-org/llama.cpp/pull/17668
45+
# assert res.body["model"] == model if model is not None else server.model_alias
4546
assert res.body["usage"]["prompt_tokens"] == n_prompt
4647
assert res.body["usage"]["completion_tokens"] == n_predicted
4748
choice = res.body["choices"][0]
@@ -59,7 +60,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
5960
)
6061
def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
6162
global server
62-
server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
63+
server.model_alias = "llama-test-model"
6364
server.start()
6465
res = server.make_stream_request("POST", "/chat/completions", data={
6566
"max_tokens": max_tokens,
@@ -81,7 +82,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
8182
else:
8283
assert "role" not in choice["delta"]
8384
assert data["system_fingerprint"].startswith("b")
84-
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
85+
assert data["model"] == "llama-test-model"
8586
if last_cmpl_id is None:
8687
last_cmpl_id = data["id"]
8788
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream

0 commit comments

Comments
 (0)