Skip to content

Commit 9206ae1

Browse files
committed
feat: sync llama.cpp to b6709
Updates llama.cpp from b6638 to b6709, adding LFM2-MoE architecture support. Changes: - Updated third_party/llama.cpp submodule to b6709 - Synced cpp/ directory via scripts/bootstrap.sh - Added LLM_ARCH_LFM2MOE for LiquidAI hybrid models - Updated version.ts to build 6709 Tested with LiquidAI LFM2-1.2B models on iOS. References: - Release: https://github.com/ggml-org/llama.cpp/releases/tag/b6709 - PR: ggml-org/llama.cpp#16464
1 parent c56b150 commit 9206ae1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1524
-386
lines changed

cpp/chat-parser.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
7575
}
7676
return true;
7777
}
78+
79+
bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
80+
if (!tool_call.is_object() || tool_call.size() != 1) {
81+
return false;
82+
}
83+
84+
// Get the tool name (the single key in the object)
85+
auto it = tool_call.begin();
86+
std::string name = it.key();
87+
88+
if (name.empty()) {
89+
return false;
90+
}
91+
92+
// Get the arguments (the nested object)
93+
const json & args_json = it.value();
94+
std::string arguments = "";
95+
96+
if (args_json.is_object()) {
97+
arguments = args_json.dump();
98+
} else if (args_json.is_string()) {
99+
arguments = args_json;
100+
} else if (!args_json.is_null()) {
101+
// For other types, convert to string representation
102+
arguments = args_json.dump();
103+
}
104+
105+
return add_tool_call(name, "", arguments);
106+
}
78107
void common_chat_msg_parser::finish() {
79108
if (!is_partial_ && pos_ != input_.size()) {
80109
throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));

cpp/chat-parser.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
6464
// Adds an array of tool calls using their "name", "id" and "arguments" fields.
6565
bool add_tool_calls(const nlohmann::ordered_json & arr);
6666

67+
// Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
68+
bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
69+
6770
void finish();
6871

6972
bool consume_spaces();

cpp/chat.cpp

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,7 @@ const char * common_chat_format_name(common_chat_format format) {
612612
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
613613
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
614614
case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
615+
case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
615616
case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
616617
case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
617618
case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -625,6 +626,7 @@ const char * common_chat_format_name(common_chat_format format) {
625626
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
626627
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
627628
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
629+
case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
628630
default:
629631
throw std::runtime_error("Unknown chat format");
630632
}
@@ -788,6 +790,7 @@ static std::string apply(
788790
}
789791
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
790792
tmpl_inputs.extra_context = inputs.extra_context;
793+
tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
791794
if (additional_context) {
792795
tmpl_inputs.extra_context.merge_patch(*additional_context);
793796
}
@@ -968,6 +971,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
968971
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
969972
return data;
970973
}
974+
975+
static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
976+
common_chat_params data;
977+
data.prompt = apply(tmpl, inputs);
978+
data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
979+
data.preserved_tokens = {
980+
"[THINK]",
981+
"[/THINK]",
982+
};
983+
984+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
985+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
986+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
987+
auto schemas = json::array();
988+
foreach_function(inputs.tools, [&](const json & tool) {
989+
const auto & function = tool.at("function");
990+
schemas.push_back({
991+
{"type", "object"},
992+
{"properties", {
993+
{"name", {
994+
{"type", "string"},
995+
{"const", function.at("name")},
996+
}},
997+
{"arguments", function.at("parameters")},
998+
{"id", {
999+
{"type", "string"},
1000+
{"pattern", "^[a-zA-Z0-9]{9}$"},
1001+
}},
1002+
}},
1003+
{"required", json::array({"name", "arguments", "id"})},
1004+
});
1005+
});
1006+
auto schema = json {
1007+
{"type", "array"},
1008+
{"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
1009+
{"minItems", 1},
1010+
};
1011+
if (!inputs.parallel_tool_calls) {
1012+
schema["maxItems"] = 1;
1013+
}
1014+
builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
1015+
});
1016+
data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
1017+
data.preserved_tokens.push_back("[TOOL_CALLS]");
1018+
} else {
1019+
data.grammar_lazy = false;
1020+
if (!inputs.json_schema.is_null()) {
1021+
if (!inputs.grammar.empty()) {
1022+
throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
1023+
}
1024+
data.grammar = json_schema_to_grammar(inputs.json_schema);
1025+
} else {
1026+
data.grammar = inputs.grammar;
1027+
}
1028+
}
1029+
1030+
return data;
1031+
}
1032+
9711033
static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9721034
if (!builder.syntax().parse_tool_calls) {
9731035
builder.add_content(builder.consume_rest());
@@ -978,6 +1040,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
9781040
parse_prefixed_json_tool_call_array(builder, prefix);
9791041
}
9801042

1043+
static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
1044+
builder.try_parse_reasoning("[THINK]", "[/THINK]");
1045+
1046+
if (!builder.syntax().parse_tool_calls) {
1047+
builder.add_content(builder.consume_rest());
1048+
return;
1049+
}
1050+
1051+
static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
1052+
parse_prefixed_json_tool_call_array(builder, prefix);
1053+
}
1054+
9811055
static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
9821056
common_chat_params data;
9831057

@@ -1250,6 +1324,75 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
12501324
}
12511325
return data;
12521326
}
1327+
1328+
static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
1329+
common_chat_params data;
1330+
1331+
// Generate the prompt using the apply() function with the template
1332+
data.prompt = apply(tmpl, inputs);
1333+
data.format = COMMON_CHAT_FORMAT_APERTUS;
1334+
1335+
// Handle thinking tags appropriately based on inputs.enable_thinking
1336+
if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
1337+
if (!inputs.enable_thinking) {
1338+
data.prompt += "<|inner_suffix|>";
1339+
} else {
1340+
data.thinking_forced_open = true;
1341+
}
1342+
}
1343+
1344+
// When tools are present, build grammar for the <|tools_prefix|> format
1345+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1346+
data.grammar_lazy = true;
1347+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1348+
auto schemas = json::array();
1349+
foreach_function(inputs.tools, [&](const json & tool) {
1350+
const auto & function = tool.at("function");
1351+
schemas.push_back({
1352+
{ "type", "object" },
1353+
{ "properties",
1354+
{
1355+
{ function.at("name"), function.at("parameters") }
1356+
} },
1357+
{ "required", json::array({ function.at("name") }) },
1358+
});
1359+
});
1360+
auto schema = json{
1361+
{ "type", "array" },
1362+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1363+
{ "minItems", 1 },
1364+
};
1365+
if (!inputs.parallel_tool_calls) {
1366+
schema["maxItems"] = 1;
1367+
}
1368+
builder.add_rule("root",
1369+
std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
1370+
"\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
1371+
});
1372+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1373+
// If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
1374+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1375+
std::string(data.thinking_forced_open ?
1376+
"[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
1377+
"(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
1378+
"(<\\|tools_prefix\\|>)[\\s\\S]*" });
1379+
data.preserved_tokens = {
1380+
"<|system_start|>",
1381+
"<|system_end|>",
1382+
"<|developer_start|>",
1383+
"<|developer_end|>",
1384+
"<|user_start|>",
1385+
"<|user_end|>",
1386+
"<|assistant_start|>",
1387+
"<|assistant_end|>",
1388+
"<|inner_prefix|>",
1389+
"<|inner_suffix|>",
1390+
"<|tools_prefix|>",
1391+
"<|tools_suffix|>",
1392+
};
1393+
}
1394+
return data;
1395+
}
12531396
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
12541397
if (!builder.syntax().parse_tool_calls) {
12551398
builder.add_content(builder.consume_rest());
@@ -2309,6 +2452,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
23092452
builder.add_content(builder.consume_rest());
23102453
}
23112454

2455+
static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
2456+
// Parse thinking tags
2457+
builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
2458+
if (!builder.syntax().parse_tool_calls) {
2459+
builder.add_content(builder.consume_rest());
2460+
return;
2461+
}
2462+
2463+
// Look for tool calls
2464+
static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
2465+
if (auto res = builder.try_find_regex(tool_call_regex)) {
2466+
builder.move_to(res->groups[0].end);
2467+
2468+
auto tool_calls_data = builder.consume_json();
2469+
if (tool_calls_data.json.is_array()) {
2470+
builder.consume_spaces();
2471+
if (!builder.try_consume_literal("<|tools_suffix|>")) {
2472+
throw common_chat_msg_partial_exception("Incomplete tool call");
2473+
}
2474+
for (const auto & value : tool_calls_data.json) {
2475+
if (value.is_object()) {
2476+
builder.add_tool_call_short_form(value);
2477+
}
2478+
}
2479+
} else {
2480+
throw common_chat_msg_partial_exception("Incomplete tool call");
2481+
}
2482+
}
2483+
builder.add_content(builder.consume_rest());
2484+
}
2485+
23122486
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
23132487
// Parse thinking tags first - this handles the main reasoning content
23142488
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2553,6 +2727,11 @@ static common_chat_params common_chat_templates_apply_jinja(
25532727
return common_chat_params_init_nemotron_v2(tmpl, params);
25542728
}
25552729

2730+
// Apertus format detection
2731+
if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
2732+
return common_chat_params_init_apertus(tmpl, params);
2733+
}
2734+
25562735
// Use generic handler when mixing tools + JSON schema.
25572736
// TODO: support that mix in handlers below.
25582737
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2581,6 +2760,10 @@ static common_chat_params common_chat_templates_apply_jinja(
25812760
return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
25822761
}
25832762

2763+
if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
2764+
return common_chat_params_init_magistral(tmpl, params);
2765+
}
2766+
25842767
// Plain handler (no tools)
25852768
if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
25862769
return common_chat_params_init_without_tools(tmpl, params);
@@ -2681,6 +2864,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
26812864
case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
26822865
common_chat_parse_mistral_nemo(builder);
26832866
break;
2867+
case COMMON_CHAT_FORMAT_MAGISTRAL:
2868+
common_chat_parse_magistral(builder);
2869+
break;
26842870
case COMMON_CHAT_FORMAT_LLAMA_3_X:
26852871
common_chat_parse_llama_3_1(builder);
26862872
break;
@@ -2720,6 +2906,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
27202906
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
27212907
common_chat_parse_nemotron_v2(builder);
27222908
break;
2909+
case COMMON_CHAT_FORMAT_APERTUS:
2910+
common_chat_parse_apertus(builder);
2911+
break;
27232912
default:
27242913
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
27252914
}

cpp/chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ enum common_chat_format {
112112
COMMON_CHAT_FORMAT_CONTENT_ONLY,
113113
COMMON_CHAT_FORMAT_GENERIC,
114114
COMMON_CHAT_FORMAT_MISTRAL_NEMO,
115+
COMMON_CHAT_FORMAT_MAGISTRAL,
115116
COMMON_CHAT_FORMAT_LLAMA_3_X,
116117
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
117118
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -125,6 +126,7 @@ enum common_chat_format {
125126
COMMON_CHAT_FORMAT_GPT_OSS,
126127
COMMON_CHAT_FORMAT_SEED_OSS,
127128
COMMON_CHAT_FORMAT_NEMOTRON_V2,
129+
COMMON_CHAT_FORMAT_APERTUS,
128130

129131
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
130132
};

cpp/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,6 +1141,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
11411141
mparams.use_mlock = params.use_mlock;
11421142
mparams.check_tensors = params.check_tensors;
11431143
mparams.use_extra_bufts = !params.no_extra_bufts;
1144+
mparams.no_host = params.no_host;
11441145

11451146
if (params.kv_overrides.empty()) {
11461147
mparams.kv_overrides = NULL;

cpp/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ struct common_params {
393393
bool check_tensors = false; // validate tensor data
394394
bool no_op_offload = false; // globally disable offload host tensor operations to device
395395
bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
396+
bool no_host = false; // bypass host buffer allowing extra buffers to be used
396397

397398
bool single_turn = false; // single turn chat conversation
398399

@@ -428,7 +429,7 @@ struct common_params {
428429
int32_t timeout_write = timeout_read; // http write timeout in seconds
429430
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
430431
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
431-
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
432+
int32_t n_ctx_checkpoints = 3; // max number of context checkpoints per slot
432433

433434
std::string hostname = "127.0.0.1";
434435
std::string public_path = ""; // NOLINT

0 commit comments

Comments
 (0)