boshjerns
diff --git a/‎cpp/chat-parser.cpp‎
Lines changed: 29 additions & 0 deletions b/‎cpp/chat-parser.cpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎cpp/chat-parser.h‎
Lines changed: 3 additions & 0 deletions b/‎cpp/chat-parser.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cpp/chat.cpp‎
Lines changed: 189 additions & 0 deletions b/‎cpp/chat.cpp‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎cpp/chat.h‎
Lines changed: 2 additions & 0 deletions b/‎cpp/chat.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/common.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/common.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/common.h‎
Lines changed: 2 additions & 1 deletion b/‎cpp/common.h‎
Lines changed: 2 additions & 1 deletion
@@ -75,6 +75,35 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
     }
     return true;
 }
+
+bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
+    if (!tool_call.is_object() || tool_call.size() != 1) {
+        return false;
+    }
+
+    // Get the tool name (the single key in the object)
+    auto it = tool_call.begin();
+    std::string name = it.key();
+
+    if (name.empty()) {
+        return false;
+    }
+
+    // Get the arguments (the nested object)
+    const json & args_json = it.value();
+    std::string arguments = "";
+
+    if (args_json.is_object()) {
+        arguments = args_json.dump();
+    } else if (args_json.is_string()) {
+        arguments = args_json;
+    } else if (!args_json.is_null()) {
+        // For other types, convert to string representation
+        arguments = args_json.dump();
+    }
+
+    return add_tool_call(name, "", arguments);
+}
 void common_chat_msg_parser::finish() {
     if (!is_partial_ && pos_ != input_.size()) {
         throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
 
@@ -64,6 +64,9 @@ class common_chat_msg_parser {
     // Adds an array of tool calls using their "name", "id" and "arguments" fields.
     bool add_tool_calls(const nlohmann::ordered_json & arr);
 
+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
     void finish();
 
     bool consume_spaces();
 
@@ -612,6 +612,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
         case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
         case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
+        case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
         case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -625,6 +626,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
         case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
         case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
+        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -788,6 +790,7 @@ static std::string apply(
     }
     tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
     tmpl_inputs.extra_context = inputs.extra_context;
+    tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
     if (additional_context) {
         tmpl_inputs.extra_context.merge_patch(*additional_context);
     }
@@ -968,6 +971,65 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
     data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
     return data;
 }
+
+static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
+    data.preserved_tokens = {
+        "[THINK]",
+        "[/THINK]",
+    };
+
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    {"type", "object"},
+                    {"properties", {
+                        {"name", {
+                            {"type", "string"},
+                            {"const", function.at("name")},
+                        }},
+                        {"arguments", function.at("parameters")},
+                        {"id", {
+                            {"type", "string"},
+                            {"pattern", "^[a-zA-Z0-9]{9}$"},
+                        }},
+                    }},
+                    {"required", json::array({"name", "arguments", "id"})},
+                });
+            });
+            auto schema = json {
+                {"type", "array"},
+                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+                {"minItems", 1},
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
+        });
+        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
+        data.preserved_tokens.push_back("[TOOL_CALLS]");
+    } else {
+        data.grammar_lazy = false;
+        if (!inputs.json_schema.is_null()) {
+            if (!inputs.grammar.empty()) {
+                throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+            }
+            data.grammar = json_schema_to_grammar(inputs.json_schema);
+        } else {
+            data.grammar = inputs.grammar;
+        }
+    }
+
+    return data;
+}
+
 static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
     if (!builder.syntax().parse_tool_calls) {
         builder.add_content(builder.consume_rest());
@@ -978,6 +1040,18 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
     parse_prefixed_json_tool_call_array(builder, prefix);
 }
 
+static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("[THINK]", "[/THINK]");
+
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
+    parse_prefixed_json_tool_call_array(builder, prefix);
+}
+
 static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
 
@@ -1250,6 +1324,75 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
     }
     return data;
 }
+
+static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Generate the prompt using the apply() function with the template
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_APERTUS;
+
+    // Handle thinking tags appropriately based on inputs.enable_thinking
+    if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "<|inner_suffix|>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    // When tools are present, build grammar for the <|tools_prefix|> format
+    if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = true;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    { "type",       "object"                                                   },
+                    { "properties",
+                        {
+                            { function.at("name"), function.at("parameters") }
+                        }                                                                        },
+                    { "required",   json::array({ function.at("name") }) },
+                });
+            });
+            auto schema = json{
+                        { "type",     "array"                                                         },
+                        { "items",    schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
+                        { "minItems", 1                                                               },
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root",
+                                std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
+                                    "\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
+                            });
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+            // If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
+            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+            std::string(data.thinking_forced_open ?
+                            "[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
+                            "(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
+                "(<\\|tools_prefix\\|>)[\\s\\S]*" });
+        data.preserved_tokens = {
+            "<|system_start|>",
+            "<|system_end|>",
+            "<|developer_start|>",
+            "<|developer_end|>",
+            "<|user_start|>",
+            "<|user_end|>",
+            "<|assistant_start|>",
+            "<|assistant_end|>",
+            "<|inner_prefix|>",
+            "<|inner_suffix|>",
+            "<|tools_prefix|>",
+            "<|tools_suffix|>",
+        };
+    }
+    return data;
+}
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
     if (!builder.syntax().parse_tool_calls) {
         builder.add_content(builder.consume_rest());
@@ -2309,6 +2452,37 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
     builder.add_content(builder.consume_rest());
 }
 
+static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            builder.consume_spaces();
+            if (!builder.try_consume_literal("<|tools_suffix|>")) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            for (const auto & value : tool_calls_data.json) {
+                if (value.is_object()) {
+                    builder.add_tool_call_short_form(value);
+                }
+            }
+        } else {
+            throw common_chat_msg_partial_exception("Incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
     // Parse thinking tags first - this handles the main reasoning content
     builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2553,6 +2727,11 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_nemotron_v2(tmpl, params);
     }
 
+    // Apertus format detection
+    if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
+        return common_chat_params_init_apertus(tmpl, params);
+    }
+
     // Use generic handler when mixing tools + JSON schema.
     // TODO: support that mix in handlers below.
     if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2581,6 +2760,10 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
     }
 
+    if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
+        return common_chat_params_init_magistral(tmpl, params);
+    }
+
     // Plain handler (no tools)
     if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
         return common_chat_params_init_without_tools(tmpl, params);
@@ -2681,6 +2864,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
             common_chat_parse_mistral_nemo(builder);
             break;
+        case COMMON_CHAT_FORMAT_MAGISTRAL:
+            common_chat_parse_magistral(builder);
+            break;
         case COMMON_CHAT_FORMAT_LLAMA_3_X:
             common_chat_parse_llama_3_1(builder);
             break;
@@ -2720,6 +2906,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_NEMOTRON_V2:
             common_chat_parse_nemotron_v2(builder);
             break;
+        case COMMON_CHAT_FORMAT_APERTUS:
+            common_chat_parse_apertus(builder);
+            break;
         default:
             throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }
 
@@ -112,6 +112,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_CONTENT_ONLY,
     COMMON_CHAT_FORMAT_GENERIC,
     COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_MAGISTRAL,
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -125,6 +126,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_GPT_OSS,
     COMMON_CHAT_FORMAT_SEED_OSS,
     COMMON_CHAT_FORMAT_NEMOTRON_V2,
+    COMMON_CHAT_FORMAT_APERTUS,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
 
@@ -1141,6 +1141,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
+    mparams.no_host         = params.no_host;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
 
@@ -393,6 +393,7 @@ struct common_params {
     bool check_tensors     = false; // validate tensor data
     bool no_op_offload     = false; // globally disable offload host tensor operations to device
     bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
+    bool no_host           = false; // bypass host buffer allowing extra buffers to be used
 
     bool single_turn       = false; // single turn chat conversation
 
@@ -428,7 +429,7 @@ struct common_params {
     int32_t timeout_write     = timeout_read; // http write timeout in seconds
     int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
+    int32_t n_ctx_checkpoints = 3;            // max number of context checkpoints per slot
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT