support chat parsing for gpt-oss

ngxson · ngxson · commit 44bdb752cdf2 · 2025-08-02T01:11:19.000+02:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2922,11 +2922,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
         "- none: leaves thoughts unparsed in `message.content`\n"
         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
-        "(default: deepseek)",
+        "(default: auto)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
             else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
+            else if (value == "auto") {     params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -592,6 +592,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -1289,6 +1290,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
         tool_calls_end);
 }
 
+static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    auto prompt = apply(tmpl, inputs);
+
+    data.prompt = prompt;
+    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
+
+    // TODO: support tool calls in GPT-OSS?
+
+    return data;
+}
+static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
+    // TODO @ngxson : this won't work with --special enabled, we should fix that
+    builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+}
+
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
     LOG_DBG("%s\n", __func__);
     common_chat_params data;
@@ -1774,6 +1795,11 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_hermes_2_pro(tmpl, params);
     }
 
+    // GPT-OSS
+    if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
+        return common_chat_params_init_gpt_oss(tmpl, params);
+    }
+
     // Use generic handler when mixing tools + JSON schema.
     // TODO: support that mix in handlers below.
     if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1925,6 +1951,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
             common_chat_parse_command_r7b(builder);
             break;
+        case COMMON_CHAT_FORMAT_GPT_OSS:
+            common_chat_parse_gpt_oss(builder);
+            break;
         default:
             throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
     }
diff --git a/common/chat.h b/common/chat.h
@@ -109,6 +109,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_GPT_OSS,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
diff --git a/common/common.h b/common/common.h
@@ -236,6 +236,7 @@ struct common_params_diffusion {
 
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_AUTO,
     COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
@@ -394,7 +395,7 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
     int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -2330,9 +2330,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         }
 
-        // @ngxson : quick hack for gpt-oss
+        // @ngxson : quick hack for gpt-oss, always render these tokens
         for (const auto & t : token_to_id) {
-            if (t.first == "<|channel|>" || t.first == "<|message|>") {
+            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
                 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -2330,9 +2330,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {`
`2330`	`2330`	`}`
`2331`	`2331`	`}`
`2332`	`2332`
`2333`		`- // @ngxson : quick hack for gpt-oss`
	`2333`	`+ // @ngxson : quick hack for gpt-oss, always render these tokens`
`2334`	`2334`	`for (const auto & t : token_to_id) {`
`2335`		`- if (t.first == "<\|channel\|>" \|\| t.first == "<\|message\|>") {`
	`2335`	`+ if (t.first == "<\|channel\|>" \|\| t.first == "<\|message\|>" \|\| t.first == "<\|start\|>") {`
`2336`	`2336`	`id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;`
`2337`	`2337`	`}`
`2338`	`2338`	`}`