@@ -174,6 +174,7 @@ struct server_task_result {
174174 std::vector<llama_token> tokens;
175175
176176 bool stream;
177+ bool include_usage;
177178 std::string prompt;
178179 // slot_params generation_params;
179180
@@ -501,22 +502,22 @@ struct server_task_result {
501502 {" model" , oaicompat_model},
502503 {" object" , " chat.completion.chunk" },
503504 });
504-
505- // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
506- // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
507- deltas.push_back ({
508- {" choices" , json::array ()},
509- {" created" , t},
510- {" id" , oaicompat_cmpl_id},
511- {" model" , oaicompat_model},
512- {" object" , " chat.completion.chunk" },
513- {" usage" , json {
514- {" completion_tokens" , n_decoded},
515- {" prompt_tokens" , n_prompt_tokens},
516- {" total_tokens" , n_decoded + n_prompt_tokens},
517- }},
518- });
519-
505+ if (include_usage) {
506+ // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
507+ // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
508+ deltas.push_back ({
509+ {" choices" , json::array ()},
510+ {" created" , t},
511+ {" id" , oaicompat_cmpl_id},
512+ {" model" , oaicompat_model},
513+ {" object" , " chat.completion.chunk" },
514+ {" usage" , json {
515+ {" completion_tokens" , n_decoded},
516+ {" prompt_tokens" , n_prompt_tokens},
517+ {" total_tokens" , n_decoded + n_prompt_tokens},
518+ }},
519+ });
520+ }
520521 if (timings.prompt_n >= 0 ) {
521522 deltas.back ().push_back ({ " timings" , timings.to_json () });
522523 }
@@ -548,6 +549,7 @@ struct server_task_multi {
548549
549550struct slot_params {
550551 bool stream = true ;
552+ bool include_usage = false ;
551553 bool cache_prompt = true ; // remember the prompt to avoid reprocessing all prompt
552554
553555 int32_t n_keep = 0 ; // number of tokens to keep from initial prompt
@@ -1360,7 +1362,7 @@ struct server_context {
13601362 // thinking is enabled if:
13611363 // 1. It's not explicitly disabled (reasoning_budget == 0)
13621364 // 2. The chat template supports it
1363- const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking (chat_templates.get ());
1365+ const bool enable_thinking = params.use_jinja && params. reasoning_budget != 0 && common_chat_templates_support_enable_thinking (chat_templates.get ());
13641366 // LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
13651367
13661368 oai_parser_opt = {
@@ -1515,6 +1517,8 @@ struct server_context {
15151517 }
15161518 slot.params .timings_per_token = json_value (data, " timings_per_token" , false );
15171519 slot.params .stream = json_value (data, " stream" , false );
1520+ auto stream_opt = json_value (data, " stream_options" , json::object ());
1521+ slot.params .include_usage = json_value (stream_opt, " include_usage" , false );
15181522 slot.params .cache_prompt = json_value (data, " cache_prompt" , true );
15191523 slot.params .n_predict = json_value (data, " n_predict" , json_value (data, " max_tokens" , default_params.n_predict ));
15201524 slot.sparams .top_k = json_value (data, " top_k" , default_sparams.top_k );
@@ -2207,6 +2211,7 @@ struct server_context {
22072211 res.error = false ;
22082212 res.stop = true ; // to do: set value
22092213 res.stream = slot.params .stream ;
2214+ res.include_usage = slot.params .include_usage ;
22102215 res.content = slot.generated_text ;
22112216 res.oaicompat = slot.params .oaicompat ;
22122217 res.oaicompat_model = slot.params .oaicompat_model ;
0 commit comments