Skip to content

Commit bc750d5

Browse files
author
firecoperana
committed
handle reasoning content in webui
server : include usage statistics only when user request them (#16052) server : only attempt to enable thinking if using jinja (#15967)
1 parent 37aba08 commit bc750d5

File tree

4 files changed

+81
-55
lines changed

4 files changed

+81
-55
lines changed
192 Bytes
Binary file not shown.

examples/server/server.cpp

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ struct server_task_result {
174174
std::vector<llama_token> tokens;
175175

176176
bool stream;
177+
bool include_usage;
177178
std::string prompt;
178179
//slot_params generation_params;
179180

@@ -501,22 +502,22 @@ struct server_task_result {
501502
{"model", oaicompat_model},
502503
{"object", "chat.completion.chunk"},
503504
});
504-
505-
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
506-
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
507-
deltas.push_back({
508-
{"choices", json::array()},
509-
{"created", t},
510-
{"id", oaicompat_cmpl_id},
511-
{"model", oaicompat_model},
512-
{"object", "chat.completion.chunk"},
513-
{"usage", json {
514-
{"completion_tokens", n_decoded},
515-
{"prompt_tokens", n_prompt_tokens},
516-
{"total_tokens", n_decoded + n_prompt_tokens},
517-
}},
518-
});
519-
505+
if (include_usage) {
506+
// OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
507+
// https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
508+
deltas.push_back({
509+
{"choices", json::array()},
510+
{"created", t},
511+
{"id", oaicompat_cmpl_id},
512+
{"model", oaicompat_model},
513+
{"object", "chat.completion.chunk"},
514+
{"usage", json {
515+
{"completion_tokens", n_decoded},
516+
{"prompt_tokens", n_prompt_tokens},
517+
{"total_tokens", n_decoded + n_prompt_tokens},
518+
}},
519+
});
520+
}
520521
if (timings.prompt_n >= 0) {
521522
deltas.back().push_back({ "timings", timings.to_json() });
522523
}
@@ -548,6 +549,7 @@ struct server_task_multi {
548549

549550
struct slot_params {
550551
bool stream = true;
552+
bool include_usage = false;
551553
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
552554

553555
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -1360,7 +1362,7 @@ struct server_context {
13601362
// thinking is enabled if:
13611363
// 1. It's not explicitly disabled (reasoning_budget == 0)
13621364
// 2. The chat template supports it
1363-
const bool enable_thinking = params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
1365+
const bool enable_thinking = params.use_jinja && params.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
13641366
//LLAMA_LOG_INFO("Enable thinking? %d\n", enable_thinking);
13651367

13661368
oai_parser_opt = {
@@ -1515,6 +1517,8 @@ struct server_context {
15151517
}
15161518
slot.params.timings_per_token = json_value(data, "timings_per_token", false);
15171519
slot.params.stream = json_value(data, "stream", false);
1520+
auto stream_opt = json_value(data, "stream_options", json::object());
1521+
slot.params.include_usage = json_value(stream_opt, "include_usage", false);
15181522
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
15191523
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
15201524
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
@@ -2207,6 +2211,7 @@ struct server_context {
22072211
res.error = false;
22082212
res.stop = true; // to do: set value
22092213
res.stream = slot.params.stream;
2214+
res.include_usage = slot.params.include_usage;
22102215
res.content = slot.generated_text;
22112216
res.oaicompat = slot.params.oaicompat;
22122217
res.oaicompat_model = slot.params.oaicompat_model;

0 commit comments

Comments
 (0)