@@ -1291,10 +1291,11 @@ struct server_context_impl {
12911291
12921292 res->index = slot.task ->index ;
12931293 // in stream mode, content and tokens are already in last partial chunk
1294- res->content = slot.task ->params .stream ? " " : slot.generated_text ;
12951294 if (slot.task ->params .stream ) {
1295+ res->content = " " ;
12961296 res->tokens = llama_tokens{};
12971297 } else {
1298+ res->content = std::move (slot.generated_text );
12981299 res->tokens = std::move (slot.generated_tokens );
12991300 }
13001301 res->timings = slot.get_timings ();
@@ -2591,6 +2592,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
25912592 inputs = tokenize_input_prompts (ctx_server.vocab , ctx_server.mctx , prompt, true , true );
25922593 }
25932594 tasks.reserve (inputs.size ());
2595+ states.reserve (inputs.size ());
25942596 for (size_t i = 0 ; i < inputs.size (); i++) {
25952597 server_task task = server_task (type);
25962598
@@ -2608,9 +2610,9 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
26082610 task.params .res_type = res_type;
26092611 task.params .oaicompat_cmpl_id = completion_id;
26102612 task.params .oaicompat_model = ctx_server.model_name ;
2611- states.emplace_back (task.params .oaicompat_chat_syntax );
26122613
26132614 tasks.push_back (std::move (task));
2615+ states.push_back (task.params .oaicompat_chat_syntax );
26142616 }
26152617
26162618 rd.post_tasks (std::move (tasks));
@@ -2639,7 +2641,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
26392641 // if single request, return single object instead of array
26402642 res->ok (arr.size () == 1 ? arr[0 ] : arr);
26412643 }
2642-
26432644 } else {
26442645 // in streaming mode, the first error must be treated as non-stream response
26452646 // this is to match the OAI API behavior
0 commit comments