From dc0d88628383a1a8d8c93d5e621b068ba7782999 Mon Sep 17 00:00:00 2001
From: Christina <truffle@gmail.com>
Date: Sat, 6 Dec 2025 09:34:41 -0600
Subject: [PATCH] docs: clarify --ctx-size and --parallel interaction in
 arg.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using --parallel N, the --ctx-size value is the TOTAL context
divided among all slots, not the per-slot context. This is a common
source of confusion (see #11681, #5732).

Examples:
- --ctx-size 4096 --parallel 4 → each slot gets 1024 tokens
- To get 4096 tokens per slot with 4 parallel slots, use --ctx-size 16384

Updated the help text in arg.cpp (the source for auto-generated docs)
for both --ctx-size and --parallel flags to clarify this behavior.

Fixes #11681
---
 common/arg.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 9e062ee7a1b..5ad328d549a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -920,7 +920,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-c", "--ctx-size"}, "N",
-        string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        string_format("size of the prompt context (default: %d, 0 = loaded from model). "
+            "Note: when using --parallel N, this is the TOTAL context divided among all slots, "
+            "not per-slot. For X tokens per slot with N parallel slots, use --ctx-size X*N", params.n_ctx),
         [](common_params & params, int value) {
             params.n_ctx = value;
         }
@@ -1756,7 +1758,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(
         {"-np", "--parallel"}, "N",
-        string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        string_format("number of parallel sequences to decode (default: %d). "
+            "Note: total context (--ctx-size) is divided equally among parallel slots", params.n_parallel),
         [](common_params & params, int value) {
             params.n_parallel = value;
         }