From dc0d88628383a1a8d8c93d5e621b068ba7782999 Mon Sep 17 00:00:00 2001 From: Christina Date: Sat, 6 Dec 2025 09:34:41 -0600 Subject: [PATCH] docs: clarify --ctx-size and --parallel interaction in arg.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using --parallel N, the --ctx-size value is the TOTAL context divided among all slots, not the per-slot context. This is a common source of confusion (see #11681, #5732). Examples: - --ctx-size 4096 --parallel 4 → each slot gets 1024 tokens - To get 4096 tokens per slot with 4 parallel slots, use --ctx-size 16384 Updated the help text in arg.cpp (the source for auto-generated docs) for both --ctx-size and --parallel flags to clarify this behavior. Fixes #11681 --- common/arg.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 9e062ee7a1b..5ad328d549a 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -920,7 +920,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_LOOKUP})); add_opt(common_arg( {"-c", "--ctx-size"}, "N", - string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), + string_format("size of the prompt context (default: %d, 0 = loaded from model). " + "Note: when using --parallel N, this is the TOTAL context divided among all slots, " + "not per-slot. For X tokens per slot with N parallel slots, use --ctx-size X*N", params.n_ctx), [](common_params & params, int value) { params.n_ctx = value; } @@ -1756,7 +1758,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( {"-np", "--parallel"}, "N", - string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + string_format("number of parallel sequences to decode (default: %d). " + "Note: total context (--ctx-size) is divided equally among parallel slots", params.n_parallel), [](common_params & params, int value) { params.n_parallel = value; }