Update handling of max-num-batched-tokens in CLI and SLURM scripts

fcogidi · fcogidi · commit 0d1c07000597 · 2025-03-17T12:03:11.000-04:00
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -52,7 +52,7 @@ def cli() -> None:
 @click.option(
     "--max-num-batched-tokens",
     type=int,
-    help="Maximum number of batched tokens per iteration, defaults to min(2048, max-num-seqs), pairs with --enable-chunked-prefill to control the batch size at the prefill stage",
+    help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
 )
 @click.option(
     "--partition",
diff --git a/vec_inf/cli/_config.py b/vec_inf/cli/_config.py
@@ -47,8 +47,8 @@ class ModelConfig(BaseModel):
     max_num_seqs: int = Field(
         default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
     )
-    max_num_batched_tokens: int = Field(
-        default=2048,
+    max_num_batched_tokens: Optional[int] = Field(
+        default=None,
         gt=0,
         le=1_000_000,
         description="Maximum batched tokens per iteration",
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
@@ -117,10 +117,6 @@ def _get_launch_params(self) -> dict[str, Any]:
         if params["max_model_len"] > 32_000:  # this is the default behavior of vLLM
             params["enable_chunked_prefill"] = True
 
-        params["max_num_batched_tokens"] = min(
-            params["max_num_batched_tokens"], params["max_model_len"]
-        )
-
         # Validate required fields
         if not REQUIRED_FIELDS.issubset(set(params.keys())):
             raise click.ClickException(
diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm
@@ -102,6 +102,12 @@ else
     export ENABLE_CHUNKED_PREFILL=""
 fi
 
+if [ -z "$MAX_NUM_BATCHED_TOKENS" ] || [ "$MAX_NUM_BATCHED_TOKENS" = "None" ]; then
+    export MAX_NUM_BATCHED_TOKENS=""
+else
+    export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
+fi
+
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
     singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -118,9 +124,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
-    --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
     --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
     ${ENABLE_PREFIX_CACHING} \
     ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
@@ -139,9 +145,9 @@ else
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
-    --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
     --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
     ${ENABLE_PREFIX_CACHING} \
     ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
diff --git a/vec_inf/vllm.slurm b/vec_inf/vllm.slurm
@@ -35,6 +35,12 @@ else
     export ENABLE_CHUNKED_PREFILL=""
 fi
 
+if [ -z "$MAX_NUM_BATCHED_TOKENS" ] || [ "$MAX_NUM_BATCHED_TOKENS" = "None" ]; then
+    export MAX_NUM_BATCHED_TOKENS=""
+else
+    export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
+fi
+
 # Activate vllm venv
 if [ "$VENV_BASE" = "singularity" ]; then
     export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -54,9 +60,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
-    --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
     --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
     ${ENABLE_PREFIX_CACHING} \
     ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}
@@ -75,9 +81,9 @@ else
     --max-model-len ${MAX_MODEL_LEN} \
     --max-num-seqs ${MAX_NUM_SEQS} \
     --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
-    --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
     --compilation-config ${COMPILATION_CONFIG} \
     --task ${TASK} \
+    ${MAX_NUM_BATCHED_TOKENS} \
     ${ENABLE_PREFIX_CACHING} \
     ${ENABLE_CHUNKED_PREFILL} \
     ${ENFORCE_EAGER}

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def cli() -> None:`
`52`	`52`	`@click.option(`
`53`	`53`	`"--max-num-batched-tokens",`
`54`	`54`	`type=int,`
`55`		`- help="Maximum number of batched tokens per iteration, defaults to min(2048, max-num-seqs), pairs with --enable-chunked-prefill to control the batch size at the prefill stage",`
	`55`	`+ help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",`
`56`	`56`	`)`
`57`	`57`	`@click.option(`
`58`	`58`	`"--partition",`