Skip to content

Commit 0d1c070

Browse files
committed
Update handling of max-num-batched-tokens in CLI and SLURM scripts
1 parent cfd3e91 commit 0d1c070

File tree

5 files changed

+19
-11
lines changed

5 files changed

+19
-11
lines changed

vec_inf/cli/_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def cli() -> None:
5252
@click.option(
5353
"--max-num-batched-tokens",
5454
type=int,
55-
help="Maximum number of batched tokens per iteration, defaults to min(2048, max-num-seqs), pairs with --enable-chunked-prefill to control the batch size at the prefill stage",
55+
help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
5656
)
5757
@click.option(
5858
"--partition",

vec_inf/cli/_config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ class ModelConfig(BaseModel):
4747
max_num_seqs: int = Field(
4848
default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
4949
)
50-
max_num_batched_tokens: int = Field(
51-
default=2048,
50+
max_num_batched_tokens: Optional[int] = Field(
51+
default=None,
5252
gt=0,
5353
le=1_000_000,
5454
description="Maximum batched tokens per iteration",

vec_inf/cli/_helper.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,6 @@ def _get_launch_params(self) -> dict[str, Any]:
117117
if params["max_model_len"] > 32_000: # this is the default behavior of vLLM
118118
params["enable_chunked_prefill"] = True
119119

120-
params["max_num_batched_tokens"] = min(
121-
params["max_num_batched_tokens"], params["max_model_len"]
122-
)
123-
124120
# Validate required fields
125121
if not REQUIRED_FIELDS.issubset(set(params.keys())):
126122
raise click.ClickException(

vec_inf/multinode_vllm.slurm

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ else
102102
export ENABLE_CHUNKED_PREFILL=""
103103
fi
104104

105+
if [ -z "$MAX_NUM_BATCHED_TOKENS" ] || [ "$MAX_NUM_BATCHED_TOKENS" = "None" ]; then
106+
export MAX_NUM_BATCHED_TOKENS=""
107+
else
108+
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
109+
fi
110+
105111
# Activate vllm venv
106112
if [ "$VENV_BASE" = "singularity" ]; then
107113
singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -118,9 +124,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
118124
--max-model-len ${MAX_MODEL_LEN} \
119125
--max-num-seqs ${MAX_NUM_SEQS} \
120126
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
121-
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
122127
--compilation-config ${COMPILATION_CONFIG} \
123128
--task ${TASK} \
129+
${MAX_NUM_BATCHED_TOKENS} \
124130
${ENABLE_PREFIX_CACHING} \
125131
${ENABLE_CHUNKED_PREFILL} \
126132
${ENFORCE_EAGER}
@@ -139,9 +145,9 @@ else
139145
--max-model-len ${MAX_MODEL_LEN} \
140146
--max-num-seqs ${MAX_NUM_SEQS} \
141147
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
142-
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
143148
--compilation-config ${COMPILATION_CONFIG} \
144149
--task ${TASK} \
150+
${MAX_NUM_BATCHED_TOKENS} \
145151
${ENABLE_PREFIX_CACHING} \
146152
${ENABLE_CHUNKED_PREFILL} \
147153
${ENFORCE_EAGER}

vec_inf/vllm.slurm

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ else
3535
export ENABLE_CHUNKED_PREFILL=""
3636
fi
3737

38+
if [ -z "$MAX_NUM_BATCHED_TOKENS" ] || [ "$MAX_NUM_BATCHED_TOKENS" = "None" ]; then
39+
export MAX_NUM_BATCHED_TOKENS=""
40+
else
41+
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
42+
fi
43+
3844
# Activate vllm venv
3945
if [ "$VENV_BASE" = "singularity" ]; then
4046
export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -54,9 +60,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
5460
--max-model-len ${MAX_MODEL_LEN} \
5561
--max-num-seqs ${MAX_NUM_SEQS} \
5662
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
57-
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
5863
--compilation-config ${COMPILATION_CONFIG} \
5964
--task ${TASK} \
65+
${MAX_NUM_BATCHED_TOKENS} \
6066
${ENABLE_PREFIX_CACHING} \
6167
${ENABLE_CHUNKED_PREFILL} \
6268
${ENFORCE_EAGER}
@@ -75,9 +81,9 @@ else
7581
--max-model-len ${MAX_MODEL_LEN} \
7682
--max-num-seqs ${MAX_NUM_SEQS} \
7783
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
78-
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
7984
--compilation-config ${COMPILATION_CONFIG} \
8085
--task ${TASK} \
86+
${MAX_NUM_BATCHED_TOKENS} \
8187
${ENABLE_PREFIX_CACHING} \
8288
${ENABLE_CHUNKED_PREFILL} \
8389
${ENFORCE_EAGER}

0 commit comments

Comments
 (0)