Skip to content

Commit 2607f0d

Browse files
committed
Add new CLI options for prefix caching, chunked prefill, and max batched tokens
1 parent a96b900 commit 2607f0d

File tree

6 files changed

+105
-2
lines changed

6 files changed

+105
-2
lines changed

vec_inf/cli/_cli.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,21 @@ def cli() -> None:
3939
type=float,
4040
help="GPU memory utilization, default to 0.9",
4141
)
42+
@click.option(
43+
"--enable-prefix-caching",
44+
type=click.Choice(["True", "False"]),
45+
help="Enables automatic prefix caching, accepts 'True' or 'False', default to 'False'",
46+
)
47+
@click.option(
48+
"--enable-chunked-prefill",
49+
type=click.Choice(["True", "False"]),
50+
help="Enable chunked prefill, accepts 'True' or 'False', default to 'True' if max-num-seqs > 32k, else 'False'",
51+
)
52+
@click.option(
53+
"--max-num-batched-tokens",
54+
type=int,
55+
help="Maximum number of batched tokens per iteration, defaults to min(2048, max-num-seqs), pairs with --enable-chunked-prefill to control the batch size at the prefill stage",
56+
)
4257
@click.option(
4358
"--partition",
4459
type=str,
@@ -90,6 +105,11 @@ def cli() -> None:
90105
type=str,
91106
help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
92107
)
108+
@click.option(
109+
"--compilation-config",
110+
type=click.Choice(["0", "3"]),
111+
help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
112+
)
93113
@click.option(
94114
"--enforce-eager",
95115
type=str,

vec_inf/cli/_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ class ModelConfig(BaseModel):
4747
max_num_seqs: int = Field(
4848
default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
4949
)
50+
max_num_batched_tokens: int = Field(
51+
default=2048,
52+
gt=0,
53+
le=1_000_000,
54+
description="Maximum batched tokens per iteration",
55+
)
5056
gpu_memory_utilization: float = Field(
5157
default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
5258
)

vec_inf/cli/_helper.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,12 @@ def _get_launch_params(self) -> dict[str, Any]:
8787
params = self.model_config.model_dump()
8888

8989
# Process boolean fields
90-
for bool_field in ["pipeline_parallelism", "enforce_eager"]:
90+
for bool_field in [
91+
"pipeline_parallelism",
92+
"enforce_eager",
93+
"enable_prefix_caching",
94+
"enable_chunked_prefill",
95+
]:
9196
if (value := self.cli_kwargs.get(bool_field)) is not None:
9297
params[bool_field] = utils.convert_boolean_value(value)
9398

@@ -97,9 +102,25 @@ def _get_launch_params(self) -> dict[str, Any]:
97102
"json_mode",
98103
"pipeline_parallelism",
99104
"enforce_eager",
105+
"enable_prefix_caching",
106+
"enable_chunked_prefill",
100107
]:
101108
params[key] = value
102109

110+
if "compilation_config" not in params:
111+
params["compilation_config"] = "0"
112+
if "enable_prefix_caching" not in params:
113+
params["enable_prefix_caching"] = False
114+
if "enable_chunked_prefill" not in params:
115+
params["enable_chunked_prefill"] = False
116+
117+
if params["max_model_len"] > 32_000: # this is the default behavior of vLLM
118+
params["enable_chunked_prefill"] = True
119+
120+
params["max_num_batched_tokens"] = min(
121+
params["max_num_batched_tokens"], params["max_model_len"]
122+
)
123+
103124
# Validate required fields
104125
if not REQUIRED_FIELDS.issubset(set(params.keys())):
105126
raise click.ClickException(
@@ -126,6 +147,10 @@ def set_env_vars(self) -> None:
126147
os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
127148
os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
128149
os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
150+
os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
151+
os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
152+
os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
153+
os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
129154
os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
130155
os.environ["SRC_DIR"] = SRC_DIR
131156
os.environ["MODEL_WEIGHTS"] = str(
@@ -183,6 +208,10 @@ def format_table_output(self, job_id: str) -> Table:
183208
table.add_row("Max Num Seqs", self.params["max_num_seqs"])
184209
table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
185210
table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
211+
table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
212+
table.add_row("Enable Chunked Prefill", self.params["enable_chunked_prefill"])
213+
table.add_row("Max Num Batched Tokens", self.params["max_num_batched_tokens"])
214+
table.add_row("Compilation Config", self.params["compilation_config"])
186215
table.add_row("Enforce Eager", self.params["enforce_eager"])
187216
table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
188217
table.add_row("Log Directory", self.params["log_dir"])

vec_inf/cli/_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,14 @@ def get_latest_metric(log_lines: list[str]) -> Union[str, dict[str, str]]:
174174
key, value = metric.split(": ")
175175
latest_metric[key] = value
176176
break
177+
if "Prefix cache hit rate" in line:
178+
# Parse the metric values from the line
179+
metrics_str = line.split("] ")[1].strip()
180+
prefix, metrics_str = metrics_str.split(": ", 1)
181+
metrics_list = metrics_str.split(", ")
182+
for metric in metrics_list:
183+
key, value = metric.split(": ")
184+
latest_metric[f"{key} {prefix}"] = value
177185
except Exception as e:
178186
return f"[red]Error reading log file: {e}[/red]"
179187

vec_inf/multinode_vllm.slurm

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,18 @@ else
9090
export ENFORCE_EAGER=""
9191
fi
9292

93+
if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
94+
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
95+
else
96+
export ENABLE_PREFIX_CACHING="--no-enable-prefix-caching"
97+
fi
98+
99+
if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
100+
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
101+
else
102+
export ENABLE_CHUNKED_PREFILL=""
103+
fi
104+
93105
# Activate vllm venv
94106
if [ "$VENV_BASE" = "singularity" ]; then
95107
singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -105,8 +117,11 @@ if [ "$VENV_BASE" = "singularity" ]; then
105117
--max-logprobs ${MAX_LOGPROBS} \
106118
--max-model-len ${MAX_MODEL_LEN} \
107119
--max-num-seqs ${MAX_NUM_SEQS} \
108-
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
120+
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
121+
--compilation-config ${COMPILATION_CONFIG} \
109122
--task ${TASK} \
123+
${ENABLE_PREFIX_CACHING} \
124+
${ENABLE_CHUNKED_PREFILL} \
110125
${ENFORCE_EAGER}
111126
else
112127
source ${VENV_BASE}/bin/activate
@@ -123,6 +138,10 @@ else
123138
--max-model-len ${MAX_MODEL_LEN} \
124139
--max-num-seqs ${MAX_NUM_SEQS} \
125140
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
141+
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
142+
--compilation-config ${COMPILATION_CONFIG} \
126143
--task ${TASK} \
144+
${ENABLE_PREFIX_CACHING} \
145+
${ENABLE_CHUNKED_PREFILL} \
127146
${ENFORCE_EAGER}
128147
fi

vec_inf/vllm.slurm

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ else
2323
export ENFORCE_EAGER=""
2424
fi
2525

26+
if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
27+
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
28+
else
29+
export ENABLE_PREFIX_CACHING="--no-enable-prefix-caching"
30+
fi
31+
32+
if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
33+
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
34+
else
35+
export ENABLE_CHUNKED_PREFILL=""
36+
fi
37+
2638
# Activate vllm venv
2739
if [ "$VENV_BASE" = "singularity" ]; then
2840
export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -42,8 +54,13 @@ if [ "$VENV_BASE" = "singularity" ]; then
4254
--max-model-len ${MAX_MODEL_LEN} \
4355
--max-num-seqs ${MAX_NUM_SEQS} \
4456
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
57+
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
58+
--compilation-config ${COMPILATION_CONFIG} \
4559
--task ${TASK} \
60+
${ENABLE_PREFIX_CACHING} \
61+
${ENABLE_CHUNKED_PREFILL} \
4662
${ENFORCE_EAGER}
63+
4764
else
4865
source ${VENV_BASE}/bin/activate
4966
python3 -m vllm.entrypoints.openai.api_server \
@@ -58,6 +75,10 @@ else
5875
--max-model-len ${MAX_MODEL_LEN} \
5976
--max-num-seqs ${MAX_NUM_SEQS} \
6077
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
78+
--max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} \
79+
--compilation-config ${COMPILATION_CONFIG} \
6180
--task ${TASK} \
81+
${ENABLE_PREFIX_CACHING} \
82+
${ENABLE_CHUNKED_PREFILL} \
6283
${ENFORCE_EAGER}
6384
fi

0 commit comments

Comments
 (0)