Skip to content

Commit 9ef73c0

Browse files
authored
Merge pull request #68 from VectorInstitute/fco/expose_more_args
- Add new CLI options: --enable-prefix-caching, --enable-chunked-prefill, --max-num-batched-tokens, --compile-config - Update all boolean options to flags
2 parents 3794604 + 5bf5ac7 commit 9ef73c0

File tree

7 files changed

+117
-35
lines changed

7 files changed

+117
-35
lines changed

tests/vec_inf/cli/test_utils.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from vec_inf.cli._utils import (
1010
MODEL_READY_SIGNATURE,
11-
convert_boolean_value,
1211
create_table,
1312
get_base_url,
1413
is_server_running,
@@ -223,19 +222,3 @@ def test_load_config_invalid_user_model(tmp_path):
223222
assert "validation error" in str(excinfo.value).lower()
224223
assert "model_type" in str(excinfo.value)
225224
assert "num_gpus" in str(excinfo.value)
226-
227-
228-
def test_convert_boolean_value_with_string():
229-
"""Testing string inputs."""
230-
assert convert_boolean_value("true") is True
231-
assert convert_boolean_value("TRUE") is True
232-
assert convert_boolean_value("false") is False
233-
assert convert_boolean_value("random_string") is False
234-
235-
236-
def test_convert_boolean_value_with_numeric_and_boolean():
237-
"""Testing integer and boolean inputs."""
238-
assert convert_boolean_value(1) is True
239-
assert convert_boolean_value(0) is False
240-
assert convert_boolean_value(True) is True
241-
assert convert_boolean_value(False) is False

vec_inf/cli/_cli.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,21 @@ def cli() -> None:
3939
type=float,
4040
help="GPU memory utilization, default to 0.9",
4141
)
42+
@click.option(
43+
"--enable-prefix-caching",
44+
is_flag=True,
45+
help="Enables automatic prefix caching",
46+
)
47+
@click.option(
48+
"--enable-chunked-prefill",
49+
is_flag=True,
50+
help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
51+
)
52+
@click.option(
53+
"--max-num-batched-tokens",
54+
type=int,
55+
help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
56+
)
4257
@click.option(
4358
"--partition",
4459
type=str,
@@ -87,13 +102,18 @@ def cli() -> None:
87102
)
88103
@click.option(
89104
"--pipeline-parallelism",
90-
type=str,
91-
help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
105+
is_flag=True,
106+
help="Enable pipeline parallelism, enabled by default for supported models",
107+
)
108+
@click.option(
109+
"--compilation-config",
110+
type=click.Choice(["0", "1", "2", "3"]),
111+
help="torch.compile optimization level, accepts '0', '1', '2', or '3', default to '0', which means no optimization is applied",
92112
)
93113
@click.option(
94114
"--enforce-eager",
95-
type=str,
96-
help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
115+
is_flag=True,
116+
help="Always use eager-mode PyTorch",
97117
)
98118
@click.option(
99119
"--json-mode",

vec_inf/cli/_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ class ModelConfig(BaseModel):
4747
max_num_seqs: int = Field(
4848
default=256, gt=0, le=1024, description="Maximum concurrent request sequences"
4949
)
50+
compilation_config: int = Field(
51+
default=0,
52+
gt=-1,
53+
le=4,
54+
description="torch.compile optimization level",
55+
)
5056
gpu_memory_utilization: float = Field(
5157
default=0.9, gt=0.0, le=1.0, description="GPU memory utilization"
5258
)

vec_inf/cli/_helper.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@
3434
"max_model_len",
3535
}
3636

37+
BOOLEAN_FIELDS = {
38+
"pipeline_parallelism",
39+
"enforce_eager",
40+
"enable_prefix_caching",
41+
"enable_chunked_prefill",
42+
}
43+
3744
LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
3845
SRC_DIR = str(Path(__file__).parent.parent)
3946

@@ -90,16 +97,15 @@ def _get_launch_params(self) -> dict[str, Any]:
9097
params = self.model_config.model_dump()
9198

9299
# Process boolean fields
93-
for bool_field in ["pipeline_parallelism", "enforce_eager"]:
94-
if (value := self.cli_kwargs.get(bool_field)) is not None:
95-
params[bool_field] = utils.convert_boolean_value(value)
100+
for bool_field in BOOLEAN_FIELDS:
101+
if self.cli_kwargs[bool_field]:
102+
params[bool_field] = True
96103

97104
# Merge other overrides
98105
for key, value in self.cli_kwargs.items():
99106
if value is not None and key not in [
100107
"json_mode",
101-
"pipeline_parallelism",
102-
"enforce_eager",
108+
*BOOLEAN_FIELDS,
103109
]:
104110
params[key] = value
105111

@@ -129,7 +135,7 @@ def set_env_vars(self) -> None:
129135
os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
130136
os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
131137
os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
132-
os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
138+
os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
133139
os.environ["SRC_DIR"] = SRC_DIR
134140
os.environ["MODEL_WEIGHTS"] = str(
135141
Path(self.params["model_weights_parent_dir"], self.model_name)
@@ -138,6 +144,15 @@ def set_env_vars(self) -> None:
138144
os.environ["VENV_BASE"] = self.params["venv"]
139145
os.environ["LOG_DIR"] = self.params["log_dir"]
140146

147+
if self.params.get("enable_prefix_caching"):
148+
os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
149+
if self.params.get("enable_chunked_prefill"):
150+
os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
151+
if self.params.get("max_num_batched_tokens"):
152+
os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
153+
if self.params.get("enforce_eager"):
154+
os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
155+
141156
def build_launch_command(self) -> str:
142157
"""Construct the full launch command with parameters."""
143158
# Base command
@@ -185,8 +200,20 @@ def format_table_output(self, job_id: str) -> Table:
185200
table.add_row("Max Model Length", self.params["max_model_len"])
186201
table.add_row("Max Num Seqs", self.params["max_num_seqs"])
187202
table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
203+
table.add_row("Compilation Config", self.params["compilation_config"])
188204
table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
189-
table.add_row("Enforce Eager", self.params["enforce_eager"])
205+
if self.params.get("enable_prefix_caching"):
206+
table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
207+
if self.params.get("enable_chunked_prefill"):
208+
table.add_row(
209+
"Enable Chunked Prefill", self.params["enable_chunked_prefill"]
210+
)
211+
if self.params.get("max_num_batched_tokens"):
212+
table.add_row(
213+
"Max Num Batched Tokens", self.params["max_num_batched_tokens"]
214+
)
215+
if self.params.get("enforce_eager"):
216+
table.add_row("Enforce Eager", self.params["enforce_eager"])
190217
table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
191218
table.add_row("Log Directory", self.params["log_dir"])
192219

vec_inf/cli/_utils.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,3 @@ def load_config() -> list[ModelConfig]:
158158
ModelConfig(model_name=name, **model_data)
159159
for name, model_data in config.get("models", {}).items()
160160
]
161-
162-
163-
def convert_boolean_value(value: Union[str, int, bool]) -> bool:
164-
"""Convert various input types to boolean strings."""
165-
if isinstance(value, str):
166-
return value.lower() == "true"
167-
return bool(value)

vec_inf/multinode_vllm.slurm

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,24 @@ else
9090
export ENFORCE_EAGER=""
9191
fi
9292

93+
if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
94+
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
95+
else
96+
export ENABLE_PREFIX_CACHING=""
97+
fi
98+
99+
if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
100+
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
101+
else
102+
export ENABLE_CHUNKED_PREFILL=""
103+
fi
104+
105+
if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
106+
export MAX_NUM_BATCHED_TOKENS=""
107+
else
108+
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
109+
fi
110+
93111
# Activate vllm venv
94112
if [ "$VENV_BASE" = "singularity" ]; then
95113
singularity exec --nv --bind ${MODEL_WEIGHTS}:${MODEL_WEIGHTS} $SINGULARITY_IMAGE \
@@ -106,7 +124,11 @@ if [ "$VENV_BASE" = "singularity" ]; then
106124
--max-model-len ${MAX_MODEL_LEN} \
107125
--max-num-seqs ${MAX_NUM_SEQS} \
108126
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
127+
--compilation-config ${COMPILATION_CONFIG} \
109128
--task ${TASK} \
129+
${MAX_NUM_BATCHED_TOKENS} \
130+
${ENABLE_PREFIX_CACHING} \
131+
${ENABLE_CHUNKED_PREFILL} \
110132
${ENFORCE_EAGER}
111133
else
112134
source ${VENV_BASE}/bin/activate
@@ -123,6 +145,10 @@ else
123145
--max-model-len ${MAX_MODEL_LEN} \
124146
--max-num-seqs ${MAX_NUM_SEQS} \
125147
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
148+
--compilation-config ${COMPILATION_CONFIG} \
126149
--task ${TASK} \
150+
${MAX_NUM_BATCHED_TOKENS} \
151+
${ENABLE_PREFIX_CACHING} \
152+
${ENABLE_CHUNKED_PREFILL} \
127153
${ENFORCE_EAGER}
128154
fi

vec_inf/vllm.slurm

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,24 @@ else
2323
export ENFORCE_EAGER=""
2424
fi
2525

26+
if [ "$ENABLE_PREFIX_CACHING" = "True" ]; then
27+
export ENABLE_PREFIX_CACHING="--enable-prefix-caching"
28+
else
29+
export ENABLE_PREFIX_CACHING=""
30+
fi
31+
32+
if [ "$ENABLE_CHUNKED_PREFILL" = "True" ]; then
33+
export ENABLE_CHUNKED_PREFILL="--enable-chunked-prefill"
34+
else
35+
export ENABLE_CHUNKED_PREFILL=""
36+
fi
37+
38+
if [ -z "$MAX_NUM_BATCHED_TOKENS" ]; then
39+
export MAX_NUM_BATCHED_TOKENS=""
40+
else
41+
export MAX_NUM_BATCHED_TOKENS="--max-num-batched-tokens=$MAX_NUM_BATCHED_TOKENS"
42+
fi
43+
2644
# Activate vllm venv
2745
if [ "$VENV_BASE" = "singularity" ]; then
2846
export SINGULARITY_IMAGE=/model-weights/vec-inf-shared/vector-inference_latest.sif
@@ -42,8 +60,13 @@ if [ "$VENV_BASE" = "singularity" ]; then
4260
--max-model-len ${MAX_MODEL_LEN} \
4361
--max-num-seqs ${MAX_NUM_SEQS} \
4462
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
63+
--compilation-config ${COMPILATION_CONFIG} \
4564
--task ${TASK} \
65+
${MAX_NUM_BATCHED_TOKENS} \
66+
${ENABLE_PREFIX_CACHING} \
67+
${ENABLE_CHUNKED_PREFILL} \
4668
${ENFORCE_EAGER}
69+
4770
else
4871
source ${VENV_BASE}/bin/activate
4972
python3 -m vllm.entrypoints.openai.api_server \
@@ -58,6 +81,10 @@ else
5881
--max-model-len ${MAX_MODEL_LEN} \
5982
--max-num-seqs ${MAX_NUM_SEQS} \
6083
--gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} \
84+
--compilation-config ${COMPILATION_CONFIG} \
6185
--task ${TASK} \
86+
${MAX_NUM_BATCHED_TOKENS} \
87+
${ENABLE_PREFIX_CACHING} \
88+
${ENABLE_CHUNKED_PREFILL} \
6289
${ENFORCE_EAGER}
6390
fi

0 commit comments

Comments
 (0)