Enable default values for common params, added max_num_seqs param

XkunW · XkunW · commit 487aef80f30b · 2024-10-31T16:36:16.000-04:00
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -1,3 +1,4 @@
+import inspect
 import os
 import time
 from typing import Optional
@@ -27,9 +28,19 @@ def cli():
 @click.option(
     "--max-model-len",
     type=int,
-    help="Model context length. If unspecified, will be automatically derived from the model config.",
+    help="Model context length. Default value set based on suggested resource allocation.",
+)
+@click.option(
+    "--max-num-seqs",
+    type=int,
+    help="Maximum number of sequences to process in a single request",
+)
+@click.option(
+    "--partition",
+    type=str,
+    default="a40",
+    help="Type of compute partition, default to a40"
 )
-@click.option("--partition", type=str, help="Type of compute partition, default to a40")
 @click.option(
     "--num-nodes",
     type=int,
@@ -43,29 +54,37 @@ def cli():
 @click.option(
     "--qos",
     type=str,
-    help="Quality of service, default depends on suggested resource allocation required for the model",
+    default="m2",
+    help="Quality of service, default set to m2",
 )
 @click.option(
     "--time",
     type=str,
-    help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
+    default="08:00:00",
+    help="Time limit for job, this should comply with QoS, default to max walltime of m2",
 )
 @click.option(
     "--vocab-size",
     type=int,
     help="Vocabulary size, this option is intended for custom models",
 )
-@click.option("--data-type", type=str, help="Model data type, default to auto")
-@click.option("--venv", type=str, help="Path to virtual environment")
+@click.option("--data-type", type=str, default="auto", help="Model data type, default to auto")
+@click.option(
+    "--venv",
+    type=str,
+    default="singularity",
+    help="Path to virtual environment, default to preconfigured singularity container"
+)
 @click.option(
     "--log-dir",
     type=str,
-    help="Path to slurm log directory, default to .vec-inf-logs in home directory",
+    default="default",
+    help="Path to slurm log directory, default to .vec-inf-logs in user home directory",
 )
 @click.option(
     "--pipeline-parallelism",
     type=str,
-    help="Enable pipeline parallelism, accepts 'true' or 'false', defaults to 'true' for supported models"
+    help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
 )
 @click.option(
     "--json-mode",
@@ -77,6 +96,7 @@ def launch(
     model_family: Optional[str] = None,
     model_variant: Optional[str] = None,
     max_model_len: Optional[int] = None,
+    max_num_seqs: Optional[int] = None,
     partition: Optional[str] = None,
     num_nodes: Optional[int] = None,
     num_gpus: Optional[int] = None,
@@ -92,8 +112,9 @@ def launch(
     """
     Launch a model on the cluster
     """
-    
-    pipeline_parallelism = pipeline_parallelism is None or pipeline_parallelism.lower() == "true"
+
+    if isinstance(pipeline_parallelism, str):
+        pipeline_parallelism = pipeline_parallelism.lower() == "true"
 
     launch_script_path = os.path.join(
         os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
diff --git a/vec_inf/launch_server.sh b/vec_inf/launch_server.sh
@@ -12,17 +12,18 @@ while [[ "$#" -gt 0 ]]; do
         --num-nodes) num_nodes="$2"; shift ;;
         --num-gpus) num_gpus="$2"; shift ;;
         --max-model-len) max_model_len="$2"; shift ;;
+        --max-num-seqs) max_num_seqs="$2"; shift ;;
         --vocab-size) vocab_size="$2"; shift ;;
         --data-type) data_type="$2"; shift ;;
-        --venv) virtual_env="$2"; shift ;;
+        --venv) venv="$2"; shift ;;
         --log-dir) log_dir="$2"; shift ;;
         --pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
         *) echo "Unknown parameter passed: $1"; exit 1 ;;
     esac
     shift
 done
 
-required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size pipeline_parallelism)
+required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir)
 
 for var in "$required_vars[@]"; do
     if [ -z "$!var" ]; then
@@ -40,22 +41,20 @@ export NUM_NODES=$num_nodes
 export NUM_GPUS=$num_gpus
 export VLLM_MAX_MODEL_LEN=$max_model_len
 export VLLM_MAX_LOGPROBS=$vocab_size
-export PIPELINE_PARALLELISM=$pipeline_parallelism
-# For custom models, the following are set to default if not specified
-export VLLM_DATA_TYPE="auto"
-export VENV_BASE="singularity"
-export LOG_DIR="default"
-
-if [ -n "$data_type" ]; then
-    export VLLM_DATA_TYPE=$data_type
+export VLLM_DATA_TYPE=$data_type
+export VENV_BASE=$venv
+export LOG_DIR=$log_dir
+
+if [ -n "$max_num_seqs" ]; then
+    export VLLM_MAX_NUM_SEQS=$max_num_seqs
+else 
+    export VLLM_MAX_NUM_SEQS=256
 fi
 
-if [ -n "$virtual_env" ]; then
-    export VENV_BASE=$virtual_env
-fi
-
-if [ -n "$log_dir" ]; then
-    export LOG_DIR=$log_dir
+if [ -n "$pipeline_parallelism" ]; then
+    export PIPELINE_PARALLELISM=$pipeline_parallelism
+else
+    export PIPELINE_PARALLELISM="False"
 fi
 
 # ================================= Set default environment variables ======================================
@@ -100,6 +99,8 @@ echo GPUs per Node: $NUM_GPUS
 echo QOS: $QOS
 echo Walltime: $WALLTIME
 echo Data Type: $VLLM_DATA_TYPE
+echo Max Model Length: $VLLM_MAX_MODEL_LEN
+echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
 
 is_special=""
 if [ "$NUM_NODES" -gt 1 ]; then
diff --git a/vec_inf/multinode_vllm.slurm b/vec_inf/multinode_vllm.slurm
@@ -95,7 +95,8 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --dtype ${VLLM_DATA_TYPE} \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS}
 else
     source ${VENV_BASE}/bin/activate
     python3 -m vllm.entrypoints.openai.api_server \
@@ -108,5 +109,6 @@ else
     --dtype ${VLLM_DATA_TYPE} \
     --trust-remote-code \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS}
 fi
diff --git a/vec_inf/vllm.slurm b/vec_inf/vllm.slurm
@@ -31,7 +31,8 @@ if [ "$VENV_BASE" = "singularity" ]; then
     --dtype ${VLLM_DATA_TYPE} \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --trust-remote-code \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS}
 else
     source ${VENV_BASE}/bin/activate
     python3 -m vllm.entrypoints.openai.api_server \
@@ -43,5 +44,6 @@ else
     --dtype ${VLLM_DATA_TYPE} \
     --max-logprobs ${VLLM_MAX_LOGPROBS} \
     --trust-remote-code \
-    --max-model-len ${VLLM_MAX_MODEL_LEN}
+    --max-model-len ${VLLM_MAX_MODEL_LEN} \
+    --max-num-seqs ${VLLM_MAX_NUM_SEQS}
 fi