Added enforce eager option, added support for reward modeling models, refactors based on mypy

XkunW · XkunW · commit e7b6871fefef · 2024-11-27T18:36:12.000-05:00
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
@@ -1,9 +1,10 @@
 import os
 import time
-from typing import Optional
+from typing import Optional, cast
 
 import click
-import pandas as pd
+
+import polars as pl
 from rich.columns import Columns
 from rich.console import Console
 from rich.live import Live
@@ -91,6 +92,11 @@ def cli():
     type=str,
     help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
 )
+@click.option(
+    "--enforce-eager",
+    type=str,
+    help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
+)
 @click.option(
     "--json-mode",
     is_flag=True,
@@ -113,14 +119,17 @@ def launch(
     log_dir: Optional[str] = None,
     model_weights_parent_dir: Optional[str] = None,
     pipeline_parallelism: Optional[str] = None,
+    enforce_eager: Optional[str] = None,
     json_mode: bool = False,
 ) -> None:
     """
     Launch a model on the cluster
     """
 
     if isinstance(pipeline_parallelism, str):
-        pipeline_parallelism = pipeline_parallelism.lower() == "true"
+        pipeline_parallelism = (
+            "True" if pipeline_parallelism.lower() == "true" else "False"
+        )
 
     launch_script_path = os.path.join(
         os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
@@ -129,15 +138,15 @@ def launch(
 
     models_df = utils.load_models_df()
 
-    if model_name in models_df["model_name"].values:
+    if model_name in models_df["model_name"].to_list():
         default_args = utils.load_default_args(models_df, model_name)
         for arg in default_args:
             if arg in locals() and locals()[arg] is not None:
                 default_args[arg] = locals()[arg]
             renamed_arg = arg.replace("_", "-")
             launch_cmd += f" --{renamed_arg} {default_args[arg]}"
     else:
-        model_args = models_df.columns.tolist()
+        model_args = models_df.columns
         model_args.remove("model_name")
         model_args.remove("model_type")
         for arg in model_args:
@@ -265,45 +274,58 @@ def shutdown(slurm_job_id: int) -> None:
     is_flag=True,
     help="Output in JSON string",
 )
-def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
+def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """
     List all available models, or get default setup of a specific model
     """
 
-    def list_model(model_name: str, models_df: pd.DataFrame, json_mode: bool):
-        if model_name not in models_df["model_name"].values:
+    def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
+        if model_name not in models_df["model_name"].to_list():
             raise ValueError(f"Model name {model_name} not found in available models")
 
         excluded_keys = {"venv", "log_dir"}
-        model_row = models_df.loc[models_df["model_name"] == model_name]
+        model_row = models_df.filter(models_df["model_name"] == model_name)
 
         if json_mode:
-            filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
-            click.echo(filtered_model_row.to_json(orient="records"))
+            filtered_model_row = model_row.drop(excluded_keys, strict=False)
+            click.echo(filtered_model_row.to_dicts()[0])
             return
         table = utils.create_table(key_title="Model Config", value_title="Value")
-        for _, row in model_row.iterrows():
+        for row in model_row.to_dicts():
             for key, value in row.items():
                 if key not in excluded_keys:
                     table.add_row(key, str(value))
         CONSOLE.print(table)
 
-    def list_all(models_df: pd.DataFrame, json_mode: bool):
+    def list_all(models_df: pl.DataFrame, json_mode: bool):
         if json_mode:
-            click.echo(models_df["model_name"].to_json(orient="records"))
+            click.echo(models_df["model_name"].to_list())
             return
         panels = []
         model_type_colors = {
             "LLM": "cyan",
             "VLM": "bright_blue",
             "Text Embedding": "purple",
+            "Reward Modeling": "bright_magenta",
         }
-        custom_order = ["LLM", "VLM", "Text Embedding"]
-        models_df["model_type"] = pd.Categorical(
-            models_df["model_type"], categories=custom_order, ordered=True
+
+        models_df = models_df.with_columns(
+            pl.when(pl.col("model_type") == "LLM")
+            .then(0)
+            .when(pl.col("model_type") == "VLM")
+            .then(1)
+            .when(pl.col("model_type") == "Text Embedding")
+            .then(2)
+            .when(pl.col("model_type") == "Reward Modeling")
+            .then(3)
+            .otherwise(-1)
+            .alias("model_type_order")
         )
-        models_df = models_df.sort_values(by="model_type")
-        for _, row in models_df.iterrows():
+
+        models_df = models_df.sort("model_type_order")
+        models_df = models_df.drop("model_type_order")
+
+        for row in models_df.to_dicts():
             panel_color = model_type_colors.get(row["model_type"], "white")
             styled_text = (
                 f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
@@ -336,10 +358,22 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
 
     with Live(refresh_per_second=1, console=CONSOLE) as live:
         while True:
-            out_logs = utils.read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
-            metrics = utils.get_latest_metric(out_logs)
+            out_logs = utils.read_slurm_log(
+                slurm_job_name, slurm_job_id, "out", log_dir
+            )
+            # if out_logs is a string, then it is an error message
+            if isinstance(out_logs, str):
+                live.update(out_logs)
+                break
+            out_logs = cast(list, out_logs)
+            latest_metrics = utils.get_latest_metric(out_logs)
+            # if latest_metrics is a string, then it is an error message
+            if isinstance(latest_metrics, str):
+                live.update(latest_metrics)
+                break
+            latest_metrics = cast(dict, latest_metrics)
             table = utils.create_table(key_title="Metric", value_title="Value")
-            for key, value in metrics.items():
+            for key, value in latest_metrics.items():
                 table.add_row(key, value)
 
             live.update(table)
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
@@ -1,8 +1,8 @@
 import os
 import subprocess
-from typing import Optional, Union
+from typing import Optional, Union, cast
 
-import pandas as pd
+import polars as pl
 import requests
 from rich.table import Table
 
@@ -35,9 +35,11 @@ def read_slurm_log(
                 log_dir = os.path.join(models_dir, dir)
                 break
 
+    log_dir = cast(str, log_dir)
+
     try:
         file_path = os.path.join(
-            log_dir,  
+            log_dir,
             f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
         )
         with open(file_path, "r") as file:
@@ -58,13 +60,15 @@ def is_server_running(
     if isinstance(log_content, str):
         return log_content
 
-    status = None
+    status: Union[str, tuple[str, str]] = "LAUNCHING"
+
     for line in log_content:
         if "error" in line.lower():
             status = ("FAILED", line.strip("\n"))
         if MODEL_READY_SIGNATURE in line:
             status = "RUNNING"
-    return "LAUNCHING" if not status else status
+
+    return status
 
 
 def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
@@ -115,11 +119,11 @@ def create_table(
     return table
 
 
-def load_models_df() -> pd.DataFrame:
+def load_models_df() -> pl.DataFrame:
     """
     Load the models dataframe
     """
-    models_df = pd.read_csv(
+    models_df = pl.read_csv(
         os.path.join(
             os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
             "models/models.csv",
@@ -128,14 +132,14 @@ def load_models_df() -> pd.DataFrame:
     return models_df
 
 
-def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
+def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
     """
     Load the default arguments for a model
     """
-    row_data = models_df.loc[models_df["model_name"] == model_name]
-    default_args = row_data.iloc[0].to_dict()
-    default_args.pop("model_name")
-    default_args.pop("model_type")
+    row_data = models_df.filter(models_df["model_name"] == model_name)
+    default_args = row_data.to_dicts()[0]
+    default_args.pop("model_name", None)
+    default_args.pop("model_type", None)
     return default_args
 
 
@@ -147,9 +151,9 @@ def get_latest_metric(log_lines: list[str]) -> dict | str:
         for line in reversed(log_lines):
             if "Avg prompt throughput" in line:
                 # Parse the metric values from the line
-                metrics = line.split("] ")[1].strip().strip(".")
-                metrics = metrics.split(", ")
-                for metric in metrics:
+                metrics_str = line.split("] ")[1].strip().strip(".")
+                metrics_list = metrics_str.split(", ")
+                for metric in metrics_list:
                     key, value = metric.split(": ")
                     latest_metric[key] = value
                 break