Support transformers run Vision model (#49)

x574chen · Xiaotong Chen · web-flow · commit 773c1784b7e6 · 2025-01-10T16:38:47.000+08:00
* add transformers to run qwen2-vl vision model for accuracy baseline

* update multimodel requirement: add dashinfer

* Skip TensorRT package import if TensorRT is not installed

* use flash_attention_2 and update doc

---------

Co-authored-by: Xiaotong Chen &lt;“cxt459847@alibaba-inc.com”&gt;
diff --git a/docs/sphinx/vlm/vlm_offline_inference_en.rst b/docs/sphinx/vlm/vlm_offline_inference_en.rst
@@ -111,21 +111,26 @@ You can also use OpenAI's Python client library:
 
 Launching with CLI
 -------------------------
-You can also opt to install dashinfer-vlm locally and use command line to launch server.
+You can install dashinfer-vlm locally and use the command line to launch the server by following these steps. We highly recommend using NVIDIA PyTorch Containers `nvcr.io/nvidia/pytorch:xx.xx-py3` for setup.
 
-1. Pull dashinfer docker image (see :ref:`docker-label`)
-2. Install TensorRT Python package, and download TensorRT GA build from NVIDIA Developer Zone.
+1. (Optional when TensorRT is installed) Install TensorRT Python package, and download TensorRT GA build from NVIDIA Developer Zone.
 
 .. code-block:: bash
 
    wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
    tar -xvzf TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz
+   pip install `pwd`/TensorRT-10.5.0.18/python/tensorrt-10.5.0-cp310-none-linux_x86_64.whl
    export LD_LIBRARY_PATH=`pwd`/TensorRT-10.5.0.18/lib
 
-3. Install dashinfer Python Package from `release <https://github.com/modelscope/dash-infer/releases>`_
-4. Install dashinfer-vlm: ``pip install dashinfer-vlm``.
+2. Install dashinfer-vlm: ``pip install dashinfer-vlm``, or install from source code
 
-Now you can launch server with command line:
+.. code-block:: bash
+
+   git clone https://github.com/modelscope/dash-infer.git
+   cd dash-infer/multimodal/
+   pip install -e ./
+
+3. Launch server with command line:
 
 .. code-block:: bash
 
diff --git a/multimodal/dashinfer_vlm/api_server/config.py b/multimodal/dashinfer_vlm/api_server/config.py
@@ -48,7 +48,7 @@ def add_context_args(parser):
         "--vision_engine",
         type=str,
         default="tensorrt",
-        choices=["tensorrt"],
+        choices=["tensorrt", "transformers"],
         help="engine to run vision model",
     )
     group.add_argument(
@@ -76,6 +76,11 @@ def add_context_args(parser):
         action="store_true",
         help="enable FP8",
     )
+    group.add_argument(
+        "--dtype",
+        default="bfloat16",
+        choices=["bfloat16", "float16"],
+    )
     group.add_argument(
         "--min-pixels",
         default=4*28*28,
diff --git a/multimodal/dashinfer_vlm/api_server/conversation.py b/multimodal/dashinfer_vlm/api_server/conversation.py
@@ -116,7 +116,7 @@ def get_content(self, content) -> Tuple:
             image_list = []
         else:
             if image_list[0] == "image":
-                text = "<|vision_start|><|vision_end|>\n" * (len(image_list) - 1)
+                text = "<|vision_start|><|vision_end|>" * (len(image_list) - 1)
             elif image_list[0] == "video":
                 text = "<|vision_start|><|vision_end|>\n"
             else:
diff --git a/multimodal/dashinfer_vlm/api_server/protocol/openai_api_protocol.py b/multimodal/dashinfer_vlm/api_server/protocol/openai_api_protocol.py
@@ -88,7 +88,7 @@ class ChatCompletionRequest(BaseModel):
     ]
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
-    top_k: Optional[int] = 1
+    top_k: Optional[int] = 0
     n: Optional[int] = 1
     max_tokens: Optional[int] = None
     max_completion_tokens: Optional[int] = None
diff --git a/multimodal/dashinfer_vlm/api_server/server.py b/multimodal/dashinfer_vlm/api_server/server.py
@@ -79,7 +79,7 @@ def init():
     home_dir = os.environ.get("HOME") or "/root"
     output_dir = os.path.join(home_dir, ".cache/as_model/", model.split("/")[-1])
     model_name = "model"
-    data_type = "bfloat16"
+    data_type = context.get("dtype")
 
     model_loader = HuggingFaceVLModel(
         model,
@@ -474,7 +474,9 @@ def get_vl_request(
             "min_length": 5,
             "frequency_penalty": frequency_penalty,
             "presence_penalty": presence_penalty,
+            # "repetition_penalty": 1.05,
             "length_penalty": 1,
+            "stop_words_ids": [[151643], [151644], [151645]],
             "eos_token_id": context.get("eos_token_id"),
             "seed": 1234567,
         }
diff --git a/multimodal/dashinfer_vlm/vl_inference/runtime/hie_worker.py b/multimodal/dashinfer_vlm/vl_inference/runtime/hie_worker.py
@@ -11,7 +11,11 @@
 import threading
 import queue
 
-from ..utils.trt.vit_process import VisualTRT_V2
+try:
+    from ..utils.trt.vit_process import VisualTRT_V2
+except Exception:
+    pass
+
 import torch
 import numpy as np
 import time
@@ -75,11 +79,11 @@ def run(self):
         if self.model_type == "QWEN2-VL":
             # warm up
             image = torch.randn(
-                10080,
+                2436,
                 1176,
                 dtype=torch.float16 if self.precision == "fp16" else torch.float32,
             )
-            grid_thw = torch.tensor([[1, 120, 84]], dtype=torch.int64)
+            grid_thw = torch.tensor([[1, 58, 42]], dtype=torch.int64)
             first_grid = grid_thw[0, 0].item()
             batch_tensor = torch.zeros(first_grid)
             dict(
@@ -93,6 +97,10 @@ def run(self):
                 self.model = VisualTRT_V2(
                     vit_engine_path=self.model_path, trt_vit_config=self.trt_vit_config
                 )
+            elif self.backend == "transformers":
+                self.model = self.model_path.to(self.device)
+                with torch.no_grad():
+                    self.model(image.to(self.device), grid_thw=grid_thw.to(self.device))
             elif self.backend == "hie":
                 raise NotImplementedError
         else:
@@ -126,9 +134,6 @@ def get_vit_result(self, image, input_info):
         if self.model_type == "QWEN1-VL":
             output = self.model(image, use_flashattn=True)
         elif self.model_type == "QWEN2-VL":
-            # grid_thw = torch.tensor(
-            #                 [input_info["vit_grid_t"], input_info["vit_grid_h"], input_info["vit_grid_w"]], dtype=torch.int32
-            #             ).unsqueeze(0)
             grid_thw = np.array(
                 [
                     [
@@ -144,11 +149,14 @@ def get_vit_result(self, image, input_info):
             batch_tensor = torch.zeros(first_grid).to(
                 dtype=torch.int32, device=self.device
             )
-            # output = self.model(image, grid_thw, batch_tensor)
-            output = self.model(image, grid_thw, batch_tensor)
-            # print("vit output shape: ", output.shape)
+            if self.backend == "tensorrt":
+                output = self.model(image, grid_thw, batch_tensor)
+            elif self.backend == "transformers":
+                with torch.no_grad():
+                    output = self.model(image, grid_thw=grid_thw)
         else:
             output = self.model(image.contiguous().to(self.device), input_info)
+
         return output
 
     def process_request(self, task: VitRequest) -> None:
diff --git a/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py b/multimodal/dashinfer_vlm/vl_inference/utils/model_loader.py
@@ -5,6 +5,7 @@
 import os
 import torch
 import glob
+import warnings
 from modelscope import snapshot_download
 from transformers import Qwen2VLForConditionalGeneration, AutoConfig, AutoTokenizer
 from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
@@ -13,8 +14,20 @@
 from dashinfer import allspark
 from dashinfer.allspark.model_loader import HuggingFaceModel, ModelSerializerException
 from dashinfer.allspark.model_config import QWen2ConfigAdapter
-from .trt.onnx_to_plan import ONNX_TRT
-
+try:
+    from .trt.onnx_to_plan import ONNX_TRT
+except Exception:
+    warnings.warn("TensorRT package is not available", ImportWarning)
+
+def dtype_to_torch_dtype(dtype):
+    if dtype == "float32":
+        return torch.float32
+    elif dtype == "float16":
+        return torch.float16
+    elif dtype == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError("unsupported data type: {}".format(dtype))
 
 class HuggingFaceVLModel(HuggingFaceModel):
     def __init__(
@@ -49,6 +62,8 @@ def load_model(
                 self.torch_model = Qwen2VLForConditionalGeneration.from_pretrained(
                     self.hf_model_path,
                     trust_remote_code=self.trust_remote_code,
+                    torch_dtype=dtype_to_torch_dtype(self.data_type),
+                    device_map="cpu",
                     **kwargs,
                 ).eval()
                 self.vit_config = Qwen2VLVisionConfig.from_pretrained(
@@ -62,17 +77,6 @@ def load_model(
                     trust_remote_code=self.trust_remote_code,
                     **kwargs,
                 )
-                self.torch_model = self.torch_model.cpu()
-
-                if self.data_type == "float32":
-                    self.torch_model.float()
-                elif self.data_type == "float16":
-                    self.torch_model.half()
-                elif self.data_type == "bfloat16":
-                    self.torch_model.bfloat16()
-                else:
-                    self.torch_model = None
-                    raise ValueError("unsupported data type: {}".format(self.data_type))
             except Exception as e:
                 print(
                     f"exception when load model: {self.hf_model_path} , exception: {e}"
@@ -122,9 +126,17 @@ def serialize(
             onnx_trt_obj = ONNX_TRT(self.hf_model_path)
             onnx_trt_obj.export_onnx(onnxFile)
             onnx_trt_obj.generate_trt_engine(onnxFile, self.vision_model_path)
+        elif self.vision_engine == "transformers":
+            visual_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    self.hf_model_path,
+                    trust_remote_code=self.trust_remote_code,
+                    torch_dtype=dtype_to_torch_dtype(self.data_type),
+                    device_map="cpu",
+                    attn_implementation="flash_attention_2",
+                ).visual.eval()
+            self.vision_model_path = visual_model
         else:
             raise ValueError(f"unsupported engine {self.vision_engine}")
-
         # Convert Allspark LLM
         enable_quant = self.fp8
         weight_only_quant=False
diff --git a/multimodal/requirements.txt b/multimodal/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt==10.5.0
+dashinfer@https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 av
 numpy==1.24.3
 requests==2.32.3

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ class ChatCompletionRequest(BaseModel):`
`88`	`88`	`]`
`89`	`89`	`temperature: Optional[float] = 0.7`
`90`	`90`	`top_p: Optional[float] = 1.0`
`91`		`- top_k: Optional[int] = 1`
	`91`	`+ top_k: Optional[int] = 0`
`92`	`92`	`n: Optional[int] = 1`
`93`	`93`	`max_tokens: Optional[int] = None`
`94`	`94`	`max_completion_tokens: Optional[int] = None`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-tensorrt==10.5.0`
	`1`	`+dashinfer@https://github.com/modelscope/dash-infer/releases/download/v2.0.0-rc3/dashinfer-2.0.0rc3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl`
`2`	`2`	`av`
`3`	`3`	`numpy==1.24.3`
`4`	`4`	`requests==2.32.3`