diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e83addd0c092..2950db571e6e 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -847,7 +847,7 @@ def get_xdrope_input_positions( .expand(-1, llm_grid_w + 1) .reshape(-1) ) - h_index[pos : pos + token_num] = 0 + t_index[pos : pos + token_num] = image_index if xd_num == 4: llm_positions = torch.stack([p_index, w_index, h_index, t_index]) diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py index 0a7e7865c783..0b10ae249dbb 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl_image.py +++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py @@ -195,9 +195,9 @@ def _preprocess( processed_images = [] for image in images: if do_resize: - resized_width, resized_height = smart_resize( - width, - height, + resized_height, resized_width = smart_resize( + height=height, + width=width, factor=patch_size * merge_size, min_pixels=self.min_pixels, max_pixels=self.max_pixels,