diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e83addd0c092..b13d8ca5fea1 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -847,7 +847,7 @@ def get_xdrope_input_positions( .expand(-1, llm_grid_w + 1) .reshape(-1) ) - h_index[pos : pos + token_num] = 0 + t_index[pos : pos + token_num] = 0 if xd_num == 4: llm_positions = torch.stack([p_index, w_index, h_index, t_index]) diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py index 0a7e7865c783..0f5680825ef5 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl_image.py +++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py @@ -195,9 +195,9 @@ def _preprocess( processed_images = [] for image in images: if do_resize: - resized_width, resized_height = smart_resize( - width, + resized_height, resized_width = smart_resize( height, + width, factor=patch_size * merge_size, min_pixels=self.min_pixels, max_pixels=self.max_pixels,