From 3609feea7851a63ab38ade4212ba9fd3c3ce1403 Mon Sep 17 00:00:00 2001 From: grider-transwithai Date: Wed, 26 Nov 2025 01:58:52 +0800 Subject: [PATCH] [Bugfix] Fix HunyuanVL XD-RoPE and smart_resize bugs Fix two critical bugs in HunyuanVL implementation: 1. XD-RoPE height index zeroing: The h_index was incorrectly being zeroed instead of t_index, removing vertical spatial awareness for image tokens. Fixed by setting t_index to 0 (correct for static images) instead of h_index. 2. smart_resize argument swap: The function was called with (width, height) but expects (height, width), and return values were assigned incorrectly. Fixed argument order and assignment to match function signature. Signed-off-by: grider-transwithai --- vllm/model_executor/models/hunyuan_vision.py | 2 +- vllm/transformers_utils/processors/hunyuan_vl_image.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e83addd0c092..b13d8ca5fea1 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -847,7 +847,7 @@ def get_xdrope_input_positions( .expand(-1, llm_grid_w + 1) .reshape(-1) ) - h_index[pos : pos + token_num] = 0 + t_index[pos : pos + token_num] = 0 if xd_num == 4: llm_positions = torch.stack([p_index, w_index, h_index, t_index]) diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py index 0a7e7865c783..0f5680825ef5 100644 --- a/vllm/transformers_utils/processors/hunyuan_vl_image.py +++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py @@ -195,9 +195,9 @@ def _preprocess( processed_images = [] for image in images: if do_resize: - resized_width, resized_height = smart_resize( - width, + resized_height, resized_width = smart_resize( height, + width, factor=patch_size * merge_size, min_pixels=self.min_pixels, max_pixels=self.max_pixels,