From 3609feea7851a63ab38ade4212ba9fd3c3ce1403 Mon Sep 17 00:00:00 2001
From: grider-transwithai <grider@transwith.ai>
Date: Wed, 26 Nov 2025 01:58:52 +0800
Subject: [PATCH] [Bugfix] Fix HunyuanVL XD-RoPE and smart_resize bugs

Fix two critical bugs in HunyuanVL implementation:

1. XD-RoPE height index zeroing: The h_index was incorrectly being
   zeroed instead of t_index, removing vertical spatial awareness
   for image tokens. Fixed by setting t_index to 0 (correct for
   static images) instead of h_index.

2. smart_resize argument swap: The function was called with (width,
   height) but expects (height, width), and return values were
   assigned incorrectly. Fixed argument order and assignment to
   match function signature.

Signed-off-by: grider-transwithai <grider@transwith.ai>
---
 vllm/model_executor/models/hunyuan_vision.py           | 2 +-
 vllm/transformers_utils/processors/hunyuan_vl_image.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
index e83addd0c092..b13d8ca5fea1 100644
--- a/vllm/model_executor/models/hunyuan_vision.py
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -847,7 +847,7 @@ def get_xdrope_input_positions(
                 .expand(-1, llm_grid_w + 1)
                 .reshape(-1)
             )
-            h_index[pos : pos + token_num] = 0
+            t_index[pos : pos + token_num] = 0
 
         if xd_num == 4:
             llm_positions = torch.stack([p_index, w_index, h_index, t_index])
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
index 0a7e7865c783..0f5680825ef5 100644
--- a/vllm/transformers_utils/processors/hunyuan_vl_image.py
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -195,9 +195,9 @@ def _preprocess(
         processed_images = []
         for image in images:
             if do_resize:
-                resized_width, resized_height = smart_resize(
-                    width,
+                resized_height, resized_width = smart_resize(
                     height,
+                    width,
                     factor=patch_size * merge_size,
                     min_pixels=self.min_pixels,
                     max_pixels=self.max_pixels,