From de8e3280d02f904173863db9a6cf149c47e0f3ba Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Thu, 27 Nov 2025 05:08:42 +0000
Subject: [PATCH 1/6] feat: improve loading tokenizer when loadling gguf models

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
---
 vllm/config/model.py                  |  8 ----
 vllm/transformers_utils/gguf_utils.py | 60 +++++++++++++++++++++++++++
 vllm/transformers_utils/tokenizer.py  |  9 +++-
 3 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 25972f097f53..33cb81773e1a 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -40,7 +40,6 @@
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
 from vllm.transformers_utils.utils import (
-    is_gguf,
     is_remote_gguf,
     maybe_model_redirect,
     split_remote_gguf,
@@ -440,13 +439,6 @@ def __post_init__(
         self.model = maybe_model_redirect(self.model)
         # The tokenizer is consistent with the model by default.
         if self.tokenizer is None:
-            # Check if this is a GGUF model (either local file or remote GGUF)
-            if is_gguf(self.model):
-                raise ValueError(
-                    "Using a tokenizer is mandatory when loading a GGUF model. "
-                    "Please specify the tokenizer path or name using the "
-                    "--tokenizer argument."
-                )
             self.tokenizer = self.model
         if self.tokenizer_revision is None:
             self.tokenizer_revision = self.revision
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 2bf59c91a3bb..3fde14baf489 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -2,10 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """GGUF utility functions."""
 
+import fnmatch
 from pathlib import Path
 
 import gguf
 from gguf.constants import Keys, VisionProjectorType
+from huggingface_hub import HfFileSystem
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
@@ -164,3 +166,61 @@ def maybe_patch_hf_config_from_gguf(
             hf_config = new_hf_config
 
     return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str | None:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+        or None if not found
+    """
+    repo_id = str(repo_id)
+    try:
+        fs = HfFileSystem()
+        # List all files in the repository
+        file_list = fs.ls(repo_id, detail=False, revision=revision)
+
+        # Patterns to match GGUF files with the quant_type
+        patterns = [
+            f"*-{quant_type}.gguf",
+            f"*-{quant_type}-*.gguf",
+            f"*/*-{quant_type}.gguf",
+            f"*/*-{quant_type}-*.gguf",
+        ]
+
+        # Find matching files
+        matching_files = []
+        for pattern in patterns:
+            matches = fnmatch.filter(file_list, pattern)
+            matching_files.extend(matches)
+
+        if not matching_files:
+            logger.warning(
+                "No GGUF file found in %s with quant_type %s", repo_id, quant_type
+            )
+            return None
+
+        # Sort to ensure consistent ordering (prefer non-sharded files)
+        matching_files.sort(key=lambda x: (x.count("-"), x))
+        gguf_filename = matching_files[0]
+
+        return gguf_filename.replace(repo_id + "/", "", 1)
+    except Exception as e:
+        logger.warning(
+            "Failed to get GGUF file path from HuggingFace Hub for %s "
+            "with quant_type %s: %s",
+            repo_id,
+            quant_type,
+            e,
+        )
+        return None
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f0e0ba8ef424..fd69ff97836b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -19,6 +19,7 @@
     get_sentence_transformer_tokenizer_config,
     list_filtered_repo_files,
 )
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import (
     check_gguf_file,
@@ -190,7 +191,13 @@ def get_tokenizer(
             kwargs["gguf_file"] = Path(tokenizer_name).name
             tokenizer_name = Path(tokenizer_name).parent
         elif is_remote_gguf(tokenizer_name):
-            tokenizer_name, _ = split_remote_gguf(tokenizer_name)
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            kwargs["gguf_file"] = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.

From 9ee0a0c2bbfb826f29f0de795d757f26a1f77df2 Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Thu, 27 Nov 2025 14:16:01 +0900
Subject: [PATCH 2/6] raise error when remote gguf's tokenizer is None

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Injae Ryou <injaeryou@gmail.com>
---
 vllm/transformers_utils/tokenizer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index fd69ff97836b..4d6200aea3de 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -193,11 +193,17 @@ def get_tokenizer(
         elif is_remote_gguf(tokenizer_name):
             tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
             # Get the HuggingFace Hub path for the GGUF file
-            kwargs["gguf_file"] = get_gguf_file_path_from_hf(
+            gguf_file = get_gguf_file_path_from_hf(
                 tokenizer_name,
                 quant_type,
                 revision=revision,
             )
+            if gguf_file is None:
+                raise ValueError(
+                    f"Could not find GGUF file for repo {tokenizer_name} "
+                    f"with quantization {quant_type}."
+                )
+            kwargs["gguf_file"] = gguf_file
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
     # first to use official Mistral tokenizer if possible.

From 34f3d15aa64a83571c1653cf88394c3d40d363fe Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Thu, 27 Nov 2025 05:19:50 +0000
Subject: [PATCH 3/6] refactor: catch Exception

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
---
 vllm/transformers_utils/gguf_utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 3fde14baf489..367e2ab2208d 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -8,6 +8,11 @@
 import gguf
 from gguf.constants import Keys, VisionProjectorType
 from huggingface_hub import HfFileSystem
+from huggingface_hub.errors import (
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
 
 from vllm.logger import init_logger
@@ -215,7 +220,7 @@ def get_gguf_file_path_from_hf(
         gguf_filename = matching_files[0]
 
         return gguf_filename.replace(repo_id + "/", "", 1)
-    except Exception as e:
+    except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError) as e:
         logger.warning(
             "Failed to get GGUF file path from HuggingFace Hub for %s "
             "with quant_type %s: %s",

From bc4e6b273d3f90af0266276b2cce4eaf6efc8f2e Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Thu, 27 Nov 2025 22:57:08 +0900
Subject: [PATCH 4/6] refactor: reuse existing function

- list_filtered_repo_files

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
---
 vllm/transformers_utils/gguf_utils.py | 69 ++++++++++-----------------
 vllm/transformers_utils/tokenizer.py  |  5 --
 2 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 367e2ab2208d..9e70ccaf4519 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -7,15 +7,9 @@
 
 import gguf
 from gguf.constants import Keys, VisionProjectorType
-from huggingface_hub import HfFileSystem
-from huggingface_hub.errors import (
-    HfHubHTTPError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-)
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
-
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -177,7 +171,7 @@ def get_gguf_file_path_from_hf(
     repo_id: str | Path,
     quant_type: str,
     revision: str | None = None,
-) -> str | None:
+) -> str:
     """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
 
     Args:
@@ -187,45 +181,30 @@ def get_gguf_file_path_from_hf(
 
     Returns:
         The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
-        or None if not found
     """
     repo_id = str(repo_id)
-    try:
-        fs = HfFileSystem()
-        # List all files in the repository
-        file_list = fs.ls(repo_id, detail=False, revision=revision)
-
-        # Patterns to match GGUF files with the quant_type
-        patterns = [
-            f"*-{quant_type}.gguf",
-            f"*-{quant_type}-*.gguf",
-            f"*/*-{quant_type}.gguf",
-            f"*/*-{quant_type}-*.gguf",
-        ]
-
-        # Find matching files
-        matching_files = []
-        for pattern in patterns:
-            matches = fnmatch.filter(file_list, pattern)
-            matching_files.extend(matches)
-
-        if not matching_files:
-            logger.warning(
-                "No GGUF file found in %s with quant_type %s", repo_id, quant_type
-            )
-            return None
-
-        # Sort to ensure consistent ordering (prefer non-sharded files)
-        matching_files.sort(key=lambda x: (x.count("-"), x))
-        gguf_filename = matching_files[0]
-
-        return gguf_filename.replace(repo_id + "/", "", 1)
-    except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError) as e:
-        logger.warning(
-            "Failed to get GGUF file path from HuggingFace Hub for %s "
-            "with quant_type %s: %s",
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+    
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s "
+            "with quantization %s.",
             repo_id,
             quant_type,
-            e,
         )
-        return None
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
+
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 4d6200aea3de..929dc8bf481c 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -198,11 +198,6 @@ def get_tokenizer(
                 quant_type,
                 revision=revision,
             )
-            if gguf_file is None:
-                raise ValueError(
-                    f"Could not find GGUF file for repo {tokenizer_name} "
-                    f"with quantization {quant_type}."
-                )
             kwargs["gguf_file"] = gguf_file
 
     # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format

From ac41103b9e8c4acb731bf970333b00167e644d61 Mon Sep 17 00:00:00 2001
From: Injae Ryou <injaeryou@gmail.com>
Date: Thu, 27 Nov 2025 23:06:11 +0900
Subject: [PATCH 5/6] chore: pre-commit

Signed-off-by: Injae Ryou <injaeryou@gmail.com>
---
 vllm/transformers_utils/gguf_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
index 9e70ccaf4519..f727b1b4726b 100644
--- a/vllm/transformers_utils/gguf_utils.py
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -2,12 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """GGUF utility functions."""
 
-import fnmatch
 from pathlib import Path
 
 import gguf
 from gguf.constants import Keys, VisionProjectorType
 from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
+
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import list_filtered_repo_files
 
@@ -194,11 +194,10 @@ def get_gguf_file_path_from_hf(
         allow_patterns=gguf_patterns,
         revision=revision,
     )
-    
+
     if len(matching_files) == 0:
         raise ValueError(
-            "Could not find GGUF file for repo %s "
-            "with quantization %s.",
+            "Could not find GGUF file for repo %s with quantization %s.",
             repo_id,
             quant_type,
         )
@@ -207,4 +206,3 @@ def get_gguf_file_path_from_hf(
     matching_files.sort(key=lambda x: (x.count("-"), x))
     gguf_filename = matching_files[0]
     return gguf_filename
-

From 46f1b44dca7ef6550bb1172f82625b00fce6d178 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 27 Nov 2025 22:37:28 +0800
Subject: [PATCH 6/6] raise invalid tokenizer early for mm gguf

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 vllm/config/model.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index 33cb81773e1a..80e67d3e1c34 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -40,6 +40,7 @@
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
 from vllm.transformers_utils.utils import (
+    is_gguf,
     is_remote_gguf,
     maybe_model_redirect,
     split_remote_gguf,
@@ -692,6 +693,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size