From de8e3280d02f904173863db9a6cf149c47e0f3ba Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Thu, 27 Nov 2025 05:08:42 +0000 Subject: [PATCH 1/6] feat: improve loading tokenizer when loadling gguf models Signed-off-by: Injae Ryou --- vllm/config/model.py | 8 ---- vllm/transformers_utils/gguf_utils.py | 60 +++++++++++++++++++++++++++ vllm/transformers_utils/tokenizer.py | 9 +++- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 25972f097f53..33cb81773e1a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -40,7 +40,6 @@ ) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import ( - is_gguf, is_remote_gguf, maybe_model_redirect, split_remote_gguf, @@ -440,13 +439,6 @@ def __post_init__( self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: - # Check if this is a GGUF model (either local file or remote GGUF) - if is_gguf(self.model): - raise ValueError( - "Using a tokenizer is mandatory when loading a GGUF model. " - "Please specify the tokenizer path or name using the " - "--tokenizer argument." - ) self.tokenizer = self.model if self.tokenizer_revision is None: self.tokenizer_revision = self.revision diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 2bf59c91a3bb..3fde14baf489 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -2,10 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """GGUF utility functions.""" +import fnmatch from pathlib import Path import gguf from gguf.constants import Keys, VisionProjectorType +from huggingface_hub import HfFileSystem from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger @@ -164,3 +166,61 @@ def maybe_patch_hf_config_from_gguf( hf_config = new_hf_config return hf_config + + +def get_gguf_file_path_from_hf( + repo_id: str | Path, + quant_type: str, + revision: str | None = None, +) -> str | None: + """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type. + + Args: + repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B") + quant_type: The quantization type (e.g., "Q4_K_M", "F16") + revision: Optional revision/branch name + + Returns: + The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"), + or None if not found + """ + repo_id = str(repo_id) + try: + fs = HfFileSystem() + # List all files in the repository + file_list = fs.ls(repo_id, detail=False, revision=revision) + + # Patterns to match GGUF files with the quant_type + patterns = [ + f"*-{quant_type}.gguf", + f"*-{quant_type}-*.gguf", + f"*/*-{quant_type}.gguf", + f"*/*-{quant_type}-*.gguf", + ] + + # Find matching files + matching_files = [] + for pattern in patterns: + matches = fnmatch.filter(file_list, pattern) + matching_files.extend(matches) + + if not matching_files: + logger.warning( + "No GGUF file found in %s with quant_type %s", repo_id, quant_type + ) + return None + + # Sort to ensure consistent ordering (prefer non-sharded files) + matching_files.sort(key=lambda x: (x.count("-"), x)) + gguf_filename = matching_files[0] + + return gguf_filename.replace(repo_id + "/", "", 1) + except Exception as e: + logger.warning( + "Failed to get GGUF file path from HuggingFace Hub for %s " + "with quant_type %s: %s", + repo_id, + quant_type, + e, + ) + return None diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index f0e0ba8ef424..fd69ff97836b 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -19,6 +19,7 @@ get_sentence_transformer_tokenizer_config, list_filtered_repo_files, ) +from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import ( check_gguf_file, @@ -190,7 +191,13 @@ def get_tokenizer( kwargs["gguf_file"] = Path(tokenizer_name).name tokenizer_name = Path(tokenizer_name).parent elif is_remote_gguf(tokenizer_name): - tokenizer_name, _ = split_remote_gguf(tokenizer_name) + tokenizer_name, quant_type = split_remote_gguf(tokenizer_name) + # Get the HuggingFace Hub path for the GGUF file + kwargs["gguf_file"] = get_gguf_file_path_from_hf( + tokenizer_name, + quant_type, + revision=revision, + ) # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format # first to use official Mistral tokenizer if possible. From 9ee0a0c2bbfb826f29f0de795d757f26a1f77df2 Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Thu, 27 Nov 2025 14:16:01 +0900 Subject: [PATCH 2/6] raise error when remote gguf's tokenizer is None Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Injae Ryou --- vllm/transformers_utils/tokenizer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index fd69ff97836b..4d6200aea3de 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -193,11 +193,17 @@ def get_tokenizer( elif is_remote_gguf(tokenizer_name): tokenizer_name, quant_type = split_remote_gguf(tokenizer_name) # Get the HuggingFace Hub path for the GGUF file - kwargs["gguf_file"] = get_gguf_file_path_from_hf( + gguf_file = get_gguf_file_path_from_hf( tokenizer_name, quant_type, revision=revision, ) + if gguf_file is None: + raise ValueError( + f"Could not find GGUF file for repo {tokenizer_name} " + f"with quantization {quant_type}." + ) + kwargs["gguf_file"] = gguf_file # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format # first to use official Mistral tokenizer if possible. From 34f3d15aa64a83571c1653cf88394c3d40d363fe Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Thu, 27 Nov 2025 05:19:50 +0000 Subject: [PATCH 3/6] refactor: catch Exception Signed-off-by: Injae Ryou --- vllm/transformers_utils/gguf_utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 3fde14baf489..367e2ab2208d 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -8,6 +8,11 @@ import gguf from gguf.constants import Keys, VisionProjectorType from huggingface_hub import HfFileSystem +from huggingface_hub.errors import ( + HfHubHTTPError, + RepositoryNotFoundError, + RevisionNotFoundError, +) from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger @@ -215,7 +220,7 @@ def get_gguf_file_path_from_hf( gguf_filename = matching_files[0] return gguf_filename.replace(repo_id + "/", "", 1) - except Exception as e: + except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError) as e: logger.warning( "Failed to get GGUF file path from HuggingFace Hub for %s " "with quant_type %s: %s", From bc4e6b273d3f90af0266276b2cce4eaf6efc8f2e Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Thu, 27 Nov 2025 22:57:08 +0900 Subject: [PATCH 4/6] refactor: reuse existing function - list_filtered_repo_files Signed-off-by: Injae Ryou --- vllm/transformers_utils/gguf_utils.py | 69 ++++++++++----------------- vllm/transformers_utils/tokenizer.py | 5 -- 2 files changed, 24 insertions(+), 50 deletions(-) diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 367e2ab2208d..9e70ccaf4519 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -7,15 +7,9 @@ import gguf from gguf.constants import Keys, VisionProjectorType -from huggingface_hub import HfFileSystem -from huggingface_hub.errors import ( - HfHubHTTPError, - RepositoryNotFoundError, - RevisionNotFoundError, -) from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig - from vllm.logger import init_logger +from vllm.transformers_utils.config import list_filtered_repo_files logger = init_logger(__name__) @@ -177,7 +171,7 @@ def get_gguf_file_path_from_hf( repo_id: str | Path, quant_type: str, revision: str | None = None, -) -> str | None: +) -> str: """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type. Args: @@ -187,45 +181,30 @@ def get_gguf_file_path_from_hf( Returns: The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"), - or None if not found """ repo_id = str(repo_id) - try: - fs = HfFileSystem() - # List all files in the repository - file_list = fs.ls(repo_id, detail=False, revision=revision) - - # Patterns to match GGUF files with the quant_type - patterns = [ - f"*-{quant_type}.gguf", - f"*-{quant_type}-*.gguf", - f"*/*-{quant_type}.gguf", - f"*/*-{quant_type}-*.gguf", - ] - - # Find matching files - matching_files = [] - for pattern in patterns: - matches = fnmatch.filter(file_list, pattern) - matching_files.extend(matches) - - if not matching_files: - logger.warning( - "No GGUF file found in %s with quant_type %s", repo_id, quant_type - ) - return None - - # Sort to ensure consistent ordering (prefer non-sharded files) - matching_files.sort(key=lambda x: (x.count("-"), x)) - gguf_filename = matching_files[0] - - return gguf_filename.replace(repo_id + "/", "", 1) - except (RepositoryNotFoundError, RevisionNotFoundError, HfHubHTTPError) as e: - logger.warning( - "Failed to get GGUF file path from HuggingFace Hub for %s " - "with quant_type %s: %s", + gguf_patterns = [ + f"*-{quant_type}.gguf", + f"*-{quant_type}-*.gguf", + f"*/*-{quant_type}.gguf", + f"*/*-{quant_type}-*.gguf", + ] + matching_files = list_filtered_repo_files( + repo_id, + allow_patterns=gguf_patterns, + revision=revision, + ) + + if len(matching_files) == 0: + raise ValueError( + "Could not find GGUF file for repo %s " + "with quantization %s.", repo_id, quant_type, - e, ) - return None + + # Sort to ensure consistent ordering (prefer non-sharded files) + matching_files.sort(key=lambda x: (x.count("-"), x)) + gguf_filename = matching_files[0] + return gguf_filename + diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 4d6200aea3de..929dc8bf481c 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -198,11 +198,6 @@ def get_tokenizer( quant_type, revision=revision, ) - if gguf_file is None: - raise ValueError( - f"Could not find GGUF file for repo {tokenizer_name} " - f"with quantization {quant_type}." - ) kwargs["gguf_file"] = gguf_file # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format From ac41103b9e8c4acb731bf970333b00167e644d61 Mon Sep 17 00:00:00 2001 From: Injae Ryou Date: Thu, 27 Nov 2025 23:06:11 +0900 Subject: [PATCH 5/6] chore: pre-commit Signed-off-by: Injae Ryou --- vllm/transformers_utils/gguf_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 9e70ccaf4519..f727b1b4726b 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -2,12 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """GGUF utility functions.""" -import fnmatch from pathlib import Path import gguf from gguf.constants import Keys, VisionProjectorType from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig + from vllm.logger import init_logger from vllm.transformers_utils.config import list_filtered_repo_files @@ -194,11 +194,10 @@ def get_gguf_file_path_from_hf( allow_patterns=gguf_patterns, revision=revision, ) - + if len(matching_files) == 0: raise ValueError( - "Could not find GGUF file for repo %s " - "with quantization %s.", + "Could not find GGUF file for repo %s with quantization %s.", repo_id, quant_type, ) @@ -207,4 +206,3 @@ def get_gguf_file_path_from_hf( matching_files.sort(key=lambda x: (x.count("-"), x)) gguf_filename = matching_files[0] return gguf_filename - From 46f1b44dca7ef6550bb1172f82625b00fce6d178 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 27 Nov 2025 22:37:28 +0800 Subject: [PATCH 6/6] raise invalid tokenizer early for mm gguf Signed-off-by: Isotr0py --- vllm/config/model.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index 33cb81773e1a..80e67d3e1c34 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -40,6 +40,7 @@ ) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import ( + is_gguf, is_remote_gguf, maybe_model_redirect, split_remote_gguf, @@ -692,6 +693,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + # Multimodal GGUF models must use original repo for mm processing + if is_gguf(self.tokenizer) and self.is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size