diff --git a/vllm/config/model.py b/vllm/config/model.py index 84311596b660..c6601f9de519 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -440,13 +440,6 @@ def __post_init__( self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: - # Check if this is a GGUF model (either local file or remote GGUF) - if is_gguf(self.model): - raise ValueError( - "Using a tokenizer is mandatory when loading a GGUF model. " - "Please specify the tokenizer path or name using the " - "--tokenizer argument." - ) self.tokenizer = self.model if self.tokenizer_revision is None: self.tokenizer_revision = self.revision @@ -700,6 +693,14 @@ def _task_to_convert(task: TaskOption) -> ConvertType: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) + # Multimodal GGUF models must use original repo for mm processing + if is_gguf(self.tokenizer) and self.is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + if self.disable_sliding_window: # Set after get_and_verify_max_len to ensure that max_model_len # can be correctly capped to sliding window size diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py index 2bf59c91a3bb..f727b1b4726b 100644 --- a/vllm/transformers_utils/gguf_utils.py +++ b/vllm/transformers_utils/gguf_utils.py @@ -9,6 +9,7 @@ from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig from vllm.logger import init_logger +from vllm.transformers_utils.config import list_filtered_repo_files logger = init_logger(__name__) @@ -164,3 +165,44 @@ def maybe_patch_hf_config_from_gguf( hf_config = new_hf_config return hf_config + + +def get_gguf_file_path_from_hf( + repo_id: str | Path, + quant_type: str, + revision: str | None = None, +) -> str: + """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type. + + Args: + repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B") + quant_type: The quantization type (e.g., "Q4_K_M", "F16") + revision: Optional revision/branch name + + Returns: + The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"), + """ + repo_id = str(repo_id) + gguf_patterns = [ + f"*-{quant_type}.gguf", + f"*-{quant_type}-*.gguf", + f"*/*-{quant_type}.gguf", + f"*/*-{quant_type}-*.gguf", + ] + matching_files = list_filtered_repo_files( + repo_id, + allow_patterns=gguf_patterns, + revision=revision, + ) + + if len(matching_files) == 0: + raise ValueError( + "Could not find GGUF file for repo %s with quantization %s.", + repo_id, + quant_type, + ) + + # Sort to ensure consistent ordering (prefer non-sharded files) + matching_files.sort(key=lambda x: (x.count("-"), x)) + gguf_filename = matching_files[0] + return gguf_filename diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index f0e0ba8ef424..929dc8bf481c 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -19,6 +19,7 @@ get_sentence_transformer_tokenizer_config, list_filtered_repo_files, ) +from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import ( check_gguf_file, @@ -190,7 +191,14 @@ def get_tokenizer( kwargs["gguf_file"] = Path(tokenizer_name).name tokenizer_name = Path(tokenizer_name).parent elif is_remote_gguf(tokenizer_name): - tokenizer_name, _ = split_remote_gguf(tokenizer_name) + tokenizer_name, quant_type = split_remote_gguf(tokenizer_name) + # Get the HuggingFace Hub path for the GGUF file + gguf_file = get_gguf_file_path_from_hf( + tokenizer_name, + quant_type, + revision=revision, + ) + kwargs["gguf_file"] = gguf_file # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format # first to use official Mistral tokenizer if possible.