diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4590b239212..f0c73adfb54 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4202,7 +4202,7 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: return torch.stack([true_row, false_row], dim=0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if "model.vision_" in name: + if "model.vision_" in name or "vision_tower" in name or "multi_modal_projector" in name or "image_newline" in name: # skip multimodal tensors return [] @@ -4286,6 +4286,71 @@ def set_gguf_parameters(self): self.gguf_writer.add_mask_token_id(mask_token_id) +@ModelBase.register("RForConditionalGeneration") +class RVisionModel(MmprojModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + # R model uses a 2-layer MLP projector similar to LFM2, but without patch merging + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + # R model doesn't use patch merging, so scale_factor=1 + self.gguf_writer.add_vision_projector_scale_factor(1) + self.gguf_writer.add_vision_use_gelu(True) + + # Add the preprocessor longest edge size + preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) + self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name + + if is_vision_tensor: + # Handle image_newline specifically (before stripping prefix) + if "image_newline" in name: + return [("model.image_newline", data_torch)] + + # Strip the model. prefix if present + if name.startswith("model."): + name = name[6:] # Remove "model." + + # Map R model projector tensors to LFM2 format + if "multi_modal_projector" in name: + if "pre_norm.weight" in name: + return [("mm.input_norm.weight", data_torch)] + elif "pre_norm.bias" in name: + return [("mm.input_norm.bias", data_torch)] + elif "linear_1.weight" in name: + return [("mm.1.weight", data_torch)] + elif "linear_1.bias" in name: + return [("mm.1.bias", data_torch)] + elif "linear_2.weight" in name: + return [("mm.2.weight", data_torch)] + elif "linear_2.bias" in name: + return [("mm.2.bias", data_torch)] + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("RForConditionalGeneration") +class RTextModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.QWEN3 + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # Skip vision tensors - they go in the mmproj file + if "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name: + return [] + + # Strip model.language_model. prefix if present + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.") + + # Use Qwen3 handling for text model tensors + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration") class Qwen3VLVisionModel(MmprojModel): def __init__(self, *args, **kwargs): diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3ed08a0fec6..cb56ab68104 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2444,6 +2444,11 @@ struct clip_graph { // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) // support dynamic resolution ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + // If scale_factor is 1, no merging is needed (e.g., for R model) + if (scale_factor == 1) { + return cur; + } + GGML_ASSERT(scale_factor > 1); const int n_embd = cur->ne[0];