Add support for R-4B multimodal model

infil00p · infil00p · commit e2f15ef73c35 · 2025-12-06T13:42:16.000-08:00
This commit adds support for the R-4B model (YannQi/R-4B), a multimodal
large language model with auto-thinking capabilities.

Changes:
- convert_hf_to_gguf.py: Added RVisionModel and RTextModel classes to handle
  the R model architecture (RForConditionalGeneration)
  - RVisionModel uses LFM2 projector type with scale_factor=1 (no patch merging)
  - RTextModel extends Qwen3Model for the language component
  - Proper tensor name mapping for the projector (pre_norm, linear_1, linear_2)

- tools/mtmd/clip.cpp: Modified build_patch_merge_permute() to support
  scale_factor=1, which skips patch merging for models that don't need it
  - R model uses SigLIP vision encoder with 729 tokens (27x27 patches)
  - Projector: LayerNorm → Linear → GELU → Linear (no patch downsampling)

Architecture:
- Base text model: Qwen3-4B
- Vision encoder: SigLIP (384x384, patch size 14)
- Projector: 2-layer MLP with pre-normalization (no patch merging)
- Feature selection: full (keeps all 729 vision tokens)

Tested with llama-mtmd-cli and successfully generates English responses
with Chinese internal reasoning (&lt;think&gt; tags).
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -4202,7 +4202,7 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
         return torch.stack([true_row, false_row], dim=0)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if "model.vision_" in name:
+        if "model.vision_" in name or "vision_tower" in name or "multi_modal_projector" in name or "image_newline" in name:
             # skip multimodal tensors
             return []
 
@@ -4286,6 +4286,71 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_mask_token_id(mask_token_id)
 
 
+@ModelBase.register("RForConditionalGeneration")
+class RVisionModel(MmprojModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        # R model uses a 2-layer MLP projector similar to LFM2, but without patch merging
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+        # R model doesn't use patch merging, so scale_factor=1
+        self.gguf_writer.add_vision_projector_scale_factor(1)
+        self.gguf_writer.add_vision_use_gelu(True)
+
+        # Add the preprocessor longest edge size
+        preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
+        self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        is_vision_tensor = "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name
+
+        if is_vision_tensor:
+            # Handle image_newline specifically (before stripping prefix)
+            if "image_newline" in name:
+                return [("model.image_newline", data_torch)]
+
+            # Strip the model. prefix if present
+            if name.startswith("model."):
+                name = name[6:]  # Remove "model."
+
+            # Map R model projector tensors to LFM2 format
+            if "multi_modal_projector" in name:
+                if "pre_norm.weight" in name:
+                    return [("mm.input_norm.weight", data_torch)]
+                elif "pre_norm.bias" in name:
+                    return [("mm.input_norm.bias", data_torch)]
+                elif "linear_1.weight" in name:
+                    return [("mm.1.weight", data_torch)]
+                elif "linear_1.bias" in name:
+                    return [("mm.1.bias", data_torch)]
+                elif "linear_2.weight" in name:
+                    return [("mm.2.weight", data_torch)]
+                elif "linear_2.bias" in name:
+                    return [("mm.2.bias", data_torch)]
+
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return [] # skip other tensors
+
+
+@ModelBase.register("RForConditionalGeneration")
+class RTextModel(Qwen3Model):
+    model_arch = gguf.MODEL_ARCH.QWEN3
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        # Skip vision tensors - they go in the mmproj file
+        if "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name:
+            return []
+
+        # Strip model.language_model. prefix if present
+        if name.startswith("model.language_model."):
+            name = name.replace("model.language_model.", "model.")
+
+        # Use Qwen3 handling for text model tensors
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
 class Qwen3VLVisionModel(MmprojModel):
     def __init__(self, *args, **kwargs):
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -2444,6 +2444,11 @@ struct clip_graph {
     // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
     // support dynamic resolution
     ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
+        // If scale_factor is 1, no merging is needed (e.g., for R model)
+        if (scale_factor == 1) {
+            return cur;
+        }
+
         GGML_ASSERT(scale_factor > 1);
 
         const int n_embd = cur->ne[0];