Skip to content

Commit e2f15ef

Browse files
committed
Add support for R-4B multimodal model
This commit adds support for the R-4B model (YannQi/R-4B), a multimodal large language model with auto-thinking capabilities. Changes: - convert_hf_to_gguf.py: Added RVisionModel and RTextModel classes to handle the R model architecture (RForConditionalGeneration) - RVisionModel uses LFM2 projector type with scale_factor=1 (no patch merging) - RTextModel extends Qwen3Model for the language component - Proper tensor name mapping for the projector (pre_norm, linear_1, linear_2) - tools/mtmd/clip.cpp: Modified build_patch_merge_permute() to support scale_factor=1, which skips patch merging for models that don't need it - R model uses SigLIP vision encoder with 729 tokens (27x27 patches) - Projector: LayerNorm → Linear → GELU → Linear (no patch downsampling) Architecture: - Base text model: Qwen3-4B - Vision encoder: SigLIP (384x384, patch size 14) - Projector: 2-layer MLP with pre-normalization (no patch merging) - Feature selection: full (keeps all 729 vision tokens) Tested with llama-mtmd-cli and successfully generates English responses with Chinese internal reasoning (<think> tags).
1 parent 817d743 commit e2f15ef

File tree

2 files changed

+71
-1
lines changed

2 files changed

+71
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4202,7 +4202,7 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
42024202
return torch.stack([true_row, false_row], dim=0)
42034203

42044204
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4205-
if "model.vision_" in name:
4205+
if "model.vision_" in name or "vision_tower" in name or "multi_modal_projector" in name or "image_newline" in name:
42064206
# skip multimodal tensors
42074207
return []
42084208

@@ -4286,6 +4286,71 @@ def set_gguf_parameters(self):
42864286
self.gguf_writer.add_mask_token_id(mask_token_id)
42874287

42884288

4289+
@ModelBase.register("RForConditionalGeneration")
4290+
class RVisionModel(MmprojModel):
4291+
def set_gguf_parameters(self):
4292+
super().set_gguf_parameters()
4293+
# R model uses a 2-layer MLP projector similar to LFM2, but without patch merging
4294+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2)
4295+
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
4296+
# R model doesn't use patch merging, so scale_factor=1
4297+
self.gguf_writer.add_vision_projector_scale_factor(1)
4298+
self.gguf_writer.add_vision_use_gelu(True)
4299+
4300+
# Add the preprocessor longest edge size
4301+
preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
4302+
self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
4303+
4304+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4305+
del bid # unused
4306+
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name
4307+
4308+
if is_vision_tensor:
4309+
# Handle image_newline specifically (before stripping prefix)
4310+
if "image_newline" in name:
4311+
return [("model.image_newline", data_torch)]
4312+
4313+
# Strip the model. prefix if present
4314+
if name.startswith("model."):
4315+
name = name[6:] # Remove "model."
4316+
4317+
# Map R model projector tensors to LFM2 format
4318+
if "multi_modal_projector" in name:
4319+
if "pre_norm.weight" in name:
4320+
return [("mm.input_norm.weight", data_torch)]
4321+
elif "pre_norm.bias" in name:
4322+
return [("mm.input_norm.bias", data_torch)]
4323+
elif "linear_1.weight" in name:
4324+
return [("mm.1.weight", data_torch)]
4325+
elif "linear_1.bias" in name:
4326+
return [("mm.1.bias", data_torch)]
4327+
elif "linear_2.weight" in name:
4328+
return [("mm.2.weight", data_torch)]
4329+
elif "linear_2.bias" in name:
4330+
return [("mm.2.bias", data_torch)]
4331+
4332+
return [(self.map_tensor_name(name), data_torch)]
4333+
4334+
return [] # skip other tensors
4335+
4336+
4337+
@ModelBase.register("RForConditionalGeneration")
4338+
class RTextModel(Qwen3Model):
4339+
model_arch = gguf.MODEL_ARCH.QWEN3
4340+
4341+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
4342+
# Skip vision tensors - they go in the mmproj file
4343+
if "vision_tower" in name or "vision_model" in name or "multi_modal_projector" in name or "image_newline" in name:
4344+
return []
4345+
4346+
# Strip model.language_model. prefix if present
4347+
if name.startswith("model.language_model."):
4348+
name = name.replace("model.language_model.", "model.")
4349+
4350+
# Use Qwen3 handling for text model tensors
4351+
return super().modify_tensors(data_torch, name, bid)
4352+
4353+
42894354
@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
42904355
class Qwen3VLVisionModel(MmprojModel):
42914356
def __init__(self, *args, **kwargs):

tools/mtmd/clip.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2444,6 +2444,11 @@ struct clip_graph {
24442444
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
24452445
// support dynamic resolution
24462446
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
2447+
// If scale_factor is 1, no merging is needed (e.g., for R model)
2448+
if (scale_factor == 1) {
2449+
return cur;
2450+
}
2451+
24472452
GGML_ASSERT(scale_factor > 1);
24482453

24492454
const int n_embd = cur->ne[0];

0 commit comments

Comments
 (0)