From 3e623b91a5aa98e660bbc06cd894ffd69919fb5d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Dec 2025 12:36:45 +0100 Subject: [PATCH 1/9] convert: support Mistral 3 Large MoE --- convert_hf_to_gguf.py | 129 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 119 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8ddb6d04cd9..27b0e2836c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9912,17 +9912,124 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mis def set_gguf_parameters(self): super().set_gguf_parameters() - if "yarn" in self.hparams: - yarn_params = self.hparams["yarn"] - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim - self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + + @staticmethod + def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): + if "yarn" in hparams: + yarn_params = hparams["yarn"] + gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + gguf_writer.add_rope_scaling_factor(yarn_params["factor"]) + gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"]) + gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"]) + gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim + gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"]) + + if "llama_4_scaling" in hparams: + gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) + + +class MistralMoeModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + undo_permute = False - if "llama_4_scaling" in self.hparams: - self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"]) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + logger.info("Using MistralMoeModel") + # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py + config = self.hparams + # Mistral key -> HF key + config_mapping = { + "dim": "hidden_size", + "norm_eps": "rms_norm_eps", + "n_kv_heads": "num_key_value_heads", + "n_layers": "num_hidden_layers", + "n_heads": "num_attention_heads", + "hidden_dim": "intermediate_size", + } + # HF key -> (Mistral key, default value) + top_level_mapping_with_default = { + "model_type": ("model_type", "transformer"), + "hidden_act": ("activation", "silu"), + "tie_word_embeddings": ("tied_embeddings", False), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), + "max_position_embeddings": ("max_position_embeddings", 128_000), + } + for key, new_key in config_mapping.items(): + if key in config: + config[new_key] = config[key] + for new_key, (key, default_value) in top_level_mapping_with_default.items(): + config[new_key] = config.get(key, default_value) + moe_config_map = { + "route_every_n": "moe_layer_freq", + "first_k_dense_replace": "first_k_dense_replace", + "num_experts_per_tok": "num_experts_per_tok", + "num_experts": "n_routed_experts", + "expert_hidden_dim": "moe_intermediate_size", + "routed_scale": "routed_scaling_factor", + "num_shared_experts": "n_shared_experts", + "num_expert_groups": "n_group", + "num_expert_groups_per_tok": "topk_group", + } + moe = config["moe"] + for key, new_key in moe_config_map.items(): + if key in moe: + config[new_key] = moe[key] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + + # TODO @ngxson : this should be in tensor_mapping, but I don't have time for now + # copied from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3.py + remapping = { + r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2", # noqa: E501 + r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2", # noqa: E501 + r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2", # noqa: E501 + r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2", # noqa: E501 + r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2", # noqa: E501 + r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3", # noqa: E501 + r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3", # noqa: E501 + r"norm\.weight": "model.norm.weight", # noqa: E501 + r"tok_embeddings\.weight": "model.embed_tokens.weight", # noqa: E501 + r"output\.weight": "lm_head.weight", # noqa: E501 + } + + def _remap_mistral_to_ds(self, name: str) -> str: + for k, v in self.remapping.items(): + match = re.fullmatch(k, name) + if match: + name = re.sub(k, v, name) + break + else: + raise ValueError(f"Cannot remap {name}") + + # Remapping scale names. We could do this in the regex above but it + # would triple the number of lines for most layers. + if name.endswith(".qscale_act"): + name = re.sub(r"\.qscale_act$", ".input_scale", name) + elif name.endswith(".qscale_weight"): + name = re.sub(r"\.qscale_weight$", ".weight_scale", name) + return name + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = self._remap_mistral_to_ds(name) + return super().modify_tensors(data_torch, name, bid) class PixtralModel(LlavaVisionModel): @@ -10478,6 +10585,8 @@ def main() -> None: elif args.mmproj: assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" model_class = PixtralModel + elif "moe" in hparams: + model_class = MistralMoeModel else: model_class = MistralModel From 1a308dad60fe62d9756d98f502c50b802ce42e0e Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 3 Dec 2025 12:44:36 +0100 Subject: [PATCH 2/9] filter out vision tensors, add missing keys --- convert_hf_to_gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 27b0e2836c1..1e8b85de760 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9978,6 +9978,9 @@ def __init__(self, *args, **kwargs): for key, new_key in moe_config_map.items(): if key in moe: config[new_key] = moe[key] + config["topk_method"] = None + config["norm_topk_prob"] = True + config["scoring_func"] = "softmax" def set_gguf_parameters(self): super().set_gguf_parameters() @@ -10028,6 +10031,8 @@ def _remap_mistral_to_ds(self, name: str) -> str: return name def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name: + return [] name = self._remap_mistral_to_ds(name) return super().modify_tensors(data_torch, name, bid) From 08e0a4effabd883b55fae64e48ebe612bf87b7d6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Dec 2025 14:08:44 +0100 Subject: [PATCH 3/9] handle vocab --- convert_hf_to_gguf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1e8b85de760..281591a28e7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9982,6 +9982,9 @@ def __init__(self, *args, **kwargs): config["norm_topk_prob"] = True config["scoring_func"] = "softmax" + def set_vocab(self): + LlamaModel._set_vocab_mistral(self) # type: ignore + def set_gguf_parameters(self): super().set_gguf_parameters() MistralModel.set_mistral_config(self.gguf_writer, self.hparams) From 249eda4e6d6252308bb331404ef8184e19d7512f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Dec 2025 14:22:32 +0100 Subject: [PATCH 4/9] add temperature_length --- convert_hf_to_gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 281591a28e7..9862f4cad8b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9988,6 +9988,8 @@ def set_vocab(self): def set_gguf_parameters(self): super().set_gguf_parameters() MistralModel.set_mistral_config(self.gguf_writer, self.hparams) + yarn_params = self.hparams["yarn"] + self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) # TODO @ngxson : this should be in tensor_mapping, but I don't have time for now # copied from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3.py From aebab5f110d5885a7f8a467eb3575275dcce2e04 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Dec 2025 14:24:23 +0100 Subject: [PATCH 5/9] fix mscale_all_dim --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9862f4cad8b..e07bebd1c70 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9990,6 +9990,7 @@ def set_gguf_parameters(self): MistralModel.set_mistral_config(self.gguf_writer, self.hparams) yarn_params = self.hparams["yarn"] self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 # TODO @ngxson : this should be in tensor_mapping, but I don't have time for now # copied from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3.py From 646e47ddd72f7e6163831d0454979c63f47dfde9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 3 Dec 2025 19:09:43 +0100 Subject: [PATCH 6/9] clean up --- convert_hf_to_gguf.py | 188 ++++++++++++++------------------- gguf-py/gguf/tensor_mapping.py | 10 ++ 2 files changed, 90 insertions(+), 108 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e07bebd1c70..49ca06840e7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1524,6 +1524,67 @@ def _set_vocab_interns1(self): special_vocab._set_special_token("bos", 151643) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_mistral(self): + if not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" + ) + + if vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(vocab.bos_id) + self.gguf_writer.add_eos_token_id(vocab.eos_id) + self.gguf_writer.add_unk_token_id(vocab.unk_id) + self.gguf_writer.add_pad_token_id(vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) + + template_dir = Path(__file__).parent / "models/templates/" + + if not self.is_mistral_format or not self.disable_mistral_community_chat_template: + # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. + if self.is_mistral_format: + logger.info( + "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." + ) + template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) + self.gguf_writer.add_chat_template(template) + else: + logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") + class MmprojModel(ModelBase): model_type = ModelType.MMPROJ @@ -2294,67 +2355,6 @@ def __init__(self, *args, **kwargs): if self.hf_arch == "VLlama3ForCausalLM": self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - def _set_vocab_mistral(self): - if not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - - vocab = MistralVocab(self.dir_model) - logger.info( - f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." - ) - - self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" - ) - - if vocab.tokenizer_type == MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(vocab.bos_id) - self.gguf_writer.add_eos_token_id(vocab.eos_id) - self.gguf_writer.add_unk_token_id(vocab.unk_id) - self.gguf_writer.add_pad_token_id(vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) - - template_dir = Path(__file__).parent / "models/templates/" - - if not self.is_mistral_format or not self.disable_mistral_community_chat_template: - # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. - if self.is_mistral_format: - logger.info( - "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " - "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." - ) - template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) - self.gguf_writer.add_chat_template(template) - else: - logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.") - def set_vocab(self): if self.is_mistral_format: return self._set_vocab_mistral() @@ -9934,11 +9934,12 @@ class MistralMoeModel(DeepseekV2Model): model_name = "Mistral" hf_arch = "" is_mistral_format = True - undo_permute = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) logger.info("Using MistralMoeModel") + # remap hparams from Mistral MoE format to DeepseekV2 format + # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py config = self.hparams # Mistral key -> HF key @@ -9958,11 +9959,13 @@ def __init__(self, *args, **kwargs): "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), "max_position_embeddings": ("max_position_embeddings", 128_000), } + # mapping top-level keys for key, new_key in config_mapping.items(): if key in config: config[new_key] = config[key] for new_key, (key, default_value) in top_level_mapping_with_default.items(): config[new_key] = config.get(key, default_value) + # mapping MoE-specific keys moe_config_map = { "route_every_n": "moe_layer_freq", "first_k_dense_replace": "first_k_dense_replace", @@ -9978,12 +9981,13 @@ def __init__(self, *args, **kwargs): for key, new_key in moe_config_map.items(): if key in moe: config[new_key] = moe[key] + # provide missing values config["topk_method"] = None config["norm_topk_prob"] = True config["scoring_func"] = "softmax" def set_vocab(self): - LlamaModel._set_vocab_mistral(self) # type: ignore + self._set_vocab_mistral() def set_gguf_parameters(self): super().set_gguf_parameters() @@ -9992,54 +9996,22 @@ def set_gguf_parameters(self): self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 - # TODO @ngxson : this should be in tensor_mapping, but I don't have time for now - # copied from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3.py - remapping = { - r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight", # noqa: E501 - r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2", # noqa: E501 - r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight", # noqa: E501 - r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2", # noqa: E501 - r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2", # noqa: E501 - r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight", # noqa: E501 - r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2", # noqa: E501 - r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2", # noqa: E501 - r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight", # noqa: E501 - r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2", # noqa: E501 - r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2", # noqa: E501 - r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2", # noqa: E501 - r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight", # noqa: E501 - r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2", # noqa: E501 - r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2", # noqa: E501 - r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2", # noqa: E501 - r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3", # noqa: E501 - r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3", # noqa: E501 - r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3", # noqa: E501 - r"norm\.weight": "model.norm.weight", # noqa: E501 - r"tok_embeddings\.weight": "model.embed_tokens.weight", # noqa: E501 - r"output\.weight": "lm_head.weight", # noqa: E501 - } - - def _remap_mistral_to_ds(self, name: str) -> str: - for k, v in self.remapping.items(): - match = re.fullmatch(k, name) - if match: - name = re.sub(k, v, name) - break - else: - raise ValueError(f"Cannot remap {name}") - - # Remapping scale names. We could do this in the regex above but it - # would triple the number of lines for most layers. - if name.endswith(".qscale_act"): - name = re.sub(r"\.qscale_act$", ".input_scale", name) - elif name.endswith(".qscale_weight"): - name = re.sub(r"\.qscale_weight$", ".weight_scale", name) - return name - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name: return [] - name = self._remap_mistral_to_ds(name) + + # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic + if name.endswith(".qscale_act"): + name = name.replace(".qscale_act", ".input_scale") + if name.endswith(".qscale_weight"): + name = name.replace(".qscale_weight", ".weight_scale") + if ".experts." in name: + name = name.replace(".experts.", ".mlp.experts.") + name = name.replace(".w1.", ".gate_proj.") + name = name.replace(".w2.", ".down_proj.") + name = name.replace(".w3.", ".up_proj.") + name = "model." + name + return super().modify_tensors(data_torch, name, bid) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a7b09739791..5346885409c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -376,6 +376,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker "model.layers.{bid}.feed_forward.gate", # lfm2moe "model.layers.{bid}.mlp.router.gate", # afmoe + "layers.{bid}.gate", # mistral-large ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -450,6 +451,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 "model.layers.{bid}.feed_forward.down_proj", "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan + "layers.{bid}.shared_experts.w3", # mistral-large ), MODEL_TENSOR.FFN_UP_CHEXP: ( @@ -496,6 +498,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan + "layers.{bid}.shared_experts.w1", # mistral-large ), MODEL_TENSOR.FFN_GATE_CHEXP: ( @@ -557,6 +560,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan + "layers.{bid}.shared_experts.w2", # mistral-large ), MODEL_TENSOR.FFN_DOWN_CHEXP: ( @@ -924,18 +928,22 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_A: ( "model.layers.{bid}.self_attn.q_a_proj", # deepseek2 + "layers.{bid}.attention.wq_a", # mistral-large ), MODEL_TENSOR.ATTN_Q_B: ( "model.layers.{bid}.self_attn.q_b_proj", # deepseek2 + "layers.{bid}.attention.wq_b", # mistral-large ), MODEL_TENSOR.ATTN_KV_A_MQA: ( "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2 + "layers.{bid}.attention.wkv_a_with_mqa", # mistral-large ), MODEL_TENSOR.ATTN_KV_B: ( "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + "layers.{bid}.attention.wkv_b", # mistral-large ), MODEL_TENSOR.ATTN_K_B: ( @@ -948,10 +956,12 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 + "layers.{bid}.attention.q_a_norm", # mistral-large ), MODEL_TENSOR.ATTN_KV_A_NORM: ( "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 + "layers.{bid}.attention.kv_a_norm", # mistral-large ), MODEL_TENSOR.ATTN_SUB_NORM: ( From 49c4e2ddcb9dd2c3435f0b5d0139d5e969fabd94 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 5 Dec 2025 21:55:51 +0100 Subject: [PATCH 7/9] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 2 ++ gguf-py/gguf/tensor_mapping.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5fb3d9af617..c641989c19b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -10017,6 +10017,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace(".qscale_act", ".input_scale") if name.endswith(".qscale_weight"): name = name.replace(".qscale_weight", ".weight_scale") + if ".wkv_b." in name: + name = name.replace(".wkv_b.", ".kv_b_proj.") if ".experts." in name: name = name.replace(".experts.", ".mlp.experts.") name = name.replace(".w1.", ".gate_proj.") diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5346885409c..b24612c3829 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -952,6 +952,18 @@ class TensorNameMap: MODEL_TENSOR.ATTN_V_B: ( "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + MODEL_TENSOR.ATTN_KV_B: ( + "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_K_B: ( + "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 + "layers.{bid}.attention.k_b_proj", # mistral-large + ), + + MODEL_TENSOR.ATTN_V_B: ( + "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + "layers.{bid}.attention.v_b_proj", # mistral-large ), MODEL_TENSOR.ATTN_Q_A_NORM: ( From 2f8c2efc4fc4631870fe5813a15184212353d308 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 5 Dec 2025 22:01:53 +0100 Subject: [PATCH 8/9] fix --- gguf-py/gguf/tensor_mapping.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b24612c3829..d5007882f4e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -946,16 +946,6 @@ class TensorNameMap: "layers.{bid}.attention.wkv_b", # mistral-large ), - MODEL_TENSOR.ATTN_K_B: ( - "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 - ), - - MODEL_TENSOR.ATTN_V_B: ( - "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 - MODEL_TENSOR.ATTN_KV_B: ( - "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 - ), - MODEL_TENSOR.ATTN_K_B: ( "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 "layers.{bid}.attention.k_b_proj", # mistral-large From 15f78b670c19fffce5e080ba35c3ebaf970ff331 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 5 Dec 2025 22:05:25 +0100 Subject: [PATCH 9/9] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d5007882f4e..d9c87da1946 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -943,7 +943,6 @@ class TensorNameMap: MODEL_TENSOR.ATTN_KV_B: ( "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 - "layers.{bid}.attention.wkv_b", # mistral-large ), MODEL_TENSOR.ATTN_K_B: (