Skip to content

Commit dbc15a7

Browse files
ngxsonCISC
andauthored
convert: support Mistral 3 Large MoE (#17730)
* convert: support Mistral 3 Large MoE * filter out vision tensors, add missing keys * handle vocab * add temperature_length * fix mscale_all_dim * clean up * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent c6c5e85 commit dbc15a7

File tree

2 files changed

+188
-83
lines changed

2 files changed

+188
-83
lines changed

convert_hf_to_gguf.py

Lines changed: 177 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,6 +1524,79 @@ def _set_vocab_interns1(self):
15241524
special_vocab._set_special_token("bos", 151643)
15251525
special_vocab.add_to_gguf(self.gguf_writer)
15261526

1527+
def _set_vocab_mistral(self):
1528+
if not _mistral_common_installed:
1529+
raise ImportError(_mistral_import_error_msg)
1530+
1531+
vocab = MistralVocab(self.dir_model)
1532+
logger.info(
1533+
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
1534+
)
1535+
1536+
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
1537+
1538+
tokens = []
1539+
scores = []
1540+
toktypes = []
1541+
1542+
for text, score, toktype in vocab.all_tokens():
1543+
tokens.append(text)
1544+
scores.append(score)
1545+
toktypes.append(toktype)
1546+
1547+
assert len(tokens) == vocab.vocab_size, (
1548+
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
1549+
)
1550+
1551+
if vocab.tokenizer_type == MistralTokenizerType.tekken:
1552+
self.gguf_writer.add_tokenizer_pre("tekken")
1553+
self.gguf_writer.add_token_merges(
1554+
vocab.extract_vocab_merges_from_model()
1555+
)
1556+
1557+
logger.info(
1558+
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
1559+
)
1560+
1561+
self.gguf_writer.add_bos_token_id(vocab.bos_id)
1562+
self.gguf_writer.add_eos_token_id(vocab.eos_id)
1563+
self.gguf_writer.add_unk_token_id(vocab.unk_id)
1564+
self.gguf_writer.add_pad_token_id(vocab.pad_id)
1565+
1566+
self.gguf_writer.add_token_list(tokens)
1567+
self.gguf_writer.add_token_scores(scores)
1568+
self.gguf_writer.add_token_types(toktypes)
1569+
self.gguf_writer.add_vocab_size(vocab.vocab_size)
1570+
1571+
self.gguf_writer.add_add_bos_token(True)
1572+
self.gguf_writer.add_add_eos_token(False)
1573+
1574+
local_template_file_path = self.dir_model / "chat_template.jinja"
1575+
1576+
if self.is_mistral_format and local_template_file_path.is_file():
1577+
# Ministral-3 and other new Mistral models come with chat templates.
1578+
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
1579+
logger.info("Using an existing Mistral local chat template.")
1580+
1581+
with open(local_template_file_path, "r", encoding="utf-8") as f:
1582+
template = f.read()
1583+
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
1584+
template_dir = Path(__file__).parent / "models/templates/"
1585+
1586+
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
1587+
if self.is_mistral_format:
1588+
logger.info(
1589+
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
1590+
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
1591+
)
1592+
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
1593+
else:
1594+
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
1595+
template = None
1596+
1597+
if template is not None:
1598+
self.gguf_writer.add_chat_template(template)
1599+
15271600

15281601
class MmprojModel(ModelBase):
15291602
model_type = ModelType.MMPROJ
@@ -2294,79 +2367,6 @@ def __init__(self, *args, **kwargs):
22942367
if self.hf_arch == "VLlama3ForCausalLM":
22952368
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
22962369

2297-
def _set_vocab_mistral(self):
2298-
if not _mistral_common_installed:
2299-
raise ImportError(_mistral_import_error_msg)
2300-
2301-
vocab = MistralVocab(self.dir_model)
2302-
logger.info(
2303-
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
2304-
)
2305-
2306-
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
2307-
2308-
tokens = []
2309-
scores = []
2310-
toktypes = []
2311-
2312-
for text, score, toktype in vocab.all_tokens():
2313-
tokens.append(text)
2314-
scores.append(score)
2315-
toktypes.append(toktype)
2316-
2317-
assert len(tokens) == vocab.vocab_size, (
2318-
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
2319-
)
2320-
2321-
if vocab.tokenizer_type == MistralTokenizerType.tekken:
2322-
self.gguf_writer.add_tokenizer_pre("tekken")
2323-
self.gguf_writer.add_token_merges(
2324-
vocab.extract_vocab_merges_from_model()
2325-
)
2326-
2327-
logger.info(
2328-
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
2329-
)
2330-
2331-
self.gguf_writer.add_bos_token_id(vocab.bos_id)
2332-
self.gguf_writer.add_eos_token_id(vocab.eos_id)
2333-
self.gguf_writer.add_unk_token_id(vocab.unk_id)
2334-
self.gguf_writer.add_pad_token_id(vocab.pad_id)
2335-
2336-
self.gguf_writer.add_token_list(tokens)
2337-
self.gguf_writer.add_token_scores(scores)
2338-
self.gguf_writer.add_token_types(toktypes)
2339-
self.gguf_writer.add_vocab_size(vocab.vocab_size)
2340-
2341-
self.gguf_writer.add_add_bos_token(True)
2342-
self.gguf_writer.add_add_eos_token(False)
2343-
2344-
local_template_file_path = self.dir_model / "chat_template.jinja"
2345-
2346-
if self.is_mistral_format and local_template_file_path.is_file():
2347-
# Ministral-3 and other new Mistral models come with chat templates.
2348-
# ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
2349-
logger.info("Using an existing Mistral local chat template.")
2350-
2351-
with open(local_template_file_path, "r", encoding="utf-8") as f:
2352-
template = f.read()
2353-
elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
2354-
template_dir = Path(__file__).parent / "models/templates/"
2355-
2356-
# Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
2357-
if self.is_mistral_format:
2358-
logger.info(
2359-
"Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
2360-
"Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
2361-
)
2362-
template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
2363-
else:
2364-
logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
2365-
template = None
2366-
2367-
if template is not None:
2368-
self.gguf_writer.add_chat_template(template)
2369-
23702370
def set_vocab(self):
23712371
if self.is_mistral_format:
23722372
return self._set_vocab_mistral()
@@ -9924,17 +9924,109 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mis
99249924

99259925
def set_gguf_parameters(self):
99269926
super().set_gguf_parameters()
9927-
if "yarn" in self.hparams:
9928-
yarn_params = self.hparams["yarn"]
9929-
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
9930-
self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
9931-
self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
9932-
self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
9933-
self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
9934-
self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
9927+
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
99359928

9936-
if "llama_4_scaling" in self.hparams:
9937-
self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
9929+
@staticmethod
9930+
def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
9931+
if "yarn" in hparams:
9932+
yarn_params = hparams["yarn"]
9933+
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
9934+
gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
9935+
gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
9936+
gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
9937+
gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
9938+
gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
9939+
9940+
if "llama_4_scaling" in hparams:
9941+
gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
9942+
9943+
9944+
class MistralMoeModel(DeepseekV2Model):
9945+
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
9946+
model_name = "Mistral"
9947+
hf_arch = ""
9948+
is_mistral_format = True
9949+
9950+
def __init__(self, *args, **kwargs):
9951+
super().__init__(*args, **kwargs)
9952+
logger.info("Using MistralMoeModel")
9953+
# remap hparams from Mistral MoE format to DeepseekV2 format
9954+
# we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
9955+
# ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
9956+
config = self.hparams
9957+
# Mistral key -> HF key
9958+
config_mapping = {
9959+
"dim": "hidden_size",
9960+
"norm_eps": "rms_norm_eps",
9961+
"n_kv_heads": "num_key_value_heads",
9962+
"n_layers": "num_hidden_layers",
9963+
"n_heads": "num_attention_heads",
9964+
"hidden_dim": "intermediate_size",
9965+
}
9966+
# HF key -> (Mistral key, default value)
9967+
top_level_mapping_with_default = {
9968+
"model_type": ("model_type", "transformer"),
9969+
"hidden_act": ("activation", "silu"),
9970+
"tie_word_embeddings": ("tied_embeddings", False),
9971+
"max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
9972+
"max_position_embeddings": ("max_position_embeddings", 128_000),
9973+
}
9974+
# mapping top-level keys
9975+
for key, new_key in config_mapping.items():
9976+
if key in config:
9977+
config[new_key] = config[key]
9978+
for new_key, (key, default_value) in top_level_mapping_with_default.items():
9979+
config[new_key] = config.get(key, default_value)
9980+
# mapping MoE-specific keys
9981+
moe_config_map = {
9982+
"route_every_n": "moe_layer_freq",
9983+
"first_k_dense_replace": "first_k_dense_replace",
9984+
"num_experts_per_tok": "num_experts_per_tok",
9985+
"num_experts": "n_routed_experts",
9986+
"expert_hidden_dim": "moe_intermediate_size",
9987+
"routed_scale": "routed_scaling_factor",
9988+
"num_shared_experts": "n_shared_experts",
9989+
"num_expert_groups": "n_group",
9990+
"num_expert_groups_per_tok": "topk_group",
9991+
}
9992+
moe = config["moe"]
9993+
for key, new_key in moe_config_map.items():
9994+
if key in moe:
9995+
config[new_key] = moe[key]
9996+
# provide missing values
9997+
config["topk_method"] = None
9998+
config["norm_topk_prob"] = True
9999+
config["scoring_func"] = "softmax"
10000+
10001+
def set_vocab(self):
10002+
self._set_vocab_mistral()
10003+
10004+
def set_gguf_parameters(self):
10005+
super().set_gguf_parameters()
10006+
MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
10007+
yarn_params = self.hparams["yarn"]
10008+
self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
10009+
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
10010+
10011+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
10012+
if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
10013+
return []
10014+
10015+
# rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
10016+
if name.endswith(".qscale_act"):
10017+
name = name.replace(".qscale_act", ".input_scale")
10018+
if name.endswith(".qscale_weight"):
10019+
name = name.replace(".qscale_weight", ".weight_scale")
10020+
if ".wkv_b." in name:
10021+
name = name.replace(".wkv_b.", ".kv_b_proj.")
10022+
if ".experts." in name:
10023+
name = name.replace(".experts.", ".mlp.experts.")
10024+
name = name.replace(".w1.", ".gate_proj.")
10025+
name = name.replace(".w2.", ".down_proj.")
10026+
name = name.replace(".w3.", ".up_proj.")
10027+
name = "model." + name
10028+
10029+
return super().modify_tensors(data_torch, name, bid)
993810030

993910031

994010032
class PixtralModel(LlavaVisionModel):
@@ -10490,6 +10582,8 @@ def main() -> None:
1049010582
elif args.mmproj:
1049110583
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
1049210584
model_class = PixtralModel
10585+
elif "moe" in hparams:
10586+
model_class = MistralMoeModel
1049310587
else:
1049410588
model_class = MistralModel
1049510589

gguf-py/gguf/tensor_mapping.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ class TensorNameMap:
376376
"model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
377377
"model.layers.{bid}.feed_forward.gate", # lfm2moe
378378
"model.layers.{bid}.mlp.router.gate", # afmoe
379+
"layers.{bid}.gate", # mistral-large
379380
),
380381

381382
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -450,6 +451,7 @@ class TensorNameMap:
450451
"model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
451452
"model.layers.{bid}.feed_forward.down_proj",
452453
"model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan
454+
"layers.{bid}.shared_experts.w3", # mistral-large
453455
),
454456

455457
MODEL_TENSOR.FFN_UP_CHEXP: (
@@ -496,6 +498,7 @@ class TensorNameMap:
496498
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
497499
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
498500
"model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan
501+
"layers.{bid}.shared_experts.w1", # mistral-large
499502
),
500503

501504
MODEL_TENSOR.FFN_GATE_CHEXP: (
@@ -557,6 +560,7 @@ class TensorNameMap:
557560
"model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
558561
"model.layers.{bid}.shared_mlp.output_linear", # granitemoe
559562
"model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan
563+
"layers.{bid}.shared_experts.w2", # mistral-large
560564
),
561565

562566
MODEL_TENSOR.FFN_DOWN_CHEXP: (
@@ -924,14 +928,17 @@ class TensorNameMap:
924928

925929
MODEL_TENSOR.ATTN_Q_A: (
926930
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
931+
"layers.{bid}.attention.wq_a", # mistral-large
927932
),
928933

929934
MODEL_TENSOR.ATTN_Q_B: (
930935
"model.layers.{bid}.self_attn.q_b_proj", # deepseek2
936+
"layers.{bid}.attention.wq_b", # mistral-large
931937
),
932938

933939
MODEL_TENSOR.ATTN_KV_A_MQA: (
934940
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
941+
"layers.{bid}.attention.wkv_a_with_mqa", # mistral-large
935942
),
936943

937944
MODEL_TENSOR.ATTN_KV_B: (
@@ -940,18 +947,22 @@ class TensorNameMap:
940947

941948
MODEL_TENSOR.ATTN_K_B: (
942949
"model.layers.{bid}.self_attn.k_b_proj", # deepseek2
950+
"layers.{bid}.attention.k_b_proj", # mistral-large
943951
),
944952

945953
MODEL_TENSOR.ATTN_V_B: (
946954
"model.layers.{bid}.self_attn.v_b_proj", # deepseek2
955+
"layers.{bid}.attention.v_b_proj", # mistral-large
947956
),
948957

949958
MODEL_TENSOR.ATTN_Q_A_NORM: (
950959
"model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
960+
"layers.{bid}.attention.q_a_norm", # mistral-large
951961
),
952962

953963
MODEL_TENSOR.ATTN_KV_A_NORM: (
954964
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
965+
"layers.{bid}.attention.kv_a_norm", # mistral-large
955966
),
956967

957968
MODEL_TENSOR.ATTN_SUB_NORM: (

0 commit comments

Comments
 (0)