@@ -1581,10 +1581,27 @@ def __init__(self, *args, **kwargs):
15811581
15821582 # load preprocessor config
15831583 self .preprocessor_config = {}
1584- if not self .is_mistral_format :
1585- with open (self .dir_model / "preprocessor_config.json" , "r" , encoding = "utf-8" ) as f :
1584+
1585+ # prefer preprocessor_config.json if possible
1586+ preprocessor_config_path = self .dir_model / "preprocessor_config.json"
1587+ if preprocessor_config_path .is_file ():
1588+ with open (preprocessor_config_path , "r" , encoding = "utf-8" ) as f :
15861589 self .preprocessor_config = json .load (f )
15871590
1591+ # prefer processor_config.json if possible
1592+ processor_config_path = self .dir_model / "processor_config.json"
1593+ if processor_config_path .is_file ():
1594+ with open (processor_config_path , "r" , encoding = "utf-8" ) as f :
1595+ cfg = json .load (f )
1596+ # move image_processor to root level for compat
1597+ if "image_processor" in cfg :
1598+ cfg = {
1599+ ** cfg ,
1600+ ** cfg ["image_processor" ],
1601+ }
1602+ # merge configs
1603+ self .preprocessor_config = {** self .preprocessor_config , ** cfg }
1604+
15881605 def get_vision_config (self ) -> dict [str , Any ] | None :
15891606 config_name = "vision_config" if not self .is_mistral_format else "vision_encoder"
15901607 return self .global_config .get (config_name )
@@ -2797,7 +2814,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27972814
27982815@ModelBase .register ("Mistral3ForConditionalGeneration" )
27992816class Mistral3Model (LlamaModel ):
2800- model_arch = gguf .MODEL_ARCH .LLAMA
2817+ model_arch = gguf .MODEL_ARCH .MISTRAL3
2818+
2819+ def __init__ (self , * args , ** kwargs ):
2820+ super ().__init__ (* args , ** kwargs )
2821+ # for compatibility, we use LLAMA arch for older models
2822+ # TODO: remove this once everyone has migrated to newer version of llama.cpp
2823+ if self .hparams .get ("model_type" ) != "ministral3" :
2824+ self .model_arch = gguf .MODEL_ARCH .LLAMA
2825+ self .gguf_writer .arch = gguf .MODEL_ARCH_NAMES [self .model_arch ]
2826+ self .gguf_writer .add_architecture ()
2827+ self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
2828+
2829+ def set_gguf_parameters (self ):
2830+ super ().set_gguf_parameters ()
2831+ rope_params = self .hparams .get ("rope_parameters" )
2832+ if self .hparams .get ("model_type" ) == "ministral3" :
2833+ assert rope_params is not None , "ministral3 must have 'rope_parameters' config"
2834+ assert rope_params ["rope_type" ] == "yarn" , "ministral3 rope_type must be 'yarn'"
2835+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
2836+ self .gguf_writer .add_rope_scaling_factor (rope_params ["factor" ])
2837+ self .gguf_writer .add_rope_scaling_yarn_beta_fast (rope_params ["beta_fast" ])
2838+ self .gguf_writer .add_rope_scaling_yarn_beta_slow (rope_params ["beta_slow" ])
2839+ self .gguf_writer .add_rope_scaling_yarn_log_mul (rope_params ["mscale_all_dim" ])
2840+ self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_params ["original_max_position_embeddings" ])
2841+ self .gguf_writer .add_rope_freq_base (rope_params ["rope_theta" ])
2842+ self .gguf_writer .add_attn_temperature_scale (rope_params ["llama_4_scaling_beta" ])
28012843
28022844 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ):
28032845 name = name .replace ("language_model." , "" )
@@ -9809,12 +9851,22 @@ def modify_tensors(self, data_torch, name, bid):
98099851
98109852
98119853class MistralModel (LlamaModel ):
9812- model_arch = gguf .MODEL_ARCH .LLAMA
9854+ model_arch = gguf .MODEL_ARCH .MISTRAL3
98139855 model_name = "Mistral"
98149856 hf_arch = ""
98159857 is_mistral_format = True
98169858 undo_permute = False
98179859
9860+ def __init__ (self , * args , ** kwargs ):
9861+ super ().__init__ (* args , ** kwargs )
9862+ # for compatibility, we use LLAMA arch for older models
9863+ # TODO: remove this once everyone migrates to newer version of llama.cpp
9864+ if "llama_4_scaling" not in self .hparams :
9865+ self .model_arch = gguf .MODEL_ARCH .LLAMA
9866+ self .gguf_writer .arch = gguf .MODEL_ARCH_NAMES [self .model_arch ]
9867+ self .gguf_writer .add_architecture ()
9868+ self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
9869+
98189870 @staticmethod
98199871 def get_community_chat_template (vocab : MistralVocab , templates_dir : Path , is_mistral_format : bool ):
98209872 assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None , _mistral_import_error_msg
@@ -9854,6 +9906,20 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mis
98549906
98559907 return template
98569908
9909+ def set_gguf_parameters (self ):
9910+ super ().set_gguf_parameters ()
9911+ if "yarn" in self .hparams :
9912+ yarn_params = self .hparams ["yarn" ]
9913+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
9914+ self .gguf_writer .add_rope_scaling_factor (yarn_params ["factor" ])
9915+ self .gguf_writer .add_rope_scaling_yarn_beta_fast (yarn_params ["beta" ])
9916+ self .gguf_writer .add_rope_scaling_yarn_beta_slow (yarn_params ["alpha" ])
9917+ self .gguf_writer .add_rope_scaling_yarn_log_mul (1.0 ) # mscale_all_dim
9918+ self .gguf_writer .add_rope_scaling_orig_ctx_len (yarn_params ["original_max_position_embeddings" ])
9919+
9920+ if "llama_4_scaling" in self .hparams :
9921+ self .gguf_writer .add_attn_temperature_scale (self .hparams ["llama_4_scaling" ]["beta" ])
9922+
98579923
98589924class PixtralModel (LlavaVisionModel ):
98599925 model_name = "Pixtral"
0 commit comments