@@ -7713,9 +7713,112 @@ def set_vocab(self):
77137713 self .gguf_writer .add_chat_template (chat_template )
77147714
77157715
7716- @ModelBase .register ("OpenAIMoeForCausalLM" )
7717- class OpenAIMoeModel (TextModel ):
7718- model_arch = gguf .MODEL_ARCH .OPENAI_MOE
7716+ @ModelBase .register ("GptOssForCausalLM" )
7717+ class GptOssModel (TextModel ):
7718+ model_arch = gguf .MODEL_ARCH .GPT_OSS
7719+
7720+ def repack_mxfp4 (self , new_name : str , blocks : Tensor , scales : Tensor ):
7721+ assert blocks .dtype == torch .uint8
7722+ assert scales .dtype == torch .uint8
7723+ scales = scales .unsqueeze (- 1 )
7724+ assert len (blocks .shape ) == 4
7725+ assert len (scales .shape ) == 4
7726+ new_data = torch .cat ([scales , blocks ], dim = - 1 )
7727+ new_data = new_data .numpy ()
7728+ new_shape = [scales .shape [0 ], scales .shape [1 ], scales .shape [2 ] * 32 ]
7729+ logger .info (f"Repacked { new_name } with shape { new_shape } and quantization MXFP4" )
7730+ self .gguf_writer .add_tensor (new_name , new_data , new_shape , gguf .GGMLQuantizationType .MXFP4 )
7731+
7732+ def convert_moe_packed_tensors (
7733+ self ,
7734+ new_name : str ,
7735+ blocks ,
7736+ scales ,
7737+ * ,
7738+ dtype : torch .dtype = torch .float16 ,
7739+ rows_per_chunk : int = 32768 * 1024 ,
7740+ ):
7741+ import math
7742+
7743+ scales = scales .to (torch .int32 ) - 127
7744+
7745+ assert blocks .shape [:- 1 ] == scales .shape , f"{ blocks .shape = } does not match { scales .shape = } "
7746+
7747+ FP4_VALUES = [
7748+ + 0.0 ,
7749+ + 0.5 ,
7750+ + 1.0 ,
7751+ + 1.5 ,
7752+ + 2.0 ,
7753+ + 3.0 ,
7754+ + 4.0 ,
7755+ + 6.0 ,
7756+ - 0.0 ,
7757+ - 0.5 ,
7758+ - 1.0 ,
7759+ - 1.5 ,
7760+ - 2.0 ,
7761+ - 3.0 ,
7762+ - 4.0 ,
7763+ - 6.0 ,
7764+ ]
7765+ blocks = blocks .to (device = "cpu" )
7766+ scales = scales .to (device = "cpu" )
7767+ lut = torch .tensor (FP4_VALUES , dtype = dtype , device = blocks .device )
7768+
7769+ * prefix_shape , G , B = blocks .shape
7770+ rows_total = math .prod (prefix_shape ) * G
7771+
7772+ blocks = blocks .reshape (rows_total , B )
7773+ scales = scales .reshape (rows_total , 1 )
7774+
7775+ out = torch .empty (rows_total , B * 2 , dtype = dtype , device = "cpu" )
7776+
7777+ for r0 in range (0 , rows_total , rows_per_chunk ):
7778+ r1 = min (r0 + rows_per_chunk , rows_total )
7779+
7780+ blk = blocks [r0 :r1 ]
7781+ exp = scales [r0 :r1 ]
7782+
7783+ # nibble indices -> int64
7784+ idx_lo = (blk & 0x0F ).to (torch .long )
7785+ idx_hi = (blk >> 4 ).to (torch .long )
7786+
7787+ sub = out [r0 :r1 ]
7788+ sub [:, 0 ::2 ] = lut [idx_lo ]
7789+ sub [:, 1 ::2 ] = lut [idx_hi ]
7790+
7791+ torch .ldexp (sub , exp , out = sub )
7792+ del idx_lo , idx_hi , blk , exp
7793+
7794+ out = out .reshape (* prefix_shape , G , B * 2 ).view (* prefix_shape , G * B * 2 )
7795+ out = out .numpy ()
7796+ logger .info (f"Unpacked { new_name } with shape { out .shape } from MXFP4 to F16" )
7797+ print (out .dtype , out .device , out .shape )
7798+ self .gguf_writer .add_tensor (new_name , out )
7799+
7800+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
7801+ blocks0 : Tensor = torch .zeros (1 )
7802+ blocks1 : Tensor = torch .zeros (1 )
7803+ # we assume that tensors are loaded in the correct order
7804+ for name , data_torch in self .get_tensors ():
7805+ if "mlp.experts.down_proj_blocks" in name :
7806+ blocks0 = data_torch
7807+ elif "mlp.experts.down_proj_scales" in name :
7808+ new_name = self .map_tensor_name (name .replace ("_scales" , ".weight" ))
7809+ #self.repack_mxfp4(new_name, blocks0, data_torch)
7810+ self .convert_moe_packed_tensors (new_name , blocks0 , data_torch )
7811+ elif "mlp.experts.gate_up_proj_blocks" in name :
7812+ blocks0 , blocks1 = data_torch [:, ::2 , :, :], data_torch [:, 1 ::2 , :, :]
7813+ elif "mlp.experts.gate_up_proj_scales" in name :
7814+ scales0 , scales1 = data_torch [:, ::2 , :], data_torch [:, 1 ::2 , :]
7815+ new_name_gate = self .map_tensor_name (name .replace ("gate_up_proj_scales" , "gate_proj.weight" ))
7816+ new_name_up = self .map_tensor_name (name .replace ("gate_up_proj_scales" , "up_proj.weight" ))
7817+ # self.repack_mxfp4(new_name_gate, blocks0, scales0)
7818+ # self.repack_mxfp4(new_name_up, blocks1, scales1)
7819+ self .convert_moe_packed_tensors (new_name_gate , blocks0 , scales0 )
7820+ self .convert_moe_packed_tensors (new_name_up , blocks1 , scales1 )
7821+ return []
77197822
77207823 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
77217824 del bid # unused
@@ -7728,32 +7831,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
77287831 if name .endswith ("_bias" ):
77297832 name = name .replace ("down_proj_bias" , "down_proj.bias" )
77307833 else :
7731- name = name .replace ("down_proj" , "down_proj.weight" )
7732- data_torch = data_torch .transpose (- 1 , - 2 )
7834+ return []
77337835
77347836 # split the gate_up into gate and up
77357837 if "gate_up_proj" in name :
77367838 if name .endswith ("_bias" ):
77377839 name_up = name .replace ("gate_up_proj_bias" , "up_proj.bias" )
77387840 name_gate = name .replace ("gate_up_proj_bias" , "gate_proj.bias" )
7739- #dim_half = data_torch.shape[-1] // 2
7740- #gate_proj_bias, up_proj_bias = data_torch.split(dim_half, dim=-1)
77417841 gate_proj_bias , up_proj_bias = data_torch [..., ::2 ], data_torch [..., 1 ::2 ]
77427842 return [
77437843 (self .map_tensor_name (name_gate ), gate_proj_bias ),
77447844 (self .map_tensor_name (name_up ), up_proj_bias )
77457845 ]
77467846 else :
7747- name_up = name .replace ("gate_up_proj" , "up_proj.weight" )
7748- name_gate = name .replace ("gate_up_proj" , "gate_proj.weight" )
7749- #dim_half = data_torch.shape[-1] // 2
7750- #gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2)
7751- data_torch = data_torch .transpose (- 1 , - 2 )
7752- gate_proj_weight , up_proj_weight = data_torch [:, ::2 , :], data_torch [:, 1 ::2 , :]
7753- return [
7754- (self .map_tensor_name (name_gate ), gate_proj_weight ),
7755- (self .map_tensor_name (name_up ), up_proj_weight )
7756- ]
7847+ return []
77577848
77587849 return [(self .map_tensor_name (name ), data_torch )]
77597850
@@ -7767,7 +7858,7 @@ def set_gguf_parameters(self):
77677858
77687859 rope_scaling = self .hparams .get ("rope_scaling" ) or {}
77697860 rope_type = rope_scaling .get ("rope_type" , rope_scaling .get ("type" ))
7770- assert rope_type == "yarn" , f"OpenAI MoE only supports yarn rope scaling, got { rope_type } "
7861+ assert rope_type == "yarn" , f"GPT-OSS only supports yarn rope scaling, got { rope_type } "
77717862 self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .YARN )
77727863 self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
77737864 self .gguf_writer .add_rope_scaling_orig_ctx_len (rope_scaling .get ("original_max_position_embeddings" , 4096 ))
@@ -7912,6 +8003,7 @@ class LazyTorchTensor(gguf.LazyBase):
79128003 _dtype_map : dict [torch .dtype , type ] = {
79138004 torch .float16 : np .float16 ,
79148005 torch .float32 : np .float32 ,
8006+ torch .uint8 : np .uint8 ,
79158007 }
79168008
79178009 # used for safetensors slices
0 commit comments