@@ -345,7 +345,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
345345 if args .clip_model_is_siglip :
346346 visual_projection_dim = 0
347347 else :
348- visual_projection_dim = v_hparams .get ("projection_dim" , config [ "projection_dim" ] )
348+ visual_projection_dim = v_hparams .get ("projection_dim" , config . get ( "projection_dim" , 0 ) )
349349
350350 # set vision_model hparams
351351 fout .add_uint32 ("clip.vision.image_size" , v_hparams ["image_size" ])
@@ -431,30 +431,35 @@ def get_non_negative_vision_feature_layers(v_hparams):
431431 fout .add_array ("clip.vision.image_mean" , image_mean )
432432 fout .add_array ("clip.vision.image_std" , image_std )
433433
434- use_gelu = v_hparams [ "hidden_act" ] == "gelu"
434+ use_gelu = v_hparams . get ( "hidden_act" , "" ) == "gelu"
435435fout .add_bool ("clip.use_gelu" , use_gelu )
436436
437437
438438if has_llava_projector :
439439 # By default, we drop the last layer for llava projector
440440 # models unless we have explicitly set vision feature layers
441441 if feature_layers is None :
442- model .vision_model .encoder .layers .pop (- 1 )
442+ # Phi-3 Specific Keys
443+ if args .projector_type == "phi3_v" :
444+ model .model .vision_embed_tokens .img_processor .vision_model .encoder .layers .pop (- 1 )
445+ else :
446+ model .vision_model .encoder .layers .pop (- 1 )
443447 else :
444448 model .vision_model .encoder .layers = model .vision_model .encoder .layers [:max (feature_layers )]
445449
446- projector = torch .load (args .llava_projector )
447- for name , data in projector .items ():
448- name = get_tensor_name (name )
449- # pw and dw conv ndim==4
450- if data .ndim == 2 or data .ndim == 4 :
451- data = data .squeeze ().numpy ().astype (np .float16 )
452- else :
453- data = data .squeeze ().numpy ().astype (np .float32 )
450+ if args .llava_projector :
451+ projector = torch .load (args .llava_projector )
452+ for name , data in projector .items ():
453+ name = get_tensor_name (name )
454+ # pw and dw conv ndim==4
455+ if data .ndim == 2 or data .ndim == 4 :
456+ data = data .squeeze ().numpy ().astype (np .float16 )
457+ else :
458+ data = data .squeeze ().numpy ().astype (np .float32 )
454459
455- fout .add_tensor (name , data )
460+ fout .add_tensor (name , data )
456461
457- print ("Projector tensors added\n " )
462+ print ("Projector tensors added\n " )
458463
459464print ("Processing model tensors..." )
460465state_dict = model .state_dict ()
0 commit comments