Skip to content

Commit be9f100

Browse files
Fixing some parsing issues
1 parent 0fcca6a commit be9f100

File tree

1 file changed

+18
-13
lines changed

1 file changed

+18
-13
lines changed

tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ def get_non_negative_vision_feature_layers(v_hparams):
345345
if args.clip_model_is_siglip:
346346
visual_projection_dim = 0
347347
else:
348-
visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"])
348+
visual_projection_dim = v_hparams.get("projection_dim", config.get("projection_dim", 0))
349349

350350
# set vision_model hparams
351351
fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
@@ -431,30 +431,35 @@ def get_non_negative_vision_feature_layers(v_hparams):
431431
fout.add_array("clip.vision.image_mean", image_mean)
432432
fout.add_array("clip.vision.image_std", image_std)
433433

434-
use_gelu = v_hparams["hidden_act"] == "gelu"
434+
use_gelu = v_hparams.get("hidden_act","") == "gelu"
435435
fout.add_bool("clip.use_gelu", use_gelu)
436436

437437

438438
if has_llava_projector:
439439
# By default, we drop the last layer for llava projector
440440
# models unless we have explicitly set vision feature layers
441441
if feature_layers is None:
442-
model.vision_model.encoder.layers.pop(-1)
442+
# Phi-3 Specific Keys
443+
if args.projector_type == "phi3_v":
444+
model.model.vision_embed_tokens.img_processor.vision_model.encoder.layers.pop(-1)
445+
else:
446+
model.vision_model.encoder.layers.pop(-1)
443447
else:
444448
model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
445449

446-
projector = torch.load(args.llava_projector)
447-
for name, data in projector.items():
448-
name = get_tensor_name(name)
449-
# pw and dw conv ndim==4
450-
if data.ndim == 2 or data.ndim == 4:
451-
data = data.squeeze().numpy().astype(np.float16)
452-
else:
453-
data = data.squeeze().numpy().astype(np.float32)
450+
if args.llava_projector:
451+
projector = torch.load(args.llava_projector)
452+
for name, data in projector.items():
453+
name = get_tensor_name(name)
454+
# pw and dw conv ndim==4
455+
if data.ndim == 2 or data.ndim == 4:
456+
data = data.squeeze().numpy().astype(np.float16)
457+
else:
458+
data = data.squeeze().numpy().astype(np.float32)
454459

455-
fout.add_tensor(name, data)
460+
fout.add_tensor(name, data)
456461

457-
print("Projector tensors added\n")
462+
print("Projector tensors added\n")
458463

459464
print("Processing model tensors...")
460465
state_dict = model.state_dict()

0 commit comments

Comments
 (0)