@@ -676,7 +676,25 @@ struct clip_graph {
676676 const int enc_n_patches = enc_image_size / enc_patch_size; // 64
677677
678678 ggml_tensor * inpL = build_enc_inp (inp_raw, enc_patch_size, enc_n_patches, enc_n_embd);
679- ggml_tensor * cur = ggml_add (ctx0, inpL, model.pos_embed );
679+ ggml_tensor * cur = nullptr ;
680+
681+ const auto tgt_size = inpL->ne [1 ];
682+ const auto str_size = model.pos_embed ->ne [1 ];
683+ if (str_size != tgt_size) {
684+ ggml_tensor * new_pos_embed = ggml_interpolate (
685+ ctx0,
686+ model.pos_embed ,
687+ tgt_size,
688+ tgt_size,
689+ enc_n_embd,
690+ 1 ,
691+ ggml_scale_mode::GGML_SCALE_MODE_BICUBIC
692+ );
693+ new_pos_embed = ggml_cont (ctx0, ggml_permute (ctx0, new_pos_embed, 2 ,1 ,0 ,3 ));
694+ cur = ggml_add (ctx0, inpL, new_pos_embed);
695+ } else {
696+ cur = ggml_add (ctx0, inpL, model.pos_embed );
697+ }
680698
681699 // loop over layers
682700 for (int il = 0 ; il < _depth; il++) {
@@ -840,10 +858,11 @@ struct clip_graph {
840858 ggml_tensor * global_features_2 = build_dp_ocr_clip (global_features_1);
841859
842860 // FIXME remove n_patches is hardcoded
843- int clip_n_patches = 256 ; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
844861
845862 // torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
846863 global_features_1 = ggml_cont (ctx0,ggml_permute (ctx0, global_features_1,2 ,1 ,0 ,3 ));
864+ int clip_n_patches = global_features_1->ne [1 ] * global_features_1->ne [2 ];
865+
847866 // flatten 2nd and 3rd dims
848867 global_features_1 = ggml_reshape_2d (ctx0, global_features_1, global_features_1->ne [0 ], clip_n_patches);
849868
@@ -874,21 +893,24 @@ struct clip_graph {
874893 GGML_ASSERT (model.view_seperator != nullptr );
875894
876895 // 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
877- ggml_tensor * t = ggml_reshape_4d (ctx0, global_features, 1280 , 16 , 16 , 1 ); // (n_dim, w, h)
896+ const auto h = static_cast <int >(std::sqrt (static_cast <float >(global_features->ne [1 ])));
897+ const auto w = h;
898+ const auto n_dim = global_features->ne [0 ];
899+ ggml_tensor * t = ggml_reshape_4d (ctx0, global_features, n_dim, h, w, 1 ); // (n_dim, w, h)
878900 t = ggml_cont (ctx0, ggml_permute (ctx0, t, 2 , 1 , 0 , 3 )); // (h, w, n_dim)
879901 ggml_tensor * nl = ggml_cont (ctx0,ggml_permute (ctx0, model.image_newline , 2 , 1 , 0 , 3 ));
880- nl = ggml_repeat_4d (ctx0, nl, 16 , 1 , 1280 , 1 ); // n_pos rows
902+ nl = ggml_repeat_4d (ctx0, nl, h , 1 , n_dim , 1 ); // n_pos rows
881903
882904
883905 // 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
884906 t = ggml_concat (ctx0, t, nl, 1 ); // (h, w+1, n_dim)
885907
886- t = ggml_reshape_2d (ctx0, t, 1280 , 16 * (16 + 1 )); // (n_dim, h*(w+1))
908+ t = ggml_reshape_2d (ctx0, t, n_dim, h * (h + 1 )); // (n_dim, h*(w+1))
887909
888910
889911 // 5) append view_separator as an extra "token":
890912 // view_separator: [n_dim] -> [n_dim, 1]
891- ggml_tensor * vs = ggml_reshape_2d (ctx0, model.view_seperator , 1280 , 1 ); // (n_dim, 1)
913+ ggml_tensor * vs = ggml_reshape_2d (ctx0, model.view_seperator , n_dim , 1 ); // (n_dim, 1)
892914
893915 // concat along token dimension (dim=1):
894916 t = ggml_concat (ctx0, t, vs, 1 ); // (n_dim, h*(w+1) + 1)
@@ -1547,10 +1569,35 @@ struct clip_graph {
15471569 ggml_tensor * inp = ggml_cpy (ctx0, patch_embeds, ggml_dup_tensor (ctx0, patch_embeds));
15481570
15491571
1550- const int n_pos = 257 ; // +1 for [CLS]
15511572 inp = ggml_cont (ctx0,ggml_permute (ctx0, inp,2 ,1 ,0 ,3 ));
15521573 inp = ggml_reshape_2d (ctx0, inp, n_embd, inp->ne [1 ]*inp->ne [2 ]*inp->ne [3 ]);
15531574
1575+ ggml_tensor * new_pos_embd = ggml_cpy (ctx0, model.position_embeddings , ggml_dup_tensor (ctx0, model.position_embeddings ));
1576+
1577+ int n_pos = new_pos_embd->ne [1 ]; // +1 for [CLS]
1578+ const auto tgt_size = static_cast <int >(std::sqrt (inp->ne [1 ]));
1579+ const auto src_size = static_cast <int >(std::sqrt (n_pos - 1 ));
1580+
1581+
1582+ if (tgt_size != src_size) {
1583+ // ggml_tensor * old_pos_embd = ggml_new_tensor_2d(ctx0, model.position_embeddings->type, model.position_embeddings->ne[0], str_size * str_size);
1584+ ggml_tensor * old_pos_embd = ggml_view_2d (ctx0, new_pos_embd,
1585+ new_pos_embd->ne [0 ], src_size * src_size,
1586+ ggml_row_size (new_pos_embd->type , new_pos_embd->ne [0 ]), 0 );
1587+ ggml_tensor * cls_tok = ggml_view_2d (ctx0, new_pos_embd,
1588+ new_pos_embd->ne [0 ], 1 ,
1589+ ggml_row_size (new_pos_embd->type , new_pos_embd->ne [0 ]), src_size * src_size);
1590+ new_pos_embd = ggml_interpolate (ctx0,
1591+ old_pos_embd,
1592+ tgt_size,
1593+ tgt_size,
1594+ new_pos_embd->ne [0 ], 1 , GGML_SCALE_MODE_BICUBIC);
1595+ new_pos_embd = ggml_reshape_3d (ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1 );
1596+ // new_pos_embd = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embd, 2,1,0,3));
1597+ new_pos_embd = ggml_concat (ctx0, new_pos_embd, cls_tok, 1 );
1598+ n_pos = tgt_size * tgt_size + 1 ;
1599+ }
1600+
15541601
15551602
15561603 // add CLS token
@@ -1560,11 +1607,8 @@ struct clip_graph {
15601607 norm_type norm_t = NORM_TYPE_NORMAL;
15611608
15621609 // for selecting learned pos embd, used by ViT
1563- ggml_tensor * positions = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_pos);
1564- ggml_set_name (positions, " positions" );
1565- ggml_set_input (positions);
1566-
1567- ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, model.position_embeddings , positions);
1610+ ggml_tensor * positions = ggml_cast (ctx0, ggml_arange (ctx0, 0 , n_pos, 1 ), GGML_TYPE_I32);
1611+ ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, new_pos_embd, positions);
15681612
15691613
15701614 ggml_tensor * cur = build_vit (inp, n_pos, norm_t , hparams.ffn_op , learned_pos_embd,
@@ -2525,7 +2569,27 @@ struct clip_graph {
25252569 const int64_t C = rel_pos->ne [0 ]; // channels
25262570 const int64_t L = rel_pos->ne [1 ]; // length
25272571
2528- GGML_ASSERT (2 *std::max (q_size, k_size) - 1 == L);
2572+ // GGML_ASSERT(2*std::max(q_size, k_size) - 1 == L);
2573+
2574+ const auto max_rel_dist = 2 *std::max (q_size, k_size) - 1 ;
2575+ ggml_tensor * rel_pos_resized = rel_pos;
2576+
2577+ if (max_rel_dist != L) {
2578+ // Linear interpolation
2579+ const auto scale = L / static_cast <float >(max_rel_dist);
2580+ ggml_tensor * indices = ggml_arange (ctx, 0 .0f , static_cast <float >(max_rel_dist), 1 .0f );
2581+ indices = ggml_scale_inplace (ctx, indices, scale);
2582+ ggml_tensor * indices_floor= ggml_cast (ctx, ggml_floor (ctx, indices), GGML_TYPE_I32);
2583+ ggml_tensor * indices_ceil = ggml_cast (ctx, ggml_ceil (ctx, indices), GGML_TYPE_I32);
2584+ ggml_tensor * weights = ggml_sub (ctx, indices, indices_floor);
2585+ ggml_tensor * ws1 = ggml_scale_bias (ctx, weights, -1 .0f , 1 .0f );
2586+ rel_pos_resized = ggml_cont (ctx , ggml_permute (ctx, rel_pos_resized, 1 , 0 , 2 , 3 )); // [C, L] for ggml_get_rows
2587+ ggml_tensor * rs1 = ggml_cont (ctx, ggml_permute (ctx, ggml_get_rows (ctx, rel_pos_resized, indices_floor), 1 , 0 , 2 , 3 )); // lower rows
2588+ rs1 = ggml_mul (ctx, rs1, ws1); // lower rows
2589+ ggml_tensor * rs2 = ggml_cont (ctx, ggml_permute (ctx, ggml_get_rows (ctx, rel_pos_resized, indices_ceil), 1 , 0 , 2 , 3 )); // upper rows
2590+ rs2 = ggml_mul (ctx, rs2, weights); // upper rows
2591+ rel_pos_resized = ggml_add (ctx,rs1, rs2);
2592+ }
25292593
25302594 // -------------------------------------------------
25312595 // 1) q_idx ← arange(0..q_size-1) [q_size]
@@ -5007,7 +5071,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50075071 if (!params.crop_mode ) {
50085072 /* Native Resolution (Tiny/Small/Base/Large) */
50095073
5010- const int native_resolutions[] = {
5074+ const int native_resolutions[] = {
50115075 512 /* tiny */ , 640 /* small */ , 1024 /* base */ , 1280 /* large */
50125076 };
50135077 // original image size
@@ -5060,7 +5124,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50605124 img_tool::resize (*img, *scaled_img, clip_image_size{new_w, new_h},
50615125 img_tool::RESIZE_ALGO_BICUBIC);
50625126
5063- // Use mean color for padding
5127+ // Use mean color for padding
50645128 unsigned char pad_r = static_cast <unsigned char >(params.image_mean [0 ] * 255 .0f );
50655129 unsigned char pad_g = static_cast <unsigned char >(params.image_mean [1 ] * 255 .0f );
50665130 unsigned char pad_b = static_cast <unsigned char >(params.image_mean [2 ] * 255 .0f );
@@ -5352,6 +5416,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
53525416 int x_patch = img->nx / (params.patch_size );
53535417
53545418 n_patches += x_patch + 1 ;
5419+ n_patches = 1280 ;
5420+
53555421
53565422 } break ;
53575423 default :
@@ -5690,14 +5756,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
56905756 } break ;
56915757 case PROJECTOR_TYPE_DEEPSEEKOCR:
56925758 {
5693- // FIXME we need correct this when all model configs are set correctly
5694- // n_patch is not correct right now
5695- int32_t n_pos = 16 * 16 + 1 ; // hardcode for now
5696- std::vector<int32_t > positions (n_pos);
5697- for (int i = 0 ; i < n_pos; i++) {
5698- positions[i] = i;
5699- }
5700- set_input_i32 (" positions" , positions);
57015759 } break ;
57025760 case PROJECTOR_TYPE_LLAMA4:
57035761 {
0 commit comments