@@ -662,19 +662,19 @@ struct clip_graph {
662662
663663 ggml_cgraph * build_deepseek_ocr () {
664664 // patch embedding
665- ggml_tensor * inp_raw = build_inp_raw ();
666- ggml_tensor * sam_out = build_sam (inp_raw);
665+ ggml_tensor * inp_raw = build_inp_raw ();
666+ ggml_tensor * sam_out = build_sam (inp_raw);
667667 ggml_tensor * clip_out = build_dsocr_clip (sam_out);
668-
668+
669669 int clip_n_patches = sam_out->ne [0 ] * sam_out->ne [1 ];
670-
671- sam_out = ggml_cont (ctx0, ggml_permute (ctx0, sam_out, 1 , 2 , 0 , 3 ));
672- sam_out = ggml_reshape_2d (ctx0, sam_out, sam_out->ne [0 ], clip_n_patches);
670+
671+ sam_out = ggml_cont (ctx0, ggml_permute (ctx0, sam_out, 1 , 2 , 0 , 3 ));
672+ sam_out = ggml_reshape_2d (ctx0, sam_out, sam_out->ne [0 ], clip_n_patches);
673673 clip_out = ggml_view_2d (ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb [1 ], clip_out->nb [1 ]);
674-
674+
675675 ggml_tensor * cur;
676676 cur = ggml_concat (ctx0, clip_out, sam_out, 0 );
677- cur = ggml_reshape_2d (ctx0, cur, 2 * n_embd,clip_n_patches);
677+ cur = ggml_reshape_2d (ctx0, cur, 2 * n_embd, clip_n_patches);
678678 cur = ggml_cont (ctx0, cur);
679679 cur = ggml_mul_mat (ctx0, model.fc_w , cur);
680680 cur = ggml_add (ctx0, cur, model.fc_b );
@@ -687,10 +687,10 @@ struct clip_graph {
687687 ggml_tensor * vs;
688688
689689 imgnl = ggml_repeat_4d (ctx0, model.image_newline , n_dim, 1 , h, 1 );
690- vs = ggml_reshape_2d (ctx0, model.view_seperator , n_dim, 1 ); // (n_dim, 1)
691- cur = ggml_reshape_3d (ctx0, cur, n_dim, w, h);
692- cur = ggml_reshape_2d (ctx0, ggml_concat (ctx0, cur, imgnl, 1 ), n_dim, (w+ 1 )* h);
693- cur = ggml_concat (ctx0, cur, vs, 1 ); // (n_dim, h*(w+1) + 1)
690+ vs = ggml_reshape_2d (ctx0, model.view_seperator , n_dim, 1 ); // (n_dim, 1)
691+ cur = ggml_reshape_3d (ctx0, cur, n_dim, w, h);
692+ cur = ggml_reshape_2d (ctx0, ggml_concat (ctx0, cur, imgnl, 1 ), n_dim, (w + 1 ) * h);
693+ cur = ggml_concat (ctx0, cur, vs, 1 ); // (n_dim, h*(w+1) + 1)
694694
695695 cb (cur, " dsocr_output" , -1 );
696696
@@ -2156,7 +2156,7 @@ struct clip_graph {
21562156 ggml_tensor * Qcur;
21572157 ggml_tensor * Kcur;
21582158 ggml_tensor * Vcur;
2159-
2159+
21602160 if (layer.qkv_w ) {
21612161 ggml_tensor * QKV;
21622162
@@ -2181,12 +2181,12 @@ struct clip_graph {
21812181 if (layer.q_b ) {
21822182 Qcur = ggml_add (ctx0, Qcur, layer.q_b );
21832183 }
2184-
2184+
21852185 Kcur = ggml_mul_mat (ctx0, layer.k_w , cur);
21862186 if (layer.k_b ) {
21872187 Kcur = ggml_add (ctx0, Kcur, layer.k_b );
21882188 }
2189-
2189+
21902190 Vcur = ggml_mul_mat (ctx0, layer.v_w , cur);
21912191 if (layer.v_b ) {
21922192 Vcur = ggml_add (ctx0, Vcur, layer.v_b );
@@ -2591,7 +2591,7 @@ struct clip_graph {
25912591 } else {
25922592 ggml_tensor * v = ggml_permute (ctx0, v_cur, 1 , 2 , 0 , 3 );
25932593 v = ggml_cont (ctx0, v);
2594-
2594+
25952595 ggml_tensor * kq = ggml_mul_mat (ctx0, k, q);
25962596 // F32 may not needed for vision encoders?
25972597 // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -2727,11 +2727,11 @@ struct clip_graph {
27272727 const int d_heads = n_embd / n_heads;
27282728
27292729 ggml_tensor * inpL;
2730-
2730+
27312731 inpL = ggml_conv_2d_sk_p0 (ctx0, model.patch_embed_proj_w , inp_raw);
27322732 inpL = ggml_add (ctx0, inpL, ggml_reshape_3d (ctx0, model.patch_embed_proj_b , 1 , 1 , n_embd));
27332733 inpL = ggml_cont (ctx0, ggml_permute (ctx0, inpL, 1 , 2 , 0 , 3 ));
2734-
2734+
27352735 ggml_tensor * cur;
27362736 const auto tgt_size = inpL->ne [1 ];
27372737 const auto str_size = model.pos_embed ->ne [1 ];
@@ -2776,7 +2776,7 @@ struct clip_graph {
27762776 // self-attention
27772777 {
27782778 const int B = cur->ne [3 ];
2779-
2779+
27802780 cur = ggml_mul_mat (ctx0, layer.qkv_w , cur);
27812781 cur = ggml_add (ctx0, cur, layer.qkv_b );
27822782 cur = ggml_cont (ctx0, cur); // Ensure tensor is contiguous before reshape
@@ -2853,7 +2853,7 @@ struct clip_graph {
28532853 cur = ggml_cont (ctx0, ggml_permute (ctx0, cur, 1 , 2 , 0 , 3 ));
28542854 cur = build_norm (cur, model.neck_1_w , model.neck_1_b , NORM_TYPE_NORMAL, hparams.eps , -1 );
28552855 cur = ggml_cont (ctx0, ggml_permute (ctx0, cur, 2 , 0 , 1 , 3 ));
2856-
2856+
28572857 cur = ggml_conv_2d (ctx0, model.neck_2_w , cur, 1 , 1 , 1 , 1 , 1 , 1 );
28582858 cur = ggml_cont (ctx0, ggml_permute (ctx0, cur, 1 , 2 , 0 , 3 ));
28592859 cur = build_norm (cur, model.neck_3_w , model.neck_3_b , NORM_TYPE_NORMAL, hparams.eps , -1 );
@@ -2883,7 +2883,7 @@ struct clip_graph {
28832883 if (tgt_size != src_size) {
28842884 ggml_tensor * old_pos_embd;
28852885 ggml_tensor * cls_tok;
2886-
2886+
28872887 old_pos_embd = ggml_view_2d (
28882888 ctx0, new_pos_embd,
28892889 new_pos_embd->ne [0 ], src_size * src_size,
@@ -2912,7 +2912,7 @@ struct clip_graph {
29122912 ggml_tensor * positions = ggml_cast (ctx0, ggml_arange (ctx0, 0 , n_pos, 1 ), GGML_TYPE_I32);
29132913 ggml_tensor * learned_pos_embd = ggml_get_rows (ctx0, new_pos_embd, positions);
29142914
2915- ggml_tensor * cur = build_vit (inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK,
2915+ ggml_tensor * cur = build_vit (inp, n_pos, NORM_TYPE_NORMAL, ffn_op_type::FFN_GELU_QUICK,
29162916 learned_pos_embd, nullptr ); // shape [1024, 16, 16]
29172917
29182918 ggml_build_forward_expand (gf, cur);
@@ -5193,11 +5193,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51935193 const int orig_h = original_size.height ;
51945194 const int orig_area = orig_h * orig_w;
51955195 std::array<uint8_t , 3u > color;
5196-
5196+
51975197 for (int i = 0 ; i < 3 ; i++) {
51985198 color[i] = (int )(255 * params.image_mean [i]);
51995199 }
5200-
5200+
52015201 int mode_i = 0 ;
52025202 int min_diff = orig_area;
52035203
@@ -5212,7 +5212,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
52125212 if (mode_i < 2 ) {
52135213 /* Native Resolution (Tiny/Small) */
52145214 const int image_size = native_resolutions[mode_i];
5215-
5215+
52165216 // Just resize the image to image_size × image_size
52175217 clip_image_u8_ptr resized_img (clip_image_u8_init ());
52185218 img_tool::resize (*img, *resized_img,
@@ -5229,7 +5229,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
52295229 else if (mode_i < 4 ) {
52305230 /* Native Resolution (Base/Large) */
52315231 const int image_size = native_resolutions[mode_i];
5232-
5232+
52335233 // Resize maintaining aspect ratio, then pad to square
52345234 float scale = std::min (
52355235 static_cast <float >(image_size) / orig_w,
@@ -5286,7 +5286,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
52865286 else {
52875287 GGML_ABORT (" DeepSeek-OCR hasn't supported Gundam/Gundam-Master yet" );
52885288 /* Dynamic Resolution (Gundam/Gundam-Master) */
5289-
5289+
52905290 // configurable, or read from params
52915291 const int min_num = 2 ;
52925292 const int max_num = 9 ;
@@ -5295,18 +5295,18 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
52955295 // original image size
52965296 const int orig_w = original_size.width ;
52975297 const int orig_h = original_size.height ;
5298-
5298+
52995299 // create overview image (thumbnail)
53005300 clip_image_u8_ptr overview_img (clip_image_u8_init ());
5301- img_tool::resize (*img, *overview_img, { image_size, image_size },
5301+ img_tool::resize (*img, *overview_img, { image_size, image_size },
53025302 img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true , color);
53035303 clip_image_f32_ptr overview_f32 (clip_image_f32_init ());
53045304 normalize_image_u8_to_f32 (*overview_img, *overview_f32, params.image_mean , params.image_std );
53055305 res_imgs->entries .push_back (std::move (overview_f32));
53065306
53075307 // build candidate grids (cols, rows)
53085308 auto target_ratios = ds_build_target_ratios (min_num, max_num);
5309-
5309+
53105310 // pick the grid that best matches the original aspect ratio
53115311 const float aspect_ratio = static_cast <float >(orig_w) / static_cast <float >(orig_h);
53125312 auto best = ds_find_closest_ratio (aspect_ratio, target_ratios, orig_w, orig_h, image_size);
@@ -5315,7 +5315,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
53155315
53165316 // resize to refined size (no padding, direct resize)
53175317 clip_image_u8_ptr refined_img (clip_image_u8_init ());
5318- img_tool::resize (*img, *refined_img, { image_size * grid_cols, image_size * grid_rows },
5318+ img_tool::resize (*img, *refined_img, { image_size * grid_cols, image_size * grid_rows },
53195319 img_tool::RESIZE_ALGO_BICUBIC_PILLOW, false );
53205320
53215321 // crop slices from the refined image
0 commit comments