@@ -193,8 +193,6 @@ struct clip_hparams {
193193 int32_t attn_window_size = 0 ;
194194 int32_t n_wa_pattern = 0 ;
195195
196- bool crop_mode = false ;
197-
198196 // audio
199197 int32_t n_mel_bins = 0 ; // whisper preprocessor
200198 int32_t proj_stack_factor = 0 ; // ultravox
@@ -208,6 +206,9 @@ struct clip_hparams {
208206 int32_t custom_image_min_tokens = -1 ;
209207 int32_t custom_image_max_tokens = -1 ;
210208
209+ // DeepSeek-OCR resolution mode
210+ enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
211+
211212 void set_limit_image_tokens (int n_tokens_min, int n_tokens_max) {
212213 const int cur_merge = n_merge == 0 ? 1 : n_merge;
213214 const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
@@ -512,6 +513,7 @@ struct clip_ctx {
512513 if (ctx_params.image_max_tokens > 0 ) {
513514 model.hparams .custom_image_max_tokens = ctx_params.image_max_tokens ;
514515 }
516+ model.hparams .dsocr_mode = ctx_params.dsocr_mode ;
515517
516518 backend_ptrs.push_back (backend_cpu);
517519 backend_buft.push_back (ggml_backend_get_default_buffer_type (backend_cpu));
@@ -3403,7 +3405,6 @@ struct clip_model_loader {
34033405 hparams.patch_size = 16 ;
34043406 hparams.image_size = 1024 ;
34053407 hparams.warmup_image_size = 1024 ;
3406- hparams.crop_mode = false ;
34073408 } break ;
34083409 default :
34093410 break ;
@@ -5054,9 +5055,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50545055 }
50555056 } break ;
50565057 case PROJECTOR_TYPE_DEEPSEEKOCR:
5057- if (!params.crop_mode ) {
5058- /* Native Resolution (Tiny/Small/Base/Large) */
5059-
5058+ {
50605059 const int native_resolutions[] = {
50615060 512 /* tiny */ , 640 /* small */ , 1024 /* base */ , 1280 /* large */
50625061 };
@@ -5065,29 +5064,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50655064 const int orig_h = original_size.height ;
50665065 const int orig_area = orig_h * orig_w;
50675066 std::array<uint8_t , 3u > color;
5068-
5067+
50695068 for (int i = 0 ; i < 3 ; i++) {
50705069 color[i] = (int )(255 * params.image_mean [i]);
50715070 }
5072-
5073- // mode selection logic (find most suitable resolution)
5071+
50745072 int mode_i = 0 ;
5075- int min_diff = orig_area;
5076-
5077- for (int i = 0 ; i < 4 ; i++) {
5078- int r = native_resolutions[i];
5079- if (std::abs (orig_area - r*r) < min_diff) {
5080- mode_i = i;
5081- min_diff = std::abs (orig_area - r*r);
5073+
5074+ if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
5075+ mode_i = 0 ;
5076+ } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
5077+ mode_i = 1 ;
5078+ } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
5079+ mode_i = 2 ;
5080+ } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
5081+ mode_i = 3 ;
5082+ } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
5083+ mode_i = 4 ;
5084+ } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
5085+ mode_i = 5 ;
5086+ } else {
5087+ if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
5088+ LOG_WRN (" %s: unknown dsocr_mode, using auto mode\n " , __func__);
5089+ }
5090+ int min_diff = orig_area;
5091+ for (int i = 0 ; i < 4 ; i++) {
5092+ int r = native_resolutions[i];
5093+ if (std::abs (orig_area - r*r) < min_diff) {
5094+ mode_i = i;
5095+ min_diff = std::abs (orig_area - r*r);
5096+ }
50825097 }
50835098 }
50845099
5085- const int image_size = native_resolutions[mode_i];
5086-
50875100 if (mode_i < 2 ) {
5088- // TINY/SMALL MODE: Direct resize (no slicing)
5101+ /* Native Resolution (Tiny/Small) */
5102+ const int image_size = native_resolutions[mode_i];
5103+
50895104 // Just resize the image to image_size × image_size
5090-
50915105 clip_image_u8_ptr resized_img (clip_image_u8_init ());
50925106 img_tool::resize (*img, *resized_img,
50935107 clip_image_size{image_size, image_size},
@@ -5100,10 +5114,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51005114 res_imgs->grid_x = 1 ;
51015115 res_imgs->grid_y = 1 ;
51025116 }
5103- else {
5104- // BASE/LARGE MODE: Resize with aspect ratio + padding
5117+ else if (mode_i < 4 ) {
5118+ /* Native Resolution (Base/Large) */
5119+ const int image_size = native_resolutions[mode_i];
5120+
51055121 // Resize maintaining aspect ratio, then pad to square
5106-
51075122 float scale = std::min (
51085123 static_cast <float >(image_size) / orig_w,
51095124 static_cast <float >(image_size) / orig_h
@@ -5120,7 +5135,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51205135 unsigned char pad_g = static_cast <unsigned char >(params.image_mean [1 ] * 255 .0f );
51215136 unsigned char pad_b = static_cast <unsigned char >(params.image_mean [2 ] * 255 .0f );
51225137
5123- // Step 2: Pad to image_size × image_size (center padding)
5138+ // Pad to image_size × image_size (center padding)
51245139 clip_image_u8_ptr padded_img (clip_image_u8_init ());
51255140 padded_img->nx = image_size;
51265141 padded_img->ny = image_size;
@@ -5148,76 +5163,77 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51485163 }
51495164 }
51505165
5151- // Step 3: Normalize and output
5166+ // Normalize and output
51525167 clip_image_f32_ptr res (clip_image_f32_init ());
51535168 normalize_image_u8_to_f32 (*padded_img, *res, params.image_mean , params.image_std );
51545169 res_imgs->entries .push_back (std::move (res));
51555170
51565171 res_imgs->grid_x = 1 ;
51575172 res_imgs->grid_y = 1 ;
51585173 }
5159- }
5160- else {
5161- /* Dynamic Resolution (Gundam/Gundam-M) */
5162-
5163- // configurable, or read from params
5164- const int min_num = 2 ;
5165- const int max_num = 9 ;
5166- const int image_size = params.image_size ; // typically 640
5167- // const bool use_thumbnail = true; // mimic python's use_thumbnail
5168-
5169- // original image size
5170- const int orig_w = original_size.width ;
5171- const int orig_h = original_size.height ;
5172-
5173- // 1) build candidate grids (cols, rows)
5174- auto target_ratios = ds_build_target_ratios (min_num, max_num);
5175-
5176- // 2) pick the grid that best matches the original aspect ratio
5177- const float aspect_ratio = static_cast <float >(orig_w) / static_cast <float >(orig_h);
5178- auto best = ds_find_closest_aspect_ratio (aspect_ratio, target_ratios, orig_w, orig_h, image_size);
5179- const int grid_cols = best.first ; // how many tiles horizontally
5180- const int grid_rows = best.second ; // how many tiles vertically
5181-
5182- // 3) compute the target (forced) size — python did:
5183- // target_width = image_size * cols
5184- // target_height = image_size * rows
5185- const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
5186-
5187- // 4) prepare slice instructions, same style as the idefics3 branch
5188- llava_uhd::slice_instructions instructions;
5189- instructions.overview_size = clip_image_size{ image_size, image_size }; // for thumbnail/global
5190- instructions.refined_size = refined_size;
5191- instructions.grid_size = clip_image_size{ grid_cols, grid_rows };
5192-
5193- // in deepseek python they always produce *full* 640x640 blocks,
5194- // so we can do a simple double loop over rows/cols:
5195- for (int r = 0 ; r < grid_rows; ++r) {
5196- for (int c = 0 ; c < grid_cols; ++c) {
5197- const int x = c * image_size;
5198- const int y = r * image_size;
5199-
5200- instructions.slices .push_back (llava_uhd::slice_coordinates{
5201- /* x */ x,
5202- /* y */ y,
5203- /* size */ clip_image_size{ image_size, image_size }
5204- });
5174+ else {
5175+ GGML_ABORT (" DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n " );
5176+ /* Dynamic Resolution (Gundam/Gundam-Master) */
5177+
5178+ // configurable, or read from params
5179+ const int min_num = 2 ;
5180+ const int max_num = 9 ;
5181+ const int image_size = params.image_size ; // typically 640
5182+ // const bool use_thumbnail = true; // mimic python's use_thumbnail
5183+
5184+ // original image size
5185+ const int orig_w = original_size.width ;
5186+ const int orig_h = original_size.height ;
5187+
5188+ // 1) build candidate grids (cols, rows)
5189+ auto target_ratios = ds_build_target_ratios (min_num, max_num);
5190+
5191+ // 2) pick the grid that best matches the original aspect ratio
5192+ const float aspect_ratio = static_cast <float >(orig_w) / static_cast <float >(orig_h);
5193+ auto best = ds_find_closest_aspect_ratio (aspect_ratio, target_ratios, orig_w, orig_h, image_size);
5194+ const int grid_cols = best.first ; // how many tiles horizontally
5195+ const int grid_rows = best.second ; // how many tiles vertically
5196+
5197+ // 3) compute the target (forced) size — python did:
5198+ // target_width = image_size * cols
5199+ // target_height = image_size * rows
5200+ const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
5201+
5202+ // 4) prepare slice instructions, same style as the idefics3 branch
5203+ llava_uhd::slice_instructions instructions;
5204+ instructions.overview_size = clip_image_size{ image_size, image_size }; // for thumbnail/global
5205+ instructions.refined_size = refined_size;
5206+ instructions.grid_size = clip_image_size{ grid_cols, grid_rows };
5207+
5208+ // in deepseek python they always produce *full* 640x640 blocks,
5209+ // so we can do a simple double loop over rows/cols:
5210+ for (int r = 0 ; r < grid_rows; ++r) {
5211+ for (int c = 0 ; c < grid_cols; ++c) {
5212+ const int x = c * image_size;
5213+ const int y = r * image_size;
5214+
5215+ instructions.slices .push_back (llava_uhd::slice_coordinates{
5216+ /* x */ x,
5217+ /* y */ y,
5218+ /* size */ clip_image_size{ image_size, image_size }
5219+ });
5220+ }
52055221 }
5222+
5223+ // 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
5224+ auto imgs = llava_uhd::slice_image (img, instructions);
5225+
5226+ // 7) cast & normalize like the idefics3 branch
5227+ for (size_t i = 0 ; i < imgs.size (); ++i) {
5228+ clip_image_f32_ptr res (clip_image_f32_init ());
5229+ normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
5230+ res_imgs->entries .push_back (std::move (res));
5231+ }
5232+
5233+ // keep the grid info — the model may need to know how to reassemble / attend
5234+ res_imgs->grid_x = grid_cols;
5235+ res_imgs->grid_y = grid_rows;
52065236 }
5207-
5208- // 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
5209- auto imgs = llava_uhd::slice_image (img, instructions);
5210-
5211- // 7) cast & normalize like the idefics3 branch
5212- for (size_t i = 0 ; i < imgs.size (); ++i) {
5213- clip_image_f32_ptr res (clip_image_f32_init ());
5214- normalize_image_u8_to_f32 (*imgs[i], *res, params.image_mean , params.image_std );
5215- res_imgs->entries .push_back (std::move (res));
5216- }
5217-
5218- // keep the grid info — the model may need to know how to reassemble / attend
5219- res_imgs->grid_x = grid_cols;
5220- res_imgs->grid_y = grid_rows;
52215237 }
52225238 break ;
52235239
@@ -5807,7 +5823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
58075823
58085824 for (auto & p : patterns) {
58095825 if (tname_s == p) {
5810- save_tensor_to_file (t);
5826+ save_tensor_to_file (t, data. data () );
58115827 is_stored = true ;
58125828 break ;
58135829 }
0 commit comments