mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution control & all native resolution modes work

bluebread · bluebread · commit c5f4c64fe481 · 2025-11-30T16:57:19.000Z
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1824,6 +1824,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image_max_tokens = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--dsocr-mode"}, "MODE",
+        "DeepSeek-OCR resolution mode, one of:\n"
+        "- auto (default): automatically select resolution\n"
+        "- tiny, small, base, large: native resolution\n"
+        "- gundam, gundam-master: dynamic resolution",
+        [](common_params & params, const std::string & value) {
+            if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
+                value == "large" || value == "gundam" || value == "gundam-master") {
+                params.dsocr_mode = value;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
diff --git a/common/common.h b/common/common.h
@@ -433,6 +433,7 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s)
     int image_min_tokens = -1;
     int image_max_tokens = -1;
+    std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
 
     // finetune
     struct lr_opt lr;
diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
@@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                                  src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
                                  sf0, sf1, sf2, sf3, pixel_offset, stream);
+    } else {
+        GGML_ABORT("fatal error");
     }
 }
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -569,7 +569,7 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
     printf("   ]\n");
 }
 
-static void save_tensor_to_file(const struct ggml_tensor * tensor) {
+static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {
     char filename[512];
     snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);
 
@@ -589,7 +589,7 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor) {
                 (long long)total_elements);
     }
 
-    uint8_t * data = (uint8_t *) tensor->data;
+    const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;
     ggml_type type = tensor->type;
     const int64_t * ne = tensor->ne;
     const size_t * nb = tensor->nb;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -193,8 +193,6 @@ struct clip_hparams {
     int32_t attn_window_size = 0;
     int32_t n_wa_pattern = 0;
 
-    bool crop_mode = false;
-
     // audio
     int32_t n_mel_bins = 0; // whisper preprocessor
     int32_t proj_stack_factor = 0; // ultravox
@@ -208,6 +206,9 @@ struct clip_hparams {
     int32_t custom_image_min_tokens = -1;
     int32_t custom_image_max_tokens = -1;
 
+    // DeepSeek-OCR resolution mode
+    enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
+
     void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
         const int cur_merge = n_merge == 0 ? 1 : n_merge;
         const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
@@ -512,6 +513,7 @@ struct clip_ctx {
         if (ctx_params.image_max_tokens > 0) {
             model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
         }
+        model.hparams.dsocr_mode = ctx_params.dsocr_mode;
 
         backend_ptrs.push_back(backend_cpu);
         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
@@ -3403,7 +3405,6 @@ struct clip_model_loader {
                         hparams.patch_size = 16;
                         hparams.image_size = 1024;
                         hparams.warmup_image_size = 1024;
-                        hparams.crop_mode = false;
                     } break;
                 default:
                     break;
@@ -5054,9 +5055,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 }
             } break;
         case PROJECTOR_TYPE_DEEPSEEKOCR:
-            if (!params.crop_mode) {
-                /* Native Resolution (Tiny/Small/Base/Large) */
-
+            {
                 const int native_resolutions[] = {
                     512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */
                 };
@@ -5065,29 +5064,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                 const int orig_h = original_size.height;
                 const int orig_area = orig_h * orig_w;
                 std::array<uint8_t, 3u> color;
-
+    
                 for (int i = 0; i < 3; i++) {
                     color[i] = (int)(255 * params.image_mean[i]);
                 }
-
-                // mode selection logic (find most suitable resolution)
+    
                 int mode_i = 0;
-                int min_diff = orig_area;
-
-                for (int i = 0; i < 4; i++) {
-                    int r = native_resolutions[i];
-                    if (std::abs(orig_area - r*r) < min_diff) {
-                        mode_i = i;
-                        min_diff = std::abs(orig_area - r*r);
+    
+                if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
+                    mode_i = 0;
+                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
+                    mode_i = 1;
+                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
+                    mode_i = 2;
+                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
+                    mode_i = 3;
+                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
+                    mode_i = 4;
+                } else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
+                    mode_i = 5;
+                } else {
+                    if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
+                        LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__);
+                    }
+                    int min_diff = orig_area;
+                    for (int i = 0; i < 4; i++) {
+                        int r = native_resolutions[i];
+                        if (std::abs(orig_area - r*r) < min_diff) {
+                            mode_i = i;
+                            min_diff = std::abs(orig_area - r*r);
+                        }
                     }
                 }
 
-                const int image_size = native_resolutions[mode_i];
-
                 if (mode_i < 2) {
-                    // TINY/SMALL MODE: Direct resize (no slicing)
+                    /* Native Resolution (Tiny/Small) */
+                    const int image_size = native_resolutions[mode_i];
+                    
                     // Just resize the image to image_size × image_size
-
                     clip_image_u8_ptr resized_img(clip_image_u8_init());
                     img_tool::resize(*img, *resized_img,
                                     clip_image_size{image_size, image_size},
@@ -5100,10 +5114,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     res_imgs->grid_x = 1;
                     res_imgs->grid_y = 1;
                 }
-                else {
-                    // BASE/LARGE MODE: Resize with aspect ratio + padding
+                else if (mode_i < 4) {
+                    /* Native Resolution (Base/Large) */
+                    const int image_size = native_resolutions[mode_i];
+                    
                     // Resize maintaining aspect ratio, then pad to square
-
                     float scale = std::min(
                         static_cast<float>(image_size) / orig_w,
                         static_cast<float>(image_size) / orig_h
@@ -5120,7 +5135,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
                     unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
 
-                    // Step 2: Pad to image_size × image_size (center padding)
+                    // Pad to image_size × image_size (center padding)
                     clip_image_u8_ptr padded_img(clip_image_u8_init());
                     padded_img->nx = image_size;
                     padded_img->ny = image_size;
@@ -5148,76 +5163,77 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                         }
                     }
 
-                    // Step 3: Normalize and output
+                    // Normalize and output
                     clip_image_f32_ptr res(clip_image_f32_init());
                     normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
                     res_imgs->entries.push_back(std::move(res));
 
                     res_imgs->grid_x = 1;
                     res_imgs->grid_y = 1;
                 }
-            }
-            else {
-                /* Dynamic Resolution (Gundam/Gundam-M) */
-
-                // configurable, or read from params
-                const int  min_num       = 2;
-                const int  max_num       = 9;
-                const int  image_size    = params.image_size;  // typically 640
-                // const bool use_thumbnail = true;               // mimic python's use_thumbnail
-
-                // original image size
-                const int             orig_w        = original_size.width;
-                const int             orig_h        = original_size.height;
-
-                // 1) build candidate grids (cols, rows)
-                auto target_ratios = ds_build_target_ratios(min_num, max_num);
-
-                // 2) pick the grid that best matches the original aspect ratio
-                const float aspect_ratio = static_cast<float>(orig_w) / static_cast<float>(orig_h);
-                auto      best = ds_find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size);
-                const int grid_cols = best.first;   // how many tiles horizontally
-                const int grid_rows = best.second;  // how many tiles vertically
-
-                // 3) compute the target (forced) size — python did:
-                //    target_width  = image_size * cols
-                //    target_height = image_size * rows
-                const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
-
-                // 4) prepare slice instructions, same style as the idefics3 branch
-                llava_uhd::slice_instructions instructions;
-                instructions.overview_size = clip_image_size{ image_size, image_size };  // for thumbnail/global
-                instructions.refined_size  = refined_size;
-                instructions.grid_size     = clip_image_size{ grid_cols, grid_rows };
-
-                // in deepseek python they always produce *full* 640x640 blocks,
-                // so we can do a simple double loop over rows/cols:
-                for (int r = 0; r < grid_rows; ++r) {
-                    for (int c = 0; c < grid_cols; ++c) {
-                        const int x = c * image_size;
-                        const int y = r * image_size;
-
-                        instructions.slices.push_back(llava_uhd::slice_coordinates{
-                            /* x */ x,
-                            /* y */ y,
-                            /* size */ clip_image_size{ image_size, image_size }
-                        });
+                else {
+                    GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n");
+                    /* Dynamic Resolution (Gundam/Gundam-Master) */
+    
+                    // configurable, or read from params
+                    const int  min_num       = 2;
+                    const int  max_num       = 9;
+                    const int  image_size    = params.image_size;  // typically 640
+                    // const bool use_thumbnail = true;               // mimic python's use_thumbnail
+    
+                    // original image size
+                    const int             orig_w        = original_size.width;
+                    const int             orig_h        = original_size.height;
+    
+                    // 1) build candidate grids (cols, rows)
+                    auto target_ratios = ds_build_target_ratios(min_num, max_num);
+    
+                    // 2) pick the grid that best matches the original aspect ratio
+                    const float aspect_ratio = static_cast<float>(orig_w) / static_cast<float>(orig_h);
+                    auto      best = ds_find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size);
+                    const int grid_cols = best.first;   // how many tiles horizontally
+                    const int grid_rows = best.second;  // how many tiles vertically
+    
+                    // 3) compute the target (forced) size — python did:
+                    //    target_width  = image_size * cols
+                    //    target_height = image_size * rows
+                    const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
+    
+                    // 4) prepare slice instructions, same style as the idefics3 branch
+                    llava_uhd::slice_instructions instructions;
+                    instructions.overview_size = clip_image_size{ image_size, image_size };  // for thumbnail/global
+                    instructions.refined_size  = refined_size;
+                    instructions.grid_size     = clip_image_size{ grid_cols, grid_rows };
+    
+                    // in deepseek python they always produce *full* 640x640 blocks,
+                    // so we can do a simple double loop over rows/cols:
+                    for (int r = 0; r < grid_rows; ++r) {
+                        for (int c = 0; c < grid_cols; ++c) {
+                            const int x = c * image_size;
+                            const int y = r * image_size;
+    
+                            instructions.slices.push_back(llava_uhd::slice_coordinates{
+                                /* x */ x,
+                                /* y */ y,
+                                /* size */ clip_image_size{ image_size, image_size }
+                            });
+                        }
                     }
+    
+                    // 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
+                    auto imgs = llava_uhd::slice_image(img, instructions);
+    
+                    // 7) cast & normalize like the idefics3 branch
+                    for (size_t i = 0; i < imgs.size(); ++i) {
+                        clip_image_f32_ptr res(clip_image_f32_init());
+                        normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                        res_imgs->entries.push_back(std::move(res));
+                    }
+    
+                    // keep the grid info — the model may need to know how to reassemble / attend
+                    res_imgs->grid_x = grid_cols;
+                    res_imgs->grid_y = grid_rows;
                 }
-
-                // 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
-                auto imgs = llava_uhd::slice_image(img, instructions);
-
-                // 7) cast & normalize like the idefics3 branch
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-
-                // keep the grid info — the model may need to know how to reassemble / attend
-                res_imgs->grid_x = grid_cols;
-                res_imgs->grid_y = grid_rows;
             }
             break;
 
@@ -5807,7 +5823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
             for (auto & p : patterns) {
                 if (tname_s == p) {
-                    save_tensor_to_file(t);
+                    save_tensor_to_file(t, data.data());
                     is_stored = true;
                     break;
                 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -29,11 +29,22 @@ enum clip_flash_attn_type {
     CLIP_FLASH_ATTN_TYPE_ENABLED  = 1,
 };
 
+enum clip_dsocr_mode {
+    CLIP_DSOCR_MODE_AUTO,
+    CLIP_DSOCR_MODE_TINY,
+    CLIP_DSOCR_MODE_SMALL,
+    CLIP_DSOCR_MODE_BASE,
+    CLIP_DSOCR_MODE_LARGE,
+    CLIP_DSOCR_MODE_GUNDAM,
+    CLIP_DSOCR_MODE_GUNDAM_MASTER,
+};
+
 struct clip_context_params {
     bool use_gpu;
     enum clip_flash_attn_type flash_attn_type;
     int image_min_tokens;
     int image_max_tokens;
+    enum clip_dsocr_mode dsocr_mode;
 };
 
 struct clip_init_result {
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
@@ -138,6 +138,7 @@ struct mtmd_cli_context {
         mparams.flash_attn_type  = params.flash_attn_type;
         mparams.image_min_tokens = params.image_min_tokens;
         mparams.image_max_tokens = params.image_max_tokens;
+        mparams.dsocr_mode       = params.dsocr_mode.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h

Original file line number	Diff line number	Diff line change
`@@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`214`	`214`	`upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],`
`215`	`215`	`src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],`
`216`	`216`	`sf0, sf1, sf2, sf3, pixel_offset, stream);`
	`217`	`+ } else {`
	`218`	`+ GGML_ABORT("fatal error");`
`217`	`219`	`}`
`218`	`220`	`}`
Original file line number	Diff line number	Diff line change
`@@ -569,7 +569,7 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {`
`569`	`569`	`printf(" ]\n");`
`570`	`570`	`}`
`571`	`571`
`572`		`-static void save_tensor_to_file(const struct ggml_tensor * tensor) {`
	`572`	`+static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {`
`573`	`573`	`char filename[512];`
`574`	`574`	`snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);`
`575`	`575`
`@@ -589,7 +589,7 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor) {`
`589`	`589`	`(long long)total_elements);`
`590`	`590`	`}`
`591`	`591`
`592`		`- uint8_t * data = (uint8_t *) tensor->data;`
	`592`	`+ const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;`
`593`	`593`	`ggml_type type = tensor->type;`
`594`	`594`	`const int64_t * ne = tensor->ne;`
`595`	`595`	`const size_t * nb = tensor->nb;`