Skip to content

Commit c5f4c64

Browse files
committed
mtmd : add --dsocr-mode CLI argument for DeepSeek-OCR resolution control & all native resolution modes work
1 parent 5543094 commit c5f4c64

File tree

9 files changed

+159
-88
lines changed

9 files changed

+159
-88
lines changed

common/arg.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1824,6 +1824,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
18241824
params.image_max_tokens = value;
18251825
}
18261826
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
1827+
add_opt(common_arg(
1828+
{"--dsocr-mode"}, "MODE",
1829+
"DeepSeek-OCR resolution mode, one of:\n"
1830+
"- auto (default): automatically select resolution\n"
1831+
"- tiny, small, base, large: native resolution\n"
1832+
"- gundam, gundam-master: dynamic resolution",
1833+
[](common_params & params, const std::string & value) {
1834+
if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
1835+
value == "large" || value == "gundam" || value == "gundam-master") {
1836+
params.dsocr_mode = value;
1837+
} else {
1838+
throw std::invalid_argument("invalid value");
1839+
}
1840+
}
1841+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
18271842
if (llama_supports_rpc()) {
18281843
add_opt(common_arg(
18291844
{"--rpc"}, "SERVERS",

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,7 @@ struct common_params {
433433
std::vector<std::string> image; // path to image file(s)
434434
int image_min_tokens = -1;
435435
int image_max_tokens = -1;
436+
std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
436437

437438
// finetune
438439
struct lr_opt lr;

ggml/src/ggml-cuda/upscale.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
214214
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
215215
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
216216
sf0, sf1, sf2, sf3, pixel_offset, stream);
217+
} else {
218+
GGML_ABORT("fatal error");
217219
}
218220
}

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
569569
printf(" ]\n");
570570
}
571571

572-
static void save_tensor_to_file(const struct ggml_tensor * tensor) {
572+
static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {
573573
char filename[512];
574574
snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);
575575

@@ -589,7 +589,7 @@ static void save_tensor_to_file(const struct ggml_tensor * tensor) {
589589
(long long)total_elements);
590590
}
591591

592-
uint8_t * data = (uint8_t *) tensor->data;
592+
const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;
593593
ggml_type type = tensor->type;
594594
const int64_t * ne = tensor->ne;
595595
const size_t * nb = tensor->nb;

tools/mtmd/clip.cpp

Lines changed: 102 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,6 @@ struct clip_hparams {
193193
int32_t attn_window_size = 0;
194194
int32_t n_wa_pattern = 0;
195195

196-
bool crop_mode = false;
197-
198196
// audio
199197
int32_t n_mel_bins = 0; // whisper preprocessor
200198
int32_t proj_stack_factor = 0; // ultravox
@@ -208,6 +206,9 @@ struct clip_hparams {
208206
int32_t custom_image_min_tokens = -1;
209207
int32_t custom_image_max_tokens = -1;
210208

209+
// DeepSeek-OCR resolution mode
210+
enum clip_dsocr_mode dsocr_mode = clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO;
211+
211212
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
212213
const int cur_merge = n_merge == 0 ? 1 : n_merge;
213214
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
@@ -512,6 +513,7 @@ struct clip_ctx {
512513
if (ctx_params.image_max_tokens > 0) {
513514
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
514515
}
516+
model.hparams.dsocr_mode = ctx_params.dsocr_mode;
515517

516518
backend_ptrs.push_back(backend_cpu);
517519
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
@@ -3403,7 +3405,6 @@ struct clip_model_loader {
34033405
hparams.patch_size = 16;
34043406
hparams.image_size = 1024;
34053407
hparams.warmup_image_size = 1024;
3406-
hparams.crop_mode = false;
34073408
} break;
34083409
default:
34093410
break;
@@ -5054,9 +5055,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50545055
}
50555056
} break;
50565057
case PROJECTOR_TYPE_DEEPSEEKOCR:
5057-
if (!params.crop_mode) {
5058-
/* Native Resolution (Tiny/Small/Base/Large) */
5059-
5058+
{
50605059
const int native_resolutions[] = {
50615060
512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */
50625061
};
@@ -5065,29 +5064,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50655064
const int orig_h = original_size.height;
50665065
const int orig_area = orig_h * orig_w;
50675066
std::array<uint8_t, 3u> color;
5068-
5067+
50695068
for (int i = 0; i < 3; i++) {
50705069
color[i] = (int)(255 * params.image_mean[i]);
50715070
}
5072-
5073-
// mode selection logic (find most suitable resolution)
5071+
50745072
int mode_i = 0;
5075-
int min_diff = orig_area;
5076-
5077-
for (int i = 0; i < 4; i++) {
5078-
int r = native_resolutions[i];
5079-
if (std::abs(orig_area - r*r) < min_diff) {
5080-
mode_i = i;
5081-
min_diff = std::abs(orig_area - r*r);
5073+
5074+
if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_TINY) {
5075+
mode_i = 0;
5076+
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_SMALL) {
5077+
mode_i = 1;
5078+
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_BASE) {
5079+
mode_i = 2;
5080+
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_LARGE) {
5081+
mode_i = 3;
5082+
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM) {
5083+
mode_i = 4;
5084+
} else if (params.dsocr_mode == clip_dsocr_mode::CLIP_DSOCR_MODE_GUNDAM_MASTER) {
5085+
mode_i = 5;
5086+
} else {
5087+
if (params.dsocr_mode != clip_dsocr_mode::CLIP_DSOCR_MODE_AUTO) {
5088+
LOG_WRN("%s: unknown dsocr_mode, using auto mode\n", __func__);
5089+
}
5090+
int min_diff = orig_area;
5091+
for (int i = 0; i < 4; i++) {
5092+
int r = native_resolutions[i];
5093+
if (std::abs(orig_area - r*r) < min_diff) {
5094+
mode_i = i;
5095+
min_diff = std::abs(orig_area - r*r);
5096+
}
50825097
}
50835098
}
50845099

5085-
const int image_size = native_resolutions[mode_i];
5086-
50875100
if (mode_i < 2) {
5088-
// TINY/SMALL MODE: Direct resize (no slicing)
5101+
/* Native Resolution (Tiny/Small) */
5102+
const int image_size = native_resolutions[mode_i];
5103+
50895104
// Just resize the image to image_size × image_size
5090-
50915105
clip_image_u8_ptr resized_img(clip_image_u8_init());
50925106
img_tool::resize(*img, *resized_img,
50935107
clip_image_size{image_size, image_size},
@@ -5100,10 +5114,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51005114
res_imgs->grid_x = 1;
51015115
res_imgs->grid_y = 1;
51025116
}
5103-
else {
5104-
// BASE/LARGE MODE: Resize with aspect ratio + padding
5117+
else if (mode_i < 4) {
5118+
/* Native Resolution (Base/Large) */
5119+
const int image_size = native_resolutions[mode_i];
5120+
51055121
// Resize maintaining aspect ratio, then pad to square
5106-
51075122
float scale = std::min(
51085123
static_cast<float>(image_size) / orig_w,
51095124
static_cast<float>(image_size) / orig_h
@@ -5120,7 +5135,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51205135
unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
51215136
unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
51225137

5123-
// Step 2: Pad to image_size × image_size (center padding)
5138+
// Pad to image_size × image_size (center padding)
51245139
clip_image_u8_ptr padded_img(clip_image_u8_init());
51255140
padded_img->nx = image_size;
51265141
padded_img->ny = image_size;
@@ -5148,76 +5163,77 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
51485163
}
51495164
}
51505165

5151-
// Step 3: Normalize and output
5166+
// Normalize and output
51525167
clip_image_f32_ptr res(clip_image_f32_init());
51535168
normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
51545169
res_imgs->entries.push_back(std::move(res));
51555170

51565171
res_imgs->grid_x = 1;
51575172
res_imgs->grid_y = 1;
51585173
}
5159-
}
5160-
else {
5161-
/* Dynamic Resolution (Gundam/Gundam-M) */
5162-
5163-
// configurable, or read from params
5164-
const int min_num = 2;
5165-
const int max_num = 9;
5166-
const int image_size = params.image_size; // typically 640
5167-
// const bool use_thumbnail = true; // mimic python's use_thumbnail
5168-
5169-
// original image size
5170-
const int orig_w = original_size.width;
5171-
const int orig_h = original_size.height;
5172-
5173-
// 1) build candidate grids (cols, rows)
5174-
auto target_ratios = ds_build_target_ratios(min_num, max_num);
5175-
5176-
// 2) pick the grid that best matches the original aspect ratio
5177-
const float aspect_ratio = static_cast<float>(orig_w) / static_cast<float>(orig_h);
5178-
auto best = ds_find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size);
5179-
const int grid_cols = best.first; // how many tiles horizontally
5180-
const int grid_rows = best.second; // how many tiles vertically
5181-
5182-
// 3) compute the target (forced) size — python did:
5183-
// target_width = image_size * cols
5184-
// target_height = image_size * rows
5185-
const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
5186-
5187-
// 4) prepare slice instructions, same style as the idefics3 branch
5188-
llava_uhd::slice_instructions instructions;
5189-
instructions.overview_size = clip_image_size{ image_size, image_size }; // for thumbnail/global
5190-
instructions.refined_size = refined_size;
5191-
instructions.grid_size = clip_image_size{ grid_cols, grid_rows };
5192-
5193-
// in deepseek python they always produce *full* 640x640 blocks,
5194-
// so we can do a simple double loop over rows/cols:
5195-
for (int r = 0; r < grid_rows; ++r) {
5196-
for (int c = 0; c < grid_cols; ++c) {
5197-
const int x = c * image_size;
5198-
const int y = r * image_size;
5199-
5200-
instructions.slices.push_back(llava_uhd::slice_coordinates{
5201-
/* x */ x,
5202-
/* y */ y,
5203-
/* size */ clip_image_size{ image_size, image_size }
5204-
});
5174+
else {
5175+
GGML_ABORT("DeepSeek-OCR: Gundam/Gundam-Master haven't been tested yet.\n");
5176+
/* Dynamic Resolution (Gundam/Gundam-Master) */
5177+
5178+
// configurable, or read from params
5179+
const int min_num = 2;
5180+
const int max_num = 9;
5181+
const int image_size = params.image_size; // typically 640
5182+
// const bool use_thumbnail = true; // mimic python's use_thumbnail
5183+
5184+
// original image size
5185+
const int orig_w = original_size.width;
5186+
const int orig_h = original_size.height;
5187+
5188+
// 1) build candidate grids (cols, rows)
5189+
auto target_ratios = ds_build_target_ratios(min_num, max_num);
5190+
5191+
// 2) pick the grid that best matches the original aspect ratio
5192+
const float aspect_ratio = static_cast<float>(orig_w) / static_cast<float>(orig_h);
5193+
auto best = ds_find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_w, orig_h, image_size);
5194+
const int grid_cols = best.first; // how many tiles horizontally
5195+
const int grid_rows = best.second; // how many tiles vertically
5196+
5197+
// 3) compute the target (forced) size — python did:
5198+
// target_width = image_size * cols
5199+
// target_height = image_size * rows
5200+
const clip_image_size refined_size{ image_size * grid_cols, image_size * grid_rows };
5201+
5202+
// 4) prepare slice instructions, same style as the idefics3 branch
5203+
llava_uhd::slice_instructions instructions;
5204+
instructions.overview_size = clip_image_size{ image_size, image_size }; // for thumbnail/global
5205+
instructions.refined_size = refined_size;
5206+
instructions.grid_size = clip_image_size{ grid_cols, grid_rows };
5207+
5208+
// in deepseek python they always produce *full* 640x640 blocks,
5209+
// so we can do a simple double loop over rows/cols:
5210+
for (int r = 0; r < grid_rows; ++r) {
5211+
for (int c = 0; c < grid_cols; ++c) {
5212+
const int x = c * image_size;
5213+
const int y = r * image_size;
5214+
5215+
instructions.slices.push_back(llava_uhd::slice_coordinates{
5216+
/* x */ x,
5217+
/* y */ y,
5218+
/* size */ clip_image_size{ image_size, image_size }
5219+
});
5220+
}
52055221
}
5222+
5223+
// 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
5224+
auto imgs = llava_uhd::slice_image(img, instructions);
5225+
5226+
// 7) cast & normalize like the idefics3 branch
5227+
for (size_t i = 0; i < imgs.size(); ++i) {
5228+
clip_image_f32_ptr res(clip_image_f32_init());
5229+
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
5230+
res_imgs->entries.push_back(std::move(res));
5231+
}
5232+
5233+
// keep the grid info — the model may need to know how to reassemble / attend
5234+
res_imgs->grid_x = grid_cols;
5235+
res_imgs->grid_y = grid_rows;
52065236
}
5207-
5208-
// 5) run the actual slicing (this should: resize to refined_size, then crop every slice)
5209-
auto imgs = llava_uhd::slice_image(img, instructions);
5210-
5211-
// 7) cast & normalize like the idefics3 branch
5212-
for (size_t i = 0; i < imgs.size(); ++i) {
5213-
clip_image_f32_ptr res(clip_image_f32_init());
5214-
normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
5215-
res_imgs->entries.push_back(std::move(res));
5216-
}
5217-
5218-
// keep the grid info — the model may need to know how to reassemble / attend
5219-
res_imgs->grid_x = grid_cols;
5220-
res_imgs->grid_y = grid_rows;
52215237
}
52225238
break;
52235239

@@ -5807,7 +5823,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
58075823

58085824
for (auto & p : patterns) {
58095825
if (tname_s == p) {
5810-
save_tensor_to_file(t);
5826+
save_tensor_to_file(t, data.data());
58115827
is_stored = true;
58125828
break;
58135829
}

tools/mtmd/clip.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,22 @@ enum clip_flash_attn_type {
2929
CLIP_FLASH_ATTN_TYPE_ENABLED = 1,
3030
};
3131

32+
enum clip_dsocr_mode {
33+
CLIP_DSOCR_MODE_AUTO,
34+
CLIP_DSOCR_MODE_TINY,
35+
CLIP_DSOCR_MODE_SMALL,
36+
CLIP_DSOCR_MODE_BASE,
37+
CLIP_DSOCR_MODE_LARGE,
38+
CLIP_DSOCR_MODE_GUNDAM,
39+
CLIP_DSOCR_MODE_GUNDAM_MASTER,
40+
};
41+
3242
struct clip_context_params {
3343
bool use_gpu;
3444
enum clip_flash_attn_type flash_attn_type;
3545
int image_min_tokens;
3646
int image_max_tokens;
47+
enum clip_dsocr_mode dsocr_mode;
3748
};
3849

3950
struct clip_init_result {

tools/mtmd/mtmd-cli.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ struct mtmd_cli_context {
138138
mparams.flash_attn_type = params.flash_attn_type;
139139
mparams.image_min_tokens = params.image_min_tokens;
140140
mparams.image_max_tokens = params.image_max_tokens;
141+
mparams.dsocr_mode = params.dsocr_mode.c_str();
141142
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
142143
if (!ctx_vision.get()) {
143144
LOG_ERR("Failed to load vision model from %s\n", clip_path);

0 commit comments

Comments
 (0)