Skip to content

Commit 206f8ab

Browse files
committed
- dynamic resizing
- changes are concerning PR #4
1 parent 7941f5d commit 206f8ab

File tree

1 file changed

+81
-23
lines changed

1 file changed

+81
-23
lines changed

tools/mtmd/clip.cpp

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,25 @@ struct clip_graph {
676676
const int enc_n_patches = enc_image_size / enc_patch_size; // 64
677677

678678
ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_n_patches, enc_n_embd);
679-
ggml_tensor * cur = ggml_add(ctx0, inpL, model.pos_embed);
679+
ggml_tensor * cur = nullptr;
680+
681+
const auto tgt_size = inpL->ne[1];
682+
const auto str_size = model.pos_embed->ne[1];
683+
if (str_size != tgt_size) {
684+
ggml_tensor * new_pos_embed = ggml_interpolate(
685+
ctx0,
686+
model.pos_embed,
687+
tgt_size,
688+
tgt_size,
689+
enc_n_embd,
690+
1,
691+
ggml_scale_mode::GGML_SCALE_MODE_BICUBIC
692+
);
693+
new_pos_embed = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embed, 2,1,0,3));
694+
cur = ggml_add(ctx0, inpL, new_pos_embed);
695+
} else {
696+
cur = ggml_add(ctx0, inpL, model.pos_embed);
697+
}
680698

681699
// loop over layers
682700
for (int il = 0; il < _depth; il++) {
@@ -840,10 +858,11 @@ struct clip_graph {
840858
ggml_tensor * global_features_2 = build_dp_ocr_clip(global_features_1);
841859

842860
// FIXME remove n_patches is hardcoded
843-
int clip_n_patches = 256; // FIXME hardcoded for sam 1024x1024 with 16x16 patches
844861

845862
// torch global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
846863
global_features_1 = ggml_cont(ctx0,ggml_permute(ctx0, global_features_1,2,1,0,3));
864+
int clip_n_patches = global_features_1->ne[1] * global_features_1->ne[2];
865+
847866
// flatten 2nd and 3rd dims
848867
global_features_1 = ggml_reshape_2d(ctx0, global_features_1, global_features_1->ne[0], clip_n_patches);
849868

@@ -874,21 +893,24 @@ struct clip_graph {
874893
GGML_ASSERT(model.view_seperator != nullptr);
875894

876895
// 1) global_features: [n_dim, h*w] -> [n_dim, w, h] -> [h, w, n_dim]
877-
ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, 1280, 16, 16, 1); // (n_dim, w, h)
896+
const auto h = static_cast<int>(std::sqrt(static_cast<float>(global_features->ne[1])));
897+
const auto w = h;
898+
const auto n_dim = global_features->ne[0];
899+
ggml_tensor * t = ggml_reshape_4d(ctx0, global_features, n_dim, h, w, 1); // (n_dim, w, h)
878900
t = ggml_cont(ctx0, ggml_permute(ctx0, t, 2, 1, 0, 3)); // (h, w, n_dim)
879901
ggml_tensor * nl = ggml_cont(ctx0,ggml_permute(ctx0, model.image_newline, 2, 1, 0, 3));
880-
nl = ggml_repeat_4d(ctx0, nl, 16, 1, 1280, 1); // n_pos rows
902+
nl = ggml_repeat_4d(ctx0, nl, h, 1, n_dim, 1); // n_pos rows
881903

882904

883905
// 2) image_newline: [n_dim] -> [1, 1, n_dim] -> repeat to [h, 1, n_dim]
884906
t = ggml_concat(ctx0, t, nl, 1); // (h, w+1, n_dim)
885907

886-
t = ggml_reshape_2d(ctx0, t, 1280, 16 * (16 + 1)); // (n_dim, h*(w+1))
908+
t = ggml_reshape_2d(ctx0, t, n_dim, h* (h + 1)); // (n_dim, h*(w+1))
887909

888910

889911
// 5) append view_separator as an extra "token":
890912
// view_separator: [n_dim] -> [n_dim, 1]
891-
ggml_tensor * vs = ggml_reshape_2d(ctx0, model.view_seperator, 1280, 1); // (n_dim, 1)
913+
ggml_tensor * vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1)
892914

893915
// concat along token dimension (dim=1):
894916
t = ggml_concat(ctx0, t, vs, 1); // (n_dim, h*(w+1) + 1)
@@ -1547,10 +1569,35 @@ struct clip_graph {
15471569
ggml_tensor * inp = ggml_cpy(ctx0, patch_embeds, ggml_dup_tensor(ctx0, patch_embeds));
15481570

15491571

1550-
const int n_pos = 257; // +1 for [CLS]
15511572
inp = ggml_cont(ctx0,ggml_permute(ctx0, inp,2,1,0,3));
15521573
inp = ggml_reshape_2d(ctx0, inp, n_embd, inp->ne[1]*inp->ne[2]*inp->ne[3]);
15531574

1575+
ggml_tensor * new_pos_embd = ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
1576+
1577+
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
1578+
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
1579+
const auto src_size = static_cast<int>(std::sqrt(n_pos - 1));
1580+
1581+
1582+
if (tgt_size != src_size) {
1583+
//ggml_tensor * old_pos_embd = ggml_new_tensor_2d(ctx0, model.position_embeddings->type, model.position_embeddings->ne[0], str_size * str_size);
1584+
ggml_tensor * old_pos_embd = ggml_view_2d(ctx0, new_pos_embd,
1585+
new_pos_embd->ne[0], src_size * src_size,
1586+
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), 0);
1587+
ggml_tensor * cls_tok = ggml_view_2d(ctx0, new_pos_embd,
1588+
new_pos_embd->ne[0], 1,
1589+
ggml_row_size(new_pos_embd->type, new_pos_embd->ne[0]), src_size * src_size);
1590+
new_pos_embd = ggml_interpolate(ctx0,
1591+
old_pos_embd,
1592+
tgt_size,
1593+
tgt_size,
1594+
new_pos_embd->ne[0], 1, GGML_SCALE_MODE_BICUBIC);
1595+
new_pos_embd = ggml_reshape_3d(ctx0, new_pos_embd, n_embd, tgt_size * tgt_size, 1);
1596+
//new_pos_embd = ggml_cont(ctx0, ggml_permute(ctx0, new_pos_embd, 2,1,0,3));
1597+
new_pos_embd = ggml_concat(ctx0, new_pos_embd, cls_tok, 1);
1598+
n_pos = tgt_size * tgt_size + 1;
1599+
}
1600+
15541601

15551602

15561603
// add CLS token
@@ -1560,11 +1607,8 @@ struct clip_graph {
15601607
norm_type norm_t = NORM_TYPE_NORMAL;
15611608

15621609
// for selecting learned pos embd, used by ViT
1563-
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
1564-
ggml_set_name(positions, "positions");
1565-
ggml_set_input(positions);
1566-
1567-
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
1610+
ggml_tensor * positions = ggml_cast(ctx0, ggml_arange(ctx0, 0, n_pos, 1), GGML_TYPE_I32);
1611+
ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, new_pos_embd, positions);
15681612

15691613

15701614
ggml_tensor * cur = build_vit(inp, n_pos, norm_t, hparams.ffn_op, learned_pos_embd,
@@ -2525,7 +2569,27 @@ struct clip_graph {
25252569
const int64_t C = rel_pos->ne[0]; // channels
25262570
const int64_t L = rel_pos->ne[1]; // length
25272571

2528-
GGML_ASSERT(2*std::max(q_size, k_size) - 1 == L);
2572+
//GGML_ASSERT(2*std::max(q_size, k_size) - 1 == L);
2573+
2574+
const auto max_rel_dist = 2*std::max(q_size, k_size) - 1;
2575+
ggml_tensor * rel_pos_resized = rel_pos;
2576+
2577+
if (max_rel_dist != L) {
2578+
// Linear interpolation
2579+
const auto scale = L / static_cast<float>(max_rel_dist);
2580+
ggml_tensor * indices = ggml_arange(ctx, 0.0f, static_cast<float>(max_rel_dist), 1.0f);
2581+
indices = ggml_scale_inplace(ctx, indices, scale);
2582+
ggml_tensor * indices_floor= ggml_cast(ctx, ggml_floor(ctx, indices), GGML_TYPE_I32);
2583+
ggml_tensor * indices_ceil = ggml_cast(ctx, ggml_ceil(ctx, indices), GGML_TYPE_I32);
2584+
ggml_tensor * weights = ggml_sub(ctx, indices, indices_floor);
2585+
ggml_tensor * ws1 = ggml_scale_bias(ctx, weights, -1.0f, 1.0f);
2586+
rel_pos_resized = ggml_cont(ctx , ggml_permute(ctx, rel_pos_resized, 1, 0, 2, 3)); // [C, L] for ggml_get_rows
2587+
ggml_tensor * rs1 = ggml_cont(ctx, ggml_permute(ctx, ggml_get_rows(ctx, rel_pos_resized, indices_floor), 1, 0, 2, 3)); // lower rows
2588+
rs1 = ggml_mul(ctx, rs1, ws1); // lower rows
2589+
ggml_tensor * rs2 = ggml_cont(ctx, ggml_permute(ctx, ggml_get_rows(ctx, rel_pos_resized, indices_ceil), 1, 0, 2, 3)); // upper rows
2590+
rs2 = ggml_mul(ctx, rs2, weights); // upper rows
2591+
rel_pos_resized = ggml_add(ctx,rs1, rs2);
2592+
}
25292593

25302594
// -------------------------------------------------
25312595
// 1) q_idx ← arange(0..q_size-1) [q_size]
@@ -5007,7 +5071,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50075071
if (!params.crop_mode) {
50085072
/* Native Resolution (Tiny/Small/Base/Large) */
50095073

5010-
const int native_resolutions[] = {
5074+
const int native_resolutions[] = {
50115075
512 /* tiny */, 640 /* small */, 1024 /* base */, 1280 /* large */
50125076
};
50135077
// original image size
@@ -5060,7 +5124,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
50605124
img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h},
50615125
img_tool::RESIZE_ALGO_BICUBIC);
50625126

5063-
// Use mean color for padding
5127+
// Use mean color for padding
50645128
unsigned char pad_r = static_cast<unsigned char>(params.image_mean[0] * 255.0f);
50655129
unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
50665130
unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
@@ -5352,6 +5416,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
53525416
int x_patch = img->nx / (params.patch_size);
53535417

53545418
n_patches += x_patch + 1;
5419+
n_patches = 1280;
5420+
53555421

53565422
} break;
53575423
default:
@@ -5690,14 +5756,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
56905756
} break;
56915757
case PROJECTOR_TYPE_DEEPSEEKOCR:
56925758
{
5693-
//FIXME we need correct this when all model configs are set correctly
5694-
//n_patch is not correct right now
5695-
int32_t n_pos = 16 * 16 + 1; //hardcode for now
5696-
std::vector<int32_t> positions(n_pos);
5697-
for (int i = 0; i < n_pos; i++) {
5698-
positions[i] = i;
5699-
}
5700-
set_input_i32("positions", positions);
57015759
} break;
57025760
case PROJECTOR_TYPE_LLAMA4:
57035761
{

0 commit comments

Comments
 (0)