Skip to content

Commit c6b2c93

Browse files
authored
mtmd: some small clean up (ggml-org#17909)
* clip: add support for fused qkv in build_vit * use bulid_ffn whenever possible * fix internvl * mtmd-cli: move image to beginning * test script: support custom args
1 parent 34a6d86 commit c6b2c93

File tree

3 files changed

+126
-81
lines changed

3 files changed

+126
-81
lines changed

tools/mtmd/clip.cpp

Lines changed: 91 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -595,11 +595,12 @@ struct clip_graph {
595595
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
596596
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
597597

598-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
599-
cur = ggml_add(ctx0, cur, model.mm_1_b);
600-
cur = ggml_gelu(ctx0, cur);
601-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
602-
cur = ggml_add(ctx0, cur, model.mm_2_b);
598+
cur = build_ffn(cur,
599+
model.mm_1_w, model.mm_1_b,
600+
nullptr, nullptr,
601+
model.mm_2_w, model.mm_2_b,
602+
FFN_GELU,
603+
-1);
603604

604605
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
605606
cur = build_ffn(cur,
@@ -667,16 +668,12 @@ struct clip_graph {
667668

668669
// LlavaMultiModalProjector (always using GELU activation)
669670
{
670-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
671-
if (model.mm_1_b) {
672-
cur = ggml_add(ctx0, cur, model.mm_1_b);
673-
}
674-
675-
cur = ggml_gelu(ctx0, cur);
676-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
677-
if (model.mm_2_b) {
678-
cur = ggml_add(ctx0, cur, model.mm_2_b);
679-
}
671+
cur = build_ffn(cur,
672+
model.mm_1_w, model.mm_1_b,
673+
nullptr, nullptr,
674+
model.mm_2_w, model.mm_2_b,
675+
FFN_GELU,
676+
-1);
680677
}
681678

682679
// arrangement of the [IMG_BREAK] token
@@ -866,16 +863,12 @@ struct clip_graph {
866863
// multimodal projection
867864
ggml_tensor * embeddings = inpL;
868865
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
869-
870-
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
871-
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
872-
873-
// GELU activation
874-
embeddings = ggml_gelu(ctx0, embeddings);
875-
876-
// Second linear layer
877-
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
878-
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
866+
embeddings = build_ffn(embeddings,
867+
model.mm_0_w, model.mm_0_b,
868+
nullptr, nullptr,
869+
model.mm_1_w, model.mm_1_b,
870+
FFN_GELU,
871+
-1);
879872

880873
if (use_window_attn) {
881874
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
@@ -1253,11 +1246,12 @@ struct clip_graph {
12531246
// projector LayerNorm uses pytorch's default eps = 1e-5
12541247
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
12551248
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
1256-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1257-
cur = ggml_add(ctx0, cur, model.mm_1_b);
1258-
cur = ggml_gelu(ctx0, cur);
1259-
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
1260-
cur = ggml_add(ctx0, cur, model.mm_3_b);
1249+
cur = build_ffn(cur,
1250+
model.mm_1_w, model.mm_1_b,
1251+
nullptr, nullptr,
1252+
model.mm_3_w, model.mm_3_b,
1253+
FFN_GELU,
1254+
-1);
12611255
}
12621256

12631257
// build the graph
@@ -1408,11 +1402,12 @@ struct clip_graph {
14081402
cb(cur, "proj_inp_normed", -1);
14091403

14101404
// projection mlp
1411-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1412-
cur = ggml_add(ctx0, cur, model.mm_1_b);
1413-
cur = ggml_gelu(ctx0, cur);
1414-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1415-
cur = ggml_add(ctx0, cur, model.mm_2_b);
1405+
cur = build_ffn(cur,
1406+
model.mm_1_w, model.mm_1_b,
1407+
nullptr, nullptr,
1408+
model.mm_2_w, model.mm_2_b,
1409+
FFN_GELU,
1410+
-1);
14161411
cb(cur, "proj_out", -1);
14171412
}
14181413

@@ -1883,9 +1878,12 @@ struct clip_graph {
18831878

18841879
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
18851880
// projector
1886-
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
1887-
cur = ggml_gelu_erf(ctx0, cur);
1888-
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
1881+
cur = build_ffn(cur,
1882+
model.mm_1_w, model.mm_1_b,
1883+
nullptr, nullptr,
1884+
model.mm_2_w, model.mm_2_b,
1885+
FFN_GELU_ERF,
1886+
-1);
18891887

18901888
} else {
18911889
GGML_ABORT("%s: unknown projector type", __func__);
@@ -2070,34 +2068,66 @@ struct clip_graph {
20702068

20712069
// self-attention
20722070
{
2073-
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
2074-
if (layer.q_b) {
2075-
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
2076-
}
2071+
ggml_tensor * Qcur = nullptr;
2072+
ggml_tensor * Kcur = nullptr;
2073+
ggml_tensor * Vcur = nullptr;
2074+
if (layer.qkv_w != nullptr) {
2075+
// fused qkv
2076+
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
2077+
if (layer.qkv_b != nullptr) {
2078+
cur = ggml_add(ctx0, cur, layer.qkv_b);
2079+
}
20772080

2078-
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
2079-
if (layer.k_b) {
2080-
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
2081-
}
2081+
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
2082+
/* nb1 */ ggml_row_size(cur->type, d_head),
2083+
/* nb2 */ cur->nb[1],
2084+
/* offset */ 0);
20822085

2083-
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
2084-
if (layer.v_b) {
2085-
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
2086-
}
2086+
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
2087+
/* nb1 */ ggml_row_size(cur->type, d_head),
2088+
/* nb2 */ cur->nb[1],
2089+
/* offset */ ggml_row_size(cur->type, n_embd));
20872090

2088-
if (layer.q_norm) {
2089-
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
2090-
cb(Qcur, "Qcur_norm", il);
2091-
}
2091+
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
2092+
/* nb1 */ ggml_row_size(cur->type, d_head),
2093+
/* nb2 */ cur->nb[1],
2094+
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
20922095

2093-
if (layer.k_norm) {
2094-
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
2095-
cb(Kcur, "Kcur_norm", il);
2096-
}
2096+
// TODO: q/k norm requires row size == n_embd, while here it's d_head
2097+
// we can add support in the future if needed
2098+
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
20972099

2098-
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
2099-
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
2100-
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
2100+
} else {
2101+
// separate q, k, v
2102+
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
2103+
if (layer.q_b) {
2104+
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
2105+
}
2106+
2107+
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
2108+
if (layer.k_b) {
2109+
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
2110+
}
2111+
2112+
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
2113+
if (layer.v_b) {
2114+
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
2115+
}
2116+
2117+
if (layer.q_norm) {
2118+
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
2119+
cb(Qcur, "Qcur_norm", il);
2120+
}
2121+
2122+
if (layer.k_norm) {
2123+
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
2124+
cb(Kcur, "Kcur_norm", il);
2125+
}
2126+
2127+
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
2128+
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
2129+
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
2130+
}
21012131

21022132
cb(Qcur, "Qcur", il);
21032133
cb(Kcur, "Kcur", il);

tools/mtmd/mtmd-cli.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,9 @@ int main(int argc, char ** argv) {
318318
g_is_generating = true;
319319
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
320320
for (size_t i = 0; i < params.image.size(); i++) {
321-
params.prompt += mtmd_default_marker();
321+
// most models require the marker before each image
322+
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
323+
params.prompt = mtmd_default_marker() + params.prompt;
322324
}
323325
}
324326
common_chat_msg msg;

tools/mtmd/tests.sh

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,33 +32,42 @@ fi
3232

3333
arr_prefix=()
3434
arr_hf=()
35-
arr_tmpl=() # chat template
35+
arr_extra_args=()
3636
arr_file=()
3737

3838
add_test_vision() {
3939
local hf=$1
40-
local tmpl=${2:-""} # default to empty string if not provided
40+
shift
41+
local extra_args=""
42+
if [ $# -gt 0 ]; then
43+
extra_args=$(printf " %q" "$@")
44+
fi
4145
arr_prefix+=("[vision]")
4246
arr_hf+=("$hf")
43-
arr_tmpl+=("$tmpl")
47+
arr_extra_args+=("$extra_args")
4448
arr_file+=("test-1.jpeg")
4549
}
4650

4751
add_test_audio() {
4852
local hf=$1
53+
shift
54+
local extra_args=""
55+
if [ $# -gt 0 ]; then
56+
extra_args=$(printf " %q" "$@")
57+
fi
4958
arr_prefix+=("[audio] ")
5059
arr_hf+=("$hf")
51-
arr_tmpl+=("") # no need for chat tmpl
60+
arr_extra_args+=("$extra_args")
5261
arr_file+=("test-2.mp3")
5362
}
5463

5564
add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
5665
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
5766
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
5867
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
59-
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
60-
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
61-
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
68+
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
69+
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
70+
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
6271
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
6372
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
6473
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
7988
# to test the big models, run: ./tests.sh big
8089
if [ "$RUN_BIG_TESTS" = true ]; then
8190
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
82-
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
91+
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
8392
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
8493
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
8594
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
@@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
8998
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
9099
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
91100
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
92-
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
101+
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
93102

94103
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
95104
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
@@ -122,31 +131,35 @@ for i in "${!arr_hf[@]}"; do
122131
bin="llama-mtmd-cli"
123132
prefix="${arr_prefix[$i]}"
124133
hf="${arr_hf[$i]}"
125-
tmpl="${arr_tmpl[$i]}"
134+
extra_args="${arr_extra_args[$i]}"
126135
inp_file="${arr_file[$i]}"
127136

128137
echo "Running test with binary: $bin and HF model: $hf"
129138
echo ""
130139
echo ""
131140

132-
output=$(\
133-
"$PROJ_ROOT/build/bin/$bin" \
134-
-hf "$hf" \
135-
--image $SCRIPT_DIR/$inp_file \
136-
-p "what is the publisher name of the newspaper?" \
141+
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
142+
-hf $(printf %q "$hf") \
143+
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
137144
--temp 0 -n 128 \
138-
${tmpl:+--chat-template "$tmpl"} \
139-
2>&1 | tee /dev/tty)
145+
${extra_args}"
146+
147+
# if extra_args does not contain -p, we add a default prompt
148+
if ! [[ "$extra_args" =~ "-p" ]]; then
149+
cmd+=" -p \"what is the publisher name of the newspaper?\""
150+
fi
151+
152+
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
140153

141154
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
142155

143156
# either contains "new york" or both "men" and "walk"
144157
if echo "$output" | grep -iq "new york" \
145158
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
146159
then
147-
result="$prefix \033[32mOK\033[0m: $bin $hf"
160+
result="$prefix \033[32mOK\033[0m: $hf"
148161
else
149-
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
162+
result="$prefix \033[31mFAIL\033[0m: $hf"
150163
fi
151164
echo -e "$result"
152165
arr_res+=("$result")

0 commit comments

Comments
 (0)