Skip to content

Commit 14e10f5

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents 7ba8b54 + 0a3e1d1 commit 14e10f5

File tree

21 files changed

+1950
-492
lines changed

21 files changed

+1950
-492
lines changed

common/common.cpp

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,11 +1047,21 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10471047
params.mmproj_use_gpu = false;
10481048
return true;
10491049
}
1050-
if (arg == "--image") {
1050+
if (arg == "--image" || arg == "--audio") {
10511051
CHECK_ARG
10521052
params.image.emplace_back(argv[i]);
10531053
return true;
10541054
}
1055+
if (arg == "--image-min-tokens") {
1056+
CHECK_ARG
1057+
params.image_min_tokens = std::stoi(argv[i]);
1058+
return true;
1059+
}
1060+
if (arg == "--image-max-tokens") {
1061+
CHECK_ARG
1062+
params.image_max_tokens = std::stoi(argv[i]);
1063+
return true;
1064+
}
10551065
if (arg == "-i" || arg == "--interactive") {
10561066
params.interactive = true;
10571067
return true;
@@ -2190,6 +2200,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
21902200
options.push_back({ "multi-modality" });
21912201
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
21922202
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
2203+
options.push_back({ "*", " --image-min-tokens N", "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)"});
2204+
options.push_back({ "*", " --image-max-tokens N", "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)" });
21932205
options.push_back({ "*", " --no-context-shift", "disable context-shift." });
21942206
options.push_back({ "*", "--context-shift (auto|on|off|0|1)", "set context-shift (default: %s)", params.ctx_shift ? "on" : "off" });
21952207
options.push_back({ "backend" });
@@ -2992,11 +3004,20 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
29923004

29933005
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
29943006
auto cparams = llama_context_default_params();
3007+
int n_batch = params.n_batch;
3008+
int n_ubatch = params.n_ubatch;
3009+
3010+
// temporary fix for qwen mtmd
3011+
if (!params.mmproj.path.empty()) {
3012+
n_batch = std::max(params.n_batch, params.n_ubatch);
3013+
n_ubatch = params.n_batch;
3014+
fprintf(stdout, "Adjust batch size for mtmd: u_batch = %d, batch = %d\n", n_ubatch, n_batch);
3015+
}
29953016

29963017
cparams.n_ctx = params.n_ctx;
29973018
cparams.n_seq_max = params.n_parallel;
2998-
cparams.n_batch = params.n_batch;
2999-
cparams.n_ubatch = params.n_ubatch;
3019+
cparams.n_batch = n_batch;
3020+
cparams.n_ubatch = n_ubatch;
30003021
cparams.n_threads = params.n_threads;
30013022
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
30023023
cparams.seed = params.seed;

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,8 @@ struct gpt_params {
287287
bool mmproj_use_gpu = true; // use GPU for multimodal model
288288
bool no_mmproj = false; // explicitly disable multimodal model
289289
std::vector<std::string> image; // path to image file(s)
290+
int image_min_tokens = -1;
291+
int image_max_tokens = -1;
290292

291293
// embedding
292294
bool embedding = false; // get only sentence embedding

examples/mtmd/clip-impl.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@
2929
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
3030

3131
// vision-specific
32+
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
3233
#define KEY_IMAGE_SIZE "clip.vision.image_size"
34+
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
3335
#define KEY_PATCH_SIZE "clip.vision.patch_size"
3436
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
3537
#define KEY_IMAGE_STD "clip.vision.image_std"
@@ -47,6 +49,7 @@
4749
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
4850

4951
// audio-specific
52+
#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
5053
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
5154
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
5255

@@ -117,6 +120,14 @@
117120
#define TN_MM_NORM_PRE "mm.a.norm_pre.%s"
118121
#define TN_MM_NORM_MID "mm.a.norm_mid.%s"
119122

123+
// cogvlm
124+
#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s"
125+
#define TN_MM_H_TO_4H "mm.up.%s"
126+
#define TN_MM_GATE "mm.gate.%s"
127+
#define TN_MM_4H_TO_H "mm.down.%s"
128+
#define TN_TOK_BOI "v.boi"
129+
#define TN_TOK_EOI "v.eoi"
130+
120131
// align x to upper multiple of n
121132
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
122133

@@ -141,6 +152,9 @@ enum projector_type {
141152
PROJECTOR_TYPE_VOXTRAL,
142153
PROJECTOR_TYPE_LFM2,
143154
PROJECTOR_TYPE_KIMIVL,
155+
PROJECTOR_TYPE_LIGHTONOCR,
156+
PROJECTOR_TYPE_COGVLM,
157+
PROJECTOR_TYPE_JANUS_PRO,
144158
PROJECTOR_TYPE_UNKNOWN,
145159
};
146160

@@ -164,6 +178,9 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
164178
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
165179
{ PROJECTOR_TYPE_LFM2, "lfm2"},
166180
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
181+
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
182+
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
183+
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
167184
};
168185

169186
static projector_type clip_projector_type_from_string(const std::string & str) {

0 commit comments

Comments
 (0)