Skip to content

Commit 4d7d994

Browse files
authored
Merge pull request #8 from sfallah/sf/deepseek-ocr-cleanup
Sf/deepseek ocr cleanup
2 parents 7451b84 + fc3f625 commit 4d7d994

File tree

7 files changed

+131
-69
lines changed

7 files changed

+131
-69
lines changed

convert_hf_to_gguf.py

Lines changed: 12 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,15 +1579,7 @@ def __init__(self, *args, **kwargs):
15791579

15801580
# TODO @ngxson : this is a hack to support both vision and audio encoders
15811581
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1582-
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
1583-
# FIXME: DeepseekOCRVisionModel specific hack
1584-
if self.block_count is None:
1585-
if isinstance(self, DeepseekOCRVisionModel):
1586-
clip_block_count = self.hparams['layers']
1587-
if clip_block_count is not None:
1588-
self.block_count = clip_block_count
1589-
if self.block_count is None:
1590-
raise KeyError(f"could not find block count using any of: {self.n_block_keys}")
1582+
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys)
15911583
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
15921584

15931585
# load preprocessor config
@@ -6003,16 +5995,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
60035995

60045996
@ModelBase.register("DeepseekOCRForCausalLM")
60055997
class DeepseekOCRVisionModel(MmprojModel):
6006-
def __init__(self, *args, **kwargs):
6007-
super().__init__(*args, **kwargs)
6008-
6009-
proc_fname = self.dir_model / "processor_config.json"
6010-
6011-
if proc_fname.is_file():
6012-
with open(proc_fname, "r") as f:
6013-
self.preprocessor_config = json.load(f)
6014-
6015-
60165998
def set_gguf_parameters(self):
60175999
super().set_gguf_parameters()
60186000
hparams = self.hparams
@@ -6071,27 +6053,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
60716053
if ".attn.rel_pos_h" in name or ".attn.rel_pos_w" in name:
60726054
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]
60736055

6074-
if name.startswith("model.vision_model.transformer.layers."):
6075-
# process visual tensors
6076-
# split QKV tensors if needed
6077-
if ".qkv_proj." in name:
6078-
if data_torch.ndim == 2: # weight
6079-
c3, _ = data_torch.shape
6080-
else: # bias
6081-
c3 = data_torch.shape[0]
6082-
assert c3 % 3 == 0
6083-
c = c3 // 3
6084-
wq = data_torch[:c]
6085-
wk = data_torch[c: c * 2]
6086-
wv = data_torch[c * 2:]
6087-
return [
6088-
(self.map_tensor_name(name.replace("qkv", "q")), wq),
6089-
(self.map_tensor_name(name.replace("qkv", "k")), wk),
6090-
(self.map_tensor_name(name.replace("qkv", "v")), wv),
6091-
]
6092-
else:
6093-
return [(self.map_tensor_name(name), data_torch)]
6094-
60956056
return [(self.map_tensor_name(name), data_torch)]
60966057

60976058

@@ -7263,12 +7224,20 @@ def prepare_tensors(self):
72637224
@ModelBase.register(
72647225
"DeepseekV2ForCausalLM",
72657226
"DeepseekV3ForCausalLM",
7266-
"DeepseekOCRForCausalLM",
72677227
"KimiVLForConditionalGeneration",
72687228
)
72697229
class DeepseekV2Model(TextModel):
72707230
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
72717231

7232+
def __init__(self, *args, **kwargs):
7233+
super().__init__(*args, **kwargs)
7234+
vision_config = self.hparams.get('vision_config', {}).get('width', {})
7235+
7236+
if 'clip-l-14-224' in vision_config and 'sam_vit_b' in vision_config:
7237+
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
7238+
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
7239+
self.gguf_writer.add_architecture()
7240+
72727241
def set_vocab(self):
72737242
try:
72747243
self._set_vocab_gpt2()
@@ -7324,7 +7293,7 @@ def set_vocab(self):
73247293
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
73257294

73267295
def set_gguf_parameters(self):
7327-
is_ocr = (self.hparams["num_hidden_layers"] == 12)
7296+
is_ocr = (self.model_arch == gguf.MODEL_ARCH.DEEPSEEK2OCR)
73287297

73297298
if is_ocr:
73307299
self.hparams['rope_theta'] = self.hparams.get('rope_theta', 10000.0)
@@ -7335,11 +7304,9 @@ def set_gguf_parameters(self):
73357304

73367305
super().set_gguf_parameters()
73377306
hparams = self.hparams
7338-
kv_lora_rank = hparams["q_lora_rank"] if hparams["q_lora_rank"] is not None else 512
7307+
kv_lora_rank = hparams["kv_lora_rank"] if hparams.get("kv_lora_rank") is not None else 512
73397308
routed_scaling_factor = hparams.get("routed_scaling_factor", 1.0)
73407309
norm_topk_prob = hparams.get("norm_topk_prob", False)
7341-
scoring_func = hparams.get("scoring_func", "softmax")
7342-
73437310
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
73447311
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
73457312
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
@@ -7361,12 +7328,6 @@ def set_gguf_parameters(self):
73617328
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
73627329
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
73637330

7364-
if scoring_func == "sigmoid":
7365-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
7366-
elif scoring_func == "softmax":
7367-
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
7368-
else:
7369-
raise ValueError(f"Unsupported scoring_func value: {scoring_func}")
73707331
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
73717332

73727333
rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -7462,7 +7423,6 @@ def prepare_tensors(self):
74627423
if len(experts) > 0:
74637424
raise ValueError(f"Unprocessed experts: {experts}")
74647425

7465-
74667426
@ModelBase.register("MiniMaxM2ForCausalLM")
74677427
class MiniMaxM2Model(TextModel):
74687428
model_arch = gguf.MODEL_ARCH.MINIMAXM2

gguf-py/gguf/constants.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,7 @@ class MODEL_ARCH(IntEnum):
408408
ARCTIC = auto()
409409
DEEPSEEK = auto()
410410
DEEPSEEK2 = auto()
411+
DEEPSEEK2OCR = auto()
411412
CHATGLM = auto()
412413
GLM4 = auto()
413414
GLM4_MOE = auto()
@@ -797,6 +798,7 @@ class MODEL_TENSOR(IntEnum):
797798
MODEL_ARCH.ARCTIC: "arctic",
798799
MODEL_ARCH.DEEPSEEK: "deepseek",
799800
MODEL_ARCH.DEEPSEEK2: "deepseek2",
801+
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
800802
MODEL_ARCH.CHATGLM: "chatglm",
801803
MODEL_ARCH.GLM4: "glm4",
802804
MODEL_ARCH.GLM4_MOE: "glm4moe",
@@ -2375,6 +2377,38 @@ class MODEL_TENSOR(IntEnum):
23752377
MODEL_TENSOR.FFN_UP_SHEXP,
23762378
MODEL_TENSOR.FFN_EXP_PROBS_B,
23772379
],
2380+
MODEL_ARCH.DEEPSEEK2OCR: [
2381+
MODEL_TENSOR.TOKEN_EMBD,
2382+
MODEL_TENSOR.OUTPUT_NORM,
2383+
MODEL_TENSOR.OUTPUT,
2384+
MODEL_TENSOR.ROPE_FREQS,
2385+
MODEL_TENSOR.ATTN_NORM,
2386+
MODEL_TENSOR.ATTN_Q,
2387+
MODEL_TENSOR.ATTN_Q_A,
2388+
MODEL_TENSOR.ATTN_Q_B,
2389+
MODEL_TENSOR.ATTN_KV_A_MQA,
2390+
MODEL_TENSOR.ATTN_KV_B,
2391+
MODEL_TENSOR.ATTN_K,
2392+
MODEL_TENSOR.ATTN_K_B,
2393+
MODEL_TENSOR.ATTN_V,
2394+
MODEL_TENSOR.ATTN_V_B,
2395+
MODEL_TENSOR.ATTN_Q_A_NORM,
2396+
MODEL_TENSOR.ATTN_KV_A_NORM,
2397+
MODEL_TENSOR.ATTN_OUT,
2398+
MODEL_TENSOR.ATTN_ROT_EMBD,
2399+
MODEL_TENSOR.FFN_GATE_INP,
2400+
MODEL_TENSOR.FFN_NORM,
2401+
MODEL_TENSOR.FFN_GATE,
2402+
MODEL_TENSOR.FFN_DOWN,
2403+
MODEL_TENSOR.FFN_UP,
2404+
MODEL_TENSOR.FFN_GATE_EXP,
2405+
MODEL_TENSOR.FFN_DOWN_EXP,
2406+
MODEL_TENSOR.FFN_UP_EXP,
2407+
MODEL_TENSOR.FFN_GATE_SHEXP,
2408+
MODEL_TENSOR.FFN_DOWN_SHEXP,
2409+
MODEL_TENSOR.FFN_UP_SHEXP,
2410+
MODEL_TENSOR.FFN_EXP_PROBS_B,
2411+
],
23782412
MODEL_ARCH.ERNIE4_5_MOE: [
23792413
MODEL_TENSOR.TOKEN_EMBD,
23802414
MODEL_TENSOR.OUTPUT_NORM,
@@ -3192,6 +3226,10 @@ class MODEL_TENSOR(IntEnum):
31923226
MODEL_TENSOR.ROPE_FREQS,
31933227
MODEL_TENSOR.ATTN_ROT_EMBD,
31943228
],
3229+
MODEL_ARCH.DEEPSEEK2OCR: [
3230+
MODEL_TENSOR.ROPE_FREQS,
3231+
MODEL_TENSOR.ATTN_ROT_EMBD,
3232+
],
31953233
MODEL_ARCH.CHATGLM: [
31963234
MODEL_TENSOR.ROPE_FREQS,
31973235
],

src/llama-arch.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6666
{ LLM_ARCH_ARCTIC, "arctic" },
6767
{ LLM_ARCH_DEEPSEEK, "deepseek" },
6868
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
69+
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
6970
{ LLM_ARCH_CHATGLM, "chatglm" },
7071
{ LLM_ARCH_GLM4, "glm4" },
7172
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@@ -1549,6 +1550,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
15491550
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
15501551
},
15511552
},
1553+
{
1554+
LLM_ARCH_DEEPSEEK2OCR,
1555+
{
1556+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1557+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1558+
{ LLM_TENSOR_OUTPUT, "output" },
1559+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1560+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1561+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1562+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1563+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1564+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1565+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1566+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1567+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1568+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1569+
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
1570+
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
1571+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1572+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1573+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1574+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1575+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1576+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1577+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1578+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1579+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1580+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
1581+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1582+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1583+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1584+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1585+
},
1586+
},
15521587
{
15531588
LLM_ARCH_PLM,
15541589
{

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ enum llm_arch {
7070
LLM_ARCH_ARCTIC,
7171
LLM_ARCH_DEEPSEEK,
7272
LLM_ARCH_DEEPSEEK2,
73+
LLM_ARCH_DEEPSEEK2OCR,
7374
LLM_ARCH_CHATGLM,
7475
LLM_ARCH_GLM4,
7576
LLM_ARCH_GLM4_MOE,

src/llama-kv-cache.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1385,7 +1385,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
13851385

13861386
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
13871387
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
1388-
const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
1388+
const float yarn_attn_factor = (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_DEEPSEEK2OCR)
13891389
? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
13901390
: cparams.yarn_attn_factor;
13911391

src/llama-model.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1605,10 +1605,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
16051605
}
16061606
} break;
16071607
case LLM_ARCH_DEEPSEEK2:
1608+
case LLM_ARCH_DEEPSEEK2OCR:
16081609
{
16091610
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
16101611
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
1611-
bool is_ocr = (name.find("ocr") != std::string::npos || name.find("OCR") != std::string::npos);
1612+
bool is_ocr = (arch == LLM_ARCH_DEEPSEEK2OCR);
16121613
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
16131614
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
16141615
if (!is_lite && !is_ocr) {
@@ -4659,10 +4660,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
46594660
}
46604661
} break;
46614662
case LLM_ARCH_DEEPSEEK2:
4663+
case LLM_ARCH_DEEPSEEK2OCR:
46624664
{
46634665
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
46644666
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
4665-
const bool is_ocr = (name.find("ocr") != std::string::npos || name.find("OCR") != std::string::npos);
4667+
const bool is_ocr = (arch == LLM_ARCH_DEEPSEEK2OCR);
46664668

46674669
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
46684670

@@ -6879,7 +6881,7 @@ void llama_model::print_info() const {
68796881
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
68806882
}
68816883

6882-
if (arch == LLM_ARCH_DEEPSEEK2) {
6884+
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_DEEPSEEK2OCR) {
68836885
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
68846886
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
68856887
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
@@ -7406,6 +7408,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
74067408
llm = std::make_unique<llm_build_deepseek>(*this, params);
74077409
} break;
74087410
case LLM_ARCH_DEEPSEEK2:
7411+
case LLM_ARCH_DEEPSEEK2OCR:
74097412
{
74107413
llm = std::make_unique<llm_build_deepseek2>(*this, params);
74117414
} break;
@@ -7754,6 +7757,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
77547757
case LLM_ARCH_ARCTIC:
77557758
case LLM_ARCH_DEEPSEEK:
77567759
case LLM_ARCH_DEEPSEEK2:
7760+
case LLM_ARCH_DEEPSEEK2OCR:
77577761
case LLM_ARCH_PLM:
77587762
case LLM_ARCH_CHATGLM:
77597763
case LLM_ARCH_GLM4:

tools/mtmd/clip.cpp

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2152,19 +2152,44 @@ struct clip_graph {
21522152

21532153
// self-attention
21542154
{
2155-
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
2156-
if (layer.q_b) {
2157-
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
2158-
}
2159-
2160-
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
2161-
if (layer.k_b) {
2162-
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
2163-
}
2155+
ggml_tensor * Qcur;
2156+
ggml_tensor * Kcur;
2157+
ggml_tensor * Vcur;
2158+
2159+
if (layer.qkv_w) {
2160+
ggml_tensor * QKV;
21642161

2165-
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
2166-
if (layer.v_b) {
2167-
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
2162+
QKV = ggml_mul_mat(ctx0, layer.qkv_w, cur);
2163+
if (layer.qkv_b) {
2164+
QKV = ggml_add(ctx0, QKV, layer.qkv_b);
2165+
}
2166+
QKV = ggml_reshape_4d(ctx0, QKV, cur->ne[0], 3, cur->ne[1]*cur->ne[2], cur->ne[3]);
2167+
2168+
const int ne0 = QKV->ne[0];
2169+
const int ne2 = QKV->ne[2];
2170+
const int ne3 = QKV->ne[3];
2171+
const int nb1 = QKV->nb[1];
2172+
const int nb2 = QKV->nb[2];
2173+
const int nb3 = QKV->nb[3];
2174+
2175+
Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, QKV, ne0, ne2, ne3, nb2, nb3, 0*nb1));
2176+
Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, QKV, ne0, ne2, ne3, nb2, nb3, 1*nb1));
2177+
Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, QKV, ne0, ne2, ne3, nb2, nb3, 2*nb1));
2178+
} else {
2179+
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
2180+
if (layer.q_b) {
2181+
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
2182+
}
2183+
2184+
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
2185+
if (layer.k_b) {
2186+
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
2187+
}
2188+
2189+
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
2190+
if (layer.v_b) {
2191+
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
2192+
}
21682193
}
21692194

21702195
if (layer.q_norm) {
@@ -2260,7 +2285,6 @@ struct clip_graph {
22602285
const int64_t C = rel_pos->ne[0]; // channels
22612286
const int64_t L = rel_pos->ne[1]; // length
22622287

2263-
//GGML_ASSERT(2*std::max(q_size, k_size) - 1 == L);
22642288

22652289
const auto max_rel_dist = 2*std::max(q_size, k_size) - 1;
22662290
ggml_tensor * rel_pos_resized = rel_pos;

0 commit comments

Comments
 (0)