Skip to content

Commit 4250fb4

Browse files
authored
Update llama.cpp
1 parent 83d2bb3 commit 4250fb4

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

src/llama.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14391439
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
14401440
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
14411441
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1442+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
14421443
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
14431444
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.eh_proj" },
14441445
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.embed_tokens" },
@@ -9035,9 +9036,9 @@ static bool llm_load_tensors(
90359036
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
90369037
}
90379038

9038-
// --- NextN / MTP tensors (preserved but unused), on the final layer ---
9039+
// NextN/MTP tensors (preserved but unused) - only in final layer (46 for Air, 92 for GLM-4.5)
90399040
{
9040-
const int final_layer = n_layer - 1;
9041+
const int final_layer = n_layer - 1; // NextN tensors are in the last layer only
90419042
// EH_PROJ: [2*embd, embd]
90429043
create_tensor(ctx_for_layer(final_layer),
90439044
tn(LLM_TENSOR_NEXTN_EH_PROJ, final_layer),
@@ -9081,9 +9082,9 @@ static bool llm_load_tensors(
90819082
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
90829083
layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
90839084
layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
9084-
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, llama_model_loader::TENSOR_NOT_REQUIRED);
9085-
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
9086-
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
9085+
layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0);
9086+
layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0);
9087+
layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0);
90879088

90889089
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
90899090

@@ -9106,8 +9107,8 @@ static bool llm_load_tensors(
91069107
create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
91079108
// gate bias
91089109
layer.ffn_exp_probs_b =
9109-
create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), { n_expert },
9110-
llama_model_loader::TENSOR_NOT_REQUIRED);
9110+
create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, i), { n_expert },
9111+
0);
91119112

91129113
if (n_expert == 0) {
91139114
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");

0 commit comments

Comments
 (0)