@@ -1439,6 +1439,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14391439 { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
14401440 { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
14411441 { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1442+ { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
14421443 // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
14431444 { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.eh_proj" },
14441445 { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.embed_tokens" },
@@ -9035,9 +9036,9 @@ static bool llm_load_tensors(
90359036 model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
90369037 }
90379038
9038- // --- NextN / MTP tensors (preserved but unused), on the final layer ---
9039+ // NextN/ MTP tensors (preserved but unused) - only in final layer (46 for Air, 92 for GLM-4.5)
90399040 {
9040- const int final_layer = n_layer - 1;
9041+ const int final_layer = n_layer - 1; // NextN tensors are in the last layer only
90419042 // EH_PROJ: [2*embd, embd]
90429043 create_tensor(ctx_for_layer(final_layer),
90439044 tn(LLM_TENSOR_NEXTN_EH_PROJ, final_layer),
@@ -9081,9 +9082,9 @@ static bool llm_load_tensors(
90819082 layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
90829083 layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
90839084 layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
9084- layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, llama_model_loader::TENSOR_NOT_REQUIRED );
9085- layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED );
9086- layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED );
9085+ layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0 );
9086+ layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0 );
9087+ layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0 );
90879088
90889089 layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
90899090
@@ -9106,8 +9107,8 @@ static bool llm_load_tensors(
91069107 create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
91079108 // gate bias
91089109 layer.ffn_exp_probs_b =
9109- create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "bias" , i), { n_expert },
9110- llama_model_loader::TENSOR_NOT_REQUIRED );
9110+ create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B , i), { n_expert },
9111+ 0 );
91119112
91129113 if (n_expert == 0) {
91139114 GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
0 commit comments