@@ -1439,7 +1439,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
14391439 { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
14401440 { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
14411441 { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1442- { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
14431442 // NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
14441443 { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.eh_proj" },
14451444 { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.embed_tokens" },
@@ -9036,9 +9035,9 @@ static bool llm_load_tensors(
90369035 model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
90379036 }
90389037
9039- // NextN/ MTP tensors (preserved but unused) - only in final layer (46 for Air, 92 for GLM-4.5)
9038+ // --- NextN / MTP tensors (preserved but unused), on the final layer ---
90409039 {
9041- const int final_layer = n_layer - 1; // NextN tensors are in the last layer only
9040+ const int final_layer = n_layer - 1;
90429041 // EH_PROJ: [2*embd, embd]
90439042 create_tensor(ctx_for_layer(final_layer),
90449043 tn(LLM_TENSOR_NEXTN_EH_PROJ, final_layer),
@@ -9082,9 +9081,9 @@ static bool llm_load_tensors(
90829081 layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
90839082 layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
90849083 layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
9085- layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, 0 );
9086- layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, 0 );
9087- layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, 0 );
9084+ layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, llama_model_loader::TENSOR_NOT_REQUIRED );
9085+ layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED );
9086+ layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED );
90889087
90899088 layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
90909089
@@ -9107,8 +9106,8 @@ static bool llm_load_tensors(
91079106 create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
91089107 // gate bias
91099108 layer.ffn_exp_probs_b =
9110- create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B , i), { n_expert },
9111- 0 );
9109+ create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "bias" , i), { n_expert },
9110+ llama_model_loader::TENSOR_NOT_REQUIRED );
91129111
91139112 if (n_expert == 0) {
91149113 GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
0 commit comments