@@ -653,11 +653,12 @@ ggml_tensor * llm_build_context::llm_build_ffn(
653653 auto split_u = u->splits [id];
654654 auto split_g = g->splits [id];
655655 auto split_d = d->splits [id];
656- GGML_ASSERT ((!split_u && !split_g && split_d) || (split_u && split_g && split_d));
656+ GGML_ASSERT ((!split_u && !split_g && ! split_d) || (split_u && split_g && split_d));
657657 if (!split_u) continue ;
658658 auto cur = input;
659659 if (ffn_norm && ffn_norm->extra ) {
660660 auto norm = (ggml_split_tensor_t *)ffn_norm->extra ;
661+ GGML_ASSERT (norm->splits [id]);
661662 cur = llm_build_norm (ctx, input, lctx.model .hparams , norm->splits [id], NULL , LLM_NORM_RMS, cb, il);
662663 cb (cur, " ffn_inp_normed" , il_cb);
663664 }
@@ -1088,6 +1089,7 @@ llm_expert_gating_func_type gating_op,
10881089 auto cur = input;
10891090 if (ffn_norm) {
10901091 auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra )->splits [lctx.model .main_gpu ] : ffn_norm;
1092+ GGML_ASSERT (the_ffn_norm);
10911093 cur = llm_build_norm (ctx, input, lctx.model .hparams , the_ffn_norm, nullptr , LLM_NORM_RMS, cb, il);
10921094 cb (cur, " ffn_inp_normed" , il);
10931095 }
@@ -1109,17 +1111,18 @@ llm_expert_gating_func_type gating_op,
11091111 gating_op, cb, il, graph);
11101112 cb (routed_out, " routed_out" , il);
11111113 ggml_build_forward_expand (graph, routed_out);
1112- // printf("Using non-split llm_build_moe_ffn for layer %d. n_before = %d, n_now = %d\n", il, n_before, graph->n_nodes);
11131114
11141115 if (up_shexp && gate_shexp && down_shexp) {
11151116 if (split_up_shexp) {
1116- // printf("Using split ffn for shared experts in layer %d\n", il);
1117- std::vector<ggml_tensor *> results (split_up_shexp->n_device );
1117+ std::vector<ggml_tensor *> results; results.reserve (split_up_shexp->n_device );
11181118 GGML_ASSERT (!split_up_b_shexp || split_up_b_shexp->n_device == split_up_shexp->n_device );
11191119 GGML_ASSERT (!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_shexp->n_device );
11201120 GGML_ASSERT (!split_down_b_shexp || split_down_b_shexp->n_device == split_up_shexp->n_device );
11211121 for (int id = 0 ; id < split_up_shexp->n_device ; ++id) {
11221122 int il_cb = 1000 *id + il;
1123+ GGML_ASSERT ((split_up_shexp->splits [id] && split_gate_shexp->splits [id] && split_down_shexp->splits [id]) ||
1124+ (!split_up_shexp->splits [id] && !split_gate_shexp->splits [id] && !split_down_shexp->splits [id]));
1125+ if (!split_up_shexp->splits [id]) continue ;
11231126 auto the_ffn_norm = ffn_norm ? ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra )->splits [id] : ffn_norm : nullptr ;
11241127 auto shared_out = llm_build_ffn (ctx, lctx, the_ffn_norm, input,
11251128 split_up_shexp->splits [id], split_up_b_shexp ? split_up_b_shexp->splits [id] : nullptr , nullptr ,
@@ -1130,17 +1133,19 @@ llm_expert_gating_func_type gating_op,
11301133 if (shared_out->ne [1 ] > 32 ) {
11311134 shared_out = ggml_cast (ctx, shared_out, GGML_TYPE_F16);
11321135 }
1133- results[id] = shared_out;
1136+ results. push_back ( shared_out) ;
11341137 }
1135- cur = ggml_add (ctx, results[0 ], results[1 ]);
1136- if (cur->ne [1 ] > 32 ) {
1137- // Force a graph split
1138+ GGML_ASSERT (!results.empty ());
1139+ if (results.size () == 1 ) {
1140+ cur = results.front ();
1141+ } else {
1142+ cur = ggml_add (ctx, results[0 ], results[1 ]);
11381143 cur->op_params [0 ] = 0xff ;
1139- }
1140- cb (cur, " ffn_shared_combined" , il);
1141- for (int id = 2 ; id < int (results.size ()); ++id) {
1142- cur = ggml_add (ctx, cur, results[id]);
11431144 cb (cur, " ffn_shared_combined" , il);
1145+ for (int id = 2 ; id < int (results.size ()); ++id) {
1146+ cur = ggml_add (ctx, cur, results[id]);
1147+ cb (cur, " ffn_shared_combined" , il);
1148+ }
11441149 }
11451150 if (routed_out->ne [1 ] > 32 ) {
11461151 auto routed_out_f16 = ggml_cast (ctx, routed_out, GGML_TYPE_F16);
@@ -1150,7 +1155,6 @@ llm_expert_gating_func_type gating_op,
11501155 }
11511156 cb (cur, " ffn_out" , il);
11521157 } else {
1153- // printf("Using non-split ffn for shared experts in layer %d\n", il);
11541158 auto shared_out = llm_build_ffn (ctx, lctx, nullptr , cur,
11551159 up_shexp, up_b_shexp, nullptr ,
11561160 gate_shexp, gate_b_shexp, nullptr ,
@@ -1170,14 +1174,17 @@ llm_expert_gating_func_type gating_op,
11701174 }
11711175 GGML_ASSERT (split_up_exps && split_gate_exps && split_down_exps);
11721176 GGML_ASSERT (split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device );
1173- std::vector<ggml_tensor *> results (split_up_exps->n_device );
1177+ std::vector<ggml_tensor *> results; results. reserve (split_up_exps->n_device );
11741178 GGML_ASSERT ((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
11751179 ( split_up_shexp && split_gate_shexp && split_down_shexp));
11761180 auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra ;
11771181 GGML_ASSERT (split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device );
11781182 auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr ;
11791183 GGML_ASSERT (!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device );
11801184 for (int id = 0 ; id < split_up_exps->n_device ; ++id) {
1185+ GGML_ASSERT ((split_up_exps->splits [id] && split_gate_exps->splits [id] && split_down_exps->splits [id]) ||
1186+ (!split_up_exps->splits [id] && !split_gate_exps->splits [id] && !split_down_exps->splits [id]));
1187+ if (!split_up_exps->splits [id]) continue ;
11811188 int il_cb = 1000 *(id + 1 ) + il;
11821189 auto cur = input;
11831190 if (ffn_norm) {
@@ -1220,8 +1227,9 @@ llm_expert_gating_func_type gating_op,
12201227 cur = ggml_cast (ctx, cur, GGML_TYPE_F16);
12211228 cb (cur, " ffn_out_f16" , il_cb);
12221229 }
1223- results[id] = cur;
1230+ results. push_back ( cur) ;
12241231 }
1232+ GGML_ASSERT (!results.empty ());
12251233 if (results.size () == 1 ) return results.front ();
12261234
12271235 auto cur = ggml_add (ctx, results[0 ], results[1 ]);
@@ -1660,10 +1668,15 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
16601668 }
16611669 cb (o.back (), " output" , id);
16621670 }
1663- if (o.size () == 1 ) cur = o.front ();
1664- cur = ggml_concat (ctx, o[0 ], o[1 ], 0 );
1665- for (int id = 2 ; id < int (o.size ()); ++id) {
1666- cur = ggml_concat (ctx, cur, o[id], 0 );
1671+ GGML_ASSERT (!o.empty ());
1672+ if (o.size () == 1 ) {
1673+ cur = o.front ();
1674+ }
1675+ else {
1676+ cur = ggml_concat (ctx, o[0 ], o[1 ], 0 );
1677+ for (int id = 2 ; id < int (o.size ()); ++id) {
1678+ cur = ggml_concat (ctx, cur, o[id], 0 );
1679+ }
16671680 }
16681681 } else {
16691682 if (output_norm) {
@@ -1721,7 +1734,7 @@ ggml_cgraph * llm_build_context::build_llama() {
17211734
17221735 // self-attention
17231736 if (use_rope) {
1724- cur = build_std_attention (gf, inpL, inp_pos, nullptr , this_KQ_mask, nullptr , kq_scale, hparams.f_attention_scale , this_n_swa, il);
1737+ cur = build_std_attention (gf, inpL, inp_pos, nullptr , this_KQ_mask, nullptr , nullptr , kq_scale, hparams.f_attention_scale , this_n_swa, il);
17251738 }
17261739 else {
17271740
@@ -1880,6 +1893,96 @@ ggml_cgraph * llm_build_context::build_llama() {
18801893 return gf;
18811894}
18821895
1896+ ggml_cgraph * llm_build_context::build_mistral3 () {
1897+ auto gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
1898+ const int64_t n_embd_head = hparams.n_embd_head_v ;
1899+
1900+ GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
1901+ GGML_ASSERT (n_embd_head == hparams.n_rot );
1902+
1903+ ggml_tensor * cur;
1904+ ggml_tensor * inpL;
1905+
1906+ inpL = llm_build_inp_embd (ctx0, lctx, hparams, batch, model.tok_embd , cb);
1907+
1908+ // inp_pos - contains the positions
1909+ struct ggml_tensor * inp_pos = build_inp_pos ();
1910+
1911+ // (optional) temperature tuning
1912+ ggml_tensor * inp_attn_scale = nullptr ;
1913+ if (hparams.f_attn_temp_scale != 0 .0f ) {
1914+ inp_attn_scale = build_input_scale (n_tokens);
1915+ }
1916+
1917+ ggml_tensor * KQ_mask = build_inp_KQ_mask ();
1918+
1919+ ggml_tensor * inp_out_ids = build_inp_out_ids ();
1920+
1921+ // const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1922+ const float kq_scale = hparams.f_attention_scale == 0 .0f ? 1 .0f /sqrtf (float (n_embd_head)) : 1 .f ;
1923+
1924+ for (int il = 0 ; il < n_layer; ++il) {
1925+ ggml_tensor * inpSA = inpL;
1926+
1927+ auto rope_factors = build_rope_factors (il);
1928+
1929+ cur = build_std_attention (gf, inpL, inp_pos, rope_factors, KQ_mask, nullptr , inp_attn_scale, kq_scale, hparams.f_attention_scale , 0 , il);
1930+
1931+ if (il == n_layer - 1 && inp_out_ids) {
1932+ cur = ggml_get_rows (ctx0, cur, inp_out_ids);
1933+ inpSA = ggml_get_rows (ctx0, inpSA, inp_out_ids);
1934+ cb (cur, " last_attn" , il);
1935+ cb (inpSA, " last_ffn_inp" , il);
1936+ }
1937+
1938+ ggml_tensor * ffn_inp = ggml_add (ctx0, cur, inpSA);
1939+ cb (ffn_inp, " ffn_inp" , il);
1940+
1941+ // feed-forward network (non-MoE)
1942+ if (model.layers [il].ffn_gate_inp == nullptr ) {
1943+ // non-MoE
1944+ cur = llm_build_ffn (ctx0, lctx, model.layers [il].ffn_norm , ffn_inp,
1945+ model.layers [il].ffn_up , model.layers [il].ffn_up_b , nullptr ,
1946+ model.layers [il].ffn_gate , model.layers [il].ffn_gate_b , nullptr ,
1947+ model.layers [il].ffn_down , model.layers [il].ffn_down_b , nullptr ,
1948+ NULL ,
1949+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
1950+ cb (cur, " ffn_out" , il);
1951+ } else {
1952+ // MoE branch
1953+ cur = llm_build_std_moe_ffn (ctx0, lctx, model.layers [il].ffn_norm , ffn_inp,
1954+ model.layers [il].ffn_gate_inp , nullptr ,
1955+ model.layers [il].ffn_up_exps , nullptr ,
1956+ model.layers [il].ffn_gate_exps , nullptr ,
1957+ model.layers [il].ffn_down_exps , nullptr ,
1958+ model.layers [il].ffn_exp_probs_b ,
1959+ nullptr , nullptr , // we don't have shared experts
1960+ nullptr , nullptr ,
1961+ nullptr , nullptr ,
1962+ n_expert, n_expert_used,
1963+ LLM_FFN_SILU, true , false , 0 .0f ,
1964+ LLM_EXPERT_GATING_FUNC_SOFTMAX,
1965+ LLM_FFN_SILU, cb, il, gf);
1966+ }
1967+ cur = ggml_add (ctx0, cur, ffn_inp);
1968+ cb (cur, " ffn_out" , il);
1969+
1970+ cur = lctx.cvec .apply_to (ctx0, cur, il);
1971+ cb (cur, " l_out" , il);
1972+
1973+ // input for next layer
1974+ inpL = cur;
1975+ }
1976+ cur = inpL;
1977+
1978+ cur = build_output (lctx, ctx0, cur, model.output , model.output_norm , cb);
1979+ cb (cur, " result_output" , -1 );
1980+
1981+ ggml_build_forward_expand (gf, cur);
1982+
1983+ return gf;
1984+ }
1985+
18831986ggml_cgraph * llm_build_context::build_deci () {
18841987 struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, model.max_nodes (), false );
18851988
@@ -3815,7 +3918,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
38153918 // cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
38163919 // cb(cur, "attn_norm", il);
38173920
3818- cur = build_std_attention (gf, inpL, inp_pos, nullptr , KQ_mask, nullptr , 1 .0f /sqrtf (float (n_embd_head)), 0 .0f , 0 , il);
3921+ cur = build_std_attention (gf, inpL, inp_pos, nullptr , KQ_mask, nullptr , nullptr , 1 .0f /sqrtf (float (n_embd_head)), 0 .0f , 0 , il);
38193922
38203923 if (il == n_layer - 1 ) {
38213924 // skip computing output for unused tokens
@@ -6694,7 +6797,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
66946797
66956798 // self-attention
66966799 if (rope_cache == nullptr ) {
6697- cur = build_std_attention (gf, inpL, inp_pos, nullptr , KQ_mask, nullptr , kq_scale, 0 .0f , 0 , il);
6800+ cur = build_std_attention (gf, inpL, inp_pos, nullptr , KQ_mask, nullptr , nullptr , kq_scale, 0 .0f , 0 , il);
66986801 } else {
66996802 // Pre-attention norm
67006803 cur = llm_build_norm (ctx0, inpL, hparams, model.layers [il].attn_norm , NULL , LLM_NORM_RMS, cb, il);
@@ -9173,6 +9276,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
91739276 {
91749277 result = llm.build_smollm3 ();
91759278 } break ;
9279+ case LLM_ARCH_MISTRAL3:
9280+ {
9281+ result = llm.build_mistral3 ();
9282+ } break ;
91769283 default :
91779284 GGML_ABORT (" fatal error" );
91789285 }
@@ -9193,7 +9300,7 @@ ggml_cgraph * llm_build_context::llama_build_graph(
91939300}
91949301
91959302ggml_tensor * llm_build_context::build_std_attention (ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
9196- ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il) {
9303+ ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, int n_swa, int il) {
91979304 if (!model.layers [il].wqkv && !model.layers [il].wqk && cparams.flash_attn &&
91989305 model.layers [il].wq ->extra && model.layers [il].wk ->extra && model.layers [il].wv ->extra && model.layers [il].wo ->extra ) {
91999306 if (kv_self.k_l [il]->extra && kv_self.v_l [il]->extra ) {
@@ -9264,6 +9371,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
92649371 ext_factor, attn_factor, beta_fast, beta_slow);
92659372 cb (Qcur, " Qcur" , il_cb);
92669373 cb (Kcur, " Kcur" , il_cb);
9374+ if (inp_attn_scale) {
9375+ Qcur = ggml_mul (ctx0, Qcur, inp_attn_scale);
9376+ cb (Qcur, " Qcur_temp_scaled" , il_cb);
9377+ }
92679378 ggml_build_forward_expand (gf, Qcur);
92689379 ggml_build_forward_expand (gf, Kcur);
92699380 ggml_build_forward_expand (gf, Vcur);
@@ -9357,6 +9468,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93579468 ggml_build_forward_expand (gf, cur);
93589469 attn.push_back (cur);
93599470 }
9471+ GGML_ASSERT (!attn.empty ());
93609472 if (attn.size () == 1 ) return attn.front ();
93619473 auto cur = ggml_add (ctx0, attn[0 ], attn[1 ]);
93629474 cb (cur, " combine_attn" , il);
@@ -9365,10 +9477,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93659477 cur = ggml_add (ctx0, cur, attn[id]);
93669478 cb (cur, " combine_attn" , il);
93679479 }
9368- // TODO: for more than 2 GPUs, do we need to add another forced graph split?
9369- // if (attn.size() > 2) {
9370- // cur->op_params[0] = 0xff;
9371- // }
93729480 return cur;
93739481 }
93749482 }
@@ -9392,6 +9500,11 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93929500 cb (Qcur, " Qcur" , il);
93939501 cb (Kcur, " Kcur" , il);
93949502
9503+ if (inp_attn_scale) {
9504+ Qcur = ggml_mul (ctx0, Qcur, inp_attn_scale);
9505+ cb (Qcur, " Qcur_temp_scaled" , il);
9506+ }
9507+
93959508 cur = llm_build_kv (ctx0, lctx, kv_self, gf,
93969509 model.layers [il].wo , model.layers [il].bo ,
93979510 Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
0 commit comments