Skip to content

Commit d70a752

Browse files
authored
Merge branch 'ikawrakow:main' into main
2 parents f54d377 + 0896171 commit d70a752

File tree

10 files changed

+214
-32
lines changed

10 files changed

+214
-32
lines changed

ggml/src/ggml-cuda/binbcast.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,9 +360,7 @@ static void ggml_cuda_op_scale_tensor(ggml_backend_cuda_context & ctx, ggml_tens
360360

361361
GGML_ASSERT(src0->type == GGML_TYPE_F32);
362362
GGML_ASSERT( dst->type == GGML_TYPE_F32);
363-
364-
float scale;
365-
memcpy(&scale, dst->src[1]->data, sizeof(float));
363+
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
366364

367365
scale_f32_cuda_l(src0_d, dst_d, dst->src[1]->data, ggml_nelements(src0), stream);
368366
}

src/llama-arch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
6868
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
6969
{ LLM_ARCH_MINIMAX_M2, "minimax-m2" },
7070
{ LLM_ARCH_SMOLLM3, "smollm3" },
71+
{ LLM_ARCH_MISTRAL3, "mistral3" },
7172
{ LLM_ARCH_UNKNOWN, "(unknown)" },
7273
};
7374

@@ -142,6 +143,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
142143
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
143144
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
144145
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
146+
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
145147
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
146148
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
147149

src/llama-arch.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ enum llm_arch {
6767
LLM_ARCH_BAILINGMOE2,
6868
LLM_ARCH_MINIMAX_M2,
6969
LLM_ARCH_SMOLLM3,
70+
LLM_ARCH_MISTRAL3,
7071
LLM_ARCH_UNKNOWN,
7172
};
7273

@@ -135,6 +136,7 @@ enum llm_kv {
135136
LLM_KV_ATTENTION_SCALE,
136137
LLM_KV_ATTENTION_OUTPUT_SCALE,
137138
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
139+
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
138140
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
139141
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
140142

src/llama-build-context.cpp

Lines changed: 140 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -653,11 +653,12 @@ ggml_tensor * llm_build_context::llm_build_ffn(
653653
auto split_u = u->splits[id];
654654
auto split_g = g->splits[id];
655655
auto split_d = d->splits[id];
656-
GGML_ASSERT((!split_u && !split_g && split_d) || (split_u && split_g && split_d));
656+
GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d));
657657
if (!split_u) continue;
658658
auto cur = input;
659659
if (ffn_norm && ffn_norm->extra) {
660660
auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
661+
GGML_ASSERT(norm->splits[id]);
661662
cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
662663
cb(cur, "ffn_inp_normed", il_cb);
663664
}
@@ -1088,6 +1089,7 @@ llm_expert_gating_func_type gating_op,
10881089
auto cur = input;
10891090
if (ffn_norm) {
10901091
auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
1092+
GGML_ASSERT(the_ffn_norm);
10911093
cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
10921094
cb(cur, "ffn_inp_normed", il);
10931095
}
@@ -1109,17 +1111,18 @@ llm_expert_gating_func_type gating_op,
11091111
gating_op, cb, il, graph);
11101112
cb(routed_out, "routed_out", il);
11111113
ggml_build_forward_expand(graph, routed_out);
1112-
//printf("Using non-split llm_build_moe_ffn for layer %d. n_before = %d, n_now = %d\n", il, n_before, graph->n_nodes);
11131114

11141115
if (up_shexp && gate_shexp && down_shexp) {
11151116
if (split_up_shexp) {
1116-
//printf("Using split ffn for shared experts in layer %d\n", il);
1117-
std::vector<ggml_tensor *> results(split_up_shexp->n_device);
1117+
std::vector<ggml_tensor *> results; results.reserve(split_up_shexp->n_device);
11181118
GGML_ASSERT(!split_up_b_shexp || split_up_b_shexp->n_device == split_up_shexp->n_device);
11191119
GGML_ASSERT(!split_gate_b_shexp || split_gate_b_shexp->n_device == split_up_shexp->n_device);
11201120
GGML_ASSERT(!split_down_b_shexp || split_down_b_shexp->n_device == split_up_shexp->n_device);
11211121
for (int id = 0; id < split_up_shexp->n_device; ++id) {
11221122
int il_cb = 1000*id + il;
1123+
GGML_ASSERT((split_up_shexp->splits[id] && split_gate_shexp->splits[id] && split_down_shexp->splits[id]) ||
1124+
(!split_up_shexp->splits[id] && !split_gate_shexp->splits[id] && !split_down_shexp->splits[id]));
1125+
if (!split_up_shexp->splits[id]) continue;
11231126
auto the_ffn_norm = ffn_norm ? ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[id] : ffn_norm : nullptr;
11241127
auto shared_out = llm_build_ffn(ctx, lctx, the_ffn_norm, input,
11251128
split_up_shexp->splits[id], split_up_b_shexp ? split_up_b_shexp->splits[id] : nullptr, nullptr,
@@ -1130,17 +1133,19 @@ llm_expert_gating_func_type gating_op,
11301133
if (shared_out->ne[1] > 32) {
11311134
shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16);
11321135
}
1133-
results[id] = shared_out;
1136+
results.push_back(shared_out);
11341137
}
1135-
cur = ggml_add(ctx, results[0], results[1]);
1136-
if (cur->ne[1] > 32) {
1137-
// Force a graph split
1138+
GGML_ASSERT(!results.empty());
1139+
if (results.size() == 1) {
1140+
cur = results.front();
1141+
} else {
1142+
cur = ggml_add(ctx, results[0], results[1]);
11381143
cur->op_params[0] = 0xff;
1139-
}
1140-
cb(cur, "ffn_shared_combined", il);
1141-
for (int id = 2; id < int(results.size()); ++id) {
1142-
cur = ggml_add(ctx, cur, results[id]);
11431144
cb(cur, "ffn_shared_combined", il);
1145+
for (int id = 2; id < int(results.size()); ++id) {
1146+
cur = ggml_add(ctx, cur, results[id]);
1147+
cb(cur, "ffn_shared_combined", il);
1148+
}
11441149
}
11451150
if (routed_out->ne[1] > 32) {
11461151
auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16);
@@ -1150,7 +1155,6 @@ llm_expert_gating_func_type gating_op,
11501155
}
11511156
cb(cur, "ffn_out", il);
11521157
} else {
1153-
//printf("Using non-split ffn for shared experts in layer %d\n", il);
11541158
auto shared_out = llm_build_ffn(ctx, lctx, nullptr, cur,
11551159
up_shexp, up_b_shexp, nullptr,
11561160
gate_shexp, gate_b_shexp, nullptr,
@@ -1170,14 +1174,17 @@ llm_expert_gating_func_type gating_op,
11701174
}
11711175
GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
11721176
GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
1173-
std::vector<ggml_tensor *> results(split_up_exps->n_device);
1177+
std::vector<ggml_tensor *> results; results.reserve(split_up_exps->n_device);
11741178
GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
11751179
( split_up_shexp && split_gate_shexp && split_down_shexp));
11761180
auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
11771181
GGML_ASSERT(split_gate_inp && split_gate_inp->n_device == split_up_exps->n_device);
11781182
auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
11791183
GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
11801184
for (int id = 0; id < split_up_exps->n_device; ++id) {
1185+
GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) ||
1186+
(!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id]));
1187+
if (!split_up_exps->splits[id]) continue;
11811188
int il_cb = 1000*(id + 1) + il;
11821189
auto cur = input;
11831190
if (ffn_norm) {
@@ -1220,8 +1227,9 @@ llm_expert_gating_func_type gating_op,
12201227
cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
12211228
cb(cur, "ffn_out_f16", il_cb);
12221229
}
1223-
results[id] = cur;
1230+
results.push_back(cur);
12241231
}
1232+
GGML_ASSERT(!results.empty());
12251233
if (results.size() == 1) return results.front();
12261234

12271235
auto cur = ggml_add(ctx, results[0], results[1]);
@@ -1660,10 +1668,15 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
16601668
}
16611669
cb(o.back(), "output", id);
16621670
}
1663-
if (o.size() == 1) cur = o.front();
1664-
cur = ggml_concat(ctx, o[0], o[1], 0);
1665-
for (int id = 2; id < int(o.size()); ++id) {
1666-
cur = ggml_concat(ctx, cur, o[id], 0);
1671+
GGML_ASSERT(!o.empty());
1672+
if (o.size() == 1) {
1673+
cur = o.front();
1674+
}
1675+
else {
1676+
cur = ggml_concat(ctx, o[0], o[1], 0);
1677+
for (int id = 2; id < int(o.size()); ++id) {
1678+
cur = ggml_concat(ctx, cur, o[id], 0);
1679+
}
16671680
}
16681681
} else {
16691682
if (output_norm) {
@@ -1721,7 +1734,7 @@ ggml_cgraph * llm_build_context::build_llama() {
17211734

17221735
// self-attention
17231736
if (use_rope) {
1724-
cur = build_std_attention(gf, inpL, inp_pos, nullptr, this_KQ_mask, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
1737+
cur = build_std_attention(gf, inpL, inp_pos, nullptr, this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
17251738
}
17261739
else {
17271740

@@ -1880,6 +1893,96 @@ ggml_cgraph * llm_build_context::build_llama() {
18801893
return gf;
18811894
}
18821895

1896+
ggml_cgraph * llm_build_context::build_mistral3() {
1897+
auto gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
1898+
const int64_t n_embd_head = hparams.n_embd_head_v;
1899+
1900+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
1901+
GGML_ASSERT(n_embd_head == hparams.n_rot);
1902+
1903+
ggml_tensor * cur;
1904+
ggml_tensor * inpL;
1905+
1906+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
1907+
1908+
// inp_pos - contains the positions
1909+
struct ggml_tensor * inp_pos = build_inp_pos();
1910+
1911+
// (optional) temperature tuning
1912+
ggml_tensor * inp_attn_scale = nullptr;
1913+
if (hparams.f_attn_temp_scale != 0.0f) {
1914+
inp_attn_scale = build_input_scale(n_tokens);
1915+
}
1916+
1917+
ggml_tensor * KQ_mask = build_inp_KQ_mask();
1918+
1919+
ggml_tensor * inp_out_ids = build_inp_out_ids();
1920+
1921+
//const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
1922+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : 1.f;
1923+
1924+
for (int il = 0; il < n_layer; ++il) {
1925+
ggml_tensor * inpSA = inpL;
1926+
1927+
auto rope_factors = build_rope_factors(il);
1928+
1929+
cur = build_std_attention(gf, inpL, inp_pos, rope_factors, KQ_mask, nullptr, inp_attn_scale, kq_scale, hparams.f_attention_scale, 0, il);
1930+
1931+
if (il == n_layer - 1 && inp_out_ids) {
1932+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
1933+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1934+
cb(cur, "last_attn", il);
1935+
cb(inpSA, "last_ffn_inp", il);
1936+
}
1937+
1938+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1939+
cb(ffn_inp, "ffn_inp", il);
1940+
1941+
// feed-forward network (non-MoE)
1942+
if (model.layers[il].ffn_gate_inp == nullptr) {
1943+
// non-MoE
1944+
cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
1945+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr,
1946+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr,
1947+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr,
1948+
NULL,
1949+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
1950+
cb(cur, "ffn_out", il);
1951+
} else {
1952+
// MoE branch
1953+
cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
1954+
model.layers[il].ffn_gate_inp, nullptr,
1955+
model.layers[il].ffn_up_exps, nullptr,
1956+
model.layers[il].ffn_gate_exps, nullptr,
1957+
model.layers[il].ffn_down_exps, nullptr,
1958+
model.layers[il].ffn_exp_probs_b,
1959+
nullptr, nullptr, // we don't have shared experts
1960+
nullptr, nullptr,
1961+
nullptr, nullptr,
1962+
n_expert, n_expert_used,
1963+
LLM_FFN_SILU, true, false, 0.0f,
1964+
LLM_EXPERT_GATING_FUNC_SOFTMAX,
1965+
LLM_FFN_SILU, cb, il, gf);
1966+
}
1967+
cur = ggml_add(ctx0, cur, ffn_inp);
1968+
cb(cur, "ffn_out", il);
1969+
1970+
cur = lctx.cvec.apply_to(ctx0, cur, il);
1971+
cb(cur, "l_out", il);
1972+
1973+
// input for next layer
1974+
inpL = cur;
1975+
}
1976+
cur = inpL;
1977+
1978+
cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);
1979+
cb(cur, "result_output", -1);
1980+
1981+
ggml_build_forward_expand(gf, cur);
1982+
1983+
return gf;
1984+
}
1985+
18831986
ggml_cgraph * llm_build_context::build_deci() {
18841987
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
18851988

@@ -3815,7 +3918,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
38153918
//cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
38163919
//cb(cur, "attn_norm", il);
38173920

3818-
cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il);
3921+
cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), 0.0f, 0, il);
38193922

38203923
if (il == n_layer - 1) {
38213924
// skip computing output for unused tokens
@@ -6694,7 +6797,7 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
66946797

66956798
// self-attention
66966799
if (rope_cache == nullptr) {
6697-
cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, kq_scale, 0.0f, 0, il);
6800+
cur = build_std_attention(gf, inpL, inp_pos, nullptr, KQ_mask, nullptr, nullptr, kq_scale, 0.0f, 0, il);
66986801
} else {
66996802
// Pre-attention norm
67006803
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il);
@@ -9173,6 +9276,10 @@ ggml_cgraph * llm_build_context::llama_build_graph(
91739276
{
91749277
result = llm.build_smollm3();
91759278
} break;
9279+
case LLM_ARCH_MISTRAL3:
9280+
{
9281+
result = llm.build_mistral3();
9282+
} break;
91769283
default:
91779284
GGML_ABORT("fatal error");
91789285
}
@@ -9193,7 +9300,7 @@ ggml_cgraph * llm_build_context::llama_build_graph(
91939300
}
91949301

91959302
ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
9196-
ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il) {
9303+
ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, int n_swa, int il) {
91979304
if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
91989305
model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
91999306
if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
@@ -9264,6 +9371,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
92649371
ext_factor, attn_factor, beta_fast, beta_slow);
92659372
cb(Qcur, "Qcur", il_cb);
92669373
cb(Kcur, "Kcur", il_cb);
9374+
if (inp_attn_scale) {
9375+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
9376+
cb(Qcur, "Qcur_temp_scaled", il_cb);
9377+
}
92679378
ggml_build_forward_expand(gf, Qcur);
92689379
ggml_build_forward_expand(gf, Kcur);
92699380
ggml_build_forward_expand(gf, Vcur);
@@ -9357,6 +9468,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93579468
ggml_build_forward_expand(gf, cur);
93589469
attn.push_back(cur);
93599470
}
9471+
GGML_ASSERT(!attn.empty());
93609472
if (attn.size() == 1) return attn.front();
93619473
auto cur = ggml_add(ctx0, attn[0], attn[1]);
93629474
cb(cur, "combine_attn", il);
@@ -9365,10 +9477,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93659477
cur = ggml_add(ctx0, cur, attn[id]);
93669478
cb(cur, "combine_attn", il);
93679479
}
9368-
// TODO: for more than 2 GPUs, do we need to add another forced graph split?
9369-
//if (attn.size() > 2) {
9370-
// cur->op_params[0] = 0xff;
9371-
//}
93729480
return cur;
93739481
}
93749482
}
@@ -9392,6 +9500,11 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
93929500
cb(Qcur, "Qcur", il);
93939501
cb(Kcur, "Kcur", il);
93949502

9503+
if (inp_attn_scale) {
9504+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
9505+
cb(Qcur, "Qcur_temp_scaled", il);
9506+
}
9507+
93959508
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
93969509
model.layers[il].wo, model.layers[il].bo,
93979510
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);

src/llama-build-context.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ struct llm_build_context {
160160

161161
ggml_cgraph * build_llama();
162162

163+
ggml_cgraph * build_mistral3();
164+
163165
ggml_cgraph * build_deci();
164166

165167
ggml_cgraph * build_baichuan();
@@ -405,6 +407,6 @@ llm_expert_gating_func_type gating_op,
405407
static ggml_cgraph * llama_build_graph(llama_context & lctx, const llama_batch & batch, bool worst_case);
406408

407409
ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
408-
ggml_tensor * KQ_mask, ggml_tensor * sinks, float KQ_scale, float f_attn_scale, int n_swa, int il);
410+
ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale, int n_swa, int il);
409411

410412
};

0 commit comments

Comments
 (0)