@@ -13904,27 +13904,12 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1390413904 ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il));
1390513905 ggml_build_forward_expand(gf, inp_attn->get_kq_mask());
1390613906
13907- // Optional: DeepSeek V3.2 FP8 K-side KV cache write
13907+ // Optional: DeepSeek V3.2 FP8 K-side KV cache pack (custom op)
1390813908 if (model.kv_fp8_ds32) {
13909- const int64_t D_latent = kv_lora_rank;
13910- const int64_t D_rope = n_rot;
13911- const int64_t D_total = D_latent + D_rope;
13912- GGML_UNUSED(D_total);
1391313909 ggml_tensor * k_fp8_in = ggml_concat(ctx0, kv_cmpr, k_pe, 0); // [D_total,1,n_tokens]
13914-
13915- llama_kv_cache::slot_info sinfo_fp8;
13916- sinfo_fp8.s0 = 0;
13917- sinfo_fp8.s1 = 0;
13918- sinfo_fp8.strm = { 0 };
13919- sinfo_fp8.idxs = { std::vector<uint32_t>(model.kv_fp8_ds32->get_size()) };
13920- for (uint32_t i = 0; i < model.kv_fp8_ds32->get_size(); ++i) {
13921- sinfo_fp8.idxs[0][i] = i;
13922- }
13923-
13924- ggml_tensor * k_idxs = inp_attn->get_k_idxs();
13925- ggml_build_forward_expand(
13926- gf,
13927- model.kv_fp8_ds32->cpy_k(ctx0, k_fp8_in, k_idxs, il, sinfo_fp8));
13910+ ggml_tensor * k_idxs = inp_attn->get_k_idxs();
13911+ ggml_tensor * pack_node = model.kv_fp8_ds32->build_k_pack_node(ctx0, k_fp8_in, k_idxs, il);
13912+ ggml_build_forward_expand(gf, pack_node);
1392813913 }
1392913914 }
1393013915
@@ -14060,27 +14045,12 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1406014045 ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il));
1406114046 ggml_build_forward_expand(gf, inp_attn->get_kq_mask());
1406214047
14063- // Optional: DeepSeek V3.2 FP8 K-side KV cache write
14048+ // Optional: DeepSeek V3.2 FP8 K-side KV cache pack (custom op)
1406414049 if (model.kv_fp8_ds32) {
14065- const int64_t D_latent = kv_lora_rank;
14066- const int64_t D_rope = n_rot;
14067- const int64_t D_total = D_latent + D_rope;
14068- GGML_UNUSED(D_total);
1406914050 ggml_tensor * k_fp8_in = ggml_concat(ctx0, kv_cmpr, k_pe, 0); // [D_total,1,n_tokens]
14070-
14071- llama_kv_cache::slot_info sinfo_fp8;
14072- sinfo_fp8.s0 = 0;
14073- sinfo_fp8.s1 = 0;
14074- sinfo_fp8.strm = { 0 };
14075- sinfo_fp8.idxs = { std::vector<uint32_t>(model.kv_fp8_ds32->get_size()) };
14076- for (uint32_t i = 0; i < model.kv_fp8_ds32->get_size(); ++i) {
14077- sinfo_fp8.idxs[0][i] = i;
14078- }
14079-
14080- ggml_tensor * k_idxs = inp_attn->get_k_idxs();
14081- ggml_build_forward_expand(
14082- gf,
14083- model.kv_fp8_ds32->cpy_k(ctx0, k_fp8_in, k_idxs, il, sinfo_fp8));
14051+ ggml_tensor * k_idxs = inp_attn->get_k_idxs();
14052+ ggml_tensor * pack_node = model.kv_fp8_ds32->build_k_pack_node(ctx0, k_fp8_in, k_idxs, il);
14053+ ggml_build_forward_expand(gf, pack_node);
1408414054 }
1408514055 }
1408614056
0 commit comments