Add the FP8 pack custom op hook and replacing the unsafe pointer‑based wiring.

createthis · createthis · commit e952efaaadec · 2025-12-07T14:31:37.000Z
diff --git a/src/llama-kv-cache-fp8.cpp b/src/llama-kv-cache-fp8.cpp
@@ -769,6 +769,60 @@ bool llama_kv_cache_fp8::state_read_data(llama_io_read_i & io, uint32_t strm, ui
     return false;
 }
 
+
+
+struct kv_dsmla_pack_userdata {
+    int32_t il;
+    int32_t kv_size;
+    int32_t n_stream;
+};
+
+static void kv_dsmla_pack_custom(ggml_tensor * dst, int ith, int nth, void * userdata) {
+    GGML_UNUSED(dst);
+    GGML_UNUSED(ith);
+    GGML_UNUSED(nth);
+    GGML_UNUSED(userdata);
+    // CPU stub: real work is performed in CUDA backend via specialized handler.
+}
+
+ggml_tensor * llama_kv_cache_fp8::build_k_pack_node(
+        ggml_context * ctx,
+        ggml_tensor * k_latent_rope,
+        ggml_tensor * k_idxs,
+        int32_t       il) const {
+    GGML_ASSERT(ctx != nullptr);
+    GGML_ASSERT(k_latent_rope != nullptr);
+    GGML_ASSERT(k_idxs != nullptr);
+    GGML_ASSERT(k_idxs->type == GGML_TYPE_I64);
+
+    if (model.arch != LLM_ARCH_DEEPSEEK3_2) {
+        return k_latent_rope;
+    }
+
+    const kv_layer_fp8 * lyr = get_layer(il);
+    if (lyr == nullptr || lyr->k_blob == nullptr) {
+        return k_latent_rope;
+    }
+
+    kv_dsmla_pack_userdata * ud = new kv_dsmla_pack_userdata;
+    ud->il       = il;
+    ud->kv_size  = (int32_t) get_size();
+    ud->n_stream = (int32_t) get_n_stream();
+
+    ggml_tensor * args[3] = { k_latent_rope, k_idxs, lyr->k_blob };
+    ggml_tensor * node = ggml_custom_4d(
+        ctx,
+        GGML_TYPE_F32,
+        1, 1, 1, 1,
+        args,
+        3,
+        kv_dsmla_pack_custom,
+        GGML_N_TASKS_MAX,
+        ud);
+
+    return node;
+}
+
 // Accessors for K/V are left unimplemented for now since the FP8 cache
 // is not yet used in any graph. They will be filled in when wiring the
 // cache to DeepSeek V3.2.
diff --git a/src/llama-kv-cache-fp8.h b/src/llama-kv-cache-fp8.h
@@ -80,6 +80,8 @@ class llama_kv_cache_fp8 : public llama_memory_i {
     void set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 
+    ggml_tensor * build_k_pack_node(ggml_context * ctx, ggml_tensor * k_latent_rope, ggml_tensor * k_idxs, int32_t il) const;
+
 private:
     const llama_model & model;
     const llama_hparams & hparams;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -13904,27 +13904,12 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                             ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il));
                             ggml_build_forward_expand(gf, inp_attn->get_kq_mask());
 
-                            // Optional: DeepSeek V3.2 FP8 K-side KV cache write
+                            // Optional: DeepSeek V3.2 FP8 K-side KV cache pack (custom op)
                             if (model.kv_fp8_ds32) {
-                                const int64_t D_latent = kv_lora_rank;
-                                const int64_t D_rope   = n_rot;
-                                const int64_t D_total  = D_latent + D_rope;
-                                GGML_UNUSED(D_total);
                                 ggml_tensor * k_fp8_in = ggml_concat(ctx0, kv_cmpr, k_pe, 0); // [D_total,1,n_tokens]
-
-                                llama_kv_cache::slot_info sinfo_fp8;
-                                sinfo_fp8.s0 = 0;
-                                sinfo_fp8.s1 = 0;
-                                sinfo_fp8.strm = { 0 };
-                                sinfo_fp8.idxs = { std::vector<uint32_t>(model.kv_fp8_ds32->get_size()) };
-                                for (uint32_t i = 0; i < model.kv_fp8_ds32->get_size(); ++i) {
-                                    sinfo_fp8.idxs[0][i] = i;
-                                }
-
-                                ggml_tensor * k_idxs = inp_attn->get_k_idxs();
-                                ggml_build_forward_expand(
-                                    gf,
-                                    model.kv_fp8_ds32->cpy_k(ctx0, k_fp8_in, k_idxs, il, sinfo_fp8));
+                                ggml_tensor * k_idxs   = inp_attn->get_k_idxs();
+                                ggml_tensor * pack_node = model.kv_fp8_ds32->build_k_pack_node(ctx0, k_fp8_in, k_idxs, il);
+                                ggml_build_forward_expand(gf, pack_node);
                             }
                         }
 
@@ -14060,27 +14045,12 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                             ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il));
                             ggml_build_forward_expand(gf, inp_attn->get_kq_mask());
 
-                            // Optional: DeepSeek V3.2 FP8 K-side KV cache write
+                            // Optional: DeepSeek V3.2 FP8 K-side KV cache pack (custom op)
                             if (model.kv_fp8_ds32) {
-                                const int64_t D_latent = kv_lora_rank;
-                                const int64_t D_rope   = n_rot;
-                                const int64_t D_total  = D_latent + D_rope;
-                                GGML_UNUSED(D_total);
                                 ggml_tensor * k_fp8_in = ggml_concat(ctx0, kv_cmpr, k_pe, 0); // [D_total,1,n_tokens]
-
-                                llama_kv_cache::slot_info sinfo_fp8;
-                                sinfo_fp8.s0 = 0;
-                                sinfo_fp8.s1 = 0;
-                                sinfo_fp8.strm = { 0 };
-                                sinfo_fp8.idxs = { std::vector<uint32_t>(model.kv_fp8_ds32->get_size()) };
-                                for (uint32_t i = 0; i < model.kv_fp8_ds32->get_size(); ++i) {
-                                    sinfo_fp8.idxs[0][i] = i;
-                                }
-
-                                ggml_tensor * k_idxs = inp_attn->get_k_idxs();
-                                ggml_build_forward_expand(
-                                    gf,
-                                    model.kv_fp8_ds32->cpy_k(ctx0, k_fp8_in, k_idxs, il, sinfo_fp8));
+                                ggml_tensor * k_idxs   = inp_attn->get_k_idxs();
+                                ggml_tensor * pack_node = model.kv_fp8_ds32->build_k_pack_node(ctx0, k_fp8_in, k_idxs, il);
+                                ggml_build_forward_expand(gf, pack_node);
                             }
                         }