Remove unused use_fp16 argument to idx_compute_scores_tile

createthis · createthis · commit e68b55b8a5ee · 2025-11-24T18:15:02.000Z
diff --git a/src/llama-sparse-indexer.cpp b/src/llama-sparse-indexer.cpp
@@ -51,8 +51,7 @@ ggml_tensor * sparse_attn_indexer::idx_compute_scores_tile(
     ggml_tensor * k_scale_2d,
     int64_t D, int64_t H,
     int64_t Tc, int64_t kv_end,
-    int64_t t0,
-    bool use_fp16) {
+    int64_t t0) {
     const char * __prof_env = getenv("LLAMA_SPARSE_PROF");
     bool prof = (__prof_env && atoi(__prof_env) != 0);
     double t0_us = 0.0;
@@ -184,7 +183,6 @@ ggml_tensor * sparse_attn_indexer::idx_compute_scores_tile(
                 float v = out[(size_t)i + (size_t)kv * (size_t)tc];
                 fprintf(stderr, "  C[%d,%d]= % .6f", i, tc, v);
             }
-            fprintf(stderr, "");
         }
     }
 
diff --git a/src/llama-sparse-indexer.h b/src/llama-sparse-indexer.h
@@ -32,8 +32,7 @@ struct sparse_attn_indexer {
         ggml_tensor * k_scale_2d,
         int64_t D, int64_t H,
         int64_t Tc, int64_t kv_end,
-        int64_t t0,
-        bool use_fp16);
+        int64_t t0);
 
     // Build KV-aware top-k token indices using the Lightning Indexer tensors.
     // If mctx is nullptr, uses freshly computed K_indexer directly without cache writes.
diff --git a/src/llama-sparse-topk.cpp b/src/llama-sparse-topk.cpp
@@ -505,7 +505,7 @@ ggml_tensor * sparse_attn_topk::select_topk_tokens_indexer_kvaware(
                 }
 
                 if (dbg && t0 == 0) {
-                    ggml_tensor * ref_scores = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0, use_fp16);
+                    ggml_tensor * ref_scores = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0);
                     ggml_tensor * ref_head = ggml_view_2d(ctx, ref_scores, std::min<int64_t>(kv_end, (int64_t)8), std::min<int64_t>(Tc, (int64_t)4), ref_scores->nb[1], 0);
                     cb(ref_head, "idxkv_scores_ref_head", -1);
                     if (gf) { ggml_set_output(ref_head); ggml_build_forward_expand(gf, ref_head); }
@@ -542,7 +542,7 @@ ggml_tensor * sparse_attn_topk::select_topk_tokens_indexer_kvaware(
                 }
 
             } else {
-                scores_tc = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0, use_fp16);
+                scores_tc = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0);
             }
         }
 
diff --git a/tests/test-indexer-fused-op-cuda.cpp b/tests/test-indexer-fused-op-cuda.cpp
@@ -293,8 +293,7 @@ int main() {
             H_ref,
             Tc_ref,
             kv_ref,
-            /*t0=*/0,
-            /*use_fp16=*/false);
+            /*t0=*/0);
 
         ggml_cgraph * gf_ref = ggml_new_graph(ctx_ref);
         ggml_build_forward_expand(gf_ref, scores_ref);
diff --git a/tests/test-indexer-scores-tile.cpp b/tests/test-indexer-scores-tile.cpp
@@ -85,7 +85,7 @@ int main() {
     std::memcpy(ks1d->data, KS.data(), ggml_nbytes(ks1d));
 
     ggml_tensor * scores = sparse_attn_indexer::idx_compute_scores_tile(
-        ctx, q3d, a_k, w2d, ks2d, D, H, Tc, kv, 0, /*use_fp16=*/false);
+        ctx, q3d, a_k, w2d, ks2d, D, H, Tc, kv, 0);
 
     const int iters = 10;
 
diff --git a/tests/test-indexer-triplet-vs-fused.cpp b/tests/test-indexer-triplet-vs-fused.cpp
@@ -139,7 +139,7 @@ int main() {
     ggml_tensor * scores_tc = sparse_attn_indexer::idx_compute_scores_tile(
         cpu_ctx.ctx, q3d, a_k, w2d, k_scale_2d,
         D_index, H_index, T, N_kv,
-        /*t0=*/0, /*use_fp16=*/false);
+        /*t0=*/0);
 
     ggml_cgraph * gf_cpu = ggml_new_graph(cpu_ctx.ctx);
     ggml_build_forward_expand(gf_cpu, scores_tc);

Original file line number	Diff line number	Diff line change
`@@ -51,8 +51,7 @@ ggml_tensor * sparse_attn_indexer::idx_compute_scores_tile(`
`51`	`51`	`ggml_tensor * k_scale_2d,`
`52`	`52`	`int64_t D, int64_t H,`
`53`	`53`	`int64_t Tc, int64_t kv_end,`
`54`		`- int64_t t0,`
`55`		`- bool use_fp16) {`
	`54`	`+ int64_t t0) {`
`56`	`55`	`const char * __prof_env = getenv("LLAMA_SPARSE_PROF");`
`57`	`56`	`bool prof = (__prof_env && atoi(__prof_env) != 0);`
`58`	`57`	`double t0_us = 0.0;`
`@@ -184,7 +183,6 @@ ggml_tensor * sparse_attn_indexer::idx_compute_scores_tile(`
`184`	`183`	`float v = out[(size_t)i + (size_t)kv * (size_t)tc];`
`185`	`184`	`fprintf(stderr, " C[%d,%d]= % .6f", i, tc, v);`
`186`	`185`	`}`
`187`		`- fprintf(stderr, "");`
`188`	`186`	`}`
`189`	`187`	`}`
`190`	`188`
Original file line number	Diff line number	Diff line change
`@@ -505,7 +505,7 @@ ggml_tensor * sparse_attn_topk::select_topk_tokens_indexer_kvaware(`
`505`	`505`	`}`
`506`	`506`
`507`	`507`	`if (dbg && t0 == 0) {`
`508`		`- ggml_tensor * ref_scores = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0, use_fp16);`
	`508`	`+ ggml_tensor * ref_scores = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0);`
`509`	`509`	`ggml_tensor * ref_head = ggml_view_2d(ctx, ref_scores, std::min<int64_t>(kv_end, (int64_t)8), std::min<int64_t>(Tc, (int64_t)4), ref_scores->nb[1], 0);`
`510`	`510`	`cb(ref_head, "idxkv_scores_ref_head", -1);`
`511`	`511`	`if (gf) { ggml_set_output(ref_head); ggml_build_forward_expand(gf, ref_head); }`
`@@ -542,7 +542,7 @@ ggml_tensor * sparse_attn_topk::select_topk_tokens_indexer_kvaware(`
`542`	`542`	`}`
`543`	`543`
`544`	`544`	`} else {`
`545`		`- scores_tc = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0, use_fp16);`
	`545`	`+ scores_tc = llama::sparse_attn_indexer::idx_compute_scores_tile(ctx, q3d, k_indexer_f16, weights, k_scale_2d, D, H, Tc, kv_end, t0);`
`546`	`546`	`}`
`547`	`547`	`}`
`548`	`548`