adjust kernel selection logic

JohannesGaessler · JohannesGaessler · commit 394ced5ff8e4 · 2025-11-29T18:39:58.000+01:00
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -306,6 +306,9 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
         while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {
             gqa_ratio_eff *= 2;
         }
+        if (Q->ne[1] * gqa_ratio_eff <= 2) {
+            return BEST_FATTN_KERNEL_VEC;
+        }
         if (Q->ne[1] * gqa_ratio_eff <= 16) {
             return BEST_FATTN_KERNEL_TILE; // On Volta tensor cores are only faster for sufficiently large matrices.
         }

Original file line number	Diff line number	Diff line change
`@@ -306,6 +306,9 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const`
`306`	`306`	`while (gqa_ratio % (2*gqa_ratio_eff) == 0 && gqa_ratio_eff < ncols2_max) {`
`307`	`307`	`gqa_ratio_eff *= 2;`
`308`	`308`	`}`
	`309`	`+ if (Q->ne[1] * gqa_ratio_eff <= 2) {`
	`310`	`+ return BEST_FATTN_KERNEL_VEC;`
	`311`	`+ }`
`309`	`312`	`if (Q->ne[1] * gqa_ratio_eff <= 16) {`
`310`	`313`	`return BEST_FATTN_KERNEL_TILE; // On Volta tensor cores are only faster for sufficiently large matrices.`
`311`	`314`	`}`