Fix CUDA forward crash when seqlen_q == 1

LoserCheems · web-flow · commit 820fecfc8547 · 2025-08-13T21:23:55.000+08:00
diff --git a/benchmarks/forward_equivalence.py b/benchmarks/forward_equivalence.py
@@ -518,27 +518,28 @@ def test_cuda_forward_equivalence(accuracy_threshold=0.95):
     # If you encounter NAN issues when running multiple configurations, try running a single configuration
     test_configs = [
         # (batch_size, num_heads, num_kv_heads, query_len, key_len, head_dim, is_causal)
-        # (1, 1, 1, 64, 64, 32, True),
-        # (1, 1, 1, 64, 64, 32, False),
-        # (1, 1, 1, 128, 128, 32, True),
-        # (1, 1, 1, 128, 128, 32, False),
-        # (1, 1, 1, 256, 256, 32, True),
-        # (1, 1, 1, 256, 256, 32, False),
-        # (1, 1, 1, 512, 512, 32, True),
-        # (1, 1, 1, 512, 512, 32, False),
-        # (1, 1, 1, 1024, 1024, 32, True),
-        # (1, 1, 1, 1024, 1024, 32, False),
-        # (1, 1, 1, 2048, 2048, 32, True),
-        # (1, 1, 1, 2048, 2048, 32, False),
+        (1, 1, 1, 64, 64, 32, True),
+        (1, 1, 1, 64, 64, 32, False),
+        (1, 1, 1, 128, 128, 32, True),
+        (1, 1, 1, 128, 128, 32, False),
+        (1, 1, 1, 256, 256, 32, True),
+        (1, 1, 1, 256, 256, 32, False),
+        (1, 1, 1, 512, 512, 32, True),
+        (1, 1, 1, 512, 512, 32, False),
+        (1, 1, 1, 1024, 1024, 32, True),
+        (1, 1, 1, 1024, 1024, 32, False),
+        (1, 1, 1, 2048, 2048, 32, True),
+        (1, 1, 1, 2048, 2048, 32, False),
         (1, 1, 1, 4096, 4096, 32, True),
-        # (1, 1, 1, 4096, 4096, 32, False),
-        # (1, 2, 1, 64, 64, 32, True),
-        # (2, 1, 1, 128, 128, 32, True),
-        # (2, 2, 1, 128, 128, 32, True),
-        # (1, 2, 1, 64, 64, 128, True),
-        # (1, 2, 1, 128, 128, 128, True),
-        # (1, 2, 1, 256, 256, 128, True),
-        # (1, 2, 1, 512, 512, 128, True),
+        (1, 1, 1, 4096, 4096, 32, False),
+        (1, 2, 1, 64, 64, 32, True),
+        (2, 1, 1, 128, 128, 32, True),
+        (2, 2, 1, 128, 128, 32, True),
+        (1, 2, 1, 64, 64, 128, True),
+        (1, 2, 1, 128, 128, 128, True),
+        (1, 2, 1, 256, 256, 128, True),
+        (1, 2, 1, 3, 512, 128, True),
+        (1, 2, 1, 1, 512, 128, True),
     ]
     
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diff --git a/benchmarks/forward_performance.py b/benchmarks/forward_performance.py
@@ -732,57 +732,57 @@ def run_performance_benchmark(test_type='all', num_runs=3, warmup_runs=2):
         (1, 2, 1, 4096, 4096, 128, 2048, True),
         (1, 2, 1, 8192, 8192, 128, 2048, True),
         (1, 2, 1, 16384, 16384, 128, 2048, True),
-        # (1, 2, 1, 32768, 32768, 128, 2048, True),
+        (1, 2, 1, 32768, 32768, 128, 2048, True),
 
         # # Inference
-        # (1, 2, 1, 2, 256, 128, 2048, True),
-        # (1, 2, 1, 2, 512, 128, 2048, True),
-        # (1, 2, 1, 2, 1024, 128, 2048, True),
-        # (1, 2, 1, 2, 2048, 128, 2048, True),
-        # (1, 2, 1, 2, 4096, 128, 2048, True),
-        # (1, 2, 1, 2, 8192, 128, 2048, True),
-        # (1, 2, 1, 2, 16384, 128, 2048, True),
-        # (1, 2, 1, 2, 32768, 128, 2048, True),
-        (1, 2, 1, 2, 65536, 128, 2048, True),
-        # (1, 2, 1, 2, 131072, 128, 2048, True),
-        # (1, 2, 1, 2, 262144, 128, 2048, True),
-        # (1, 2, 1, 2, 524288, 128, 2048, True),
-        
-        # # Vary batch size
-        # (1, 2, 1, 4096, 4096, 32, 2048, True),
-        # (2, 2, 1, 4096, 4096, 32, 2048, True),
-        # (4, 2, 1, 4096, 4096, 32, 2048, True),
-        # (8, 2, 1, 4096, 4096, 32, 2048, True),
-        
-        # # Vary head count
-        # (1, 1, 1, 4096, 4096, 32, 2048, True),
-        # (1, 2, 1, 4096, 4096, 32, 2048, True),
-        # (1, 4, 1, 4096, 4096, 32, 2048, True),
-        # (1, 8, 2, 4096, 4096, 32, 2048, True),
-        
-        # # Vary head dimension
-        # (1, 2, 1, 4096, 4096, 32, 2048, True),
-        # (1, 2, 1, 4096, 4096, 64, 2048, True),
-        # (1, 2, 1, 4096, 4096, 96, 2048, True),
-        # (1, 2, 1, 4096, 4096, 128, 2048, True),
-        # (1, 2, 1, 4096, 4096, 192, 2048, True),
-        # (1, 2, 1, 4096, 4096, 256, 2048, True),
-        
-        # # Vary keep_window_size
-        # (1, 2, 1, 32768, 32768, 128, 32, True),
-        # (1, 2, 1, 32768, 32768, 128, 64, True),
-        # (1, 2, 1, 32768, 32768, 128, 128, True),
-        # (1, 2, 1, 32768, 32768, 128, 256, True),
-        # (1, 2, 1, 32768, 32768, 128, 512, True),
-        # (1, 2, 1, 32768, 32768, 128, 1024, True),
-        # (1, 2, 1, 32768, 32768, 128, 2048, True),
-        # (1, 2, 1, 32768, 32768, 128, 4096, True),
-        # (1, 2, 1, 32768, 32768, 128, 8192, True),
-        # (1, 2, 1, 32768, 32768, 128, 16384, True),
-        # (1, 2, 1, 32768, 32768, 128, 32768, True),
-        
-        # # Test non-causal
-        # (1, 2, 1, 4096, 4096, 128, 2048, False),
+        (1, 2, 1, 1, 256, 128, 2048, True),
+        (1, 2, 1, 1, 512, 128, 2048, True),
+        (1, 2, 1, 1, 1024, 128, 2048, True),
+        (1, 2, 1, 1, 2048, 128, 2048, True),
+        (1, 2, 1, 1, 4096, 128, 2048, True),
+        (1, 2, 1, 1, 8192, 128, 2048, True),
+        (1, 2, 1, 1, 16384, 128, 2048, True),
+        (1, 2, 1, 1, 32768, 128, 2048, True),
+        (1, 2, 1, 1, 65536, 128, 2048, True),
+        (1, 2, 1, 1, 131072, 128, 2048, True),
+        (1, 2, 1, 1, 262144, 128, 2048, True),
+        (1, 2, 1, 1, 524288, 128, 2048, True),
+        
+        # Vary batch size
+        (1, 2, 1, 4096, 4096, 32, 2048, True),
+        (2, 2, 1, 4096, 4096, 32, 2048, True),
+        (4, 2, 1, 4096, 4096, 32, 2048, True),
+        (8, 2, 1, 4096, 4096, 32, 2048, True),
+        
+        # Vary head count
+        (1, 1, 1, 4096, 4096, 32, 2048, True),
+        (1, 2, 1, 4096, 4096, 32, 2048, True),
+        (1, 4, 1, 4096, 4096, 32, 2048, True),
+        (1, 8, 2, 4096, 4096, 32, 2048, True),
+        
+        # Vary head dimension
+        (1, 2, 1, 4096, 4096, 32, 2048, True),
+        (1, 2, 1, 4096, 4096, 64, 2048, True),
+        (1, 2, 1, 4096, 4096, 96, 2048, True),
+        (1, 2, 1, 4096, 4096, 128, 2048, True),
+        (1, 2, 1, 4096, 4096, 192, 2048, True),
+        (1, 2, 1, 4096, 4096, 256, 2048, True),
+        
+        # Vary keep_window_size
+        (1, 2, 1, 32768, 32768, 128, 32, True),
+        (1, 2, 1, 32768, 32768, 128, 64, True),
+        (1, 2, 1, 32768, 32768, 128, 128, True),
+        (1, 2, 1, 32768, 32768, 128, 256, True),
+        (1, 2, 1, 32768, 32768, 128, 512, True),
+        (1, 2, 1, 32768, 32768, 128, 1024, True),
+        (1, 2, 1, 32768, 32768, 128, 2048, True),
+        (1, 2, 1, 32768, 32768, 128, 4096, True),
+        (1, 2, 1, 32768, 32768, 128, 8192, True),
+        (1, 2, 1, 32768, 32768, 128, 16384, True),
+        (1, 2, 1, 32768, 32768, 128, 32768, True),
+        
+        # Test non-causal
+        (1, 2, 1, 4096, 4096, 128, 2048, False),
     ]
     
     print(f"\n📊 Benchmark Results (averaged over {num_runs} runs):")
diff --git a/csrc/flash_api.cpp b/csrc/flash_api.cpp
@@ -116,7 +116,7 @@ void set_params_fprop(
 
     // Set the different scale values.
     #ifdef FLASHATTENTION_DISABLE_SOFTCAP
-        TORCH_CHECK(softcap <= 0.0, "This flash attention build does not support softcap.");
+        TORCH_CHECK(softcap <= 0.0, "This flash dynamic mask attention build does not support softcap.");
     #endif
     if (softcap > 0.0) {
         params.softcap = softmax_scale / softcap;
@@ -133,7 +133,7 @@ void set_params_fprop(
     params.is_seqlens_k_cumulative = true;
 
     #ifdef FLASHATTENTION_DISABLE_UNEVEN_K
-        TORCH_CHECK(d == d_rounded, "This flash attention build does not support headdim not being a multiple of 32.");
+        TORCH_CHECK(d == d_rounded, "This flash dynamic mask attention build does not support headdim not being a multiple of 32.");
     #endif
 
     params.unpadded_lse = unpadded_lse;
@@ -231,7 +231,7 @@ void run_mha_fwd(Flash_fwd_params &params, cudaStream_t stream, bool force_split
     FP16_SWITCH(!params.is_bf16, [&] {
         HEADDIM_SWITCH(params.d, [&] {
             BOOL_SWITCH(params.is_causal, Is_causal, [&] {
-                if (params.num_splits <= 1 && !force_split_kernel) {  // If we don't set it num_splits == 0
+                if (params.num_splits <= 1 && !force_split_kernel) {    // If we don't set it num_splits == 0
                     run_mha_fwd_<elem_type, kHeadDim, Is_causal>(params, stream);
                 } else {
                     run_mha_fwd_splitkv_dispatch<elem_type, kHeadDim, Is_causal>(params, stream);
@@ -354,6 +354,8 @@ mha_fwd(
     TORCH_CHECK(q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
     TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
     TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(mask.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+    TORCH_CHECK(bias.stride(-1) == 1, "Input tensor must have contiguous last dimension");
 
     const auto sizes = q.sizes();
 
@@ -375,17 +377,21 @@ mha_fwd(
     // H/t Daniel Haziza
     const int seqlenq_ngroups_swapped = seqlen_q == 1 && num_heads > num_heads_k && head_size % 8 == 0;
     const int ngroups = num_heads / num_heads_k;
+    at::Tensor mask_view = mask;
+    at::Tensor bias_view = bias;
     if (seqlenq_ngroups_swapped) {
         q = q.reshape({batch_size, num_heads_k, ngroups, head_size}).transpose(1, 2);
+        mask_view = mask.expand({batch_size, num_heads_k, ngroups, seqlen_k});
+        bias_view = bias.expand({batch_size, num_heads_k, ngroups, seqlen_k});
         seqlen_q = ngroups;
         num_heads = num_heads_k;
     }
 
     CHECK_SHAPE(q, batch_size, seqlen_q, num_heads, head_size);
     CHECK_SHAPE(k, batch_size, seqlen_k, num_heads_k, head_size);
     CHECK_SHAPE(v, batch_size, seqlen_k, num_heads_k, head_size);
-    CHECK_SHAPE(mask, batch_size, num_heads_k, seqlen_q, seqlen_k);
-    CHECK_SHAPE(bias, batch_size, num_heads_k, seqlen_q, seqlen_k);
+    CHECK_SHAPE(mask_view, batch_size, num_heads_k, seqlen_q, seqlen_k);
+    CHECK_SHAPE(bias_view, batch_size, num_heads_k, seqlen_q, seqlen_k);
 
     at::Tensor out;
     if (out_.has_value()) {
@@ -425,7 +431,7 @@ mha_fwd(
         seqlen_q_rounded, seqlen_k_rounded,
         num_heads, num_heads_k,
         head_size, head_size_rounded,
-        q, k, v, mask, bias, out,
+        q, k, v, mask_view, bias_view, out,
         /*cu_seqlens_q_d=*/nullptr,
         /*cu_seqlens_k_d=*/nullptr,
         /*seqused_k=*/nullptr,