Adds support for mask and bias tensors in backward kernel

LoserCheems · LoserCheems · commit 4c3627a0c8a6 · 2025-08-16T22:03:53.000+08:00
Extends tensor partitioning to include mask and bias identity tensors alongside existing query and key-value tensors.

Enables proper handling of attention masks and bias terms during backward pass computation by creating corresponding partitioned tensors with appropriate layouts.
diff --git a/csrc/src/flash_bwd_kernel.h b/csrc/src/flash_bwd_kernel.h
@@ -409,8 +409,14 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
 
     Tensor cQ = make_identity_tensor(make_shape(size<0>(sQ), size<1>(sQ)));             // (BLK_M, BLK_K) -> (blk_m, blk_k)
     Tensor cKV = make_identity_tensor(make_shape(size<0>(sK), size<1>(sK)));            // (BLK_N, BLK_K) -> (blk_n, blk_k)
-    Tensor tQcQ = gmem_thr_copy_QKV.partition_D(cQ);
-    Tensor tKVcKV = gmem_thr_copy_QKV.partition_D(cKV);
+    Tensor cMask = make_identity_tensor(make_shape(size<0>(sMask), size<1>(sMask)));    // (BLK_M, BLK_N) -> (blk_m, blk_n)
+    Tensor cBias = make_identity_tensor(make_shape(size<0>(sBias), size<1>(sBias)));    // (BLK_M, BLK_N) -> (blk_m, blk_n)
+
+    // Repeat the partitioning with identity layouts
+    Tensor tQcQ = gmem_thr_copy_QKV.partition_D(cQ);                                    // (ACPY, ACPY_M, ACPY_K) -> (blk_m, blk_k)
+    Tensor tKVcKV = gmem_thr_copy_QKV.partition_D(cKV);                                 // (BCPY, BCPY_N, BCPY_K) -> (blk_n, blk_k)
+    Tensor tMaskcMask = gmem_thr_copy_Mask.partition_D(cMask);                          // (MaskCPY, MaskCPY_M, MaskCPY_N) -> (blk_m, blk_n)
+    Tensor tBiascBias = gmem_thr_copy_Bias.partition_D(cBias);                          // (BiasCPY, BiasCPY_M, BiasCPY_N) -> (blk_m, blk_n)
 
     // Allocate predicate tensors for k
     Tensor tQpQ = make_tensor<bool>(make_shape(size<2>(tQsQ)));