Adds mask and bias tensor support to backward kernel

LoserCheems · LoserCheems · commit 5ee5acf8e2e6 · 2025-08-16T22:01:00.000+08:00
Introduces gMask, gBias, and gdBias tensor declarations to enable
attention masking and bias functionality in the backward pass.

Extends the kernel to handle masked attention computations and
bias gradient calculations for more flexible attention mechanisms.
diff --git a/csrc/src/flash_bwd_kernel.h b/csrc/src/flash_bwd_kernel.h
@@ -143,6 +143,21 @@ inline __device__ void compute_dq_dk_dv_1colblock(const Params &params, const in
         Shape<Int<kBlockN>, Int<kHeadDim>>{},
         make_stride(params.v_row_stride, _1{})
     );
+    Tensor gMask = make_tensor(
+        make_gmem_ptr(reinterpret_cast<Element *>(params.mask_ptr) + row_offset_mask),
+        Shape<Int<kBlockM>, Int<kBlockN>>{},
+        make_stride(params.mask_row_stride, _1{})
+    );
+    Tensor gBias = make_tensor(
+        make_gmem_ptr(reinterpret_cast<Element *>(params.bias_ptr) + row_offset_bias),
+        Shape<Int<kBlockM>, Int<kBlockN>>{},
+        make_stride(params.bias_row_stride, _1{})
+    );
+    Tensor gdBias = make_tensor(
+        make_gmem_ptr(reinterpret_cast<Element *>(params.dbias_ptr) + row_offset_dbias),
+        Shape<Int<kBlockM>, Int<kBlockN>>{},
+        make_stride(params.dbias_row_stride, _1{})
+    );
     Tensor gdO = make_tensor(
         make_gmem_ptr(reinterpret_cast<Element *>(params.do_ptr) + row_offset_do),
         Shape<Int<kBlockM>, Int<kHeadDim>>{},