Fix: Correct K/V dimension mismatch in path_attn bwd kernels"changing K/BK to V/BV for v and dv operations (#633)

ReyJerry · web-flow · commit 1c403c3a8289 · 2025-11-05T14:46:10.000-05:00
* changing K/BK to V/BV for v and dv operations

* correcting the v tensor's loading layout, block size, and applying the necessary transpose in the dot product
diff --git a/fla/ops/path_attn/parallel_path_bwd_inter_dkv.py b/fla/ops/path_attn/parallel_path_bwd_inter_dkv.py
@@ -77,7 +77,7 @@ def parallel_path_bwd_dkv_kernel(
     # load query
     p_k = tl.make_block_ptr(k, (T, K), (H*K, 1), (i_t * BT, 0), (BT, BK), (1, 0))
     b_k = tl.load(p_k, boundary_check=(0, 1))
-    p_v = tl.make_block_ptr(v, (T, K), (H*K, 1), (i_t * BT, 0), (BT, BK), (1, 0))
+    p_v = tl.make_block_ptr(v, (T, V), (H*V, 1), (i_t * BT, 0), (BT, BV), (1, 0))
     b_v = tl.load(p_v, boundary_check=(0, 1))
 
     if USE_GATE:
@@ -90,7 +90,7 @@ def parallel_path_bwd_dkv_kernel(
         b_dg_cumsum_k = None
 
     b_dk = tl.zeros([BT, BK], dtype=tl.float32)
-    b_dv = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dv = tl.zeros([BT, BV], dtype=tl.float32)
 
     last_chunk_start = tl.floor(i_t*BT / S).to(tl.int32) * S
     idx_j = (tl.floor(i_t * BT / S).to(tl.int32) + 1).to(tl.int32)
@@ -127,7 +127,7 @@ def parallel_path_bwd_dkv_kernel(
     tl.store(p_dk, b_dk.to(dk.dtype.element_ty), boundary_check=(0, 1))
     mask = i_t * BT + tl.arange(0, BT) < T
     tl.atomic_add(
-        dv + (i_t * BT + tl.arange(0, BT))[:, None] * HQ * K + tl.arange(0, BK)[None, :],
+        dv + (i_t * BT + tl.arange(0, BT))[:, None] * HQ * V + tl.arange(0, BV)[None, :],
         b_dv,
         mask=mask[:, None],
         sem='relaxed',
diff --git a/fla/ops/path_attn/parallel_path_bwd_inter_dqh.py b/fla/ops/path_attn/parallel_path_bwd_inter_dqh.py
@@ -118,9 +118,9 @@ def parallel_path_bwd_dq_kernel(
                 b_A = b_A + b_g_cumsum_q[:, None] - b_g_cumsum_k[None, :]
             b_A = exp2(b_A * sm_scale - b_l[:, None])
             b_A = tl.where(m_t[:, None], b_A, 0)
-            p_v = tl.make_block_ptr(v, (V, T), (1, V*H), (0, offset), (BK, BS), (0, 1))
+            p_v = tl.make_block_ptr(v, (T, V), (H*V, 1), (offset, 0), (BS, BV), (1, 0))
             b_v = tl.load(p_v, boundary_check=(0, 1))
-            b_dp = tl.dot(b_do, b_v.to(b_do.dtype))
+            b_dp = tl.dot(b_do, tl.trans(b_v).to(b_do.dtype))
             b_dA = (b_dp - b_delta[:, None]) * b_A * scale
             b_dq += tl.dot(b_dA.to(b_k.dtype), b_k)
             if USE_GATE: