Merge pull request #32357 from akshay-babbar:fix-local-window-size-masking

Google-ML-Automation · Google-ML-Automation · commit f837ebc83646 · 2025-12-10T09:13:47.000-08:00
PiperOrigin-RevId: 842753992
diff --git a/jax/_src/nn/functions.py b/jax/_src/nn/functions.py
@@ -880,7 +880,7 @@ def _get_padding_mask_encoded(T, q_seqlen):
 
 def _apply_masks(logits, mask, is_causal, q_seqlen, kv_seqlen,
                  local_window_size):
-  if mask is None and not is_causal and q_seqlen is None and kv_seqlen is None:
+  if mask is None and not is_causal and q_seqlen is None and kv_seqlen is None and local_window_size is None:
     return logits
 
   combined_mask = jnp.ones_like(logits, dtype=bool)
diff --git a/tests/nn_test.py b/tests/nn_test.py
@@ -791,6 +791,27 @@ def testLog1mExpGrad(self):
         atol=1e-3,
     )
 
+  def testDotProductAttention_localWindowSizeWithoutMask(self):
+    dtype = jnp.float32
+    B, S, T, N, H = 2, 128, 128, 4, 32
+    keys = random.split(random.PRNGKey(0), 3)
+    Q = random.normal(keys[0], (B, T, N, H), dtype)
+    K = random.normal(keys[1], (B, S, N, H), dtype)
+    V = random.normal(keys[2], (B, S, N, H), dtype)
+
+    output_large_window = nn.dot_product_attention(
+        Q, K, V, mask=None, local_window_size=(32, 32)
+    )
+
+    output_small_window = nn.dot_product_attention(
+        Q, K, V, mask=None, local_window_size=(1, 1)
+    )
+
+    self.assertFalse(
+        jnp.allclose(output_large_window, output_small_window),
+        "Attention output should differ with different local_window_size, even without a mask.",
+    )
+
 
 InitializerRecord = collections.namedtuple(
   "InitializerRecord",