Make unreduced + scan_over_layers + microbatching loop work with jax.grad + reduced annotations too in addition to custom_vjp

yashk2810 · Google-ML-Automation · commit 795c11c5a693 · 2025-10-03T22:50:20.000-07:00
PiperOrigin-RevId: 814975937
diff --git a/jax/_src/lax/lax.py b/jax/_src/lax/lax.py
@@ -7083,7 +7083,8 @@ def _squeeze_sharding_rule(operand, *, dimensions):
   dims_set = set(dimensions)
   new_spec = tuple(s for i, s in enumerate(operand.sharding.spec)
                    if i not in dims_set)
-  return operand.sharding.update(spec=new_spec)
+  return operand.sharding.update(
+      spec=operand.sharding.spec.update(partitions=new_spec))
 
 def _compute_squeeze_shape(shape, dimensions):
   dims_set = set(dimensions)
diff --git a/jax/_src/lax/utils.py b/jax/_src/lax/utils.py
@@ -69,14 +69,18 @@ def _get_abstract_mesh_from_avals(in_avals) -> mesh_lib.AbstractMesh:
     m = a.sharding.mesh
   return mesh_lib.empty_abstract_mesh if m is None else m
 
-def call_unreduced_rule(prim, unreduced_rule, out_s, *avals, **kwargs):
+def call_unreduced_rule(prim, unreduced_rule, out_s, num_out, *avals, **kwargs):
   if unreduced_rule is not None:
     return unreduced_rule(out_s, *avals, **kwargs)
 
   if any(a.sharding.spec.unreduced for a in avals):
     raise NotImplementedError(
         f'unreduced rule for {prim.name} is not implemented. Please file an'
         ' issue at https://github.com/jax-ml/jax/issues')
+  if any(s.spec.unreduced for s in ([out_s] if num_out is None else out_s)):
+    raise NotImplementedError(
+        f'unreduced rule for {prim.name} is not implemented. Please file an'
+        ' issue at https://github.com/jax-ml/jax/issues')
   return out_s
 
 def call_sharding_rule(prim, sh_rule, unreduced_rule, num_out, *avals, **kwargs):
@@ -85,9 +89,11 @@ def call_sharding_rule(prim, sh_rule, unreduced_rule, num_out, *avals, **kwargs)
   if ((cur_mesh.empty or cur_mesh._are_all_axes_auto_or_manual) and
       (aval_mesh.empty or aval_mesh._are_all_axes_auto_or_manual)):
     aval_mesh = cur_mesh if aval_mesh.empty else aval_mesh
-    s = NamedSharding(aval_mesh, P())
-    s = call_unreduced_rule(prim, unreduced_rule, s, *avals, **kwargs)
-    return s if num_out is None else [s] * num_out
+    out_s = NamedSharding(aval_mesh, P())
+    out_s = out_s if num_out is None else [out_s] * num_out
+    out_s = call_unreduced_rule(prim, unreduced_rule, out_s, num_out,
+                                *avals, **kwargs)
+    return out_s
   if sh_rule is None:
     raise core.ShardingTypeError(
         f'sharding rule for {prim.name} is not implemented. Please file an'
@@ -96,7 +102,7 @@ def call_sharding_rule(prim, sh_rule, unreduced_rule, num_out, *avals, **kwargs)
         ' mode via: `jax.sharding.auto_axes(fun, out_shardings=...)`')
   out_sharding = sh_rule(*avals, **kwargs)
   out_sharding = call_unreduced_rule(prim, unreduced_rule, out_sharding,
-                                     *avals, **kwargs)
+                                     num_out, *avals, **kwargs)
   return out_sharding
 
 def call_shape_dtype_sharding_rule(prim, shape_rule, dtype_rule, sharding_rule,
diff --git a/tests/BUILD b/tests/BUILD
@@ -425,9 +425,9 @@ jax_multiplatform_test(
         "gpu_h100x2",
     ],
     shard_count = {
-        "cpu": 3,
+        "cpu": 5,
         "gpu": 2,
-        "tpu": 2,
+        "tpu": 5,
     },
     tags = ["multiaccelerator"],
     deps = [
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -9375,8 +9375,12 @@ def f(x, y):
         ' same'):
       f(arr1, arr2)
 
+  @parameterized.named_parameters(
+      ('custom_vjp', True),
+      ('grad', False),
+  )
   @jtu.with_explicit_mesh((2,), 'x')
-  def test_scan_over_layers_minibatch_unreduced(self, mesh):
+  def test_scan_over_layers_minibatch_unreduced(self, use_custom_vjp, mesh):
     if ifrt_version < 30:
       self.skipTest('Requires ifrt_version >= 30')
     if not jtu.if_cloud_tpu_at_least(2025, 9, 21):
@@ -9385,17 +9389,21 @@ def test_scan_over_layers_minibatch_unreduced(self, mesh):
     def assert_unreduced(val):
       self.assertEqual(val.aval.sharding.spec.unreduced, {'x'})
 
-    @jax.custom_vjp
-    def f(xs, w):
-      return jnp.dot(xs, w)
+    if use_custom_vjp:
+      @jax.custom_vjp
+      def f(xs, w):
+        return jnp.dot(xs, w)
 
-    def f_fwd(xs, w):
-      return f(xs, w), (xs, w)
+      def f_fwd(xs, w):
+        return f(xs, w), (xs, w)
 
-    def f_bwd(res, g):
-      xs, w = res
-      return jnp.dot(g, w), jnp.dot(xs.T, g, out_sharding=P(unreduced={'x'}))
-    f.defvjp(f_fwd, f_bwd)
+      def f_bwd(res, g):
+        xs, w = res
+        return jnp.dot(g, w), jnp.dot(xs.T, g, out_sharding=P(unreduced={'x'}))
+      f.defvjp(f_fwd, f_bwd)
+    else:
+      def f(xs, w):
+        return jnp.dot(xs, w)
 
     def model(stacked_ws, xs_mubatch):
       def scan_over_layers(carry_xs, w):
@@ -9423,7 +9431,14 @@ def mubatch_loop_body(stacked_grad_acc, xs_mubatch):
       return jax.tree.map(
           lambda W, g: W - g * 0.01, stacked_ws, stacked_grad_acc)
 
-    ws = tuple(jax.device_put(jnp.ones((4, 4)), P()) for _ in range(4))
+    if use_custom_vjp:
+      ws = tuple(jax.device_put(jnp.ones((4, 4)), P()) for _ in range(4))
+    else:
+      # Mark `w` with `reduced={'x'}` so that on the bwd pass we will induce
+      # an `unreduced={'x'}`.
+      ws = tuple(jax.device_put(jnp.ones((4, 4)), P(reduced={'x'}))
+                 for _ in range(4))
+
     xs = jax.device_put(jnp.ones((2, 2, 4)), P(None, 'x', None))
     stacked_ws = jnp.stack(ws, axis=0)
     step(stacked_ws, xs)  # doesn't crash