Add support for unreduced + scan over layers.

yashk2810 · Google-ML-Automation · commit b974342fa480 · 2025-10-03T20:37:38.000-07:00
This requires preserving unreduced through broadcast_in_dim and dynamic_update_slice which works because both the operations are linear operations.

PiperOrigin-RevId: 814943483
diff --git a/jax/_src/core.py b/jax/_src/core.py
@@ -3090,7 +3090,9 @@ def _map_shaped_array(
   assert axis is None or aval.shape[axis] == size
   if axis is None:
     return aval
-  sharding = aval.sharding.update(spec=tuple_delete(aval.sharding.spec, axis))
+  aval_s = aval.sharding
+  sharding = aval_s.update(
+      spec=aval_s.spec.update(partitions=tuple_delete(aval_s.spec, axis)))
   return ShapedArray(tuple_delete(aval.shape, axis), aval.dtype,
                      weak_type=aval.weak_type, sharding=sharding, vma=aval.vma,
                      memory_space=aval.memory_space)
@@ -3101,8 +3103,9 @@ def _unmap_shaped_array(
   if axis is None:
     return aval
   elif type(axis) is int:
-    sharding = aval.sharding.update(spec=tuple_insert(
-        aval.sharding.spec, axis, explicit_mesh_axis))
+    aval_s = aval.sharding
+    sharding = aval_s.update(spec=aval_s.spec.update(partitions=tuple_insert(
+        aval_s.spec, axis, explicit_mesh_axis)))
     return ShapedArray(tuple_insert(aval.shape, axis, size), aval.dtype,
                        weak_type=aval.weak_type, sharding=sharding,
                        vma=aval.vma, memory_space=aval.memory_space)
diff --git a/jax/_src/lax/control_flow/loops.py b/jax/_src/lax/control_flow/loops.py
@@ -531,6 +531,7 @@ def cond_fun(while_carry):
   # knows not to AR at the boundary of while. This is a no-op at the trace level
   # but during lowering time, it inserts an extra sharding constraint.
   carry = tree_map(_constrain_unreduced, carry)
+  ys = tree_map(_constrain_unreduced, ys)
   return [*carry, *ys]
 
 def _constrain_unreduced(val):
@@ -544,7 +545,8 @@ def _split_leading(sz, x):
 def _concat(a, b): return lax.concatenate([a, b], 0)
 
 def _empty_array(prefix, length_spec, aval):
-  sharding = aval.sharding.update(spec=(*length_spec, *aval.sharding.spec))
+  sharding = aval.sharding.update(spec=aval.sharding.spec.update(
+      partitions=(*length_spec, *aval.sharding.spec)))
   # TODO(yashkatariya): Replace `lax.empty2` with `lax.empty` once
   # AllocateBuffer issues are fixed. Also delete `empty2` after this usage is
   # removed. Basically uncomment the following 2 lines.
diff --git a/jax/_src/lax/lax.py b/jax/_src/lax/lax.py
@@ -6528,7 +6528,8 @@ def _broadcast_in_dim_sharding_rule(operand, *, shape, broadcast_dimensions,
   orig_spec = iter(operand.sharding.spec)
   new_spec = [next(orig_spec) if i in bds else None for i in range(len(shape))]
   assert next(orig_spec, None) is None
-  return operand.sharding.update(spec=new_spec)
+  return operand.sharding.update(
+      spec=operand.sharding.spec.update(partitions=new_spec))
 
 def _broadcast_in_dim_typecheck_rule(
     _, operand, *dyn_shape, shape, broadcast_dimensions, sharding):
diff --git a/jax/_src/lax/slicing.py b/jax/_src/lax/slicing.py
@@ -1670,6 +1670,15 @@ def _dynamic_update_slice_sharding_rule(operand, update, *start_indices):
         f" {update.str_short(mesh_axis_types=True)}.")
   return operand.sharding
 
+def _dynamic_update_slice_unreduced_rule(out_s, operand, update, *start_indices):
+  if operand.sharding.spec.unreduced != update.sharding.spec.unreduced:
+    raise core.ShardingTypeError(
+        "dynamic_update_slice operand and update must be unreduced along the"
+        " same axes. Got operand sharding"
+        f" {operand.str_short(mesh_axis_types=True)} and update sharding"
+        f" {update.str_short(mesh_axis_types=True)}.")
+  return out_s
+
 def _dynamic_update_slice_dtype_rule(operand, update, *start_indices):
   lax.check_same_dtypes("dynamic_update_slice", operand, update)
   if any(i.dtype != start_indices[0].dtype or
@@ -1735,7 +1744,8 @@ def _dynamic_update_slice_batching_rule(batched_args, batch_dims):
 dynamic_update_slice_p = standard_primitive(
     _dynamic_update_slice_shape_rule, _dynamic_update_slice_dtype_rule,
     'dynamic_update_slice', sharding_rule=_dynamic_update_slice_sharding_rule,
-    vma_rule=partial(core.standard_vma_rule, 'dynamic_update_slice'))
+    vma_rule=partial(core.standard_vma_rule, 'dynamic_update_slice'),
+    unreduced_rule=_dynamic_update_slice_unreduced_rule)
 ad.primitive_jvps[dynamic_update_slice_p] = _dynamic_update_slice_jvp
 ad.primitive_transposes[dynamic_update_slice_p] = \
     _dynamic_update_slice_transpose_rule
diff --git a/tests/pjit_test.py b/tests/pjit_test.py
@@ -9375,6 +9375,67 @@ def f(x, y):
         ' same'):
       f(arr1, arr2)
 
+  @jtu.with_explicit_mesh((2,), 'x')
+  def test_scan_over_layers_minibatch_unreduced(self, mesh):
+    if ifrt_version < 30:
+      self.skipTest('Requires ifrt_version >= 30')
+    if not jtu.if_cloud_tpu_at_least(2025, 9, 21):
+      self.skipTest("Requires libtpu built after 2025-09-21")
+
+    def assert_unreduced(val):
+      self.assertEqual(val.aval.sharding.spec.unreduced, {'x'})
+
+    @jax.custom_vjp
+    def f(xs, w):
+      return jnp.dot(xs, w)
+
+    def f_fwd(xs, w):
+      return f(xs, w), (xs, w)
+
+    def f_bwd(res, g):
+      xs, w = res
+      return jnp.dot(g, w), jnp.dot(xs.T, g, out_sharding=P(unreduced={'x'}))
+    f.defvjp(f_fwd, f_bwd)
+
+    def model(stacked_ws, xs_mubatch):
+      def scan_over_layers(carry_xs, w):
+        return f(carry_xs, w), None
+      final_xs, _ = jax.lax.scan(scan_over_layers, xs_mubatch, stacked_ws)
+      return jnp.sum(final_xs)
+
+    @partial(jax.jit, donate_argnums=(0,))
+    def step(stacked_ws, xs):
+      def mubatch_loop_body(stacked_grad_acc, xs_mubatch):
+        grad = jax.grad(model)(stacked_ws, xs_mubatch)
+        assert_unreduced(grad)
+        assert_unreduced(stacked_grad_acc)
+        stacked_grad_acc = jax.tree.map(jnp.add, stacked_grad_acc, grad)
+        assert_unreduced(stacked_grad_acc)
+        return stacked_grad_acc, None
+
+      stacked_grad_acc = jax.tree.map(jnp.zeros_like, stacked_ws)
+      stacked_grad_acc = reshard(stacked_grad_acc, P(unreduced={'x'}))
+      stacked_grad_acc, _ = jax.lax.scan(
+          mubatch_loop_body, stacked_grad_acc, xs)
+      assert_unreduced(stacked_grad_acc)
+      # AR once for a batch
+      stacked_grad_acc = reshard(stacked_grad_acc, P())
+      return jax.tree.map(
+          lambda W, g: W - g * 0.01, stacked_ws, stacked_grad_acc)
+
+    ws = tuple(jax.device_put(jnp.ones((4, 4)), P()) for _ in range(4))
+    xs = jax.device_put(jnp.ones((2, 2, 4)), P(None, 'x', None))
+    stacked_ws = jnp.stack(ws, axis=0)
+    step(stacked_ws, xs)  # doesn't crash
+
+    compiled_text = step.lower(stacked_ws, xs).compile().as_text()
+    if compiled_text is not None:
+      if jtu.test_device_matches(['gpu']):
+        self.assertEqual(compiled_text.count('all-reduce-start('), 1)
+        self.assertEqual(compiled_text.count('all-reduce-done('), 1)
+      else:
+        self.assertEqual(compiled_text.count('all-reduce('), 1)
+
 
 @jtu.pytest_mark_if_available('multiaccelerator')
 class PJitErrorTest(jtu.JaxTestCase):