[Pallas:MGPU] Add support for collective scale/sparse metadata copies to TMEM

apaszke · Google-ML-Automation · commit d5bf94e2c6c0 · 2025-12-08T12:52:17.000-08:00
PiperOrigin-RevId: 841879848
diff --git a/jax/_src/pallas/mosaic_gpu/primitives.py b/jax/_src/pallas/mosaic_gpu/primitives.py
@@ -53,6 +53,7 @@
 import numpy as np
 
 
+AxisName = jax_core.AxisName
 WARP_SIZE = 32
 WARPGROUP_SIZE = 128
 
@@ -3202,7 +3203,10 @@ def _async_store_tmem_lowering_rule_wg(
 async_copy_scales_to_tmem_p = jax_core.Primitive("async_copy_scales_to_tmem")
 async_copy_scales_to_tmem_p.multiple_results = True
 
-def async_copy_scales_to_tmem(smem_ref: _Ref, tmem_ref: _Ref):
+
+def async_copy_scales_to_tmem(
+    smem_ref: _Ref, tmem_ref: _Ref, collective_axis: AxisName | None = None,
+):
   """Copies the MMA scales from SMEM to TMEM.
 
   The copy is performed asynchronously and can be awaited by calling
@@ -3226,12 +3230,17 @@ def async_copy_scales_to_tmem(smem_ref: _Ref, tmem_ref: _Ref):
   async_copy_scales_to_tmem_p.bind(
       smem_ref, tmem_ref, *flat_smem_transforms, *flat_tmem_transforms,
       smem_tree=smem_transforms_treedef, tmem_tree=tmem_transforms_treedef,
+      collective_axis=collective_axis,
   )
 
+
 async_copy_sparse_metadata_to_tmem_p = jax_core.Primitive("async_copy_sparse_metadata_to_tmem")
 async_copy_sparse_metadata_to_tmem_p.multiple_results = True
 
-def async_copy_sparse_metadata_to_tmem(smem_ref: _Ref, tmem_ref: _Ref):
+
+def async_copy_sparse_metadata_to_tmem(
+    smem_ref: _Ref, tmem_ref: _Ref, collective_axis: AxisName | None = None
+):
   """Copies the MMA sparse metadata from SMEM to TMEM.
 
   The copy is performed asynchronously and can be awaited by calling
@@ -3255,19 +3264,21 @@ def async_copy_sparse_metadata_to_tmem(smem_ref: _Ref, tmem_ref: _Ref):
   async_copy_sparse_metadata_to_tmem_p.bind(
       smem_ref, tmem_ref, *flat_smem_transforms, *flat_tmem_transforms,
       smem_tree=smem_transforms_treedef, tmem_tree=tmem_transforms_treedef,
+      collective_axis=collective_axis,
   )
 
+
 @async_copy_scales_to_tmem_p.def_effectful_abstract_eval
 @async_copy_sparse_metadata_to_tmem_p.def_effectful_abstract_eval
-def _async_copy_to_tmem_abstract_eval(smem_ref, tmem_ref, *avals_flat, smem_tree, tmem_tree):
+def _async_copy_to_tmem_abstract_eval(smem_ref, tmem_ref, *_args, **_kwargs):
   if smem_ref.memory_space != gpu_core.MemorySpace.SMEM:
     raise ValueError("async_copy_scales_to_tmem source must be an SMEM ref")
   if tmem_ref.memory_space != gpu_core.MemorySpace.TMEM:
     raise ValueError("async_copy_scales_to_tmem target must be a TMEM ref")
   return (), {gpu_core._memory_effect}
 
 def _async_copy_to_tmem_lowering_rule(
-    impl, ctx: lowering.LoweringRuleContext, smem_ref, tmem_ref, *leaves, smem_tree, tmem_tree
+    impl, ctx: lowering.LoweringRuleContext, smem_ref, tmem_ref, *leaves, smem_tree, tmem_tree, collective_axis
 ):
   assert isinstance(tmem_ref, tcgen05.TMEMRef)
   smem_leaves, tmem_leaves = util.split_list(leaves, [smem_tree.num_leaves])
@@ -3279,8 +3290,17 @@ def _async_copy_to_tmem_lowering_rule(
     raise NotImplementedError(f"Unimplemented transforms for SMEM refs: {smem_transforms}")
   if tmem_transforms:
     raise NotImplementedError(f"Unimplemented transforms for TMEM refs: {tmem_transforms}")
-  with mgpu.when(ctx.module_ctx.single_lane_predicate):
-    impl(smem_ref, tmem_ref)
+
+  predicate = ctx.module_ctx.single_lane_predicate
+  if collective_axis is not None:
+    is_leader_block = _collective_mma_predicate(ctx, collective_axis)
+    predicate = arith_dialect.andi(predicate, is_leader_block)
+    collective = True
+  else:
+    collective = False
+
+  with mgpu.when(predicate):
+    impl(smem_ref, tmem_ref, collective=collective)
   return ()
 
 @lowering.register_lowering_rule(
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -1760,11 +1760,10 @@ def format_scales(scales):
 
   @parameterized.product(
     m=(256,),
-    n=(64, 128, 256),
+    n=(256,),
     scale_jax_dtype=(jnp.float8_e8m0fnu, jnp.float8_e4m3fn),
   )
   def test_mma_block_scaled_collective(self, m, n, scale_jax_dtype):
-    m, n = 256, 256
     in_jax_dtype = jnp.float4_e2m1fn
     out_jax_dtype = jnp.float32
     scale_block = 32 if scale_jax_dtype == jnp.float8_e8m0fnu else 16
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -3768,6 +3768,133 @@ def format_scales(scales):
     )
     np.testing.assert_allclose(result, expected, rtol=1e-3)
 
+  @parameterized.product(
+      m=[256],
+      n=[256],
+      scale_jax_dtype=[jnp.float8_e8m0fnu, jnp.float8_e4m3fn],
+  )
+  def test_collective_scaled_matmul(self, m, n, scale_jax_dtype):
+    self.skip_if_wg_semantics()
+
+    in_jax_dtype = jnp.float4_e2m1fn
+    out_jax_dtype = jnp.float32
+    scale_block = 32 if scale_jax_dtype == jnp.float8_e8m0fnu else 16
+    swizzle = 128
+    k_steps = 2
+    swizzle_elems = 8 * swizzle // dtypes.itemsize_bits(in_jax_dtype)
+    k = swizzle_elems * k_steps
+    tiling = (8, swizzle_elems)
+    transforms = (
+        plgpu.TilingTransform(tiling), plgpu.SwizzleTransform(swizzle)
+    )
+    out_transforms = self.default_transforms(dtype=out_jax_dtype)
+
+    m_block = m // 2
+    n_block = n // 2
+
+    def kernel(lhs_gmem, rhs_gmem, lhs_scales_gmem, rhs_scales_gmem, out_gmem,
+               lhs_smem, rhs_smem, lhs_scales_smem, rhs_scales_smem, out_smem,
+               tma_barrier, mma_barrier,
+               acc_tmem, lhs_scales_tmem, rhs_scales_tmem):
+      plgpu.copy_gmem_to_smem(lhs_gmem, lhs_smem, tma_barrier,
+                              collective_axes="x", partitioned_axis=0)
+      plgpu.copy_gmem_to_smem(rhs_gmem, rhs_smem, tma_barrier,
+                              collective_axes="x", partitioned_axis=0)
+      plgpu.copy_gmem_to_smem(lhs_scales_gmem, lhs_scales_smem, tma_barrier,
+                              collective_axes="x", partitioned_axis=0)
+      # RHS scales are replicated (multicast)
+      plgpu.copy_gmem_to_smem(rhs_scales_gmem, rhs_scales_smem, tma_barrier,
+                              collective_axes="x", partitioned_axis=None)
+      cluster_idx = lax.axis_index("x")
+
+      @pl.when(cluster_idx == 0)
+      def _leader_block():
+        plgpu.barrier_wait(tma_barrier)
+        plgpu.async_copy_scales_to_tmem(lhs_scales_smem, lhs_scales_tmem, collective_axis="x")
+        plgpu.async_copy_scales_to_tmem(rhs_scales_smem, rhs_scales_tmem, collective_axis="x")
+        plgpu.tcgen05_mma(
+            acc_tmem,
+            lhs_smem,
+            plgpu.transpose_ref(rhs_smem, (1, 0)),
+            mma_barrier,
+            a_scale=lhs_scales_tmem,
+            b_scale=rhs_scales_tmem,
+            accumulate=False,
+            collective_axis="x"
+        )
+      plgpu.barrier_wait(mma_barrier)
+
+      out_smem[...] = plgpu.async_load_tmem(acc_tmem)
+      plgpu.commit_smem()
+      slice_out = pl.ds(cluster_idx * m_block, m_block)
+      plgpu.copy_smem_to_gmem(out_smem, out_gmem.at[slice_out, :])
+      plgpu.wait_smem_to_gmem(0)
+
+    scratch_shapes = [
+        plgpu.SMEM((m_block, k), in_jax_dtype, transforms=transforms),
+        plgpu.SMEM((n_block, k), in_jax_dtype, transforms=transforms),
+        plgpu.SMEM((m_block // 128, k // (scale_block * 4), 32, 16), scale_jax_dtype),
+        plgpu.SMEM((n // 128, k // (scale_block * 4), 32, 16), scale_jax_dtype),
+        plgpu.SMEM((m_block, n), out_jax_dtype, transforms=out_transforms),
+        plgpu.Barrier(num_arrivals=4),
+        plgpu.Barrier(orders_tensor_core=True),
+        plgpu.TMEM((m_block, n), out_jax_dtype, collective=True),
+        plgpu.TMEM((m_block, k // scale_block), scale_jax_dtype,
+                   layout=plgpu.TMEMLayout.SCALES_LAYOUT, collective=True),
+        plgpu.TMEM((n, k // scale_block), scale_jax_dtype,
+                   layout=plgpu.TMEMLayout.SCALES_LAYOUT, collective=True),
+    ]
+
+    f = self.kernel(
+        kernel,
+        out_shape=jax.ShapeDtypeStruct((m, n), out_jax_dtype),
+        grid=(1,),
+        grid_names=("_",),
+        cluster=(2,),
+        cluster_names=("x",),
+        scratch_shapes=scratch_shapes,
+    )
+
+    x = jax.random.uniform(jax.random.key(1), shape=(m, k), dtype=jnp.float32).astype(in_jax_dtype)
+    y = jax.random.uniform(jax.random.key(2), shape=(n, k), dtype=jnp.float32).astype(in_jax_dtype)
+
+    ka, kb = jax.random.split(jax.random.key(1234), 2)
+    if scale_jax_dtype == jnp.float8_e8m0fnu:
+      x_scale = jax.lax.bitcast_convert_type(
+          jax.random.randint(ka, (m, k // scale_block), 122, 132, dtype=jnp.uint8),
+          scale_jax_dtype
+      )
+      y_scale = jax.lax.bitcast_convert_type(
+          jax.random.randint(kb, (n, k // scale_block), 122, 132, dtype=jnp.uint8),
+          scale_jax_dtype
+      )
+    else:
+      x_scale = jnp.abs(
+          jax.random.normal(ka, (m, k // scale_block), dtype=jnp.float32).astype(scale_jax_dtype)
+      )
+      y_scale = jnp.abs(
+          jax.random.normal(kb, (n, k // scale_block), dtype=jnp.float32).astype(scale_jax_dtype)
+      )
+
+    def format_scales(scales):
+      mn, k = scales.shape
+      assert mn % 128 == 0 and k % 4 == 0
+      return (
+          scales.reshape(mn // 128, 4, 32, k // 4, 4)
+          .transpose(0, 3, 2, 1, 4)
+          .reshape(mn // 128, k // 4, 32, 16)
+      )
+
+    result = f(x, y, format_scales(x_scale), format_scales(y_scale))
+
+    x_logical_scale = jnp.repeat(x_scale, scale_block, axis=1).astype(jnp.float32)
+    y_logical_scale = jnp.repeat(y_scale, scale_block, axis=1).astype(jnp.float32)
+    expected = jnp.dot(
+        x.astype(jnp.float32) * x_logical_scale,
+        (y.astype(jnp.float32) * y_logical_scale).T,
+    )
+    np.testing.assert_allclose(result, expected, rtol=1e-3)
+
   @parameterized.product(
       m=[128],
       n=[128, 256],