Mention pmap is in maintenance mode and point to shard_map and the migration guide

yashk2810 · mattjj · Google-ML-Automation · commit f7e3bdb29efa · 2025-12-09T10:31:04.000-08:00
Co-authored-by: Matthew Johnson &lt;mattjj@google.com&gt;
PiperOrigin-RevId: 842298387
diff --git a/docs/jax.sharding.rst b/docs/jax.sharding.rst
@@ -16,9 +16,6 @@ Classes
 .. autoclass:: NamedSharding
    :members:
    :show-inheritance:
-.. autoclass:: PmapSharding
-   :members:
-   :show-inheritance:
 .. autoclass:: PartitionSpec
    :members:
 .. autoclass:: Mesh
diff --git a/jax/_src/api.py b/jax/_src/api.py
@@ -1345,7 +1345,13 @@ def pmap(
     donate_argnums: int | Iterable[int] = (),
     global_arg_shapes: tuple[tuple[int, ...], ...] | None = None,
   ) -> Any:
-  """Parallel map with support for collective operations.
+  """Old way of doing parallel map. Use :py:func:`jax.shard_map` instead.
+
+  .. note::
+    While :py:func:`jax.pmap` works, you should probably use
+    :py:func:`jax.shard_map` or ``jax.smap`` instead. shard_map supports more
+    efficient autodiff, and is more composable in the multi-controller setting.
+    See https://docs.jax.dev/en/latest/notebooks/shard_map.html for examples.
 
   .. note::
     :py:func:`pmap` is now implemented in terms of :py:func:`jit` and
@@ -1510,26 +1516,6 @@ def pmap(
   are important particularly in the case of nested :py:func:`pmap` functions,
   where collective operations can operate over distinct axes:
 
-  >>> from functools import partial
-  >>> import jax
-  >>>
-  >>> @partial(pmap, axis_name='rows')
-  ... @partial(pmap, axis_name='cols')
-  ... def normalize(x):
-  ...   row_normed = x / jax.lax.psum(x, 'rows')
-  ...   col_normed = x / jax.lax.psum(x, 'cols')
-  ...   doubly_normed = x / jax.lax.psum(x, ('rows', 'cols'))
-  ...   return row_normed, col_normed, doubly_normed
-  >>>
-  >>> x = jnp.arange(8.).reshape((4, 2))
-  >>> row_normed, col_normed, doubly_normed = normalize(x)  # doctest: +SKIP
-  >>> print(row_normed.sum(0))  # doctest: +SKIP
-  [ 1.  1.]
-  >>> print(col_normed.sum(1))  # doctest: +SKIP
-  [ 1.  1.  1.  1.]
-  >>> print(doubly_normed.sum((0, 1)))  # doctest: +SKIP
-  1.0
-
   On multi-process platforms, collective operations operate over all devices,
   including those on other processes. For example, assuming the following code
   runs on two processes with 4 XLA devices each:
diff --git a/jax/_src/sharding_impls.py b/jax/_src/sharding_impls.py
@@ -193,7 +193,6 @@ def pmap_sharding_devices_indices_map(
 
 @use_cpp_class(xc.PmapSharding)
 class PmapSharding(jsharding.Sharding):
-  """Describes a sharding used by :func:`jax.pmap`."""
   devices: np.ndarray
   sharding_spec: sharding_specs.ShardingSpec
   _internal_device_list: xc.DeviceList