Merge remote-tracking branch 'upstream/main' into sum-string-dtype

AbelJSanchez · AbelJSanchez · commit ad91b9c54a78 · 2025-12-04T18:01:15.000-08:00
diff --git a/doc/source/user_guide/copy_on_write.rst b/doc/source/user_guide/copy_on_write.rst
@@ -249,9 +249,9 @@ two subsequent indexing operations, e.g.
     In [3]: df
     Out[3]:
        foo  bar
-    0  100    4
+    0    1    4
     1    2    5
-    2    3    6
+    2  100    6
 
 The column ``foo`` was updated where the column ``bar`` is greater than 5.
 This violated the CoW principles though, because it would have to modify the
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1321,6 +1321,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`)
 - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`)
 - Bug in :meth:`DataFrame.groupby` methods when operating on NumPy-nullable data failing when the NA mask was not C-contiguous (:issue:`61031`)
+- Bug in :meth:`DataFrame.groupby` when grouping by a Series and that Series was modified after calling :meth:`DataFrame.groupby` but prior to the groupby operation (:issue:`63219`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -948,7 +948,7 @@ def value_counts_internal(
             result = Series(counts, index=idx, name=name, copy=False)
 
     if sort:
-        result = result.sort_values(ascending=ascending)
+        result = result.sort_values(ascending=ascending, kind="stable")
 
     if normalize:
         result = result / counts.sum()
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -993,7 +993,12 @@ def value_counts(
             If True then the object returned will contain the relative
             frequencies of the unique values.
         sort : bool, default True
-            Sort by frequencies when True. Preserve the order of the data when False.
+            Stable sort by frequencies when True. Preserve the order of the data
+            when False.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, the sort was unstable.
         ascending : bool, default False
             Sort in ascending order.
         bins : int, optional
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7761,11 +7761,16 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies when True. Preserve the order of the data when False.
+            Stable sort by frequencies when True. Preserve the order of the data
+            when False.
 
             .. versionchanged:: 3.0.0
 
                 Prior to 3.0.0, ``sort=False`` would sort by the columns values.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, the sort was unstable.
         ascending : bool, default False
             Sort in ascending order.
         dropna : bool, default True
@@ -7875,7 +7880,7 @@ def value_counts(
         counts.name = name
 
         if sort:
-            counts = counts.sort_values(ascending=ascending)
+            counts = counts.sort_values(ascending=ascending, kind="stable")
         if normalize:
             counts /= counts.sum()
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -2884,8 +2884,8 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies when True. When False, non-grouping columns will appear
-            in the order they occur in within groups.
+            Stable sort by frequencies when True. When False, non-grouping
+            columns will appear in the order they occur in within groups.
 
             .. versionchanged:: 3.0.0
 
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -460,6 +460,8 @@ def __init__(
         dropna: bool = True,
         uniques: ArrayLike | None = None,
     ) -> None:
+        if isinstance(grouper, Series):
+            grouper = grouper.copy(deep=False)
         self.level = level
         self._orig_grouper = grouper
         grouping_vector = _convert_grouper(index, grouper)
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2706,14 +2706,18 @@ def swaplevel(self, i=-2, j=-1) -> MultiIndex:
 
         Calling this method does not change the ordering of the values.
 
+        Default is to swap the last two levels of the MultiIndex.
+
         Parameters
         ----------
         i : int, str, default -2
             First level of index to be swapped. Can pass level name as string.
-            Type of parameters can be mixed.
+            Type of parameters can be mixed. If i is a negative int, the first
+            level is indexed relative to the end of the MultiIndex.
         j : int, str, default -1
             Second level of index to be swapped. Can pass level name as string.
-            Type of parameters can be mixed.
+            Type of parameters can be mixed. If j is a negative int, the second
+            level is indexed relative to the end of the MultiIndex.
 
         Returns
         -------
@@ -2729,20 +2733,33 @@ def swaplevel(self, i=-2, j=-1) -> MultiIndex:
         Examples
         --------
         >>> mi = pd.MultiIndex(
-        ...     levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
+        ...     levels=[["a", "b"], ["bb", "aa"], ["aaa", "bbb"]],
+        ...     codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
         ... )
         >>> mi
-        MultiIndex([('a', 'bb'),
-                    ('a', 'aa'),
-                    ('b', 'bb'),
-                    ('b', 'aa')],
+        MultiIndex([('a', 'bb', 'bbb'),
+                    ('a', 'aa', 'aaa'),
+                    ('b', 'bb', 'bbb'),
+                    ('b', 'aa', 'aaa')],
                    )
-        >>> mi.swaplevel(0, 1)
-        MultiIndex([('bb', 'a'),
-                    ('aa', 'a'),
-                    ('bb', 'b'),
-                    ('aa', 'b')],
+        >>> mi.swaplevel()
+        MultiIndex([('a', 'bbb', 'bb'),
+                    ('a', 'aaa', 'aa'),
+                    ('b', 'bbb', 'bb'),
+                    ('b', 'aaa', 'aa')],
+                   )
+        >>> mi.swaplevel(0)
+        MultiIndex([('bbb', 'bb', 'a'),
+                    ('aaa', 'aa', 'a'),
+                    ('bbb', 'bb', 'b'),
+                    ('aaa', 'aa', 'b')],
                    )
+        >>> mi.swaplevel(0, 1)
+        MultiIndex([('bb', 'a', 'bbb'),
+                    ('aa', 'a', 'aaa'),
+                    ('bb', 'b', 'bbb'),
+                    ('aa', 'b', 'aaa')],
+                )
         """
         new_levels = list(self.levels)
         new_codes = list(self.codes)
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -96,7 +96,7 @@ def read_feather(
     path : str, path object, or file-like object
         String, path object (implementing ``os.PathLike[str]``), or file-like
         object implementing a binary ``read()`` function. The string could be a URL.
-        Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        Valid URL schemes include http, ftp, s3, gs and file. For file URLs, a host is
         expected. A local file could be: ``file://localhost/path/to/table.feather``.
     columns : sequence, default None
         If not provided, all columns are read.
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -226,6 +226,19 @@ def test_groupby_column_index_in_references():
     tm.assert_frame_equal(result, expected)
 
 
+def test_groupby_modify_series():
+    # https://github.com/pandas-dev/pandas/issues/63219
+    # Modifying a Series after using it to groupby should not impact
+    # the groupby operation.
+    ser = Series([1, 2, 1])
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    gb = df.groupby(ser)
+    ser.iloc[0] = 100
+    result = gb.sum()
+    expected = DataFrame({"a": [4, 2], "b": [10, 5]}, index=[1, 2])
+    tm.assert_frame_equal(result, expected)
+
+
 def test_rename_columns():
     # Case: renaming columns returns a new dataframe
     # + afterwards modifying the result
diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from pandas import (
+    Categorical,
     DataFrame,
     date_range,
 )
@@ -83,3 +84,16 @@ def test_equals(self):
         df3 = df1.set_index(["floats"], append=True)
         df2 = df1.set_index(["floats"], append=True)
         assert df3.equals(df2)
+
+    def test_equals_categorical_categories_order(self):
+        cat1 = Categorical(["a", "b", "a"], categories=["a", "b"])
+        cat2 = Categorical(["a", "b", "a"], categories=["b", "a"])
+        df1 = DataFrame({"c": cat1})
+        df2 = DataFrame({"c": cat2})
+
+        assert df1.equals(df2)
+
+        cat3 = Categorical(["a", "b", "a"], categories=["a", "b", "c"])
+        df3 = DataFrame({"c": cat3})
+
+        assert not df1.equals(df3)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1446,6 +1446,19 @@ def test_value_counts_series(self):
         )
         tm.assert_series_equal(result, expected)
 
+    def test_value_counts_stability(self):
+        # GH 63155
+        arr = np.random.default_rng(2).integers(0, 32, 64)
+        result = algos.value_counts_internal(arr, sort=True)
+
+        value_counts = Series(arr).value_counts(sort=False)
+        expected = value_counts.sort_values(ascending=False, kind="stable")
+        tm.assert_series_equal(result, expected)
+
+        unstable_sorted = value_counts.sort_values(ascending=False, kind="quicksort")
+        with pytest.raises(AssertionError):
+            tm.assert_series_equal(result, unstable_sorted)
+
 
 class TestDuplicated:
     def test_duplicated_with_nas(self):