[SPARK-38491][PYTHON] Support ignore_index of Series.sort_values

### What changes were proposed in this pull request? Support `ignore_index` of `Series.sort_values`, in which the resulting axis will be labeled `0, 1, …, n - 1`. ### Why are the changes needed? To reach parity with pandas. Older pandas support `ignore_index` as well: ```py >>> pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)) >>> pdf.sort_values("b", ignore_index=True) a b 0 7.0 1 1 NaN 2 2 5.0 3 3 4.0 4 4 3.0 5 5 2.0 6 6 1.0 7 >>> pd.__version__ '1.0.0' ``` ### Does this PR introduce _any_ user-facing change? Yes. `ignore_index` of `Series.sort_values` is supported. ```py >>> psdf = ps.DataFrame({"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)) >>> psdf a b 0.971253 1.0 7 0.401039 2.0 6 0.322310 3.0 5 0.932521 4.0 4 0.058432 5.0 3 0.122754 NaN 2 0.842971 7.0 1 >>> psdf.sort_values("b", ignore_index=True) a b 0 7.0 1 1 NaN 2 2 5.0 3 3 4.0 4 4 3.0 5 5 2.0 6 6 1.0 7 ``` ### How was this patch tested? Unit tests. Closes apache#35794 from xinrong-databricks/frame.sort_values. Authored-by: Xinrong Meng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
mantov5 · Mar 11, 2022 · 36023c2 · 36023c2
1 parent 34e3029
commit 36023c2
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 14 deletions.
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -6855,6 +6855,7 @@ def sort_values(
         ascending: Union[bool, List[bool]] = True,
         inplace: bool = False,
         na_position: str = "last",
+        ignore_index: bool = False,
     ) -> Optional["DataFrame"]:
         """
         Sort by the values along either axis.
@@ -6870,6 +6871,8 @@ def sort_values(
              if True, perform operation in-place
         na_position : {'first', 'last'}, default 'last'
              `first` puts NaNs at the beginning, `last` puts NaNs at the end
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -6882,34 +6885,45 @@ def sort_values(
         ...     'col2': [2, 9, 8, 7, 4],
         ...     'col3': [0, 9, 4, 2, 3],
         ...   },
-        ...   columns=['col1', 'col2', 'col3'])
+        ...   columns=['col1', 'col2', 'col3'],
+        ...   index=['a', 'b', 'c', 'd', 'e'])
         >>> df
            col1  col2  col3
-        0     A     2     0
-        1     B     9     9
-        2  None     8     4
-        3     D     7     2
-        4     C     4     3
+        a     A     2     0
+        b     B     9     9
+        c  None     8     4
+        d     D     7     2
+        e     C     4     3
 
         Sort by col1
 
         >>> df.sort_values(by=['col1'])
            col1  col2  col3
+        a     A     2     0
+        b     B     9     9
+        e     C     4     3
+        d     D     7     2
+        c  None     8     4
+
+        Ignore index for the resulting axis
+
+        >>> df.sort_values(by=['col1'], ignore_index=True)
+           col1  col2  col3
         0     A     2     0
         1     B     9     9
-        4     C     4     3
+        2     C     4     3
         3     D     7     2
-        2  None     8     4
+        4  None     8     4
 
         Sort Descending
 
         >>> df.sort_values(by='col1', ascending=False)
            col1  col2  col3
-        3     D     7     2
-        4     C     4     3
-        1     B     9     9
-        0     A     2     0
-        2  None     8     4
+        d     D     7     2
+        e     C     4     3
+        b     B     9     9
+        a     A     2     0
+        c  None     8     4
 
         Sort by multiple columns
 
@@ -6945,11 +6959,14 @@ def sort_values(
             new_by.append(ser.spark.column)
 
         psdf = self._sort(by=new_by, ascending=ascending, na_position=na_position)
+
         if inplace:
+            if ignore_index:
+                psdf.reset_index(drop=True, inplace=inplace)
             self._update_internal_frame(psdf._internal)
             return None
         else:
-            return psdf
+            return psdf.reset_index(drop=True) if ignore_index else psdf
 
     def sort_index(
         self,

diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
@@ -1558,6 +1558,9 @@ def test_sort_values(self):
         psdf = ps.from_pandas(pdf)
 
         self.assert_eq(psdf.sort_values("b"), pdf.sort_values("b"))
+        self.assert_eq(
+            psdf.sort_values("b", ignore_index=True), pdf.sort_values("b", ignore_index=True)
+        )
 
         for ascending in [True, False]:
             for na_position in ["first", "last"]:
@@ -1567,6 +1570,10 @@ def test_sort_values(self):
                 )
 
         self.assert_eq(psdf.sort_values(["a", "b"]), pdf.sort_values(["a", "b"]))
+        self.assert_eq(
+            psdf.sort_values(["a", "b"], ignore_index=True),
+            pdf.sort_values(["a", "b"], ignore_index=True),
+        )
         self.assert_eq(
             psdf.sort_values(["a", "b"], ascending=[False, True]),
             pdf.sort_values(["a", "b"], ascending=[False, True]),
@@ -1587,6 +1594,41 @@ def test_sort_values(self):
         self.assert_eq(psdf, pdf)
         self.assert_eq(psserA, pserA)
 
+        pdf = pd.DataFrame(
+            {"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
+        )
+        psdf = ps.from_pandas(pdf)
+        pserA = pdf.a
+        psserA = psdf.a
+        self.assert_eq(
+            psdf.sort_values("b", inplace=True, ignore_index=True),
+            pdf.sort_values("b", inplace=True, ignore_index=True),
+        )
+        self.assert_eq(psdf, pdf)
+        self.assert_eq(psserA, pserA)
+
+        # multi-index indexes
+
+        pdf = pd.DataFrame(
+            {"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]},
+            index=pd.MultiIndex.from_tuples(
+                [
+                    ("bar", "one"),
+                    ("bar", "two"),
+                    ("baz", "one"),
+                    ("baz", "two"),
+                    ("foo", "one"),
+                    ("foo", "two"),
+                    ("qux", "one"),
+                ]
+            ),
+        )
+        psdf = ps.from_pandas(pdf)
+        self.assert_eq(psdf.sort_values("b"), pdf.sort_values("b"))
+        self.assert_eq(
+            psdf.sort_values("b", ignore_index=True), pdf.sort_values("b", ignore_index=True)
+        )
+
         # multi-index columns
         pdf = pd.DataFrame(
             {("X", 10): [1, 2, 3, 4, 5, None, 7], ("X", 20): [7, 6, 5, 4, 3, 2, 1]},