Skip to content

Commit

Permalink
[SPARK-38491][PYTHON] Support ignore_index of Series.sort_values
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Support `ignore_index` of `Series.sort_values`, in which the resulting axis will be labeled `0, 1, …, n - 1`.

### Why are the changes needed?
To reach parity with pandas.

Older pandas support `ignore_index` as well:
```py
>>> pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7))
>>> pdf.sort_values("b", ignore_index=True)
     a  b
0  7.0  1
1  NaN  2
2  5.0  3
3  4.0  4
4  3.0  5
5  2.0  6
6  1.0  7
>>> pd.__version__
'1.0.0'
```

### Does this PR introduce _any_ user-facing change?
Yes. `ignore_index` of `Series.sort_values` is supported.

```py
>>> psdf = ps.DataFrame({"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7))
>>> psdf
            a  b
0.971253  1.0  7
0.401039  2.0  6
0.322310  3.0  5
0.932521  4.0  4
0.058432  5.0  3
0.122754  NaN  2
0.842971  7.0  1
>>> psdf.sort_values("b", ignore_index=True)
     a  b
0  7.0  1
1  NaN  2
2  5.0  3
3  4.0  4
4  3.0  5
5  2.0  6
6  1.0  7
```

### How was this patch tested?
Unit tests.

Closes apache#35794 from xinrong-databricks/frame.sort_values.

Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
xinrong-meng authored and HyukjinKwon committed Mar 11, 2022
1 parent 34e3029 commit 36023c2
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 14 deletions.
45 changes: 31 additions & 14 deletions python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6855,6 +6855,7 @@ def sort_values(
ascending: Union[bool, List[bool]] = True,
inplace: bool = False,
na_position: str = "last",
ignore_index: bool = False,
) -> Optional["DataFrame"]:
"""
Sort by the values along either axis.
Expand All @@ -6870,6 +6871,8 @@ def sort_values(
if True, perform operation in-place
na_position : {'first', 'last'}, default 'last'
`first` puts NaNs at the beginning, `last` puts NaNs at the end
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.
Returns
-------
Expand All @@ -6882,34 +6885,45 @@ def sort_values(
... 'col2': [2, 9, 8, 7, 4],
... 'col3': [0, 9, 4, 2, 3],
... },
... columns=['col1', 'col2', 'col3'])
... columns=['col1', 'col2', 'col3'],
... index=['a', 'b', 'c', 'd', 'e'])
>>> df
col1 col2 col3
0 A 2 0
1 B 9 9
2 None 8 4
3 D 7 2
4 C 4 3
a A 2 0
b B 9 9
c None 8 4
d D 7 2
e C 4 3
Sort by col1
>>> df.sort_values(by=['col1'])
col1 col2 col3
a A 2 0
b B 9 9
e C 4 3
d D 7 2
c None 8 4
Ignore index for the resulting axis
>>> df.sort_values(by=['col1'], ignore_index=True)
col1 col2 col3
0 A 2 0
1 B 9 9
4 C 4 3
2 C 4 3
3 D 7 2
2 None 8 4
4 None 8 4
Sort Descending
>>> df.sort_values(by='col1', ascending=False)
col1 col2 col3
3 D 7 2
4 C 4 3
1 B 9 9
0 A 2 0
2 None 8 4
d D 7 2
e C 4 3
b B 9 9
a A 2 0
c None 8 4
Sort by multiple columns
Expand Down Expand Up @@ -6945,11 +6959,14 @@ def sort_values(
new_by.append(ser.spark.column)

psdf = self._sort(by=new_by, ascending=ascending, na_position=na_position)

if inplace:
if ignore_index:
psdf.reset_index(drop=True, inplace=inplace)
self._update_internal_frame(psdf._internal)
return None
else:
return psdf
return psdf.reset_index(drop=True) if ignore_index else psdf

def sort_index(
self,
Expand Down
42 changes: 42 additions & 0 deletions python/pyspark/pandas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,9 @@ def test_sort_values(self):
psdf = ps.from_pandas(pdf)

self.assert_eq(psdf.sort_values("b"), pdf.sort_values("b"))
self.assert_eq(
psdf.sort_values("b", ignore_index=True), pdf.sort_values("b", ignore_index=True)
)

for ascending in [True, False]:
for na_position in ["first", "last"]:
Expand All @@ -1567,6 +1570,10 @@ def test_sort_values(self):
)

self.assert_eq(psdf.sort_values(["a", "b"]), pdf.sort_values(["a", "b"]))
self.assert_eq(
psdf.sort_values(["a", "b"], ignore_index=True),
pdf.sort_values(["a", "b"], ignore_index=True),
)
self.assert_eq(
psdf.sort_values(["a", "b"], ascending=[False, True]),
pdf.sort_values(["a", "b"], ascending=[False, True]),
Expand All @@ -1587,6 +1594,41 @@ def test_sort_values(self):
self.assert_eq(psdf, pdf)
self.assert_eq(psserA, pserA)

pdf = pd.DataFrame(
{"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]}, index=np.random.rand(7)
)
psdf = ps.from_pandas(pdf)
pserA = pdf.a
psserA = psdf.a
self.assert_eq(
psdf.sort_values("b", inplace=True, ignore_index=True),
pdf.sort_values("b", inplace=True, ignore_index=True),
)
self.assert_eq(psdf, pdf)
self.assert_eq(psserA, pserA)

# multi-index indexes

pdf = pd.DataFrame(
{"a": [1, 2, 3, 4, 5, None, 7], "b": [7, 6, 5, 4, 3, 2, 1]},
index=pd.MultiIndex.from_tuples(
[
("bar", "one"),
("bar", "two"),
("baz", "one"),
("baz", "two"),
("foo", "one"),
("foo", "two"),
("qux", "one"),
]
),
)
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf.sort_values("b"), pdf.sort_values("b"))
self.assert_eq(
psdf.sort_values("b", ignore_index=True), pdf.sort_values("b", ignore_index=True)
)

# multi-index columns
pdf = pd.DataFrame(
{("X", 10): [1, 2, 3, 4, 5, None, 7], ("X", 20): [7, 6, 5, 4, 3, 2, 1]},
Expand Down

0 comments on commit 36023c2

Please sign in to comment.