[SPARK-40579][PS] GroupBy.first should skip NULLs

### What changes were proposed in this pull request? make `GroupBy.first` skip nulls ### Why are the changes needed? to fix the behavior difference ``` In [1]: ...: import pandas as pd ...: import numpy as np ...: import pyspark.pandas as ps ...: ...: pdf = pd.DataFrame({"A": [1, 2, 1, 2],"B": [-1.5, np.nan, -3.2, 0.1],}) ...: psdf = ps.from_pandas(pdf) ...: In [2]: pdf.groupby("A").first() Out[2]: B A 1 -1.5 2 0.1 In [3]: psdf.groupby("A").first() B A 1 -1.5 2 NaN ``` ### Does this PR introduce _any_ user-facing change? yes, updated `GroupBy.first` will skip NULLs ### How was this patch tested? added UT Closes apache#38017 from zhengruifeng/ps_first_skip_na. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
mrpengbei · Sep 28, 2022 · e932e0a · e932e0a
1 parent a96ac7e
commit e932e0a
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 1 deletion.
diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -449,7 +449,8 @@ def first(self, numeric_only: Optional[bool] = False) -> FrameLike:
         2  False  3
         """
         return self._reduce_for_stat_function(
-            F.first, accepted_spark_types=(NumericType, BooleanType) if numeric_only else None
+            lambda col: F.first(col, ignorenulls=True),
+            accepted_spark_types=(NumericType, BooleanType) if numeric_only else None,
         )
 
     def last(self, numeric_only: Optional[bool] = False) -> FrameLike:

diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py
@@ -1419,6 +1419,17 @@ def test_first(self):
         self._test_stat_func(lambda groupby_obj: groupby_obj.first(numeric_only=None))
         self._test_stat_func(lambda groupby_obj: groupby_obj.first(numeric_only=True))
 
+        pdf = pd.DataFrame(
+            {
+                "A": [1, 2, 1, 2],
+                "B": [-1.5, np.nan, -3.2, 0.1],
+            }
+        )
+        psdf = ps.from_pandas(pdf)
+        self.assert_eq(
+            pdf.groupby("A").first().sort_index(), psdf.groupby("A").first().sort_index()
+        )
+
     def test_last(self):
         self._test_stat_func(lambda groupby_obj: groupby_obj.last())
         self._test_stat_func(lambda groupby_obj: groupby_obj.last(numeric_only=None))