[SPARK-32511][FOLLOW-UP][SQL][R][PYTHON] Add dropFields to SparkR and…

… PySpark ### What changes were proposed in this pull request? This PR adds `dropFields` method to: - PySpark `Column` - SparkR `Column` ### Why are the changes needed? Feature parity. ### Does this PR introduce _any_ user-facing change? No, new API. ### How was this patch tested? - New unit tests. - Manual verification of examples / doctests. - Manual run of MyPy tests Closes apache#29967 from zero323/SPARK-32511-FOLLOW-UP-PYSPARK-SPARKR. Authored-by: zero323 <[email protected]> Signed-off-by: HyukjinKwon <[email protected]>
shaotongtc · Oct 8, 2020 · 473b3ba · 473b3ba
1 parent 37e1b0c
commit 473b3ba
Show file tree

Hide file tree

Showing 7 changed files with 165 additions and 1 deletion.
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -272,6 +272,7 @@ exportMethods("%<=>%",
               "degrees",
               "dense_rank",
               "desc",
+              "dropFields",
               "element_at",
               "encode",
               "endsWith",

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -387,3 +387,72 @@ setMethod("withField",
             jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
             column(jc)
           })
+
+#' dropFields
+#'
+#' Drops fields in a struct \code{Column} by name.
+#'
+#' @param x a Column
+#' @param ... names of the fields to be dropped.
+#'
+#' @rdname dropFields
+#' @aliases dropFields dropFields,Column-method
+#' @examples
+#' \dontrun{
+#' df <- select(
+#'   createDataFrame(iris),
+#'   alias(
+#'     struct(
+#'       column("Sepal_Width"), column("Sepal_Length"),
+#'       alias(
+#'         struct(
+#'           column("Petal_Width"), column("Petal_Length"),
+#'           alias(
+#'             column("Petal_Width") * column("Petal_Length"),
+#'             "Petal_Product"
+#'           )
+#'         ),
+#'         "Petal"
+#'       )
+#'     ),
+#'     "dimensions"
+#'   )
+#' )
+#' head(withColumn(df, "dimensions", dropFields(df$dimensions, "Petal")))
+#'
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     dropFields(df$dimensions, "Sepal_Width", "Sepal_Length")
+#'   )
+#' )
+#'
+#' # This method supports dropping multiple nested fields directly e.g.
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     dropFields(df$dimensions, "Petal.Petal_Width", "Petal.Petal_Length")
+#'   )
+#' )
+#'
+#' # However, if you are going to add/replace multiple nested fields,
+#' # it is preffered to extract out the nested struct before
+#' # adding/replacing multiple fields e.g.
+#' head(
+#'   withColumn(
+#'     df, "dimensions",
+#'     withField(
+#'       column("dimensions"),
+#'       "Petal",
+#'       dropFields(column("dimensions.Petal"), "Petal_Width", "Petal_Length")
+#'     )
+#'   )
+#' )
+#' }
+#' @note dropFields since 3.1.0
+setMethod("dropFields",
+          signature(x = "Column"),
+          function(x, ...) {
+            jc <- callJMethod(x@jc, "dropFields", list(...))
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -732,6 +732,9 @@ setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })
 #' @rdname withField
 setGeneric("withField", function(x, fieldName, col) { standardGeneric("withField") })
 
+#' @rdname dropFields
+setGeneric("dropFields", function(x, ...) { standardGeneric("dropFields") })
+
 ###################### WindowSpec Methods ##########################
 
 #' @rdname partitionBy

diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1809,7 +1809,7 @@ test_that("column functions", {
   expect_equal(actual, expected)
 
   # Test withField
-  lines <- c("{\"Person\": {\"name\":\"Bob\", \"age\":24}}")
+  lines <- c("{\"Person\": {\"name\":\"Bob\", \"age\":24, \"height\": 170}}")
   jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
   writeLines(lines, jsonPath)
   df <- read.df(jsonPath, "json")
@@ -1820,6 +1820,23 @@ test_that("column functions", {
       )
   )
   expect_equal(result, data.frame(dummy = 42))
+
+  # Test dropFields
+  expect_setequal(
+    colnames(select(
+      withColumn(df, "Person", dropFields(df$Person, "age")),
+      column("Person.*")
+    )),
+    c("name", "height")
+  )
+
+  expect_equal(
+    colnames(select(
+      withColumn(df, "Person", dropFields(df$Person, "height", "name")),
+      column("Person.*")
+    )),
+    "age"
+  )
 })
 
 test_that("column binary mathfunctions", {

diff --git a/python/pyspark/sql/column.py b/python/pyspark/sql/column.py
@@ -358,6 +358,57 @@ def withField(self, fieldName, col):
 
         return Column(self._jc.withField(fieldName, col._jc))
 
+    @since(3.1)
+    def dropFields(self, *fieldNames):
+        """
+        An expression that drops fields in :class:`StructType` by name.
+
+        >>> from pyspark.sql import Row
+        >>> from pyspark.sql.functions import col, lit
+        >>> df = spark.createDataFrame([
+        ...     Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
+        >>> df.withColumn('a', df['a'].dropFields('b')).show()
+        +-----------------+
+        |                a|
+        +-----------------+
+        |{2, 3, {4, 5, 6}}|
+        +-----------------+
+
+        >>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()
+        +--------------+
+        |             a|
+        +--------------+
+        |{3, {4, 5, 6}}|
+        +--------------+
+
+        This method supports dropping multiple nested fields directly e.g.
+
+        >>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()
+        +--------------+
+        |             a|
+        +--------------+
+        |{1, 2, 3, {4}}|
+        +--------------+
+
+        However, if you are going to add/replace multiple nested fields,
+        it is preffered to extract out the nested struct before
+        adding/replacing multiple fields e.g.
+
+        >>> df.select(col("a").withField(
+        ...     "e", col("a.e").dropFields("g", "h")).alias("a")
+        ... ).show()
+        +--------------+
+        |             a|
+        +--------------+
+        |{1, 2, 3, {4}}|
+        +--------------+
+
+        """
+        sc = SparkContext._active_spark_context
+
+        jc = self._jc.dropFields(_to_seq(sc, fieldNames))
+        return Column(jc)
+
     def __getattr__(self, item):
         if item.startswith("__"):
             raise AttributeError(item)

diff --git a/python/pyspark/sql/column.pyi b/python/pyspark/sql/column.pyi
@@ -80,6 +80,7 @@ class Column:
     def getItem(self, key: Any) -> Column: ...
     def getField(self, name: Any) -> Column: ...
     def withField(self, fieldName: str, col: Column) -> Column: ...
+    def dropFields(self, *fieldNames: str) -> Column: ...
     def __getattr__(self, item: Any) -> Column: ...
     def __iter__(self) -> None: ...
     def rlike(self, item: str) -> Column: ...

diff --git a/python/pyspark/sql/tests/test_column.py b/python/pyspark/sql/tests/test_column.py
@@ -156,6 +156,28 @@ def test_with_field(self):
                                'fieldName should be a string',
                                lambda: df['a'].withField(col('b'), lit(3)))
 
+    def test_drop_fields(self):
+        df = self.spark.createDataFrame([Row(a=Row(b=1, c=2, d=Row(e=3, f=4)))])
+        self.assertIsInstance(df["a"].dropFields("b"), Column)
+        self.assertIsInstance(df["a"].dropFields("b", "c"), Column)
+        self.assertIsInstance(df["a"].dropFields("d.e"), Column)
+
+        result = df.select(
+            df["a"].dropFields("b").alias("a1"),
+            df["a"].dropFields("d.e").alias("a2"),
+        ).first().asDict(True)
+
+        self.assertTrue(
+            "b" not in result["a1"] and
+            "c" in result["a1"] and
+            "d" in result["a1"]
+        )
+
+        self.assertTrue(
+            "e" not in result["a2"]["d"] and
+            "f" in result["a2"]["d"]
+        )
+
 if __name__ == "__main__":
     import unittest
     from pyspark.sql.tests.test_column import *  # noqa: F401