Skip to content

Commit

Permalink
[SPARK-32511][FOLLOW-UP][SQL][R][PYTHON] Add dropFields to SparkR and…
Browse files Browse the repository at this point in the history
… PySpark

### What changes were proposed in this pull request?

This PR adds `dropFields` method to:

- PySpark `Column`
- SparkR `Column`

### Why are the changes needed?

Feature parity.

### Does this PR introduce _any_ user-facing change?

No, new API.

### How was this patch tested?

- New unit tests.
- Manual verification of examples / doctests.
- Manual run of MyPy tests

Closes apache#29967 from zero323/SPARK-32511-FOLLOW-UP-PYSPARK-SPARKR.

Authored-by: zero323 <[email protected]>
Signed-off-by: HyukjinKwon <[email protected]>
  • Loading branch information
zero323 authored and HyukjinKwon committed Oct 8, 2020
1 parent 37e1b0c commit 473b3ba
Show file tree
Hide file tree
Showing 7 changed files with 165 additions and 1 deletion.
1 change: 1 addition & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ exportMethods("%<=>%",
"degrees",
"dense_rank",
"desc",
"dropFields",
"element_at",
"encode",
"endsWith",
Expand Down
69 changes: 69 additions & 0 deletions R/pkg/R/column.R
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,72 @@ setMethod("withField",
jc <- callJMethod(x@jc, "withField", fieldName, col@jc)
column(jc)
})

#' dropFields
#'
#' Drops fields in a struct \code{Column} by name.
#'
#' @param x a Column
#' @param ... names of the fields to be dropped.
#'
#' @rdname dropFields
#' @aliases dropFields dropFields,Column-method
#' @examples
#' \dontrun{
#' df <- select(
#' createDataFrame(iris),
#' alias(
#' struct(
#' column("Sepal_Width"), column("Sepal_Length"),
#' alias(
#' struct(
#' column("Petal_Width"), column("Petal_Length"),
#' alias(
#' column("Petal_Width") * column("Petal_Length"),
#' "Petal_Product"
#' )
#' ),
#' "Petal"
#' )
#' ),
#' "dimensions"
#' )
#' )
#' head(withColumn(df, "dimensions", dropFields(df$dimensions, "Petal")))
#'
#' head(
#' withColumn(
#' df, "dimensions",
#' dropFields(df$dimensions, "Sepal_Width", "Sepal_Length")
#' )
#' )
#'
#' # This method supports dropping multiple nested fields directly e.g.
#' head(
#' withColumn(
#' df, "dimensions",
#' dropFields(df$dimensions, "Petal.Petal_Width", "Petal.Petal_Length")
#' )
#' )
#'
#' # However, if you are going to add/replace multiple nested fields,
#' # it is preffered to extract out the nested struct before
#' # adding/replacing multiple fields e.g.
#' head(
#' withColumn(
#' df, "dimensions",
#' withField(
#' column("dimensions"),
#' "Petal",
#' dropFields(column("dimensions.Petal"), "Petal_Width", "Petal_Length")
#' )
#' )
#' )
#' }
#' @note dropFields since 3.1.0
setMethod("dropFields",
signature(x = "Column"),
function(x, ...) {
jc <- callJMethod(x@jc, "dropFields", list(...))
column(jc)
})
3 changes: 3 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,9 @@ setGeneric("%<=>%", function(x, value) { standardGeneric("%<=>%") })
#' @rdname withField
setGeneric("withField", function(x, fieldName, col) { standardGeneric("withField") })

#' @rdname dropFields
setGeneric("dropFields", function(x, ...) { standardGeneric("dropFields") })

###################### WindowSpec Methods ##########################

#' @rdname partitionBy
Expand Down
19 changes: 18 additions & 1 deletion R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,7 @@ test_that("column functions", {
expect_equal(actual, expected)

# Test withField
lines <- c("{\"Person\": {\"name\":\"Bob\", \"age\":24}}")
lines <- c("{\"Person\": {\"name\":\"Bob\", \"age\":24, \"height\": 170}}")
jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(lines, jsonPath)
df <- read.df(jsonPath, "json")
Expand All @@ -1820,6 +1820,23 @@ test_that("column functions", {
)
)
expect_equal(result, data.frame(dummy = 42))

# Test dropFields
expect_setequal(
colnames(select(
withColumn(df, "Person", dropFields(df$Person, "age")),
column("Person.*")
)),
c("name", "height")
)

expect_equal(
colnames(select(
withColumn(df, "Person", dropFields(df$Person, "height", "name")),
column("Person.*")
)),
"age"
)
})

test_that("column binary mathfunctions", {
Expand Down
51 changes: 51 additions & 0 deletions python/pyspark/sql/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,57 @@ def withField(self, fieldName, col):

return Column(self._jc.withField(fieldName, col._jc))

@since(3.1)
def dropFields(self, *fieldNames):
"""
An expression that drops fields in :class:`StructType` by name.
>>> from pyspark.sql import Row
>>> from pyspark.sql.functions import col, lit
>>> df = spark.createDataFrame([
... Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
>>> df.withColumn('a', df['a'].dropFields('b')).show()
+-----------------+
| a|
+-----------------+
|{2, 3, {4, 5, 6}}|
+-----------------+
>>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()
+--------------+
| a|
+--------------+
|{3, {4, 5, 6}}|
+--------------+
This method supports dropping multiple nested fields directly e.g.
>>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()
+--------------+
| a|
+--------------+
|{1, 2, 3, {4}}|
+--------------+
However, if you are going to add/replace multiple nested fields,
it is preffered to extract out the nested struct before
adding/replacing multiple fields e.g.
>>> df.select(col("a").withField(
... "e", col("a.e").dropFields("g", "h")).alias("a")
... ).show()
+--------------+
| a|
+--------------+
|{1, 2, 3, {4}}|
+--------------+
"""
sc = SparkContext._active_spark_context

jc = self._jc.dropFields(_to_seq(sc, fieldNames))
return Column(jc)

def __getattr__(self, item):
if item.startswith("__"):
raise AttributeError(item)
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/sql/column.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class Column:
def getItem(self, key: Any) -> Column: ...
def getField(self, name: Any) -> Column: ...
def withField(self, fieldName: str, col: Column) -> Column: ...
def dropFields(self, *fieldNames: str) -> Column: ...
def __getattr__(self, item: Any) -> Column: ...
def __iter__(self) -> None: ...
def rlike(self, item: str) -> Column: ...
Expand Down
22 changes: 22 additions & 0 deletions python/pyspark/sql/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,28 @@ def test_with_field(self):
'fieldName should be a string',
lambda: df['a'].withField(col('b'), lit(3)))

def test_drop_fields(self):
df = self.spark.createDataFrame([Row(a=Row(b=1, c=2, d=Row(e=3, f=4)))])
self.assertIsInstance(df["a"].dropFields("b"), Column)
self.assertIsInstance(df["a"].dropFields("b", "c"), Column)
self.assertIsInstance(df["a"].dropFields("d.e"), Column)

result = df.select(
df["a"].dropFields("b").alias("a1"),
df["a"].dropFields("d.e").alias("a2"),
).first().asDict(True)

self.assertTrue(
"b" not in result["a1"] and
"c" in result["a1"] and
"d" in result["a1"]
)

self.assertTrue(
"e" not in result["a2"]["d"] and
"f" in result["a2"]["d"]
)

if __name__ == "__main__":
import unittest
from pyspark.sql.tests.test_column import * # noqa: F401
Expand Down

0 comments on commit 473b3ba

Please sign in to comment.