Skip to content

Commit

Permalink
[SPARK-19714][DOCS] Clarify Bucketizer handling of invalid input
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Clarify Bucketizer handleInvalid docs. Just a resubmit of apache#17169

## How was this patch tested?

N/A

Closes apache#23003 from srowen/SPARK-19714.

Authored-by: Sean Owen <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
  • Loading branch information
srowen committed Nov 11, 2018
1 parent aec0af4 commit 510ec77
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
def setOutputCol(value: String): this.type = set(outputCol, value)

/**
* Param for how to handle invalid entries. Options are 'skip' (filter out rows with
* Param for how to handle invalid entries containing NaN values. Values outside the splits
* will always be treated as errors. Options are 'skip' (filter out rows with
* invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
* additional bucket). Note that in the multiple column case, the invalid handling is applied
* to all columns. That said for 'error' it will throw an error if any invalids are found in
Expand All @@ -99,7 +100,8 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
*/
@Since("2.1.0")
override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
"how to handle invalid entries. Options are skip (filter out rows with invalid values), " +
"how to handle invalid entries containing NaN values. Values outside the splits will always " +
"be treated as errorsOptions are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special additional bucket).",
ParamValidators.inArray(Bucketizer.supportedHandleInvalids))

Expand Down
5 changes: 3 additions & 2 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,8 +361,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid,
"splits specified will be treated as errors.",
typeConverter=TypeConverters.toListFloat)

handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
"Options are 'skip' (filter out rows with invalid values), " +
handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries "
"containing NaN values. Values outside the splits will always be treated "
"as errors. Options are 'skip' (filter out rows with invalid values), " +
"'error' (throw an error), or 'keep' (keep invalid values in a special " +
"additional bucket).",
typeConverter=TypeConverters.toString)
Expand Down

0 comments on commit 510ec77

Please sign in to comment.