diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala index f99649f7fa164..0b989b0d7d253 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala @@ -89,7 +89,8 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String def setOutputCol(value: String): this.type = set(outputCol, value) /** - * Param for how to handle invalid entries. Options are 'skip' (filter out rows with + * Param for how to handle invalid entries containing NaN values. Values outside the splits + * will always be treated as errors. Options are 'skip' (filter out rows with * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special * additional bucket). Note that in the multiple column case, the invalid handling is applied * to all columns. That said for 'error' it will throw an error if any invalids are found in @@ -99,7 +100,8 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String */ @Since("2.1.0") override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid", - "how to handle invalid entries. Options are skip (filter out rows with invalid values), " + + "how to handle invalid entries containing NaN values. Values outside the splits will always " + + "be treated as errorsOptions are skip (filter out rows with invalid values), " + "error (throw an error), or keep (keep invalid values in a special additional bucket).", ParamValidators.inArray(Bucketizer.supportedHandleInvalids)) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index eccb7acae5b98..3d23700242594 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -361,8 +361,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, HasHandleInvalid, "splits specified will be treated as errors.", typeConverter=TypeConverters.toListFloat) - handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " + - "Options are 'skip' (filter out rows with invalid values), " + + handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries " + "containing NaN values. Values outside the splits will always be treated " + "as errors. Options are 'skip' (filter out rows with invalid values), " + "'error' (throw an error), or 'keep' (keep invalid values in a special " + "additional bucket).", typeConverter=TypeConverters.toString)