Skip to content

Commit eb91729

Browse files
holdenkjkbradley
authored andcommitted
[SPARK-10509][PYSPARK] Reduce excessive param boiler plate code
The current python ml params require cut-and-pasting the param setup and description between the class & ```__init__``` methods. Remove this possible case of errors & simplify use of custom params by adding a ```_copy_new_parent``` method to param so as to avoid cut and pasting (and cut and pasting at different indentation levels urgh). Author: Holden Karau <[email protected]> Closes apache#10216 from holdenk/SPARK-10509-excessive-param-boiler-plate-code.
1 parent 19fdb21 commit eb91729

12 files changed

+43
-317
lines changed

python/pyspark/ml/classification.py

-32
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
7272
.. versionadded:: 1.3.0
7373
"""
7474

75-
# a placeholder to make it appear in the generated doc
7675
threshold = Param(Params._dummy(), "threshold",
7776
"Threshold in binary classification prediction, in range [0, 1]." +
7877
" If threshold and thresholds are both set, they must match.")
@@ -92,10 +91,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
9291
super(LogisticRegression, self).__init__()
9392
self._java_obj = self._new_java_obj(
9493
"org.apache.spark.ml.classification.LogisticRegression", self.uid)
95-
#: param for threshold in binary classification, in range [0, 1].
96-
self.threshold = Param(self, "threshold",
97-
"Threshold in binary classification prediction, in range [0, 1]." +
98-
" If threshold and thresholds are both set, they must match.")
9994
self._setDefault(maxIter=100, regParam=0.1, tol=1E-6, threshold=0.5)
10095
kwargs = self.__init__._input_kwargs
10196
self.setParams(**kwargs)
@@ -232,18 +227,13 @@ class TreeClassifierParams(object):
232227
"""
233228
supportedImpurities = ["entropy", "gini"]
234229

235-
# a placeholder to make it appear in the generated doc
236230
impurity = Param(Params._dummy(), "impurity",
237231
"Criterion used for information gain calculation (case-insensitive). " +
238232
"Supported options: " +
239233
", ".join(supportedImpurities))
240234

241235
def __init__(self):
242236
super(TreeClassifierParams, self).__init__()
243-
#: param for Criterion used for information gain calculation (case-insensitive).
244-
self.impurity = Param(self, "impurity", "Criterion used for information " +
245-
"gain calculation (case-insensitive). Supported options: " +
246-
", ".join(self.supportedImpurities))
247237

248238
@since("1.6.0")
249239
def setImpurity(self, value):
@@ -485,7 +475,6 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
485475
.. versionadded:: 1.4.0
486476
"""
487477

488-
# a placeholder to make it appear in the generated doc
489478
lossType = Param(Params._dummy(), "lossType",
490479
"Loss function which GBT tries to minimize (case-insensitive). " +
491480
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
@@ -504,10 +493,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
504493
super(GBTClassifier, self).__init__()
505494
self._java_obj = self._new_java_obj(
506495
"org.apache.spark.ml.classification.GBTClassifier", self.uid)
507-
#: param for Loss function which GBT tries to minimize (case-insensitive).
508-
self.lossType = Param(self, "lossType",
509-
"Loss function which GBT tries to minimize (case-insensitive). " +
510-
"Supported options: " + ", ".join(GBTParams.supportedLossTypes))
511496
self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0,
512497
maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
513498
lossType="logistic", maxIter=20, stepSize=0.1)
@@ -597,7 +582,6 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H
597582
.. versionadded:: 1.5.0
598583
"""
599584

600-
# a placeholder to make it appear in the generated doc
601585
smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, should be >= 0, " +
602586
"default is 1.0")
603587
modelType = Param(Params._dummy(), "modelType", "The model type which is a string " +
@@ -615,13 +599,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
615599
super(NaiveBayes, self).__init__()
616600
self._java_obj = self._new_java_obj(
617601
"org.apache.spark.ml.classification.NaiveBayes", self.uid)
618-
#: param for the smoothing parameter.
619-
self.smoothing = Param(self, "smoothing", "The smoothing parameter, should be >= 0, " +
620-
"default is 1.0")
621-
#: param for the model type.
622-
self.modelType = Param(self, "modelType", "The model type which is a string " +
623-
"(case-sensitive). Supported options: multinomial (default) " +
624-
"and bernoulli.")
625602
self._setDefault(smoothing=1.0, modelType="multinomial")
626603
kwargs = self.__init__._input_kwargs
627604
self.setParams(**kwargs)
@@ -734,7 +711,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
734711
.. versionadded:: 1.6.0
735712
"""
736713

737-
# a placeholder to make it appear in the generated doc
738714
layers = Param(Params._dummy(), "layers", "Sizes of layers from input layer to output layer " +
739715
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with 100 " +
740716
"neurons and output layer of 10 neurons, default is [1, 1].")
@@ -753,14 +729,6 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
753729
super(MultilayerPerceptronClassifier, self).__init__()
754730
self._java_obj = self._new_java_obj(
755731
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
756-
self.layers = Param(self, "layers", "Sizes of layers from input layer to output layer " +
757-
"E.g., Array(780, 100, 10) means 780 inputs, one hidden layer with " +
758-
"100 neurons and output layer of 10 neurons, default is [1, 1].")
759-
self.blockSize = Param(self, "blockSize", "Block size for stacking input data in " +
760-
"matrices. Data is stacked within partitions. If block size is " +
761-
"more than remaining data in a partition then it is adjusted to " +
762-
"the size of this data. Recommended size is between 10 and 1000, " +
763-
"default is 128.")
764732
self._setDefault(maxIter=100, tol=1E-4, layers=[1, 1], blockSize=128)
765733
kwargs = self.__init__._input_kwargs
766734
self.setParams(**kwargs)

python/pyspark/ml/clustering.py

-7
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, HasTol
7373
.. versionadded:: 1.5.0
7474
"""
7575

76-
# a placeholder to make it appear in the generated doc
7776
k = Param(Params._dummy(), "k", "number of clusters to create")
7877
initMode = Param(Params._dummy(), "initMode",
7978
"the initialization algorithm. This can be either \"random\" to " +
@@ -90,12 +89,6 @@ def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
9089
"""
9190
super(KMeans, self).__init__()
9291
self._java_obj = self._new_java_obj("org.apache.spark.ml.clustering.KMeans", self.uid)
93-
self.k = Param(self, "k", "number of clusters to create")
94-
self.initMode = Param(self, "initMode",
95-
"the initialization algorithm. This can be either \"random\" to " +
96-
"choose random points as initial cluster centers, or \"k-means||\" " +
97-
"to use a parallel variant of k-means++")
98-
self.initSteps = Param(self, "initSteps", "steps for k-means initialization mode")
9992
self._setDefault(k=2, initMode="k-means||", initSteps=5, tol=1e-4, maxIter=20)
10093
kwargs = self.__init__._input_kwargs
10194
self.setParams(**kwargs)

python/pyspark/ml/evaluation.py

-12
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,6 @@ class BinaryClassificationEvaluator(JavaEvaluator, HasLabelCol, HasRawPrediction
124124
.. versionadded:: 1.4.0
125125
"""
126126

127-
# a placeholder to make it appear in the generated doc
128127
metricName = Param(Params._dummy(), "metricName",
129128
"metric name in evaluation (areaUnderROC|areaUnderPR)")
130129

@@ -138,9 +137,6 @@ def __init__(self, rawPredictionCol="rawPrediction", labelCol="label",
138137
super(BinaryClassificationEvaluator, self).__init__()
139138
self._java_obj = self._new_java_obj(
140139
"org.apache.spark.ml.evaluation.BinaryClassificationEvaluator", self.uid)
141-
#: param for metric name in evaluation (areaUnderROC|areaUnderPR)
142-
self.metricName = Param(self, "metricName",
143-
"metric name in evaluation (areaUnderROC|areaUnderPR)")
144140
self._setDefault(rawPredictionCol="rawPrediction", labelCol="label",
145141
metricName="areaUnderROC")
146142
kwargs = self.__init__._input_kwargs
@@ -210,9 +206,6 @@ def __init__(self, predictionCol="prediction", labelCol="label",
210206
super(RegressionEvaluator, self).__init__()
211207
self._java_obj = self._new_java_obj(
212208
"org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid)
213-
#: param for metric name in evaluation (mse|rmse|r2|mae)
214-
self.metricName = Param(self, "metricName",
215-
"metric name in evaluation (mse|rmse|r2|mae)")
216209
self._setDefault(predictionCol="prediction", labelCol="label",
217210
metricName="rmse")
218211
kwargs = self.__init__._input_kwargs
@@ -265,7 +258,6 @@ class MulticlassClassificationEvaluator(JavaEvaluator, HasLabelCol, HasPredictio
265258
266259
.. versionadded:: 1.5.0
267260
"""
268-
# a placeholder to make it appear in the generated doc
269261
metricName = Param(Params._dummy(), "metricName",
270262
"metric name in evaluation "
271263
"(f1|precision|recall|weightedPrecision|weightedRecall)")
@@ -280,10 +272,6 @@ def __init__(self, predictionCol="prediction", labelCol="label",
280272
super(MulticlassClassificationEvaluator, self).__init__()
281273
self._java_obj = self._new_java_obj(
282274
"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator", self.uid)
283-
# param for metric name in evaluation (f1|precision|recall|weightedPrecision|weightedRecall)
284-
self.metricName = Param(self, "metricName",
285-
"metric name in evaluation"
286-
" (f1|precision|recall|weightedPrecision|weightedRecall)")
287275
self._setDefault(predictionCol="prediction", labelCol="label",
288276
metricName="f1")
289277
kwargs = self.__init__._input_kwargs

0 commit comments

Comments
 (0)