[SPARK-37836][PYTHON][INFRA] Enable F841, E722, E305 and E226 for PEP…

… 8 compliance ### What changes were proposed in this pull request? This PR proposes to enable flake8 rules as below: - [F841](https://www.flake8rules.com/rules/F841.html): Local variable name is assigned to but never used - [E722](https://www.flake8rules.com/rules/E722.html): Do not use bare except, specify exception instead - [E305](https://www.flake8rules.com/rules/E305.html): Expected 2 blank lines after end of function or class - [E226](https://www.flake8rules.com/rules/E226.html): Missing whitespace around arithmetic operator We should probably still enable the rules below: - [E731](https://www.flake8rules.com/rules/E731.html): Do not assign a lambda expression, use a def - [E741](https://www.flake8rules.com/rules/E741.html): Do not use variables named 'I', 'O', or 'l' - [W503](https://www.flake8rules.com/rules/W503.html): Line break occurred before a binary operator - [W504](https://www.flake8rules.com/rules/W504.html): Line break occurred after a binary operator but the rules ^ are not enabled in this PR because: - There are too many instances to fix. Maybe it's better to separate PRs - Some require real code changes. - W503 and W504 are sort of conflicted. We should investigate a bit more. ### Why are the changes needed? To comply PEP 8. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? I manually tested it Python linter. Closes apache#35126 from HyukjinKwon/enable-more-rules. Authored-by: Hyukjin Kwon <[email protected]> Signed-off-by: Hyukjin Kwon <[email protected]>
mantov5 · Jan 7, 2022 · 9000339 · 9000339
1 parent a1180e4
commit 9000339
Show file tree

Hide file tree

Showing 55 changed files with 107 additions and 103 deletions.
diff --git a/dev/.gitignore b/dev/.gitignore
diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py
@@ -75,6 +75,8 @@
 def print_indented(_list):
     for x in _list:
         print("  %s" % x)
+
+
 if yesOrNoPrompt("Show all commits?"):
     print_indented(new_commits)
 print("==================================================================================\n")

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
@@ -147,6 +147,7 @@ def get_commits(tag):
         commits.append(commit)
     return commits
 
+
 # Maintain a mapping for translating issue types to contributions in the release notes
 # This serves an additional function of warning the user against unknown issue types
 # Note: This list is partially derived from this link:

diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py
@@ -137,6 +137,7 @@ def generate_candidates(author, issues):
         candidates[i] = (candidate, source)
     return candidates
 
+
 # Translate each invalid author by searching for possible candidates from GitHub and JIRA
 # In interactive mode, this script presents the user with a list of choices and have the user
 # select from this list. Additionally, the user may also choose to enter a custom name.

diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
@@ -161,7 +161,7 @@ def reset_pr_labels(pr_num, jira_components):
     try:
         page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink"))
         existing_links = map(lambda l: l['object']['url'], page)
-    except:
+    except BaseException:
         print("Failure reading JIRA %s (does it exist?)" % issue)
         print(sys.exc_info()[0])
         continue

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
@@ -147,9 +147,6 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
         distinct_authors = list(filter(lambda x: x != primary_author, distinct_authors))
         distinct_authors.insert(0, primary_author)
 
-    commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
-                      '--pretty=format:%h [%an] %s']).split("\n\n")
-
     merge_message_flags = []
 
     merge_message_flags += ["-m", title]
@@ -304,7 +301,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
                       "again (or leave blank and fix manually)." % (", ".join(fix_versions)))
         except KeyboardInterrupt:
             raise
-        except:
+        except BaseException:
             traceback.print_exc()
             print("Error setting fix version(s), try again (or leave blank and fix manually)")
 
@@ -350,14 +347,14 @@ def choose_jira_assignee(issue, asf_jira):
                 try:
                     id = int(raw_assignee)
                     assignee = candidates[id]
-                except:
+                except BaseException:
                     # assume it's a user id, and try to assign (might fail, we just prompt again)
                     assignee = asf_jira.user(raw_assignee)
                 asf_jira.assign_issue(issue.key, assignee.name)
                 return assignee
         except KeyboardInterrupt:
             raise
-        except:
+        except BaseException:
             traceback.print_exc()
             print("Error assigning JIRA, try again (or leave blank and fix manually)")
 
@@ -562,13 +559,14 @@ def main():
         print("Could not find jira-python library. Run 'sudo pip3 install jira' to install.")
         print("Exiting without trying to close the associated JIRA.")
 
+
 if __name__ == "__main__":
     import doctest
     (failure_count, test_count) = doctest.testmod()
     if failure_count:
         sys.exit(-1)
     try:
         main()
-    except:
+    except BaseException:
         clean_up()
         raise
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -94,6 +94,7 @@ def __ne__(self, other):
     def __hash__(self):
         return hash(self.name)
 
+
 tags = Module(
     name="tags",
     dependencies=[],

diff --git a/dev/sparktestsupport/utils.py b/dev/sparktestsupport/utils.py
@@ -77,7 +77,7 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
         raise AttributeError("must specify either target_branch or target_ref, not both")
     if target_branch is not None:
         diff_target = target_branch
-        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
+        run_cmd(['git', 'fetch', 'origin', str(target_branch + ':' + target_branch)])
     else:
         diff_target = target_ref
     raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
@@ -155,5 +155,6 @@ def _test():
     if failure_count:
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     _test()
diff --git a/dev/tox.ini b/dev/tox.ini
@@ -16,20 +16,24 @@
 
 [flake8]
 ignore =
-    E203,
-    E226,
-    E305,
-    E402,
-    E722,
+    E203, # Skip as black formatter adds a whitespace around ':'.
+    E402, # Module top level import is disabled for optional import check, etc.
+    F403, # Using wildcard discouraged but F401 can detect. Disabled to reduce the usage of noqa. 
+    # 1. Type hints with def are treated as redefinition (e.g., functions.log).
+    # 2. Some are used for testing.
+    F811,
+
+    # Below rules should be enabled in the future.
     E731,
     E741,
-    F403,
-    F811,
-    F841,
     W503,
     W504,
 per-file-ignores =
+    # F405 is ignored as shared.py is auto-generated.
+    # E501 can be removed after SPARK-37419.
     python/pyspark/ml/param/shared.py: F405 E501,
+    # Examples contain some unused variables.
+    examples/src/main/python/sql/datasource.py: F841,
 exclude =
     */target/*,
     docs/.local_ruby_bundle/,

diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
@@ -42,6 +42,7 @@ def readPointBatch(iterator):
         matrix[i] = np.fromstring(s.replace(',', ' '), dtype=np.float32, sep=' ')
     return [matrix]
 
+
 if __name__ == "__main__":
 
     if len(sys.argv) != 3:

diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
@@ -37,7 +37,7 @@
     # Transform original data into its bucket index.
     bucketedData = bucketizer.transform(dataFrame)
 
-    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
+    print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
     bucketedData.show()
     # $example off$
 

diff --git a/examples/src/main/python/sql/basic.py b/examples/src/main/python/sql/basic.py
@@ -197,6 +197,7 @@ def programmatic_schema_example(spark):
     # +-------+
     # $example off:programmatic_schema$
 
+
 if __name__ == "__main__":
     # $example on:init_session$
     spark = SparkSession \

diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
@@ -63,5 +63,6 @@ def run():
     print("Job results are:", result.get())
     sc.stop()
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/src/main/python/streaming/hdfs_wordcount.py b/examples/src/main/python/streaming/hdfs_wordcount.py
@@ -41,7 +41,7 @@
     lines = ssc.textFileStream(sys.argv[1])
     counts = lines.flatMap(lambda line: line.split(" "))\
                   .map(lambda x: (x, 1))\
-                  .reduceByKey(lambda a, b: a+b)
+                  .reduceByKey(lambda a, b: a + b)
     counts.pprint()
 
     ssc.start()

diff --git a/examples/src/main/python/streaming/network_wordcount.py b/examples/src/main/python/streaming/network_wordcount.py
@@ -40,7 +40,7 @@
     lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
     counts = lines.flatMap(lambda line: line.split(" "))\
                   .map(lambda word: (word, 1))\
-                  .reduceByKey(lambda a, b: a+b)
+                  .reduceByKey(lambda a, b: a + b)
     counts.pprint()
 
     ssc.start()

diff --git a/examples/src/main/python/streaming/network_wordjoinsentiments.py b/examples/src/main/python/streaming/network_wordjoinsentiments.py
@@ -43,6 +43,7 @@ def print_happiest_words(rdd):
     for tuple in top_list:
         print("%s (%d happiness)" % (tuple[1], tuple[0]))
 
+
 if __name__ == "__main__":
     if len(sys.argv) != 3:
         print("Usage: network_wordjoinsentiments.py <hostname> <port>", file=sys.stderr)

diff --git a/examples/src/main/python/streaming/recoverable_network_wordcount.py b/examples/src/main/python/streaming/recoverable_network_wordcount.py
@@ -95,6 +95,7 @@ def filterFunc(wordCount):
     wordCounts.foreachRDD(echo)
     return ssc
 
+
 if __name__ == "__main__":
     if len(sys.argv) != 5:
         print("Usage: recoverable_network_wordcount.py <hostname> <port> "

diff --git a/examples/src/main/python/streaming/sql_network_wordcount.py b/examples/src/main/python/streaming/sql_network_wordcount.py
@@ -75,7 +75,7 @@ def process(time, rdd):
             wordCountsDataFrame = \
                 spark.sql("select word, count(*) as total from words group by word")
             wordCountsDataFrame.show()
-        except:
+        except BaseException:
             pass
 
     words.foreachRDD(process)

diff --git a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py
@@ -75,7 +75,7 @@
         ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2)
     counts = lines.flatMap(lambda line: line.split(" ")) \
         .map(lambda word: (word, 1)) \
-        .reduceByKey(lambda a, b: a+b)
+        .reduceByKey(lambda a, b: a + b)
     counts.pprint()
 
     ssc.start()

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -209,7 +209,7 @@ def __init__(
                 profiler_cls,
                 udf_profiler_cls,
             )
-        except:
+        except BaseException:
             # If an error occurs, clean up in order to allow future SparkContext creation:
             self.stop()
             raise

diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
@@ -196,7 +196,7 @@ def handle_sigterm(*args):
                                     pass
                                 break
                             gc.collect()
-                    except:
+                    except BaseException:
                         traceback.print_exc()
                         os._exit(1)
                     else:

diff --git a/python/pyspark/ml/linalg/__init__.py b/python/pyspark/ml/linalg/__init__.py
@@ -60,7 +60,7 @@
     import scipy.sparse
 
     _have_scipy = True
-except:
+except BaseException:
     # No SciPy in environment, but that's okay
     _have_scipy = False
 

diff --git a/python/pyspark/ml/tests/test_persistence.py b/python/pyspark/ml/tests/test_persistence.py
@@ -379,7 +379,7 @@ def _run_test_onevsrest(self, LogisticRegressionCls):
         )
 
         lr = LogisticRegressionCls(maxIter=5, regParam=0.01)
-        ovr = OneVsRest(classifier=lr)
+        OneVsRest(classifier=lr)
 
         def reload_and_compare(ovr, suffix):
             model = ovr.fit(df)

diff --git a/python/pyspark/ml/tests/test_tuning.py b/python/pyspark/ml/tests/test_tuning.py
@@ -80,11 +80,7 @@ def _fit(self, dataset):
 class ParamGridBuilderTests(SparkSessionTestCase):
     def test_addGrid(self):
         with self.assertRaises(TypeError):
-            grid = (
-                ParamGridBuilder()
-                .addGrid("must be an instance of Param", ["not", "string"])
-                .build()
-            )
+            (ParamGridBuilder().addGrid("must be an instance of Param", ["not", "string"]).build())
 
 
 class ValidatorTestUtilsMixin:

diff --git a/python/pyspark/ml/tests/test_wrapper.py b/python/pyspark/ml/tests/test_wrapper.py
@@ -67,7 +67,7 @@ def condition():
 
         try:
             summary.__del__()
-        except:
+        except BaseException:
             pass
 
         def condition():

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
@@ -161,7 +161,7 @@ class LogisticRegressionModel(LinearClassificationModel):
     >>> from shutil import rmtree
     >>> try:
     ...    rmtree(path)
-    ... except:
+    ... except BaseException:
     ...    pass
     >>> multi_class_data = [
     ...     LabeledPoint(0.0, [0.0, 1.0, 0.0]),
@@ -537,7 +537,7 @@ class SVMModel(LinearClassificationModel):
     >>> from shutil import rmtree
     >>> try:
     ...    rmtree(path)
-    ... except:
+    ... except BaseException:
     ...    pass
     """
 

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
@@ -63,7 +63,7 @@
     import scipy.sparse
 
     _have_scipy = True
-except:
+except BaseException:
     # No SciPy in environment, but that's okay
     _have_scipy = False
 

diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
@@ -180,7 +180,7 @@ class LinearRegressionModel(LinearRegressionModelBase):
     >>> from shutil import rmtree
     >>> try:
     ...     rmtree(path)
-    ... except:
+    ... except BaseException:
     ...     pass
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(1, {0: 0.0})),
@@ -383,7 +383,7 @@ class LassoModel(LinearRegressionModelBase):
     >>> from shutil import rmtree
     >>> try:
     ...    rmtree(path)
-    ... except:
+    ... except BaseException:
     ...    pass
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(1, {0: 0.0})),
@@ -557,7 +557,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
     >>> from shutil import rmtree
     >>> try:
     ...    rmtree(path)
-    ... except:
+    ... except BaseException:
     ...    pass
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(1, {0: 0.0})),

diff --git a/python/pyspark/mllib/tests/test_algorithms.py b/python/pyspark/mllib/tests/test_algorithms.py
@@ -295,7 +295,7 @@ def test_regression(self):
         GradientBoostedTrees.trainRegressor(
             rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32
         )
-        with self.assertRaises(Exception) as cm:
+        with self.assertRaises(Exception):
             GradientBoostedTrees.trainRegressor(
                 rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1
             )

diff --git a/python/pyspark/mllib/tests/test_streaming_algorithms.py b/python/pyspark/mllib/tests/test_streaming_algorithms.py
@@ -303,7 +303,6 @@ def collect_errors(rdd):
             true, predicted = zip(*rdd.collect())
             errors.append(self.calculate_accuracy_error(true, predicted))
 
-        true_predicted = []
         input_stream = self.ssc.queueStream(input_batches)
         predict_stream = self.ssc.queueStream(predict_batches)
         slr.trainOn(input_stream)

diff --git a/python/pyspark/mllib/tests/test_util.py b/python/pyspark/mllib/tests/test_util.py
@@ -61,7 +61,7 @@ def test_load_vectors(self):
             self.assertEqual(len(ret), 2)
             self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
             self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
-        except:
+        except BaseException:
             self.fail()
         finally:
             shutil.rmtree(load_vectors_path)

diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py
@@ -383,7 +383,6 @@ def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
             from pyspark.pandas.frame import DataFrame
             from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField
 
-            len_right = len(right)
             if len(left) != len(right):
                 raise ValueError("Lengths must be equal")
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,7 +67,7 @@ def condition(): @@
             try:
                 summary.__del__()
-            except:
+            except BaseException:
                 pass
             def condition():
@@ Expand Down @@