Skip to content

Commit

Permalink
[SPARK-37836][PYTHON][INFRA] Enable F841, E722, E305 and E226 for PEP…
Browse files Browse the repository at this point in the history
… 8 compliance

### What changes were proposed in this pull request?

This PR proposes to enable flake8 rules as below:

- [F841](https://www.flake8rules.com/rules/F841.html): Local variable name is assigned to but never used
- [E722](https://www.flake8rules.com/rules/E722.html): Do not use bare except, specify exception instead
- [E305](https://www.flake8rules.com/rules/E305.html): Expected 2 blank lines after end of function or class
- [E226](https://www.flake8rules.com/rules/E226.html):  Missing whitespace around arithmetic operator

We should probably still enable the rules below:

- [E731](https://www.flake8rules.com/rules/E731.html): Do not assign a lambda expression, use a def
- [E741](https://www.flake8rules.com/rules/E741.html): Do not use variables named 'I', 'O', or 'l'
- [W503](https://www.flake8rules.com/rules/W503.html): Line break occurred before a binary operator
- [W504](https://www.flake8rules.com/rules/W504.html): Line break occurred after a binary operator

but the rules ^ are not enabled in this PR because:
- There are too many instances to fix. Maybe it's better to separate PRs
- Some require real code changes.
- W503 and W504 are sort of conflicted. We should investigate a bit more.

### Why are the changes needed?

To comply PEP 8.

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

I manually tested it Python linter.

Closes apache#35126 from HyukjinKwon/enable-more-rules.

Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
HyukjinKwon committed Jan 7, 2022
1 parent a1180e4 commit 9000339
Show file tree
Hide file tree
Showing 55 changed files with 107 additions and 103 deletions.
2 changes: 0 additions & 2 deletions dev/.gitignore

This file was deleted.

2 changes: 2 additions & 0 deletions dev/create-release/generate-contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@
def print_indented(_list):
for x in _list:
print(" %s" % x)


if yesOrNoPrompt("Show all commits?"):
print_indented(new_commits)
print("==================================================================================\n")
Expand Down
1 change: 1 addition & 0 deletions dev/create-release/releaseutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def get_commits(tag):
commits.append(commit)
return commits


# Maintain a mapping for translating issue types to contributions in the release notes
# This serves an additional function of warning the user against unknown issue types
# Note: This list is partially derived from this link:
Expand Down
1 change: 1 addition & 0 deletions dev/create-release/translate-contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ def generate_candidates(author, issues):
candidates[i] = (candidate, source)
return candidates


# Translate each invalid author by searching for possible candidates from GitHub and JIRA
# In interactive mode, this script presents the user with a list of choices and have the user
# select from this list. Additionally, the user may also choose to enter a custom name.
Expand Down
2 changes: 1 addition & 1 deletion dev/github_jira_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def reset_pr_labels(pr_num, jira_components):
try:
page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink"))
existing_links = map(lambda l: l['object']['url'], page)
except:
except BaseException:
print("Failure reading JIRA %s (does it exist?)" % issue)
print(sys.exc_info()[0])
continue
Expand Down
12 changes: 5 additions & 7 deletions dev/merge_spark_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,6 @@ def merge_pr(pr_num, target_ref, title, body, pr_repo_desc):
distinct_authors = list(filter(lambda x: x != primary_author, distinct_authors))
distinct_authors.insert(0, primary_author)

commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name,
'--pretty=format:%h [%an] %s']).split("\n\n")

merge_message_flags = []

merge_message_flags += ["-m", title]
Expand Down Expand Up @@ -304,7 +301,7 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
"again (or leave blank and fix manually)." % (", ".join(fix_versions)))
except KeyboardInterrupt:
raise
except:
except BaseException:
traceback.print_exc()
print("Error setting fix version(s), try again (or leave blank and fix manually)")

Expand Down Expand Up @@ -350,14 +347,14 @@ def choose_jira_assignee(issue, asf_jira):
try:
id = int(raw_assignee)
assignee = candidates[id]
except:
except BaseException:
# assume it's a user id, and try to assign (might fail, we just prompt again)
assignee = asf_jira.user(raw_assignee)
asf_jira.assign_issue(issue.key, assignee.name)
return assignee
except KeyboardInterrupt:
raise
except:
except BaseException:
traceback.print_exc()
print("Error assigning JIRA, try again (or leave blank and fix manually)")

Expand Down Expand Up @@ -562,13 +559,14 @@ def main():
print("Could not find jira-python library. Run 'sudo pip3 install jira' to install.")
print("Exiting without trying to close the associated JIRA.")


if __name__ == "__main__":
import doctest
(failure_count, test_count) = doctest.testmod()
if failure_count:
sys.exit(-1)
try:
main()
except:
except BaseException:
clean_up()
raise
1 change: 1 addition & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __ne__(self, other):
def __hash__(self):
return hash(self.name)


tags = Module(
name="tags",
dependencies=[],
Expand Down
3 changes: 2 additions & 1 deletion dev/sparktestsupport/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def identify_changed_files_from_git_commits(patch_sha, target_branch=None, targe
raise AttributeError("must specify either target_branch or target_ref, not both")
if target_branch is not None:
diff_target = target_branch
run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
run_cmd(['git', 'fetch', 'origin', str(target_branch + ':' + target_branch)])
else:
diff_target = target_ref
raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target],
Expand Down Expand Up @@ -155,5 +155,6 @@ def _test():
if failure_count:
sys.exit(-1)


if __name__ == "__main__":
_test()
20 changes: 12 additions & 8 deletions dev/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,24 @@

[flake8]
ignore =
E203,
E226,
E305,
E402,
E722,
E203, # Skip as black formatter adds a whitespace around ':'.
E402, # Module top level import is disabled for optional import check, etc.
F403, # Using wildcard discouraged but F401 can detect. Disabled to reduce the usage of noqa.
# 1. Type hints with def are treated as redefinition (e.g., functions.log).
# 2. Some are used for testing.
F811,

# Below rules should be enabled in the future.
E731,
E741,
F403,
F811,
F841,
W503,
W504,
per-file-ignores =
# F405 is ignored as shared.py is auto-generated.
# E501 can be removed after SPARK-37419.
python/pyspark/ml/param/shared.py: F405 E501,
# Examples contain some unused variables.
examples/src/main/python/sql/datasource.py: F841,
exclude =
*/target/*,
docs/.local_ruby_bundle/,
Expand Down
1 change: 1 addition & 0 deletions examples/src/main/python/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def readPointBatch(iterator):
matrix[i] = np.fromstring(s.replace(',', ' '), dtype=np.float32, sep=' ')
return [matrix]


if __name__ == "__main__":

if len(sys.argv) != 3:
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/bucketizer_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
bucketedData.show()
# $example off$

Expand Down
1 change: 1 addition & 0 deletions examples/src/main/python/sql/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def programmatic_schema_example(spark):
# +-------+
# $example off:programmatic_schema$


if __name__ == "__main__":
# $example on:init_session$
spark = SparkSession \
Expand Down
1 change: 1 addition & 0 deletions examples/src/main/python/status_api_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,6 @@ def run():
print("Job results are:", result.get())
sc.stop()


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion examples/src/main/python/streaming/hdfs_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
lines = ssc.textFileStream(sys.argv[1])
counts = lines.flatMap(lambda line: line.split(" "))\
.map(lambda x: (x, 1))\
.reduceByKey(lambda a, b: a+b)
.reduceByKey(lambda a, b: a + b)
counts.pprint()

ssc.start()
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/streaming/network_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2]))
counts = lines.flatMap(lambda line: line.split(" "))\
.map(lambda word: (word, 1))\
.reduceByKey(lambda a, b: a+b)
.reduceByKey(lambda a, b: a + b)
counts.pprint()

ssc.start()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def print_happiest_words(rdd):
for tuple in top_list:
print("%s (%d happiness)" % (tuple[1], tuple[0]))


if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: network_wordjoinsentiments.py <hostname> <port>", file=sys.stderr)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ def filterFunc(wordCount):
wordCounts.foreachRDD(echo)
return ssc


if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: recoverable_network_wordcount.py <hostname> <port> "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def process(time, rdd):
wordCountsDataFrame = \
spark.sql("select word, count(*) as total from words group by word")
wordCountsDataFrame.show()
except:
except BaseException:
pass

words.foreachRDD(process)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
ssc, appName, streamName, endpointUrl, regionName, InitialPositionInStream.LATEST, 2)
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
.reduceByKey(lambda a, b: a + b)
counts.pprint()

ssc.start()
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def __init__(
profiler_cls,
udf_profiler_cls,
)
except:
except BaseException:
# If an error occurs, clean up in order to allow future SparkContext creation:
self.stop()
raise
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def handle_sigterm(*args):
pass
break
gc.collect()
except:
except BaseException:
traceback.print_exc()
os._exit(1)
else:
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
import scipy.sparse

_have_scipy = True
except:
except BaseException:
# No SciPy in environment, but that's okay
_have_scipy = False

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/tests/test_persistence.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def _run_test_onevsrest(self, LogisticRegressionCls):
)

lr = LogisticRegressionCls(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
OneVsRest(classifier=lr)

def reload_and_compare(ovr, suffix):
model = ovr.fit(df)
Expand Down
6 changes: 1 addition & 5 deletions python/pyspark/ml/tests/test_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,7 @@ def _fit(self, dataset):
class ParamGridBuilderTests(SparkSessionTestCase):
def test_addGrid(self):
with self.assertRaises(TypeError):
grid = (
ParamGridBuilder()
.addGrid("must be an instance of Param", ["not", "string"])
.build()
)
(ParamGridBuilder().addGrid("must be an instance of Param", ["not", "string"]).build())


class ValidatorTestUtilsMixin:
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/tests/test_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def condition():

try:
summary.__del__()
except:
except BaseException:
pass

def condition():
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class LogisticRegressionModel(LinearClassificationModel):
>>> from shutil import rmtree
>>> try:
... rmtree(path)
... except:
... except BaseException:
... pass
>>> multi_class_data = [
... LabeledPoint(0.0, [0.0, 1.0, 0.0]),
Expand Down Expand Up @@ -537,7 +537,7 @@ class SVMModel(LinearClassificationModel):
>>> from shutil import rmtree
>>> try:
... rmtree(path)
... except:
... except BaseException:
... pass
"""

Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/linalg/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
import scipy.sparse

_have_scipy = True
except:
except BaseException:
# No SciPy in environment, but that's okay
_have_scipy = False

Expand Down
6 changes: 3 additions & 3 deletions python/pyspark/mllib/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ class LinearRegressionModel(LinearRegressionModelBase):
>>> from shutil import rmtree
>>> try:
... rmtree(path)
... except:
... except BaseException:
... pass
>>> data = [
... LabeledPoint(0.0, SparseVector(1, {0: 0.0})),
Expand Down Expand Up @@ -383,7 +383,7 @@ class LassoModel(LinearRegressionModelBase):
>>> from shutil import rmtree
>>> try:
... rmtree(path)
... except:
... except BaseException:
... pass
>>> data = [
... LabeledPoint(0.0, SparseVector(1, {0: 0.0})),
Expand Down Expand Up @@ -557,7 +557,7 @@ class RidgeRegressionModel(LinearRegressionModelBase):
>>> from shutil import rmtree
>>> try:
... rmtree(path)
... except:
... except BaseException:
... pass
>>> data = [
... LabeledPoint(0.0, SparseVector(1, {0: 0.0})),
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/tests/test_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def test_regression(self):
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32
)
with self.assertRaises(Exception) as cm:
with self.assertRaises(Exception):
GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1
)
Expand Down
1 change: 0 additions & 1 deletion python/pyspark/mllib/tests/test_streaming_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,6 @@ def collect_errors(rdd):
true, predicted = zip(*rdd.collect())
errors.append(self.calculate_accuracy_error(true, predicted))

true_predicted = []
input_stream = self.ssc.queueStream(input_batches)
predict_stream = self.ssc.queueStream(predict_batches)
slr.trainOn(input_stream)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/mllib/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_load_vectors(self):
self.assertEqual(len(ret), 2)
self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
except:
except BaseException:
self.fail()
finally:
shutil.rmtree(load_vectors_path)
Expand Down
1 change: 0 additions & 1 deletion python/pyspark/pandas/data_type_ops/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@ def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
from pyspark.pandas.frame import DataFrame
from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField

len_right = len(right)
if len(left) != len(right):
raise ValueError("Lengths must be equal")

Expand Down
Loading

0 comments on commit 9000339

Please sign in to comment.