Skip to content

Commit ee74498

Browse files
committed
[SPARK-8725][PROJECT-INFRA] Test modules in topologically-sorted order in dev/run-tests
This patch improves our `dev/run-tests` script to test modules in a topologically-sorted order based on modules' dependencies. This will help to ensure that bugs in upstream projects are not misattributed to downstream projects because those projects' tests were the first ones to exhibit the failure Topological sorting is also useful for shortening the feedback loop when testing pull requests: if I make a change in SQL then the SQL tests should run before MLlib, not after. In addition, this patch also updates our test module definitions to split `sql` into `catalyst`, `sql`, and `hive` in order to allow more tests to be skipped when changing only `hive/` files. Author: Josh Rosen <[email protected]> Closes apache#10885 from JoshRosen/SPARK-8725.
1 parent fbf7623 commit ee74498

File tree

4 files changed

+162
-18
lines changed

4 files changed

+162
-18
lines changed

NOTICE

+16
Original file line numberDiff line numberDiff line change
@@ -650,3 +650,19 @@ For CSV functionality:
650650
*/
651651

652652

653+
===============================================================================
654+
For dev/sparktestsupport/toposort.py:
655+
656+
Copyright 2014 True Blade Systems, Inc.
657+
658+
Licensed under the Apache License, Version 2.0 (the "License");
659+
you may not use this file except in compliance with the License.
660+
You may obtain a copy of the License at
661+
662+
http://www.apache.org/licenses/LICENSE-2.0
663+
664+
Unless required by applicable law or agreed to in writing, software
665+
distributed under the License is distributed on an "AS IS" BASIS,
666+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
667+
See the License for the specific language governing permissions and
668+
limitations under the License.

dev/run-tests.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
3131
from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
32+
from sparktestsupport.toposort import toposort_flatten, toposort
3233
import sparktestsupport.modules as modules
3334

3435

@@ -43,7 +44,7 @@ def determine_modules_for_files(filenames):
4344
If a file is not associated with a more specific submodule, then this method will consider that
4445
file to belong to the 'root' module.
4546
46-
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
47+
>>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"]))
4748
['pyspark-core', 'sql']
4849
>>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
4950
['root']
@@ -99,14 +100,16 @@ def determine_modules_to_test(changed_modules):
99100
Given a set of modules that have changed, compute the transitive closure of those modules'
100101
dependent modules in order to determine the set of modules that should be tested.
101102
102-
>>> sorted(x.name for x in determine_modules_to_test([modules.root]))
103+
Returns a topologically-sorted list of modules (ties are broken by sorting on module names).
104+
105+
>>> [x.name for x in determine_modules_to_test([modules.root])]
103106
['root']
104-
>>> sorted(x.name for x in determine_modules_to_test([modules.graphx]))
105-
['examples', 'graphx']
106-
>>> x = sorted(x.name for x in determine_modules_to_test([modules.sql]))
107+
>>> [x.name for x in determine_modules_to_test([modules.graphx])]
108+
['graphx', 'examples']
109+
>>> x = [x.name for x in determine_modules_to_test([modules.sql])]
107110
>>> x # doctest: +NORMALIZE_WHITESPACE
108-
['examples', 'hive-thriftserver', 'mllib', 'pyspark-ml', \
109-
'pyspark-mllib', 'pyspark-sql', 'sparkr', 'sql']
111+
['sql', 'hive', 'mllib', 'examples', 'hive-thriftserver', 'pyspark-sql', 'sparkr',
112+
'pyspark-mllib', 'pyspark-ml']
110113
"""
111114
# If we're going to have to run all of the tests, then we can just short-circuit
112115
# and return 'root'. No module depends on root, so if it appears then it will be
@@ -116,7 +119,9 @@ def determine_modules_to_test(changed_modules):
116119
modules_to_test = set()
117120
for module in changed_modules:
118121
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
119-
return modules_to_test.union(set(changed_modules))
122+
modules_to_test = modules_to_test.union(set(changed_modules))
123+
return toposort_flatten(
124+
{m: set(m.dependencies).intersection(modules_to_test) for m in modules_to_test}, sort=True)
120125

121126

122127
def determine_tags_to_exclude(changed_modules):
@@ -377,12 +382,12 @@ def run_scala_tests_maven(test_profiles):
377382

378383
def run_scala_tests_sbt(test_modules, test_profiles):
379384

380-
sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
385+
sbt_test_goals = list(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
381386

382387
if not sbt_test_goals:
383388
return
384389

385-
profiles_and_goals = test_profiles + list(sbt_test_goals)
390+
profiles_and_goals = test_profiles + sbt_test_goals
386391

387392
print("[info] Running Spark tests using SBT with these arguments: ",
388393
" ".join(profiles_and_goals))

dev/sparktestsupport/modules.py

+46-8
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@
1515
# limitations under the License.
1616
#
1717

18+
from functools import total_ordering
1819
import itertools
1920
import re
2021

2122
all_modules = []
2223

2324

25+
@total_ordering
2426
class Module(object):
2527
"""
2628
A module is the basic abstraction in our test runner script. Each module consists of a set of
@@ -75,20 +77,56 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
7577
def contains_file(self, filename):
7678
return any(re.match(p, filename) for p in self.source_file_prefixes)
7779

80+
def __repr__(self):
81+
return "Module<%s>" % self.name
82+
83+
def __lt__(self, other):
84+
return self.name < other.name
85+
86+
def __eq__(self, other):
87+
return self.name == other.name
88+
89+
def __ne__(self, other):
90+
return not (self.name == other.name)
91+
92+
def __hash__(self):
93+
return hash(self.name)
94+
95+
96+
catalyst = Module(
97+
name="catalyst",
98+
dependencies=[],
99+
source_file_regexes=[
100+
"sql/catalyst/",
101+
],
102+
sbt_test_goals=[
103+
"catalyst/test",
104+
],
105+
)
106+
78107

79108
sql = Module(
80109
name="sql",
81-
dependencies=[],
110+
dependencies=[catalyst],
82111
source_file_regexes=[
83-
"sql/(?!hive-thriftserver)",
112+
"sql/core/",
113+
],
114+
sbt_test_goals=[
115+
"sql/test",
116+
],
117+
)
118+
119+
hive = Module(
120+
name="hive",
121+
dependencies=[sql],
122+
source_file_regexes=[
123+
"sql/hive/",
84124
"bin/spark-sql",
85125
],
86126
build_profile_flags=[
87127
"-Phive",
88128
],
89129
sbt_test_goals=[
90-
"catalyst/test",
91-
"sql/test",
92130
"hive/test",
93131
],
94132
test_tags=[
@@ -99,7 +137,7 @@ def contains_file(self, filename):
99137

100138
hive_thriftserver = Module(
101139
name="hive-thriftserver",
102-
dependencies=[sql],
140+
dependencies=[hive],
103141
source_file_regexes=[
104142
"sql/hive-thriftserver",
105143
"sbin/start-thriftserver.sh",
@@ -282,7 +320,7 @@ def contains_file(self, filename):
282320

283321
examples = Module(
284322
name="examples",
285-
dependencies=[graphx, mllib, streaming, sql],
323+
dependencies=[graphx, mllib, streaming, hive],
286324
source_file_regexes=[
287325
"examples/",
288326
],
@@ -314,7 +352,7 @@ def contains_file(self, filename):
314352

315353
pyspark_sql = Module(
316354
name="pyspark-sql",
317-
dependencies=[pyspark_core, sql],
355+
dependencies=[pyspark_core, hive],
318356
source_file_regexes=[
319357
"python/pyspark/sql"
320358
],
@@ -404,7 +442,7 @@ def contains_file(self, filename):
404442

405443
sparkr = Module(
406444
name="sparkr",
407-
dependencies=[sql, mllib],
445+
dependencies=[hive, mllib],
408446
source_file_regexes=[
409447
"R/",
410448
],

dev/sparktestsupport/toposort.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#######################################################################
2+
# Implements a topological sort algorithm.
3+
#
4+
# Copyright 2014 True Blade Systems, Inc.
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
# Notes:
19+
# Based on http://code.activestate.com/recipes/578272-topological-sort
20+
# with these major changes:
21+
# Added unittests.
22+
# Deleted doctests (maybe not the best idea in the world, but it cleans
23+
# up the docstring).
24+
# Moved functools import to the top of the file.
25+
# Changed assert to a ValueError.
26+
# Changed iter[items|keys] to [items|keys], for python 3
27+
# compatibility. I don't think it matters for python 2 these are
28+
# now lists instead of iterables.
29+
# Copy the input so as to leave it unmodified.
30+
# Renamed function from toposort2 to toposort.
31+
# Handle empty input.
32+
# Switch tests to use set literals.
33+
#
34+
########################################################################
35+
36+
from functools import reduce as _reduce
37+
38+
39+
__all__ = ['toposort', 'toposort_flatten']
40+
41+
42+
def toposort(data):
43+
"""Dependencies are expressed as a dictionary whose keys are items
44+
and whose values are a set of dependent items. Output is a list of
45+
sets in topological order. The first set consists of items with no
46+
dependences, each subsequent set consists of items that depend upon
47+
items in the preceeding sets.
48+
"""
49+
50+
# Special case empty input.
51+
if len(data) == 0:
52+
return
53+
54+
# Copy the input so as to leave it unmodified.
55+
data = data.copy()
56+
57+
# Ignore self dependencies.
58+
for k, v in data.items():
59+
v.discard(k)
60+
# Find all items that don't depend on anything.
61+
extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys())
62+
# Add empty dependences where needed.
63+
data.update({item: set() for item in extra_items_in_deps})
64+
while True:
65+
ordered = set(item for item, dep in data.items() if len(dep) == 0)
66+
if not ordered:
67+
break
68+
yield ordered
69+
data = {item: (dep - ordered)
70+
for item, dep in data.items()
71+
if item not in ordered}
72+
if len(data) != 0:
73+
raise ValueError('Cyclic dependencies exist among these items: {}'.format(
74+
', '.join(repr(x) for x in data.items())))
75+
76+
77+
def toposort_flatten(data, sort=True):
78+
"""Returns a single list of dependencies. For any set returned by
79+
toposort(), those items are sorted and appended to the result (just to
80+
make the results deterministic)."""
81+
82+
result = []
83+
for d in toposort(data):
84+
result.extend((sorted if sort else list)(d))
85+
return result

0 commit comments

Comments
 (0)