[BE] Clean up trymerge code handling flaky failures (pytorch#110133)

This is the 2nd part of pytorch#110054. The flaky classification has been done on Dr.CI. There is no need to download flaky rule files and do the check anymore. Some tests are also updated with new examples because we mocked the list of flaky rules there. Similar tests have been done on Dr.CI. * [x] pytorch#110054 * [x] Clean up the flaky rules logic because it has already been implemented on Dr. CI * [ ] Clean up the broken trunk logic for the same reason Pull Request resolved: pytorch#110133 Approved by: https://github.com/clee2000
BinnyKing · Sep 30, 2023 · 81a7445 · 81a7445
1 parent f7ba3e8
commit 81a7445
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 116 deletions.
diff --git a/.github/scripts/drci_mocks.json.gz b/.github/scripts/drci_mocks.json.gz
diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz
diff --git a/.github/scripts/rockset_mocks.json.gz b/.github/scripts/rockset_mocks.json.gz
diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py
@@ -21,7 +21,6 @@
 from trymerge import (
     categorize_checks,
     find_matching_merge_rule,
-    FlakyRule,
     get_classifications,
     get_drci_classifications,
     get_rockset_results,
@@ -226,16 +225,6 @@ def mocked_read_merge_rules_raise(repo: Any, org: str, project: str) -> List[Mer
     raise RuntimeError("testing")
 
 
-def empty_flaky_rules() -> List[FlakyRule]:
-    return []
-
-
-def xla_is_flaky_rules() -> List[FlakyRule]:
-    return [
-        FlakyRule("xla", ["FAILED: Build did NOT complete successfully"]),
-    ]
-
-
 def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
     return [
         MergeRule(
@@ -247,6 +236,7 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
                 "EasyCLA",
                 "pull / linux-bionic-py3_8-clang8-xla / build",
                 "pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
+                "inductor / cuda11.8-py3.10-gcc7-sm86 / test (inductor_torchbench_dynamic, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
             ],
             ignore_flaky_failures=False,
         ),
@@ -268,7 +258,6 @@ def commit_message(self, ref: str) -> str:
         return "super awsome commit message"
 
 
-@mock.patch("trymerge.read_flaky_rules", side_effect=empty_flaky_rules)
 @mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
 @mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
 @mock.patch(
@@ -677,55 +666,74 @@ def test_get_merge_base(self, *args: Any) -> None:
 )
 class TestBypassFailures(TestCase):
     def test_get_classifications(self, *args: Any) -> None:
-        flaky_rules = [
-            # Try a regex rule
-            FlakyRule("distributed", ["##\\[error\\]The operation [wW]as .+"])
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 92863)
+        pr = GitHubPR("pytorch", "pytorch", 109584)
         checks = pr.get_checkrun_conclusions()
         checks = get_classifications(
             pr.pr_num,
             pr.project,
             checks,
             pr.last_commit()["oid"],
             pr.get_merge_base(),
-            flaky_rules,
             [],
         )
         self.assertTrue(
             checks[
-                "pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
+                "pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
+            ].classification
+            == "BROKEN_TRUNK"
+        )
+        self.assertTrue(
+            checks[
+                "trunk / win-vs2019-cpu-py3 / test (default, 2, 3, windows.4xlarge.nonephemeral)"
             ].classification
             == "BROKEN_TRUNK"
         )
         self.assertTrue(
             checks[
-                "pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
+                "pull / linux-jammy-py3.8-gcc11 / test (distributed, 1, 2, linux.2xlarge)"
+            ].classification
+            == "BROKEN_TRUNK"
+        )
+        self.assertTrue(
+            checks[
+                "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
             ].classification
             == "FLAKY"
         )
+
+        # Set the threshold larger or equal to the number of ok failures
         pending, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=2
+            checks, list(checks.keys()), ok_failed_checks_threshold=6
         )
         self.assertTrue(len(pending) == 0)
         self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)
 
         # Not set any threshold, defaults to -1 to ignore all flaky and broken trunk failures
         pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
         self.assertTrue(len(pending) == 0)
         self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)
 
+        # Set the threshold lower than the number of ok failures
         pending, failed, ignorable = categorize_checks(
             checks, list(checks.keys()), ok_failed_checks_threshold=1
         )
         self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 2)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
+        self.assertTrue(len(failed) == 6)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)
+
+        # Set the threshold to 0 like when ignore_flaky_failures is on
+        pending, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=1
+        )
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 6)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)
 
     def test_get_classifications_similar_failures(self, *args: Any) -> None:
         pr = GitHubPR("pytorch", "pytorch", 109750)
@@ -737,7 +745,6 @@ def test_get_classifications_similar_failures(self, *args: Any) -> None:
             pr.last_commit()["oid"],
             pr.get_merge_base(),
             [],
-            [],
         )
         pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
         self.assertTrue(len(pending) == 0)
@@ -754,7 +761,6 @@ def test_get_classifications_unstable(self, *args: Any) -> None:
             pr.last_commit()["oid"],
             pr.get_merge_base(),
             [],
-            [],
         )
         workflow_name = "linux-bionic-cuda12.1-py3.10-gcc9-bazel-test"
         job_name = "build-and-test (default, 1, 1, linux.4xlarge.nvidia.gpu, unstable)"
@@ -812,7 +818,6 @@ def test_get_classifications_broken_trunk(self, *args: Any) -> None:
                     pr.last_commit()["oid"],
                     pr.get_merge_base(),
                     [],
-                    [],
                 )
 
                 pending, failed, _ = categorize_checks(checks, list(checks.keys()))
@@ -832,35 +837,28 @@ def test_ignore_current(self, *args: Any) -> None:
         # current checks takes place after other classifications: flaky, unstable,
         # or broken trunk. Only actual new failures should be kept in the list of
         # ignore current checks to use to record force merge with actual failures
-        flaky_rules = [
-            FlakyRule("distributed", ["##\\[error\\]The operation was canceled."])
-        ]
-        flaky = (
-            "pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
-        )
+        flaky = "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
         broken_trunk = (
-            "pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
+            "pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
         )
 
-        pr = GitHubPR("pytorch", "pytorch", 92863)
+        pr = GitHubPR("pytorch", "pytorch", 109584)
         checks = pr.get_checkrun_conclusions()
 
-        # No broken trunk or flaky rules, then all failures are ignored when ic is used
+        # No broken trunk or flaky as the merge base is not set, these failures are
+        # counted as ignore current when ic is used
         checks = get_classifications(
             pr.pr_num,
             pr.project,
             checks,
             pr.last_commit()["oid"],
             None,
-            [],
             [broken_trunk, flaky],
         )
         self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
         self.assertTrue(checks[broken_trunk].classification == "IGNORE_CURRENT_CHECK")
-        _, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=2
-        )
-        self.assertTrue(len(failed) == 0)
+        _, failed, ignorable = categorize_checks(checks, list(checks.keys()))
+        self.assertTrue(len(failed) == 4)
         self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 2)
         self.assertTrue(len(ignorable["FLAKY"]) == 0)
         self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
@@ -874,46 +872,22 @@ def test_ignore_current(self, *args: Any) -> None:
             checks,
             pr.last_commit()["oid"],
             pr.get_merge_base(),
-            flaky_rules,
             [broken_trunk, flaky],
         )
         self.assertTrue(checks[flaky].classification == "FLAKY")
         self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
-        _, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=2
-        )
+        _, failed, ignorable = categorize_checks(checks, list(checks.keys()))
         self.assertTrue(len(failed) == 0)
         self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
-
-        # Broken trunk takes precedence over ignore current (no flaky rule is set here)
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            pr.last_commit()["oid"],
-            pr.get_merge_base(),
-            [],
-            [broken_trunk, flaky],
-        )
-        self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
-        self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
-        _, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=2
-        )
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 1)
-        self.assertTrue(len(ignorable["FLAKY"]) == 0)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 4)
 
-    @mock.patch("trymerge.read_flaky_rules", side_effect=xla_is_flaky_rules)
     @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
     def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
         """
         Regression test for https://github.com/pytorch/test-infra/issues/4126
         """
-        pr = GitHubPR("pytorch", "pytorch", 100369)
+        pr = GitHubPR("pytorch", "pytorch", 105312)
         repo = DummyGitRepo()
         # Check that failure is classified as flaky but still raises exception
         with warnings.catch_warnings(record=True) as w, self.assertRaises(RuntimeError):

diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
@@ -74,29 +74,6 @@ def __init__(self, name: str, url: str, status: Optional[str]):
         self.jobs: JobNameToStateDict = {}
 
 
-class FlakyRule:
-    def __init__(self, name: str, captures: List[str]):
-        self.name = re.compile(name)
-        self.captures = [re.compile(r) for r in captures]
-
-    def matches(self, job: Optional[Dict[str, Any]]) -> bool:
-        return (
-            job is not None
-            and self.name.search(job.get("name", "")) is not None
-            and job.get("failure_captures") is not None
-            and all(
-                any(
-                    r.search(capture) is not None
-                    for capture in job.get("failure_captures", [])
-                )
-                for r in self.captures
-            )
-        )
-
-    def __repr__(self) -> str:
-        return f"FlakyRule[name='{self.name}', captures={self.captures}]"
-
-
 GH_PR_REVIEWS_FRAGMENT = """
 fragment PRReviews on PullRequestReviewConnection {
   nodes {
@@ -1261,13 +1238,6 @@ def read_merge_rules(
         return [MergeRule(**x) for x in rc]
 
 
-@lru_cache(maxsize=None)
-def read_flaky_rules() -> List[FlakyRule]:
-    # NOTE: This is currently hardcoded, can be extended to do per repo rules
-    FLAKY_RULES_URL = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/flaky-rules.json"
-    return _get_flaky_rules(FLAKY_RULES_URL)
-
-
 def find_matching_merge_rule(
     pr: GitHubPR,
     repo: Optional[GitRepo] = None,
@@ -1298,7 +1268,6 @@ def find_matching_merge_rule(
     reject_reason = f"No rule found to match PR. Please [report]{issue_link} this issue to DevX team."
 
     rules = read_merge_rules(repo, pr.org, pr.project)
-    flaky_rules = read_flaky_rules()
     if not rules:
         reject_reason = f"Rejecting the merge as no rules are defined for the repository in {MERGE_RULE_PATH}"
         raise RuntimeError(reject_reason)
@@ -1318,7 +1287,6 @@ def find_matching_merge_rule(
         checks,
         pr.last_commit()["oid"],
         base_rev,
-        flaky_rules,
         ignore_current_checks=ignore_current_checks,
     )
 
@@ -1469,11 +1437,6 @@ def checks_to_markdown_bullets(
     ]
 
 
-@retries_decorator(rc=[])
-def _get_flaky_rules(url: str) -> List[FlakyRule]:
-    return [FlakyRule(**rule) for rule in gh_fetch_json_list(url)]
-
-
 @retries_decorator()
 def save_merge_record(
     collection: str,
@@ -1620,7 +1583,6 @@ def is_broken_trunk(
 
 def is_flaky(
     head_job: Optional[Dict[str, Any]],
-    flaky_rules: List[FlakyRule],
     drci_classifications: Any,
 ) -> bool:
     if not head_job or not drci_classifications:
@@ -1630,7 +1592,7 @@ def is_flaky(
     return any(
         head_job["name"] == flaky["name"]
         for flaky in drci_classifications.get("FLAKY", [])
-    ) or any(rule.matches(head_job) for rule in flaky_rules)
+    )
 
 
 def get_classifications(
@@ -1639,7 +1601,6 @@ def get_classifications(
     checks: Dict[str, JobCheckState],
     head_sha: str,
     merge_base: Optional[str],
-    flaky_rules: List[FlakyRule],
     ignore_current_checks: Optional[List[str]],
 ) -> Dict[str, JobCheckState]:
     # Group by job name without shard id and suffix to correctly identify broken
@@ -1716,6 +1677,9 @@ def insert(
         name_no_suffix = remove_job_name_suffix(name)
         head_sha_job = head_sha_jobs.get(name_no_suffix, {}).get(name)
 
+        # NB: It's important to note that when it comes to ghstack and broken trunk classification,
+        # the current implementation of trymerge uses the base of the current PR in the stack, i.e.
+        # gh/user/xx/base, while Dr.CI uses the base of the whole stack. Both works though
         if is_broken_trunk(head_sha_job, merge_base_jobs.get(name_no_suffix)):
             checks_with_classifications[name] = JobCheckState(
                 check.name,
@@ -1727,7 +1691,7 @@ def insert(
             )
             continue
 
-        elif is_flaky(head_sha_job, flaky_rules, drci_classifications):
+        elif is_flaky(head_sha_job, drci_classifications):
             checks_with_classifications[name] = JobCheckState(
                 check.name, check.url, check.status, "FLAKY", check.job_id, check.title
             )
@@ -2019,7 +1983,6 @@ def merge(
     start_time = time.time()
     last_exception = ""
     elapsed_time = 0.0
-    flaky_rules = read_flaky_rules()
     ignore_current_checks = [
         x[0] for x in ignore_current_checks_info
     ]  # convert to List[str] for convenience
@@ -2057,7 +2020,6 @@ def merge(
                 checks,
                 pr.last_commit()["oid"],
                 pr.get_merge_base(),
-                flaky_rules,
                 ignore_current_checks=ignore_current_checks,
             )
             pending, failing, _ = categorize_checks(