Add fallback functionality to semgrep scan when fetching config from …

…semgrep.dev (semgrep#8459) PR checklist: - [X] Purpose of the code is [evident to future readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code) - [x] Tests included or PR comment includes a reproducible test plan - [ ] Documentation is up-to-date - [x] A changelog entry was [added to changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry) for any user-facing change - [ ] Change has no security implications (otherwise, ping security team) If you're unsure about any of this, please see: - [Contribution guidelines](https://semgrep.dev/docs/contributing/contributing-code)! - [One of the more specific guides located here](https://semgrep.dev/docs/contributing/contributing/)
rkialashaki · Aug 15, 2023 · 0a640f4 · 0a640f4
1 parent f493174
commit 0a640f4
Show file tree

Hide file tree

Showing 9 changed files with 150 additions and 59 deletions.
diff --git a/changelog.d/gh-8459.added b/changelog.d/gh-8459.added
@@ -0,0 +1 @@
+`semgrep scan` is now more resilient to failures when fetching config from semgrep.dev. If it can't fetch a config from semgrep.dev it will use backup infrastructure to fetch the most recent successful config for that customers environment.
diff --git a/cli/src/semgrep/config_resolver.py b/cli/src/semgrep/config_resolver.py
@@ -50,7 +50,6 @@
 from semgrep.util import is_config_suffix
 from semgrep.util import is_rules
 from semgrep.util import is_url
-from semgrep.util import terminal_wrap
 from semgrep.util import with_color
 from semgrep.verbose_logging import getLogger
 
@@ -89,7 +88,6 @@ class ConfigLoader:
     _origin = ConfigType.LOCAL
     _config_path = ""
     _project_url = None
-    _extra_headers: Dict[str, str] = {}
 
     def __init__(
         self,
@@ -106,6 +104,7 @@ def __init__(
         state = get_state()
         self._project_url = project_url
         self._origin = ConfigType.REGISTRY
+        self._supports_fallback_config = False
         if config_str == "r2c":
             state.metrics.add_feature("config", "r2c")
             self._config_path = "https://semgrep.dev/c/p/r2c"
@@ -115,9 +114,11 @@ def __init__(
         elif is_policy_id(config_str):
             state.metrics.add_feature("config", "policy")
             self._config_path = url_for_policy()
+            self._supports_fallback_config = True
         elif is_supply_chain(config_str):
             state.metrics.add_feature("config", "sca")
             self._config_path = url_for_supply_chain()
+            self._supports_fallback_config = True
         elif is_registry_id(config_str):
             state.metrics.add_feature("config", f"registry:prefix-{config_str[0]}")
             self._config_path = registry_id_to_url(config_str)
@@ -169,20 +170,55 @@ def _download_config(self) -> ConfigFile:
         """
         Download a configuration from semgrep.dev
         """
-        config_url = self._config_path
-        logger.debug(f"trying to download from {self._nice_semgrep_url(config_url)}")
         try:
-            config = ConfigFile(
-                None,
-                self._make_config_request(),
-                config_url,
-            )
-            logger.debug(f"finished downloading from {config_url}")
-            return config
-        except Exception as e:
-            raise SemgrepError(
-                terminal_wrap(f"Failed to download config from {config_url}: {str(e)}")
-            )
+            return self._download_config_from_url(self._config_path)
+        except Exception:
+            if self._supports_fallback_config:
+                try:
+                    fallback_url = re.sub(
+                        r"^[^?]*",  # replace everything but query params
+                        f"{get_state().env.fail_open_url}/config",
+                        self._config_path,
+                    )
+                    return self._download_config_from_url(fallback_url)
+                except Exception:
+                    pass
+
+            raise  # error from first fetch
+
+    def _download_config_from_url(self, url: str) -> ConfigFile:
+        app_session = get_state().app_session
+        logger.debug("Downloading config from %s", url)
+        error = f"Failed to download configuration from {url}"
+        try:
+            resp = app_session.get(url, headers={"Accept": "application/json"})
+            if resp.status_code == requests.codes.ok:
+                try:
+                    rule_config = resp.json()["rule_config"]
+
+                    # The backend wants to return native json, but we support a json string here too
+                    config_str = (
+                        rule_config
+                        if isinstance(rule_config, str)
+                        else json.dumps(rule_config)
+                    )
+
+                    return ConfigFile(None, config_str, url)
+                except Exception as ex:
+                    # catch JSONDecodeError, AssertionError, etc. is this needed?
+                    logger.debug("Failed to decode JSON: %s", repr(ex))
+                    return ConfigFile(
+                        None, resp.content.decode("utf-8", errors="replace"), url
+                    )
+                finally:
+                    logger.debug(f"Downloaded config from %s", url)
+
+            error += f" HTTP {resp.status_code}."
+        except requests.exceptions.RetryError as ex:
+            error += f" Failed after multiple attempts ({ex.args[0].reason})"
+
+        logger.debug(error)  # since the raised exception may be caught and suppressed
+        raise SemgrepError(error)
 
     def _load_config_from_local_path(self) -> List[ConfigFile]:
         """
@@ -209,24 +245,6 @@ def _load_config_from_local_path(self) -> List[ConfigFile]:
         logger.debug(f"Done loading local config from {loc}")
         return config
 
-    def _make_config_request(self) -> str:
-        app_session = get_state().app_session
-        resp = app_session.get(
-            self._config_path,
-            headers={"Accept": "application/json", **self._extra_headers},
-        )
-        if resp.status_code == requests.codes.ok:
-            try:
-                rule_config = resp.json()["rule_config"]
-                assert isinstance(rule_config, str)
-                return rule_config
-            except Exception:  # catch JSONDecodeError, AssertionError, etc.
-                return resp.content.decode("utf-8", errors="replace")
-        else:
-            raise SemgrepError(
-                f"bad status code: {resp.status_code} returned by config url: {self._config_path}"
-            )
-
     def is_registry_url(self) -> bool:
         return self._origin == ConfigType.REGISTRY
 
@@ -755,16 +773,17 @@ def is_policy_id(config_str: str) -> bool:
 
 def url_for_supply_chain() -> str:
     env = get_state().env
-    repo_name = os.environ.get("SEMGREP_REPO_NAME")
 
     # The app considers anything that will not POST back to it to be a dry_run
     params = {
         "sca": True,
         "dry_run": True,
         "full_scan": True,
-        "repo_name": repo_name,
         "semgrep_version": __VERSION__,
     }
+    if "SEMGREP_REPO_NAME" in os.environ:
+        params["repo_name"] = os.environ.get("SEMGREP_REPO_NAME")
+
     params_str = urlencode(params)
     return f"{env.semgrep_url}/{DEFAULT_SEMGREP_APP_CONFIG_URL}?{params_str}"
 

diff --git a/cli/src/semgrep/env.py b/cli/src/semgrep/env.py
@@ -16,6 +16,11 @@ def url(value: str) -> str:
     return value.rstrip("/")
 
 
+def migrate_fail_open_url(value: str) -> str:
+    # Supports the fail_open_url being the hostname without the path even when some folks might set the path
+    return url(value.replace("/failure", ""))
+
+
 @overload
 def EnvFactory(envvars: Union[str, Iterable[str]], default: str) -> str:
     ...
@@ -53,9 +58,9 @@ class Env:
     fail_open_url: str = field(
         default=EnvFactory(
             ["SEMGREP_FAIL_OPEN_URL"],
-            "https://fail-open.prod.semgrep.dev/failure",
+            "https://fail-open.prod.semgrep.dev",
         ),
-        converter=url,
+        converter=migrate_fail_open_url,
     )
     semgrep_url: str = field(
         default=EnvFactory(["SEMGREP_URL", "SEMGREP_APP_URL"], "https://semgrep.dev"),

diff --git a/cli/src/semgrep/error_handler.py b/cli/src/semgrep/error_handler.py
@@ -67,10 +67,12 @@ def send(self, exit_code: int) -> int:
         ):
             return exit_code
 
+        url = f"{state.env.fail_open_url}/failure"
+
         import traceback
 
         logger.debug(
-            f"Sending to fail-open endpoint {state.env.fail_open_url} since fail-open is configured to {self.suppress_errors}"
+            f"Sending to fail-open endpoint {url} since fail-open is configured to {self.suppress_errors}"
         )
 
         token = auth.get_token()
@@ -82,9 +84,7 @@ def send(self, exit_code: int) -> int:
             self.payload["error"] = traceback.format_exc()
 
         try:
-            requests.post(
-                state.env.fail_open_url, headers=headers, json=self.payload, timeout=3
-            )
+            requests.post(url, headers=headers, json=self.payload, timeout=3)
         except Exception as e:
             logger.debug(f"Error sending to fail-open endpoint: {e}")
 

diff --git a/cli/tests/e2e/test_ci.py b/cli/tests/e2e/test_ci.py
@@ -16,13 +16,15 @@
 from semgrep import __VERSION__
 from semgrep.app.scans import ScanHandler
 from semgrep.app.session import AppSession
+from semgrep.config_resolver import ConfigFile
 from semgrep.config_resolver import ConfigLoader
 from semgrep.error_handler import ErrorHandler
 from semgrep.meta import GithubMeta
 from semgrep.meta import GitlabMeta
 from semgrep.meta import GitMeta
 from semgrep.metrics import Metrics
 
+
 pytestmark = pytest.mark.kinda_slow
 
 REPO_ORG_NAME = "org_name"
@@ -301,7 +303,11 @@ def automocks(mocker):
         """
     ).lstrip()
 
-    mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
+    mocker.patch.object(
+        ConfigLoader,
+        "_download_config_from_url",
+        side_effect=lambda url: ConfigFile(None, file_content, url),
+    )
     mocker.patch.object(
         ScanHandler,
         "_get_scan_config_from_app",
@@ -1539,7 +1545,11 @@ def test_query_dependency(
             sca-kind: upgrade-only
         """
     ).lstrip()
-    mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
+    mocker.patch.object(
+        ConfigLoader,
+        "_download_config_from_url",
+        side_effect=lambda url: ConfigFile(None, file_content, url),
+    )
     mocker.patch.object(
         ScanHandler,
         "_get_scan_config_from_app",
@@ -1613,7 +1623,11 @@ def test_existing_supply_chain_finding(
             sca-kind: upgrade-only
         """
     ).lstrip()
-    mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
+    mocker.patch.object(
+        ConfigLoader,
+        "_download_config_from_url",
+        side_effect=lambda url: ConfigFile(None, file_content, url),
+    )
     mocker.patch.object(
         ScanHandler,
         "_get_scan_config_from_app",

diff --git a/cli/tests/e2e/test_config_resolver.py b/cli/tests/e2e/test_config_resolver.py
@@ -4,22 +4,30 @@
 from tests.semgrep_runner import SemgrepRunner
 
 from semgrep.cli import cli
+from semgrep.config_resolver import ConfigFile
 from semgrep.config_resolver import ConfigLoader
+from semgrep.error import SemgrepError
 
 
 @pytest.mark.quick
 def test_new_feature_registry_config(monkeypatch, snapshot, mocker, tmp_path):
-    file_content = dedent(
-        """
-        rules:
-        - id: eqeq-bad
-          pattern-new-feature: $X == $X
-          message: "useless comparison"
-          languages: [python]
-          severity: ERROR
-        """
-    ).lstrip()
-    mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
+    config_file = ConfigFile(
+        None,
+        dedent(
+            """
+            rules:
+            - id: eqeq-bad
+              pattern-new-feature: $X == $X
+              message: "useless comparison"
+              languages: [python]
+              severity: ERROR
+            """
+        ).lstrip(),
+        "https://semgrep.dev/p/ci",
+    )
+    mocker.patch.object(
+        ConfigLoader, "_download_config_from_url", return_value=config_file
+    )
 
     runner = SemgrepRunner(
         env={
@@ -29,3 +37,37 @@ def test_new_feature_registry_config(monkeypatch, snapshot, mocker, tmp_path):
     )
     result = runner.invoke(cli, ["scan", "--config", "p/ci"])
     snapshot.assert_match(result.output, "output.txt")
+
+
+@pytest.mark.quick
+def test_fallback_config_works(monkeypatch, snapshot, mocker, tmp_path):
+    config_file = ConfigFile(
+        None,
+        "rules: []",
+        "https://semgrep.dev/p/ci",
+    )
+    patched_download = mocker.patch.object(
+        ConfigLoader,
+        "_download_config_from_url",
+        side_effect=[
+            SemgrepError(
+                f"Failed to download configuration. HTTP 500 when fetching URL"
+            ),
+            config_file,
+        ],
+    )
+
+    runner = SemgrepRunner(
+        env={
+            "SEMGREP_SETTINGS_FILE": str(tmp_path / ".settings.yaml"),
+            "SEMGREP_APP_TOKEN": "",
+        }
+    )
+    result = runner.invoke(cli, ["scan", "--debug", "--config", "supply-chain"])
+
+    assert "https://semgrep.dev/api" in patched_download.call_args_list[0].args[0]
+    assert (
+        "https://fail-open.prod.semgrep.dev/"
+        in patched_download.call_args_list[1].args[0]
+    )
+    assert "loaded 1 configs" in result.stdout
diff --git a/cli/tests/e2e/test_login.py b/cli/tests/e2e/test_login.py
@@ -2,6 +2,8 @@
 from tests.semgrep_runner import SemgrepRunner
 
 from semgrep.cli import cli
+from semgrep.config_resolver import ConfigFile
+from semgrep.config_resolver import ConfigLoader
 
 
 @pytest.mark.slow
@@ -106,8 +108,14 @@ def test_login(tmp_path, mocker):
     assert "Need to set env var SEMGREP_REPO_NAME" in result.output
 
     # Run policy. Check that request is made to correct deployment + org/repo
+    mocked_config = mocker.patch.object(
+        ConfigLoader,
+        "_download_config_from_url",
+        side_effect=lambda url: ConfigFile(None, "invalid-conifg}", url),
+    )
     result = runner.invoke(
         cli, ["--config", "policy"], env={"SEMGREP_REPO_NAME": "org/repo"}
     )
     assert result.exit_code == 7
-    assert "https://semgrep.dev/api/agent/deployments/scans/config" in result.output
+    assert mocked_config.called
+    assert "remote-url_0 was not a mapping" in result.output
diff --git a/cli/tests/e2e/test_metrics.py b/cli/tests/e2e/test_metrics.py
@@ -19,8 +19,10 @@
 from tests.semgrep_runner import SemgrepRunner
 
 from semgrep.cli import cli
+from semgrep.config_resolver import ConfigFile
 from semgrep.profiling import ProfilingData
 
+
 # Test data to avoid making web calls in test code
 
 USELESS_EQEQ = """rules:
@@ -56,8 +58,8 @@ def __cmp__(...):
 @pytest.fixture(scope="function")
 def mock_config_request(monkeypatch: MonkeyPatch) -> Iterator[None]:
     monkeypatch.setattr(
-        "semgrep.config_resolver.ConfigLoader._make_config_request",
-        lambda s: USELESS_EQEQ,
+        "semgrep.config_resolver.ConfigLoader._download_config_from_url",
+        lambda s, url: ConfigFile(None, USELESS_EQEQ, url),
     )
     yield
 

diff --git a/cli/tests/unit/test_error_handler.py b/cli/tests/unit/test_error_handler.py
@@ -29,7 +29,7 @@ def mock_get_token(mocker):
 def mocked_state(mocker):
     mocked = mocker.MagicMock()
     mocked.app_session.user_agent = FAKE_USER_AGENT
-    mocked.env.fail_open_url = FAIL_OPEN_URL
+    mocked.env.fail_open_url = FAIL_OPEN_URL.replace("/failure", "")
     mocker.patch("semgrep.state.get_state", return_value=mocked)
     yield mocked
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		`semgrep scan` is now more resilient to failures when fetching config from semgrep.dev. If it can't fetch a config from semgrep.dev it will use backup infrastructure to fetch the most recent successful config for that customers environment.