Skip to content

Commit

Permalink
Add fallback functionality to semgrep scan when fetching config from …
Browse files Browse the repository at this point in the history
…semgrep.dev (semgrep#8459)

PR checklist:

- [X] Purpose of the code is [evident to future
readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code)
- [x] Tests included or PR comment includes a reproducible test plan
- [ ] Documentation is up-to-date
- [x] A changelog entry was [added to
changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry)
for any user-facing change
- [ ] Change has no security implications (otherwise, ping security
team)

If you're unsure about any of this, please see:

- [Contribution
guidelines](https://semgrep.dev/docs/contributing/contributing-code)!
- [One of the more specific guides located
here](https://semgrep.dev/docs/contributing/contributing/)
  • Loading branch information
jbergler authored Aug 15, 2023
1 parent f493174 commit 0a640f4
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 59 deletions.
1 change: 1 addition & 0 deletions changelog.d/gh-8459.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
`semgrep scan` is now more resilient to failures when fetching config from semgrep.dev. If it can't fetch a config from semgrep.dev it will use backup infrastructure to fetch the most recent successful config for that customers environment.
89 changes: 54 additions & 35 deletions cli/src/semgrep/config_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
from semgrep.util import is_config_suffix
from semgrep.util import is_rules
from semgrep.util import is_url
from semgrep.util import terminal_wrap
from semgrep.util import with_color
from semgrep.verbose_logging import getLogger

Expand Down Expand Up @@ -89,7 +88,6 @@ class ConfigLoader:
_origin = ConfigType.LOCAL
_config_path = ""
_project_url = None
_extra_headers: Dict[str, str] = {}

def __init__(
self,
Expand All @@ -106,6 +104,7 @@ def __init__(
state = get_state()
self._project_url = project_url
self._origin = ConfigType.REGISTRY
self._supports_fallback_config = False
if config_str == "r2c":
state.metrics.add_feature("config", "r2c")
self._config_path = "https://semgrep.dev/c/p/r2c"
Expand All @@ -115,9 +114,11 @@ def __init__(
elif is_policy_id(config_str):
state.metrics.add_feature("config", "policy")
self._config_path = url_for_policy()
self._supports_fallback_config = True
elif is_supply_chain(config_str):
state.metrics.add_feature("config", "sca")
self._config_path = url_for_supply_chain()
self._supports_fallback_config = True
elif is_registry_id(config_str):
state.metrics.add_feature("config", f"registry:prefix-{config_str[0]}")
self._config_path = registry_id_to_url(config_str)
Expand Down Expand Up @@ -169,20 +170,55 @@ def _download_config(self) -> ConfigFile:
"""
Download a configuration from semgrep.dev
"""
config_url = self._config_path
logger.debug(f"trying to download from {self._nice_semgrep_url(config_url)}")
try:
config = ConfigFile(
None,
self._make_config_request(),
config_url,
)
logger.debug(f"finished downloading from {config_url}")
return config
except Exception as e:
raise SemgrepError(
terminal_wrap(f"Failed to download config from {config_url}: {str(e)}")
)
return self._download_config_from_url(self._config_path)
except Exception:
if self._supports_fallback_config:
try:
fallback_url = re.sub(
r"^[^?]*", # replace everything but query params
f"{get_state().env.fail_open_url}/config",
self._config_path,
)
return self._download_config_from_url(fallback_url)
except Exception:
pass

raise # error from first fetch

def _download_config_from_url(self, url: str) -> ConfigFile:
app_session = get_state().app_session
logger.debug("Downloading config from %s", url)
error = f"Failed to download configuration from {url}"
try:
resp = app_session.get(url, headers={"Accept": "application/json"})
if resp.status_code == requests.codes.ok:
try:
rule_config = resp.json()["rule_config"]

# The backend wants to return native json, but we support a json string here too
config_str = (
rule_config
if isinstance(rule_config, str)
else json.dumps(rule_config)
)

return ConfigFile(None, config_str, url)
except Exception as ex:
# catch JSONDecodeError, AssertionError, etc. is this needed?
logger.debug("Failed to decode JSON: %s", repr(ex))
return ConfigFile(
None, resp.content.decode("utf-8", errors="replace"), url
)
finally:
logger.debug(f"Downloaded config from %s", url)

error += f" HTTP {resp.status_code}."
except requests.exceptions.RetryError as ex:
error += f" Failed after multiple attempts ({ex.args[0].reason})"

logger.debug(error) # since the raised exception may be caught and suppressed
raise SemgrepError(error)

def _load_config_from_local_path(self) -> List[ConfigFile]:
"""
Expand All @@ -209,24 +245,6 @@ def _load_config_from_local_path(self) -> List[ConfigFile]:
logger.debug(f"Done loading local config from {loc}")
return config

def _make_config_request(self) -> str:
app_session = get_state().app_session
resp = app_session.get(
self._config_path,
headers={"Accept": "application/json", **self._extra_headers},
)
if resp.status_code == requests.codes.ok:
try:
rule_config = resp.json()["rule_config"]
assert isinstance(rule_config, str)
return rule_config
except Exception: # catch JSONDecodeError, AssertionError, etc.
return resp.content.decode("utf-8", errors="replace")
else:
raise SemgrepError(
f"bad status code: {resp.status_code} returned by config url: {self._config_path}"
)

def is_registry_url(self) -> bool:
return self._origin == ConfigType.REGISTRY

Expand Down Expand Up @@ -755,16 +773,17 @@ def is_policy_id(config_str: str) -> bool:

def url_for_supply_chain() -> str:
env = get_state().env
repo_name = os.environ.get("SEMGREP_REPO_NAME")

# The app considers anything that will not POST back to it to be a dry_run
params = {
"sca": True,
"dry_run": True,
"full_scan": True,
"repo_name": repo_name,
"semgrep_version": __VERSION__,
}
if "SEMGREP_REPO_NAME" in os.environ:
params["repo_name"] = os.environ.get("SEMGREP_REPO_NAME")

params_str = urlencode(params)
return f"{env.semgrep_url}/{DEFAULT_SEMGREP_APP_CONFIG_URL}?{params_str}"

Expand Down
9 changes: 7 additions & 2 deletions cli/src/semgrep/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ def url(value: str) -> str:
return value.rstrip("/")


def migrate_fail_open_url(value: str) -> str:
# Supports the fail_open_url being the hostname without the path even when some folks might set the path
return url(value.replace("/failure", ""))


@overload
def EnvFactory(envvars: Union[str, Iterable[str]], default: str) -> str:
...
Expand Down Expand Up @@ -53,9 +58,9 @@ class Env:
fail_open_url: str = field(
default=EnvFactory(
["SEMGREP_FAIL_OPEN_URL"],
"https://fail-open.prod.semgrep.dev/failure",
"https://fail-open.prod.semgrep.dev",
),
converter=url,
converter=migrate_fail_open_url,
)
semgrep_url: str = field(
default=EnvFactory(["SEMGREP_URL", "SEMGREP_APP_URL"], "https://semgrep.dev"),
Expand Down
8 changes: 4 additions & 4 deletions cli/src/semgrep/error_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,12 @@ def send(self, exit_code: int) -> int:
):
return exit_code

url = f"{state.env.fail_open_url}/failure"

import traceback

logger.debug(
f"Sending to fail-open endpoint {state.env.fail_open_url} since fail-open is configured to {self.suppress_errors}"
f"Sending to fail-open endpoint {url} since fail-open is configured to {self.suppress_errors}"
)

token = auth.get_token()
Expand All @@ -82,9 +84,7 @@ def send(self, exit_code: int) -> int:
self.payload["error"] = traceback.format_exc()

try:
requests.post(
state.env.fail_open_url, headers=headers, json=self.payload, timeout=3
)
requests.post(url, headers=headers, json=self.payload, timeout=3)
except Exception as e:
logger.debug(f"Error sending to fail-open endpoint: {e}")

Expand Down
20 changes: 17 additions & 3 deletions cli/tests/e2e/test_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
from semgrep import __VERSION__
from semgrep.app.scans import ScanHandler
from semgrep.app.session import AppSession
from semgrep.config_resolver import ConfigFile
from semgrep.config_resolver import ConfigLoader
from semgrep.error_handler import ErrorHandler
from semgrep.meta import GithubMeta
from semgrep.meta import GitlabMeta
from semgrep.meta import GitMeta
from semgrep.metrics import Metrics


pytestmark = pytest.mark.kinda_slow

REPO_ORG_NAME = "org_name"
Expand Down Expand Up @@ -301,7 +303,11 @@ def automocks(mocker):
"""
).lstrip()

mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
mocker.patch.object(
ConfigLoader,
"_download_config_from_url",
side_effect=lambda url: ConfigFile(None, file_content, url),
)
mocker.patch.object(
ScanHandler,
"_get_scan_config_from_app",
Expand Down Expand Up @@ -1539,7 +1545,11 @@ def test_query_dependency(
sca-kind: upgrade-only
"""
).lstrip()
mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
mocker.patch.object(
ConfigLoader,
"_download_config_from_url",
side_effect=lambda url: ConfigFile(None, file_content, url),
)
mocker.patch.object(
ScanHandler,
"_get_scan_config_from_app",
Expand Down Expand Up @@ -1613,7 +1623,11 @@ def test_existing_supply_chain_finding(
sca-kind: upgrade-only
"""
).lstrip()
mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
mocker.patch.object(
ConfigLoader,
"_download_config_from_url",
side_effect=lambda url: ConfigFile(None, file_content, url),
)
mocker.patch.object(
ScanHandler,
"_get_scan_config_from_app",
Expand Down
64 changes: 53 additions & 11 deletions cli/tests/e2e/test_config_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,30 @@
from tests.semgrep_runner import SemgrepRunner

from semgrep.cli import cli
from semgrep.config_resolver import ConfigFile
from semgrep.config_resolver import ConfigLoader
from semgrep.error import SemgrepError


@pytest.mark.quick
def test_new_feature_registry_config(monkeypatch, snapshot, mocker, tmp_path):
file_content = dedent(
"""
rules:
- id: eqeq-bad
pattern-new-feature: $X == $X
message: "useless comparison"
languages: [python]
severity: ERROR
"""
).lstrip()
mocker.patch.object(ConfigLoader, "_make_config_request", return_value=file_content)
config_file = ConfigFile(
None,
dedent(
"""
rules:
- id: eqeq-bad
pattern-new-feature: $X == $X
message: "useless comparison"
languages: [python]
severity: ERROR
"""
).lstrip(),
"https://semgrep.dev/p/ci",
)
mocker.patch.object(
ConfigLoader, "_download_config_from_url", return_value=config_file
)

runner = SemgrepRunner(
env={
Expand All @@ -29,3 +37,37 @@ def test_new_feature_registry_config(monkeypatch, snapshot, mocker, tmp_path):
)
result = runner.invoke(cli, ["scan", "--config", "p/ci"])
snapshot.assert_match(result.output, "output.txt")


@pytest.mark.quick
def test_fallback_config_works(monkeypatch, snapshot, mocker, tmp_path):
config_file = ConfigFile(
None,
"rules: []",
"https://semgrep.dev/p/ci",
)
patched_download = mocker.patch.object(
ConfigLoader,
"_download_config_from_url",
side_effect=[
SemgrepError(
f"Failed to download configuration. HTTP 500 when fetching URL"
),
config_file,
],
)

runner = SemgrepRunner(
env={
"SEMGREP_SETTINGS_FILE": str(tmp_path / ".settings.yaml"),
"SEMGREP_APP_TOKEN": "",
}
)
result = runner.invoke(cli, ["scan", "--debug", "--config", "supply-chain"])

assert "https://semgrep.dev/api" in patched_download.call_args_list[0].args[0]
assert (
"https://fail-open.prod.semgrep.dev/"
in patched_download.call_args_list[1].args[0]
)
assert "loaded 1 configs" in result.stdout
10 changes: 9 additions & 1 deletion cli/tests/e2e/test_login.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from tests.semgrep_runner import SemgrepRunner

from semgrep.cli import cli
from semgrep.config_resolver import ConfigFile
from semgrep.config_resolver import ConfigLoader


@pytest.mark.slow
Expand Down Expand Up @@ -106,8 +108,14 @@ def test_login(tmp_path, mocker):
assert "Need to set env var SEMGREP_REPO_NAME" in result.output

# Run policy. Check that request is made to correct deployment + org/repo
mocked_config = mocker.patch.object(
ConfigLoader,
"_download_config_from_url",
side_effect=lambda url: ConfigFile(None, "invalid-conifg}", url),
)
result = runner.invoke(
cli, ["--config", "policy"], env={"SEMGREP_REPO_NAME": "org/repo"}
)
assert result.exit_code == 7
assert "https://semgrep.dev/api/agent/deployments/scans/config" in result.output
assert mocked_config.called
assert "remote-url_0 was not a mapping" in result.output
6 changes: 4 additions & 2 deletions cli/tests/e2e/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from tests.semgrep_runner import SemgrepRunner

from semgrep.cli import cli
from semgrep.config_resolver import ConfigFile
from semgrep.profiling import ProfilingData


# Test data to avoid making web calls in test code

USELESS_EQEQ = """rules:
Expand Down Expand Up @@ -56,8 +58,8 @@ def __cmp__(...):
@pytest.fixture(scope="function")
def mock_config_request(monkeypatch: MonkeyPatch) -> Iterator[None]:
monkeypatch.setattr(
"semgrep.config_resolver.ConfigLoader._make_config_request",
lambda s: USELESS_EQEQ,
"semgrep.config_resolver.ConfigLoader._download_config_from_url",
lambda s, url: ConfigFile(None, USELESS_EQEQ, url),
)
yield

Expand Down
2 changes: 1 addition & 1 deletion cli/tests/unit/test_error_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def mock_get_token(mocker):
def mocked_state(mocker):
mocked = mocker.MagicMock()
mocked.app_session.user_agent = FAKE_USER_AGENT
mocked.env.fail_open_url = FAIL_OPEN_URL
mocked.env.fail_open_url = FAIL_OPEN_URL.replace("/failure", "")
mocker.patch("semgrep.state.get_state", return_value=mocked)
yield mocked

Expand Down

0 comments on commit 0a640f4

Please sign in to comment.