Skip to content

Commit

Permalink
feat(ssc): Surface Child Dependencies for Maven (semgrep#8373)
Browse files Browse the repository at this point in the history
This PR is part of the [Path to
Transitivity](https://www.notion.so/semgrep/Path-to-Transitivity-a1539071c8074acf986de73f4cc6778d)
project. In this PR, we introduced some logic to parse each dependency's
child and modify the `FoundDependency` class to now include the list of
packages that the dependency depends on. This data is then sent to the
app via the `/complete` endpoint where we will construct the adjacency
list (see scope doc linked above).

PR checklist:

- [ ] Purpose of the code is [evident to future
readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code)
- [ ] Tests included or PR comment includes a reproducible test plan
- [ ] Documentation is up-to-date
- [ ] A changelog entry was [added to
changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry)
for any user-facing change
- [ ] Change has no security implications (otherwise, ping security
team)

If you're unsure about any of this, please see:

- [Contribution
guidelines](https://semgrep.dev/docs/contributing/contributing-code)!
- [One of the more specific guides located
here](https://semgrep.dev/docs/contributing/contributing/)

---------

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Matthew McQuaid <[email protected]>
  • Loading branch information
3 people authored Aug 7, 2023
1 parent e332767 commit 176cfc3
Show file tree
Hide file tree
Showing 9 changed files with 213 additions and 37 deletions.
1 change: 1 addition & 0 deletions changelog.d/sc-996.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Maven Dep Tree parsing now surfaces children dependencies per package
64 changes: 53 additions & 11 deletions cli/src/semdep/parsers/pom_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Based on the output of this maven plugin https://maven.apache.org/plugins/maven-dependency-plugin/tree-mojo.html
"""
from pathlib import Path
from typing import Any
from typing import List
from typing import Optional
from typing import Tuple
Expand All @@ -13,7 +14,9 @@
from semdep.parsers.util import DependencyFileToParse
from semdep.parsers.util import DependencyParserError
from semdep.parsers.util import mark_line
from semdep.parsers.util import ParsedDependency
from semdep.parsers.util import safe_parse_lockfile_and_manifest
from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyChild
from semgrep.semgrep_interfaces.semgrep_output_v1 import Direct
from semgrep.semgrep_interfaces.semgrep_output_v1 import Ecosystem
from semgrep.semgrep_interfaces.semgrep_output_v1 import FoundDependency
Expand Down Expand Up @@ -47,11 +50,16 @@
regex(r"((\| )|( ))*").bind(
lambda depth: (regex("(\\+- )|(\\\\- )"))
>> dep.map(
lambda d: (
Transitivity(Transitive() if len(depth) // 3 > 0 else Direct()),
d[0],
d[1],
)
lambda d: {
"line_number": 0,
"depth": len(depth) // 3,
"transitivity": Transitivity(
Transitive() if len(depth) // 3 > 0 else Direct()
),
"children": [],
"package": d[0],
"version": d[1],
}
)
# ignore lines that we don't recognize
| consume_line
Expand All @@ -67,6 +75,39 @@
)


def get_children(deps: List[Any]) -> List[ParsedDependency]:
stack: List[Any] = []
results = []
for line_number, dep in deps:
if dep is None:
continue
dep["line_number"] = line_number
if not stack:
stack.append(dep)
continue
if dep["depth"] == stack[-1]["depth"]:
results.append(ParsedDependency.from_dict(stack.pop()))
if stack:
child = DependencyChild(package=dep["package"], version=dep["version"])
stack[-1]["children"].append(child)
stack.append(dep)
elif dep["depth"] > stack[-1]["depth"]:
child = DependencyChild(package=dep["package"], version=dep["version"])
stack[-1]["children"].append(child)
stack.append(dep)
else:
while len(stack) > 0 and dep["depth"] <= stack[-1]["depth"]:
results.append(ParsedDependency.from_dict(stack.pop()))
if stack:
child = DependencyChild(package=dep["package"], version=dep["version"])
stack[-1]["children"].append(child)
stack.append(dep)

while len(stack) > 0:
results.append(ParsedDependency.from_dict(stack.pop()))
return results


def parse_pom_tree(
tree_path: Path, _: Optional[Path]
) -> Tuple[List[FoundDependency], List[DependencyParserError]]:
Expand All @@ -78,23 +119,24 @@ def parse_pom_tree(
return [], errors
output = []
seen_matches = set()
for line_number, match in parsed_lockfile:
deps_with_children = get_children(parsed_lockfile)
for match in deps_with_children:
if match is None:
continue

if match in seen_matches:
continue
seen_matches.add(match)

transitivity, package, version = match
output.append(
FoundDependency(
package=package,
version=version,
package=match.package,
version=match.version,
ecosystem=Ecosystem(Maven()),
allowed_hashes={},
transitivity=transitivity,
line_number=line_number,
transitivity=match.transitivity,
line_number=match.line_number,
children=match.children,
)
)
return output, errors
89 changes: 63 additions & 26 deletions cli/src/semdep/parsers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@
is a perfectly acceptable type annotation for Mypy, and evaluates immediately to string,
causing no runtime errors.
"""
from __future__ import annotations

from base64 import b16encode
from base64 import b64decode
from dataclasses import dataclass
from pathlib import Path
from re import escape
from typing import Any
from typing import Callable
from typing import cast
from typing import Dict
from typing import Generic
from typing import List
from typing import Optional
from typing import Set
from typing import Tuple
from typing import TypeVar
from typing import Union

from ruamel.yaml import YAMLError

Expand All @@ -36,6 +36,7 @@
from semdep.external.parsy import string
from semdep.external.parsy import success
from semgrep.console import console
from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyChild
from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyParserError
from semgrep.semgrep_interfaces.semgrep_output_v1 import Direct
from semgrep.semgrep_interfaces.semgrep_output_v1 import ScaParserName
Expand All @@ -55,15 +56,15 @@
Pos = Tuple[int, int]


def not_any(*chars: str) -> "Parser[str]":
def not_any(*chars: str) -> Parser[str]:
"""
[chars] must contain only single character strings.
A parser which matches a series of any character that is *not* in [chars] and returns a string
"""
return regex(f"[^{escape(''.join(chars))}]+").desc(f"Any char not in {list(chars)}")


def extract_npm_lockfile_hash(s: Optional[str]) -> Dict[str, List[str]]:
def extract_npm_lockfile_hash(s: str | None) -> dict[str, list[str]]:
"""
Go from:
sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==
Expand All @@ -89,28 +90,28 @@ def extract_npm_lockfile_hash(s: Optional[str]) -> Dict[str, List[str]]:
line_number = line_info.map(lambda t: t[0] + 1)


def mark_line(p: "Parser[A]") -> "Parser[Tuple[int,A]]":
def mark_line(p: Parser[A]) -> Parser[tuple[int, A]]:
"""
Returns a parser which gets the current line number, runs [p] and then produces a pair of the line number and the result of [p]
"""
return line_number.bind(lambda line: p.bind(lambda x: success((line, x))))


def pair(p1: "Parser[A]", p2: "Parser[B]") -> "Parser[Tuple[A,B]]":
def pair(p1: Parser[A], p2: Parser[B]) -> Parser[tuple[A, B]]:
"""
Returns a parser which runs [p1] then [p2] and produces a pair of the results
"""
return p1.bind(lambda a: p2.bind(lambda b: success((a, b))))


def triple(p1: "Parser[A]", p2: "Parser[B]", p3: "Parser[C]") -> "Parser[Tuple[A,B,C]]":
def triple(p1: Parser[A], p2: Parser[B], p3: Parser[C]) -> Parser[tuple[A, B, C]]:
"""
Returns a parser which runs [p1] then [p2] then [p3] and produces a triple of the results
"""
return p1.bind(lambda a: p2.bind(lambda b: p3.bind(lambda c: success((a, b, c)))))


def transitivity(manifest_deps: Optional[Set[A]], dep_sources: List[A]) -> Transitivity:
def transitivity(manifest_deps: set[A] | None, dep_sources: list[A]) -> Transitivity:
"""
Computes the transitivity of a package, based on the set of dependencies from a manifest file
[manifest_deps] can be None in the case where we did not find a manifest file
Expand All @@ -135,7 +136,7 @@ def transitivity(manifest_deps: Optional[Set[A]], dep_sources: List[A]) -> Trans
return Transitivity(Unknown())


def become(p1: "Parser[A]", p2: "Parser[A]") -> None:
def become(p1: Parser[A], p2: Parser[A]) -> None:
"""
Gives [p1] the behavior of [p2] by side effect.
Typed version of the [become] method on "forward delaration" parsers from semdep.external.parsy.
Expand All @@ -146,7 +147,7 @@ def become(p1: "Parser[A]", p2: "Parser[A]") -> None:
p1.__class__ = p2.__class__


def delay(p: Callable[[], "Parser[A]"]) -> "Parser[A]":
def delay(p: Callable[[], Parser[A]]) -> Parser[A]:
"""
For use when defining (mutually) recursive functions that return parsers. See yarn.py for an example.
Basically if you have some mutually recursive functions that produce parsers, evaluating one of
Expand All @@ -157,7 +158,7 @@ def delay(p: Callable[[], "Parser[A]"]) -> "Parser[A]":
return Parser(lambda x, y: p()(x, y))


def quoted(p: "Parser[A]") -> "Parser[A]":
def quoted(p: Parser[A]) -> Parser[A]:
"""
Parse [p], surrounded by quotes, ignoring the quotes in the output
"""
Expand All @@ -178,7 +179,7 @@ def upto(
include_other: bool = False,
consume_other: bool = False,
allow_newline: bool = False,
) -> "Parser[str]":
) -> Parser[str]:
"""
[s] must be a list of single character strings. These should be all the possible delimiters
you wanto to parse "up to"
Expand Down Expand Up @@ -221,14 +222,14 @@ def parse_error_to_str(e: ParseError) -> str:
@dataclass
class DependencyFileToParse(Generic[A]):
path: Path
parser: Union["Parser[A]", Callable[[str], A]]
parser: Parser[A] | Callable[[str], A]
parser_name: ScaParserName
preprocessor: Callable[[str], str] = lambda ξ: ξ # noqa: E731


def parse_dependency_file(
file_to_parse: Optional[DependencyFileToParse[A]],
) -> Union[A, DependencyParserError, None]:
file_to_parse: DependencyFileToParse[A] | None,
) -> A | DependencyParserError | None:
"""
Run [parser] on the text in [path]
Expand Down Expand Up @@ -301,8 +302,8 @@ def parse_dependency_file(

def safe_parse_lockfile_and_manifest(
lockfile_to_parse: DependencyFileToParse[A],
manifest_to_parse: Optional[DependencyFileToParse[B]],
) -> Tuple[Optional[A], Optional[B], List[DependencyParserError]]:
manifest_to_parse: DependencyFileToParse[B] | None,
) -> tuple[A | None, B | None, list[DependencyParserError]]:
"""
Parse a lockfile and a manifest file, returning the results along with a list of errors that occurred in either parser
"""
Expand All @@ -318,6 +319,42 @@ def safe_parse_lockfile_and_manifest(
return parsed_lockfile, parsed_manifest, errors


@dataclass(eq=False)
class ParsedDependency:
"""
A dependency parsed from a lockfile. Used for freezing dependency information after
parsing and children addition.
"""

line_number: int
transitivity: Transitivity
children: list[DependencyChild]
package: str
version: str

@staticmethod
def from_dict(d: dict[str, Any]) -> ParsedDependency:
return ParsedDependency(
line_number=d["line_number"],
transitivity=d["transitivity"],
children=[child for child in d["children"]],
package=d["package"],
version=d["version"],
)

def __eq__(self, other: object) -> bool:
if not isinstance(other, ParsedDependency):
return NotImplemented
return (
self.package == other.package
and self.version == other.version
and self.transitivity == other.transitivity
)

def __hash__(self) -> int:
return hash((self.package, self.version, self.transitivity))


# A parser for JSON, using a line_number annotated JSON type. This is adapted from an example in the Parsy repo.
# It is almost identical except for the addition of types, line number tracking, and some minor renaming
# https://github.com/python-parsy/parsy/blob/master/examples/json.py
Expand All @@ -326,25 +363,25 @@ def safe_parse_lockfile_and_manifest(
@dataclass
class JSON:
line_number: int
value: Union[None, bool, str, float, int, List["JSON"], Dict[str, "JSON"]]
value: None | bool | str | float | int | list[JSON] | dict[str, JSON]

@staticmethod
def make(
marked: Tuple[
marked: tuple[
Pos,
Union[None, bool, str, float, int, List["JSON"], Dict[str, "JSON"]],
None | bool | str | float | int | list[JSON] | dict[str, JSON],
Pos,
]
) -> "JSON":
) -> JSON:
return JSON(marked[0][0] + 1, marked[1])

def as_dict(self) -> Dict[str, "JSON"]:
def as_dict(self) -> dict[str, JSON]:
return cast(Dict[str, "JSON"], self.value)

def as_str(self) -> str:
return cast(str, self.value)

def as_list(self) -> List["JSON"]:
def as_list(self) -> list[JSON]:
return cast(List["JSON"], self.value)

def as_int(self) -> int:
Expand All @@ -355,7 +392,7 @@ def as_int(self) -> int:
whitespace = regex(r"\s*")


def lexeme(p: "Parser[A]") -> "Parser[A]":
def lexeme(p: Parser[A]) -> Parser[A]:
return p << whitespace


Expand Down Expand Up @@ -387,7 +424,7 @@ def lexeme(p: "Parser[A]") -> "Parser[A]":
quoted_str = lexeme(quoted((string_part | string_esc).many().concat()))

# Data structures
json_value: "Parser[JSON]" = fail("forward ref")
json_value: Parser[JSON] = fail("forward ref")
object_pair = pair((quoted_str << colon), json_value)
json_object = lbrace >> object_pair.sep_by(comma).map(lambda x: dict(x)) << rbrace
array = lbrack >> json_value.sep_by(comma) << rbrack
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
},
"found_dependency": {
"allowed_hashes": {},
"children": [],
"ecosystem": "maven",
"line_number": 3,
"package": "org.apache.logging.log4j:log4j-core",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
},
"found_dependency": {
"allowed_hashes": {},
"children": [],
"ecosystem": "maven",
"line_number": 2,
"package": "com.google.inject:guice",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
},
"found_dependency": {
"allowed_hashes": {},
"children": [],
"ecosystem": "maven",
"line_number": 2,
"package": "com.google.inject:guice",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
},
"found_dependency": {
"allowed_hashes": {},
"children": [],
"ecosystem": "maven",
"line_number": 2,
"package": "com.google.inject:guice",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
},
"found_dependency": {
"allowed_hashes": {},
"children": [],
"ecosystem": "maven",
"line_number": 2,
"package": "com.google.inject:guice",
Expand Down
Loading

0 comments on commit 176cfc3

Please sign in to comment.