feat(ssc): Surface Child Dependencies for Maven (semgrep#8373)

This PR is part of the [Path to Transitivity](https://www.notion.so/semgrep/Path-to-Transitivity-a1539071c8074acf986de73f4cc6778d) project. In this PR, we introduced some logic to parse each dependency's child and modify the `FoundDependency` class to now include the list of packages that the dependency depends on. This data is then sent to the app via the `/complete` endpoint where we will construct the adjacency list (see scope doc linked above). PR checklist: - [ ] Purpose of the code is [evident to future readers](https://semgrep.dev/docs/contributing/contributing-code/#explaining-code) - [ ] Tests included or PR comment includes a reproducible test plan - [ ] Documentation is up-to-date - [ ] A changelog entry was [added to changelog.d](https://semgrep.dev/docs/contributing/contributing-code/#adding-a-changelog-entry) for any user-facing change - [ ] Change has no security implications (otherwise, ping security team) If you're unsure about any of this, please see: - [Contribution guidelines](https://semgrep.dev/docs/contributing/contributing-code)! - [One of the more specific guides located here](https://semgrep.dev/docs/contributing/contributing/) --------- Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Matthew McQuaid <[email protected]>
rkialashaki · Aug 7, 2023 · 176cfc3 · 176cfc3
1 parent e332767
commit 176cfc3
Show file tree

Hide file tree

Showing 9 changed files with 213 additions and 37 deletions.
diff --git a/changelog.d/sc-996.added b/changelog.d/sc-996.added
@@ -0,0 +1 @@
+Maven Dep Tree parsing now surfaces children dependencies per package
diff --git a/cli/src/semdep/parsers/pom_tree.py b/cli/src/semdep/parsers/pom_tree.py
@@ -3,6 +3,7 @@
 Based on the output of this maven plugin https://maven.apache.org/plugins/maven-dependency-plugin/tree-mojo.html
 """
 from pathlib import Path
+from typing import Any
 from typing import List
 from typing import Optional
 from typing import Tuple
@@ -13,7 +14,9 @@
 from semdep.parsers.util import DependencyFileToParse
 from semdep.parsers.util import DependencyParserError
 from semdep.parsers.util import mark_line
+from semdep.parsers.util import ParsedDependency
 from semdep.parsers.util import safe_parse_lockfile_and_manifest
+from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyChild
 from semgrep.semgrep_interfaces.semgrep_output_v1 import Direct
 from semgrep.semgrep_interfaces.semgrep_output_v1 import Ecosystem
 from semgrep.semgrep_interfaces.semgrep_output_v1 import FoundDependency
@@ -47,11 +50,16 @@
     regex(r"((\|  )|(   ))*").bind(
         lambda depth: (regex("(\\+- )|(\\\\- )"))
         >> dep.map(
-            lambda d: (
-                Transitivity(Transitive() if len(depth) // 3 > 0 else Direct()),
-                d[0],
-                d[1],
-            )
+            lambda d: {
+                "line_number": 0,
+                "depth": len(depth) // 3,
+                "transitivity": Transitivity(
+                    Transitive() if len(depth) // 3 > 0 else Direct()
+                ),
+                "children": [],
+                "package": d[0],
+                "version": d[1],
+            }
         )
         # ignore lines that we don't recognize
         | consume_line
@@ -67,6 +75,39 @@
 )
 
 
+def get_children(deps: List[Any]) -> List[ParsedDependency]:
+    stack: List[Any] = []
+    results = []
+    for line_number, dep in deps:
+        if dep is None:
+            continue
+        dep["line_number"] = line_number
+        if not stack:
+            stack.append(dep)
+            continue
+        if dep["depth"] == stack[-1]["depth"]:
+            results.append(ParsedDependency.from_dict(stack.pop()))
+            if stack:
+                child = DependencyChild(package=dep["package"], version=dep["version"])
+                stack[-1]["children"].append(child)
+            stack.append(dep)
+        elif dep["depth"] > stack[-1]["depth"]:
+            child = DependencyChild(package=dep["package"], version=dep["version"])
+            stack[-1]["children"].append(child)
+            stack.append(dep)
+        else:
+            while len(stack) > 0 and dep["depth"] <= stack[-1]["depth"]:
+                results.append(ParsedDependency.from_dict(stack.pop()))
+            if stack:
+                child = DependencyChild(package=dep["package"], version=dep["version"])
+                stack[-1]["children"].append(child)
+            stack.append(dep)
+
+    while len(stack) > 0:
+        results.append(ParsedDependency.from_dict(stack.pop()))
+    return results
+
+
 def parse_pom_tree(
     tree_path: Path, _: Optional[Path]
 ) -> Tuple[List[FoundDependency], List[DependencyParserError]]:
@@ -78,23 +119,24 @@ def parse_pom_tree(
         return [], errors
     output = []
     seen_matches = set()
-    for line_number, match in parsed_lockfile:
+    deps_with_children = get_children(parsed_lockfile)
+    for match in deps_with_children:
         if match is None:
             continue
 
         if match in seen_matches:
             continue
         seen_matches.add(match)
 
-        transitivity, package, version = match
         output.append(
             FoundDependency(
-                package=package,
-                version=version,
+                package=match.package,
+                version=match.version,
                 ecosystem=Ecosystem(Maven()),
                 allowed_hashes={},
-                transitivity=transitivity,
-                line_number=line_number,
+                transitivity=match.transitivity,
+                line_number=match.line_number,
+                children=match.children,
             )
         )
     return output, errors
diff --git a/cli/src/semdep/parsers/util.py b/cli/src/semdep/parsers/util.py
@@ -9,21 +9,21 @@
 is a perfectly acceptable type annotation for Mypy, and evaluates immediately to string,
 causing no runtime errors.
 """
+from __future__ import annotations
+
 from base64 import b16encode
 from base64 import b64decode
 from dataclasses import dataclass
 from pathlib import Path
 from re import escape
+from typing import Any
 from typing import Callable
 from typing import cast
 from typing import Dict
 from typing import Generic
 from typing import List
-from typing import Optional
-from typing import Set
 from typing import Tuple
 from typing import TypeVar
-from typing import Union
 
 from ruamel.yaml import YAMLError
 
@@ -36,6 +36,7 @@
 from semdep.external.parsy import string
 from semdep.external.parsy import success
 from semgrep.console import console
+from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyChild
 from semgrep.semgrep_interfaces.semgrep_output_v1 import DependencyParserError
 from semgrep.semgrep_interfaces.semgrep_output_v1 import Direct
 from semgrep.semgrep_interfaces.semgrep_output_v1 import ScaParserName
@@ -55,15 +56,15 @@
 Pos = Tuple[int, int]
 
 
-def not_any(*chars: str) -> "Parser[str]":
+def not_any(*chars: str) -> Parser[str]:
     """
     [chars] must contain only single character strings.
     A parser which matches a series of any character that is *not* in [chars] and returns a string
     """
     return regex(f"[^{escape(''.join(chars))}]+").desc(f"Any char not in {list(chars)}")
 
 
-def extract_npm_lockfile_hash(s: Optional[str]) -> Dict[str, List[str]]:
+def extract_npm_lockfile_hash(s: str | None) -> dict[str, list[str]]:
     """
     Go from:
         sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==
@@ -89,28 +90,28 @@ def extract_npm_lockfile_hash(s: Optional[str]) -> Dict[str, List[str]]:
 line_number = line_info.map(lambda t: t[0] + 1)
 
 
-def mark_line(p: "Parser[A]") -> "Parser[Tuple[int,A]]":
+def mark_line(p: Parser[A]) -> Parser[tuple[int, A]]:
     """
     Returns a parser which gets the current line number, runs [p] and then produces a pair of the line number and the result of [p]
     """
     return line_number.bind(lambda line: p.bind(lambda x: success((line, x))))
 
 
-def pair(p1: "Parser[A]", p2: "Parser[B]") -> "Parser[Tuple[A,B]]":
+def pair(p1: Parser[A], p2: Parser[B]) -> Parser[tuple[A, B]]:
     """
     Returns a parser which runs [p1] then [p2] and produces a pair of the results
     """
     return p1.bind(lambda a: p2.bind(lambda b: success((a, b))))
 
 
-def triple(p1: "Parser[A]", p2: "Parser[B]", p3: "Parser[C]") -> "Parser[Tuple[A,B,C]]":
+def triple(p1: Parser[A], p2: Parser[B], p3: Parser[C]) -> Parser[tuple[A, B, C]]:
     """
     Returns a parser which runs [p1] then [p2] then [p3] and produces a triple of the results
     """
     return p1.bind(lambda a: p2.bind(lambda b: p3.bind(lambda c: success((a, b, c)))))
 
 
-def transitivity(manifest_deps: Optional[Set[A]], dep_sources: List[A]) -> Transitivity:
+def transitivity(manifest_deps: set[A] | None, dep_sources: list[A]) -> Transitivity:
     """
     Computes the transitivity of a package, based on the set of dependencies from a manifest file
     [manifest_deps] can be None in the case where we did not find a manifest file
@@ -135,7 +136,7 @@ def transitivity(manifest_deps: Optional[Set[A]], dep_sources: List[A]) -> Trans
         return Transitivity(Unknown())
 
 
-def become(p1: "Parser[A]", p2: "Parser[A]") -> None:
+def become(p1: Parser[A], p2: Parser[A]) -> None:
     """
     Gives [p1] the behavior of [p2] by side effect.
     Typed version of the [become] method on "forward delaration" parsers from semdep.external.parsy.
@@ -146,7 +147,7 @@ def become(p1: "Parser[A]", p2: "Parser[A]") -> None:
     p1.__class__ = p2.__class__
 
 
-def delay(p: Callable[[], "Parser[A]"]) -> "Parser[A]":
+def delay(p: Callable[[], Parser[A]]) -> Parser[A]:
     """
     For use when defining (mutually) recursive functions that return parsers. See yarn.py for an example.
     Basically if you have some mutually recursive functions that produce parsers, evaluating one of
@@ -157,7 +158,7 @@ def delay(p: Callable[[], "Parser[A]"]) -> "Parser[A]":
     return Parser(lambda x, y: p()(x, y))
 
 
-def quoted(p: "Parser[A]") -> "Parser[A]":
+def quoted(p: Parser[A]) -> Parser[A]:
     """
     Parse [p], surrounded by quotes, ignoring the quotes in the output
     """
@@ -178,7 +179,7 @@ def upto(
     include_other: bool = False,
     consume_other: bool = False,
     allow_newline: bool = False,
-) -> "Parser[str]":
+) -> Parser[str]:
     """
     [s] must be a list of single character strings. These should be all the possible delimiters
     you wanto to parse "up to"
@@ -221,14 +222,14 @@ def parse_error_to_str(e: ParseError) -> str:
 @dataclass
 class DependencyFileToParse(Generic[A]):
     path: Path
-    parser: Union["Parser[A]", Callable[[str], A]]
+    parser: Parser[A] | Callable[[str], A]
     parser_name: ScaParserName
     preprocessor: Callable[[str], str] = lambda ξ: ξ  # noqa: E731
 
 
 def parse_dependency_file(
-    file_to_parse: Optional[DependencyFileToParse[A]],
-) -> Union[A, DependencyParserError, None]:
+    file_to_parse: DependencyFileToParse[A] | None,
+) -> A | DependencyParserError | None:
     """
     Run [parser] on the text in [path]
 
@@ -301,8 +302,8 @@ def parse_dependency_file(
 
 def safe_parse_lockfile_and_manifest(
     lockfile_to_parse: DependencyFileToParse[A],
-    manifest_to_parse: Optional[DependencyFileToParse[B]],
-) -> Tuple[Optional[A], Optional[B], List[DependencyParserError]]:
+    manifest_to_parse: DependencyFileToParse[B] | None,
+) -> tuple[A | None, B | None, list[DependencyParserError]]:
     """
     Parse a lockfile and a manifest file, returning the results along with a list of errors that occurred in either parser
     """
@@ -318,6 +319,42 @@ def safe_parse_lockfile_and_manifest(
     return parsed_lockfile, parsed_manifest, errors
 
 
+@dataclass(eq=False)
+class ParsedDependency:
+    """
+    A dependency parsed from a lockfile. Used for freezing dependency information after
+    parsing and children addition.
+    """
+
+    line_number: int
+    transitivity: Transitivity
+    children: list[DependencyChild]
+    package: str
+    version: str
+
+    @staticmethod
+    def from_dict(d: dict[str, Any]) -> ParsedDependency:
+        return ParsedDependency(
+            line_number=d["line_number"],
+            transitivity=d["transitivity"],
+            children=[child for child in d["children"]],
+            package=d["package"],
+            version=d["version"],
+        )
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, ParsedDependency):
+            return NotImplemented
+        return (
+            self.package == other.package
+            and self.version == other.version
+            and self.transitivity == other.transitivity
+        )
+
+    def __hash__(self) -> int:
+        return hash((self.package, self.version, self.transitivity))
+
+
 # A parser for JSON, using a line_number annotated JSON type. This is adapted from an example in the Parsy repo.
 # It is almost identical except for the addition of types, line number tracking, and some minor renaming
 # https://github.com/python-parsy/parsy/blob/master/examples/json.py
@@ -326,25 +363,25 @@ def safe_parse_lockfile_and_manifest(
 @dataclass
 class JSON:
     line_number: int
-    value: Union[None, bool, str, float, int, List["JSON"], Dict[str, "JSON"]]
+    value: None | bool | str | float | int | list[JSON] | dict[str, JSON]
 
     @staticmethod
     def make(
-        marked: Tuple[
+        marked: tuple[
             Pos,
-            Union[None, bool, str, float, int, List["JSON"], Dict[str, "JSON"]],
+            None | bool | str | float | int | list[JSON] | dict[str, JSON],
             Pos,
         ]
-    ) -> "JSON":
+    ) -> JSON:
         return JSON(marked[0][0] + 1, marked[1])
 
-    def as_dict(self) -> Dict[str, "JSON"]:
+    def as_dict(self) -> dict[str, JSON]:
         return cast(Dict[str, "JSON"], self.value)
 
     def as_str(self) -> str:
         return cast(str, self.value)
 
-    def as_list(self) -> List["JSON"]:
+    def as_list(self) -> list[JSON]:
         return cast(List["JSON"], self.value)
 
     def as_int(self) -> int:
@@ -355,7 +392,7 @@ def as_int(self) -> int:
 whitespace = regex(r"\s*")
 
 
-def lexeme(p: "Parser[A]") -> "Parser[A]":
+def lexeme(p: Parser[A]) -> Parser[A]:
     return p << whitespace
 
 
@@ -387,7 +424,7 @@ def lexeme(p: "Parser[A]") -> "Parser[A]":
 quoted_str = lexeme(quoted((string_part | string_esc).many().concat()))
 
 # Data structures
-json_value: "Parser[JSON]" = fail("forward ref")
+json_value: Parser[JSON] = fail("forward ref")
 object_pair = pair((quoted_str << colon), json_value)
 json_object = lbrace >> object_pair.sep_by(comma).map(lambda x: dict(x)) << rbrace
 array = lbrack >> json_value.sep_by(comma) << rbrack

diff --git a/...endency_aware_rules/rulesdependency_awarelog4shell.yaml-dependency_awarelog4j/results.txt b/...endency_aware_rules/rulesdependency_awarelog4shell.yaml-dependency_awarelog4j/results.txt
@@ -107,6 +107,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
             },
             "found_dependency": {
               "allowed_hashes": {},
+              "children": [],
               "ecosystem": "maven",
               "line_number": 3,
               "package": "org.apache.logging.log4j:log4j-core",

diff --git a/...esdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_extra_field/results.txt b/...esdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_extra_field/results.txt
@@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
             },
             "found_dependency": {
               "allowed_hashes": {},
+              "children": [],
               "ecosystem": "maven",
               "line_number": 2,
               "package": "com.google.inject:guice",

diff --git a/...s/rulesdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_joined/results.txt b/...s/rulesdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_joined/results.txt
@@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
             },
             "found_dependency": {
               "allowed_hashes": {},
+              "children": [],
               "ecosystem": "maven",
               "line_number": 2,
               "package": "com.google.inject:guice",

diff --git a/...rulesdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_optional/results.txt b/...rulesdependency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_optional/results.txt
@@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
             },
             "found_dependency": {
               "allowed_hashes": {},
+              "children": [],
               "ecosystem": "maven",
               "line_number": 2,
               "package": "com.google.inject:guice",

diff --git a/...pendency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_release_version/results.txt b/...pendency_awaremaven-guice.yaml-dependency_awaremaven_dep_tree_release_version/results.txt
@@ -40,6 +40,7 @@ SEMGREP_USER_AGENT_APPEND="pytest" SEMGREP_SETTINGS_FILE="<MASKED>" SEMGREP_VERS
             },
             "found_dependency": {
               "allowed_hashes": {},
+              "children": [],
               "ecosystem": "maven",
               "line_number": 2,
               "package": "com.google.inject:guice",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Maven Dep Tree parsing now surfaces children dependencies per package