-
Notifications
You must be signed in to change notification settings - Fork 28
feat(security): Add package name typosquatting detection #1059
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
92e5e6f
250f2e2
01ce3a8
a955da9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,280 @@ | ||
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. | ||
|
||
"""Analyzer checks if there is typosquatting presence in the package name.""" | ||
import logging | ||
import os | ||
|
||
from macaron.config.defaults import defaults | ||
from macaron.config.global_config import global_config | ||
from macaron.json_tools import JsonType | ||
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics | ||
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer): | ||
"""Check whether the PyPI package has typosquatting presence.""" | ||
|
||
KEYBOARD_LAYOUT = { | ||
"1": (0, 0), | ||
"2": (0, 1), | ||
"3": (0, 2), | ||
"4": (0, 3), | ||
"5": (0, 4), | ||
"6": (0, 5), | ||
"7": (0, 6), | ||
"8": (0, 7), | ||
"9": (0, 8), | ||
"0": (0, 9), | ||
"-": (0, 10), | ||
"q": (1, 0), | ||
"w": (1, 1), | ||
"e": (1, 2), | ||
"r": (1, 3), | ||
"t": (1, 4), | ||
"y": (1, 5), | ||
"u": (1, 6), | ||
"i": (1, 7), | ||
"o": (1, 8), | ||
"p": (1, 9), | ||
"a": (2, 0), | ||
"s": (2, 1), | ||
"d": (2, 2), | ||
"f": (2, 3), | ||
"g": (2, 4), | ||
"h": (2, 5), | ||
"j": (2, 6), | ||
"k": (2, 7), | ||
"l": (2, 8), | ||
"z": (3, 0), | ||
"x": (3, 1), | ||
"c": (3, 2), | ||
"v": (3, 3), | ||
"b": (3, 4), | ||
"n": (3, 5), | ||
"m": (3, 6), | ||
} | ||
|
||
def __init__(self) -> None: | ||
super().__init__( | ||
name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None | ||
) | ||
self.popular_packages_path, self.distance_ratio_threshold, self.keyboard, self.scaling, self.cost = ( | ||
self._load_defaults() | ||
) | ||
|
||
if global_config.popular_packages_path is not None: | ||
self.popular_packages_path = global_config.popular_packages_path | ||
|
||
def _load_defaults(self) -> tuple[str, float, float, float, float]: | ||
"""Load default settings from defaults.ini. | ||
|
||
Returns | ||
------- | ||
tuple[str, float, float, float, float]: | ||
The Major threshold, Epoch threshold, and Day published error. | ||
""" | ||
section_name = "heuristic.pypi" | ||
default_path = os.path.join(global_config.resources_path, "popular_packages.txt") | ||
if defaults.has_section(section_name): | ||
section = defaults[section_name] | ||
path = section.get("popular_packages_path", default_path) | ||
# Fall back to default if the path in defaults.ini is empty | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please use a full stop |
||
if not path.strip(): | ||
path = default_path | ||
return ( | ||
path, | ||
section.getfloat("distance_ratio_threshold", 0.95), | ||
section.getfloat("keyboard", 0.8), | ||
section.getfloat("scaling", 0.15), | ||
section.getfloat("cost", 1.0), | ||
) | ||
return ( | ||
default_path, | ||
0.95, | ||
0.8, | ||
0.15, | ||
1.0, | ||
) | ||
|
||
def are_neighbors(self, char1: str, char2: str) -> bool: | ||
"""Check if two characters are adjacent on a QWERTY keyboard. | ||
|
||
Parameters | ||
---------- | ||
char1 : str | ||
The first character. | ||
char2 : str | ||
The second character. | ||
|
||
Returns | ||
------- | ||
bool | ||
True if the characters are neighbors, False otherwise. | ||
""" | ||
c1 = self.KEYBOARD_LAYOUT.get(char1) | ||
c2 = self.KEYBOARD_LAYOUT.get(char2) | ||
Comment on lines
+118
to
+119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please avoid using very short variables names such as |
||
if not c1 or not c2: | ||
return False | ||
return (abs(c1[0] - c2[0]) <= 1) and (abs(c1[1] - c2[1]) <= 1) | ||
|
||
def substitution_func(self, char1: str, char2: str) -> float: | ||
"""Calculate the substitution cost between two characters. | ||
|
||
Parameters | ||
---------- | ||
char1 : str | ||
The first character. | ||
char2 : str | ||
The second character. | ||
|
||
Returns | ||
------- | ||
float | ||
0.0 if the characters are the same, `self.keyboard` if they are | ||
neighbors on a QWERTY keyboard, otherwise `self.cost` . | ||
""" | ||
if char1 == char2: | ||
return 0.0 | ||
if self.keyboard and self.are_neighbors(char1, char2): | ||
return self.keyboard | ||
return self.cost | ||
|
||
def jaro_distance(self, package_name: str, popular_package_name: str) -> float: | ||
"""Calculate the Jaro distance between two package names. | ||
|
||
Parameters | ||
---------- | ||
package_name : str | ||
The name of the package being analyzed. | ||
popular_package_name : str | ||
The name of a popular package to compare against. | ||
|
||
Returns | ||
------- | ||
float | ||
The Jaro distance between the two package names. | ||
""" | ||
if package_name == popular_package_name: | ||
return 1.0 | ||
|
||
len1, len2 = len(package_name), len(popular_package_name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please put these assignations on separate lines. |
||
if len1 == 0 or len2 == 0: | ||
return 0.0 | ||
|
||
match_distance = max(len1, len2) // 2 - 1 | ||
|
||
package_name_matches = [False] * len1 | ||
popular_package_name_matches = [False] * len2 | ||
matches = 0 | ||
transpositions = 0.0 # Now a float to handle partial costs | ||
|
||
# Count matches | ||
for i in range(len1): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment here for variable names. Use |
||
start = max(0, i - match_distance) | ||
end = min(i + match_distance + 1, len2) | ||
for j in range(start, end): | ||
if popular_package_name_matches[j]: | ||
continue | ||
if package_name[i] == popular_package_name[j]: | ||
package_name_matches[i] = True | ||
popular_package_name_matches[j] = True | ||
matches += 1 | ||
break | ||
|
||
if matches == 0: | ||
return 0.0 | ||
|
||
# Count transpositions with possible keyboard awareness | ||
k = 0 | ||
for i in range(len1): | ||
if package_name_matches[i]: | ||
while not popular_package_name_matches[k]: | ||
k += 1 | ||
if package_name[i] != popular_package_name[k]: | ||
transpositions += self.substitution_func(package_name[i], popular_package_name[k]) | ||
k += 1 | ||
|
||
transpositions /= 2.0 # Adjust for transpositions being counted twice | ||
|
||
return (matches / len1 + matches / len2 + (matches - transpositions) / matches) / 3.0 | ||
|
||
def ratio(self, package_name: str, popular_package_name: str) -> float: | ||
"""Calculate the Jaro-Winkler distance ratio. | ||
|
||
Parameters | ||
---------- | ||
package_name : str | ||
The name of the package being analyzed. | ||
popular_package_name : str | ||
The name of a popular package to compare against. | ||
|
||
Returns | ||
------- | ||
float | ||
The Jaro-Winkler distance ratio, incorporating a prefix bonus | ||
for common initial characters. | ||
""" | ||
scaling = self.scaling | ||
jaro_dist = self.jaro_distance(package_name, popular_package_name) | ||
prefix_length = 0 | ||
max_prefix = 4 | ||
for i in range(min(max_prefix, len(package_name), len(popular_package_name))): | ||
if package_name[i] == popular_package_name[i]: | ||
prefix_length += 1 | ||
else: | ||
break | ||
|
||
return jaro_dist + prefix_length * scaling * (1 - jaro_dist) | ||
|
||
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: | ||
"""Analyze the package. | ||
|
||
Parameters | ||
---------- | ||
pypi_package_json: PyPIPackageJsonAsset | ||
The PyPI package JSON asset object. | ||
|
||
Returns | ||
------- | ||
tuple[HeuristicResult, dict[str, JsonType]]: | ||
The result and related information collected during the analysis. | ||
""" | ||
# If there is a popular packages file, check if the package name is similar to any of them | ||
if not self.popular_packages_path or not os.path.exists(self.popular_packages_path): | ||
err_msg = f"Popular packages file not found or path not configured: {self.popular_packages_path}" | ||
logger.warning("%s. Skipping typosquatting check.", err_msg) | ||
return HeuristicResult.SKIP, {"error": err_msg} | ||
|
||
popular_packages = [] | ||
try: | ||
with open(self.popular_packages_path, encoding="utf-8") as file: | ||
popular_packages = file.read().splitlines() | ||
except OSError as e: | ||
err_msg = f"Could not read popular packages file {self.popular_packages_path}: {e}" | ||
logger.error(err_msg) | ||
return HeuristicResult.SKIP, {"error": err_msg} | ||
|
||
package_name = pypi_package_json.component_name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also have a check for when the |
||
for popular_package in popular_packages: | ||
if package_name == popular_package: | ||
return HeuristicResult.PASS, {"package_name": package_name} | ||
|
||
distance_ratio = self.ratio(package_name, popular_package) | ||
if distance_ratio >= self.distance_ratio_threshold: | ||
logger.info( | ||
"Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)", | ||
package_name, | ||
popular_package, | ||
distance_ratio, | ||
) | ||
return HeuristicResult.FAIL, { | ||
"package_name": package_name, | ||
"popular_package": popular_package, | ||
"similarity_ratio": distance_ratio, | ||
} | ||
|
||
return HeuristicResult.PASS, {"package_name": package_name} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that we have the path configurable in
defualts.ini
, could we remove the command-line argument for it? Currently, it looks like the command line argument will override thedefaults.ini
value.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
removing the command-line option would reduce flexibility for users who may want to specify a different list path dynamically, without modifying the config file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's best to remove the command-line option if we have opted to
defaults.ini
.If the user want to provide their own path, they can create a custom defaults.ini with the following content
This will only override the
popular_package_path
value, while keeping other values indefaults.ini
the same. This approach is to be consistent with other options we have for malware analysis heuristics. In this case, I think it's fine to do so without worrying about flexibility of not having a command line parameter.Besides, it's usually not a good idea to have 2 ways to achieve the same thing as it could further confuse the users (even though both ways are completely fine on its own).