Skip to content

feat(security): Add package name typosquatting detection #1059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/macaron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,16 @@ def main(argv: list[str] | None = None) -> None:
help="The directory where Macaron looks for already cloned repositories.",
)

main_parser.add_argument(
"-pp",
"--popular-packages-path",
required=False,
type=str,
default=None,
help="The path to the popular packages file used for typosquatting detection.",
dest="popular_packages_path",
)

# Add sub parsers for each action.
sub_parser = main_parser.add_subparsers(dest="action", help="Run macaron <action> --help for help")

Expand Down Expand Up @@ -579,6 +589,7 @@ def main(argv: list[str] | None = None) -> None:
build_log_path=os.path.join(args.output_dir, "build_log"),
debug_level=log_level,
local_repos_path=args.local_repos_path,
popular_packages_path=args.popular_packages_path,
resources_path=os.path.join(macaron.MACARON_PATH, "resources"),
)

Expand Down
11 changes: 11 additions & 0 deletions src/macaron/config/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -599,3 +599,14 @@ major_threshold = 20
epoch_threshold = 3
# The number of days +/- the day of publish the calendar versioning day may be.
day_publish_error = 4

# The threshold ratio for two packages to be considered similar.
distance_ratio_threshold = 0.95
# The Keyboard cost for two characters that are close to each other on the keyboard.
keyboard = 0.8
# The scaling factor for the jaro winkler distance.
scaling = 0.15
# The cost for two characters that are not close to each other on the keyboard.
cost = 1.0
# The path to the file that contains the list of popular packages.
popular_packages_path =
5 changes: 5 additions & 0 deletions src/macaron/config/global_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class GlobalConfig:
#: The path to the local .m2 Maven repository. This attribute is None if there is no available .m2 directory.
local_maven_repo: str | None = None

#: The path to the popular packages file.
popular_packages_path: str | None = None

def load(
self,
macaron_path: str,
Expand All @@ -57,6 +60,7 @@ def load(
debug_level: int,
local_repos_path: str,
resources_path: str,
popular_packages_path: str,
) -> None:
"""Initiate the GlobalConfig object.

Expand All @@ -81,6 +85,7 @@ def load(
self.debug_level = debug_level
self.local_repos_path = local_repos_path
self.resources_path = resources_path
self.popular_packages_path = popular_packages_path

def load_expectation_files(self, exp_path: str) -> None:
"""
Expand Down
3 changes: 3 additions & 0 deletions src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class Heuristics(str, Enum):
#: Indicates that the package has an unusually large version number for a single release.
ANOMALOUS_VERSION = "anomalous_version"

#: Indicates that the package name is similar to a popular package.
TYPOSQUATTING_PRESENCE = "typosquatting_presence"


class HeuristicResult(str, Enum):
"""Result type indicating the outcome of a heuristic."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Analyzer checks if there is typosquatting presence in the package name."""
import logging
import os

from macaron.config.defaults import defaults
from macaron.config.global_config import global_config
from macaron.json_tools import JsonType
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

logger = logging.getLogger(__name__)


class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer):
"""Check whether the PyPI package has typosquatting presence."""

KEYBOARD_LAYOUT = {
"1": (0, 0),
"2": (0, 1),
"3": (0, 2),
"4": (0, 3),
"5": (0, 4),
"6": (0, 5),
"7": (0, 6),
"8": (0, 7),
"9": (0, 8),
"0": (0, 9),
"-": (0, 10),
"q": (1, 0),
"w": (1, 1),
"e": (1, 2),
"r": (1, 3),
"t": (1, 4),
"y": (1, 5),
"u": (1, 6),
"i": (1, 7),
"o": (1, 8),
"p": (1, 9),
"a": (2, 0),
"s": (2, 1),
"d": (2, 2),
"f": (2, 3),
"g": (2, 4),
"h": (2, 5),
"j": (2, 6),
"k": (2, 7),
"l": (2, 8),
"z": (3, 0),
"x": (3, 1),
"c": (3, 2),
"v": (3, 3),
"b": (3, 4),
"n": (3, 5),
"m": (3, 6),
}

def __init__(self) -> None:
super().__init__(
name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None
)
self.popular_packages_path, self.distance_ratio_threshold, self.keyboard, self.scaling, self.cost = (
self._load_defaults()
)

if global_config.popular_packages_path is not None:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that we have the path configurable in defualts.ini, could we remove the command-line argument for it? Currently, it looks like the command line argument will override the defaults.ini value.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing the command-line option would reduce flexibility for users who may want to specify a different list path dynamically, without modifying the config file.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's best to remove the command-line option if we have opted to defaults.ini.
If the user want to provide their own path, they can create a custom defaults.ini with the following content

[heuristic.pypi]
popular_package_path = <custom_value>

This will only override the popular_package_path value, while keeping other values in defaults.ini the same. This approach is to be consistent with other options we have for malware analysis heuristics. In this case, I think it's fine to do so without worrying about flexibility of not having a command line parameter.

Besides, it's usually not a good idea to have 2 ways to achieve the same thing as it could further confuse the users (even though both ways are completely fine on its own).

self.popular_packages_path = global_config.popular_packages_path

def _load_defaults(self) -> tuple[str, float, float, float, float]:
"""Load default settings from defaults.ini.

Returns
-------
tuple[str, float, float, float, float]:
The Major threshold, Epoch threshold, and Day published error.
"""
section_name = "heuristic.pypi"
default_path = os.path.join(global_config.resources_path, "popular_packages.txt")
if defaults.has_section(section_name):
section = defaults[section_name]
path = section.get("popular_packages_path", default_path)
# Fall back to default if the path in defaults.ini is empty
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use a full stop . at the end of every comment.

if not path.strip():
path = default_path
return (
path,
section.getfloat("distance_ratio_threshold", 0.95),
section.getfloat("keyboard", 0.8),
section.getfloat("scaling", 0.15),
section.getfloat("cost", 1.0),
)
return (
default_path,
0.95,
0.8,
0.15,
1.0,
)

def are_neighbors(self, char1: str, char2: str) -> bool:
"""Check if two characters are adjacent on a QWERTY keyboard.

Parameters
----------
char1 : str
The first character.
char2 : str
The second character.

Returns
-------
bool
True if the characters are neighbors, False otherwise.
"""
c1 = self.KEYBOARD_LAYOUT.get(char1)
c2 = self.KEYBOARD_LAYOUT.get(char2)
Comment on lines +118 to +119
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please avoid using very short variables names such as c1 and c2 here.

if not c1 or not c2:
return False
return (abs(c1[0] - c2[0]) <= 1) and (abs(c1[1] - c2[1]) <= 1)

def substitution_func(self, char1: str, char2: str) -> float:
"""Calculate the substitution cost between two characters.

Parameters
----------
char1 : str
The first character.
char2 : str
The second character.

Returns
-------
float
0.0 if the characters are the same, `self.keyboard` if they are
neighbors on a QWERTY keyboard, otherwise `self.cost` .
"""
if char1 == char2:
return 0.0
if self.keyboard and self.are_neighbors(char1, char2):
return self.keyboard
return self.cost

def jaro_distance(self, package_name: str, popular_package_name: str) -> float:
"""Calculate the Jaro distance between two package names.

Parameters
----------
package_name : str
The name of the package being analyzed.
popular_package_name : str
The name of a popular package to compare against.

Returns
-------
float
The Jaro distance between the two package names.
"""
if package_name == popular_package_name:
return 1.0

len1, len2 = len(package_name), len(popular_package_name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please put these assignations on separate lines.

if len1 == 0 or len2 == 0:
return 0.0

match_distance = max(len1, len2) // 2 - 1

package_name_matches = [False] * len1
popular_package_name_matches = [False] * len2
matches = 0
transpositions = 0.0 # Now a float to handle partial costs

# Count matches
for i in range(len1):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment here for variable names. Use index. Please also apply this throughout the PR.

start = max(0, i - match_distance)
end = min(i + match_distance + 1, len2)
for j in range(start, end):
if popular_package_name_matches[j]:
continue
if package_name[i] == popular_package_name[j]:
package_name_matches[i] = True
popular_package_name_matches[j] = True
matches += 1
break

if matches == 0:
return 0.0

# Count transpositions with possible keyboard awareness
k = 0
for i in range(len1):
if package_name_matches[i]:
while not popular_package_name_matches[k]:
k += 1
if package_name[i] != popular_package_name[k]:
transpositions += self.substitution_func(package_name[i], popular_package_name[k])
k += 1

transpositions /= 2.0 # Adjust for transpositions being counted twice

return (matches / len1 + matches / len2 + (matches - transpositions) / matches) / 3.0

def ratio(self, package_name: str, popular_package_name: str) -> float:
"""Calculate the Jaro-Winkler distance ratio.

Parameters
----------
package_name : str
The name of the package being analyzed.
popular_package_name : str
The name of a popular package to compare against.

Returns
-------
float
The Jaro-Winkler distance ratio, incorporating a prefix bonus
for common initial characters.
"""
scaling = self.scaling
jaro_dist = self.jaro_distance(package_name, popular_package_name)
prefix_length = 0
max_prefix = 4
for i in range(min(max_prefix, len(package_name), len(popular_package_name))):
if package_name[i] == popular_package_name[i]:
prefix_length += 1
else:
break

return jaro_dist + prefix_length * scaling * (1 - jaro_dist)

def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the package.

Parameters
----------
pypi_package_json: PyPIPackageJsonAsset
The PyPI package JSON asset object.

Returns
-------
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
"""
# If there is a popular packages file, check if the package name is similar to any of them
if not self.popular_packages_path or not os.path.exists(self.popular_packages_path):
err_msg = f"Popular packages file not found or path not configured: {self.popular_packages_path}"
logger.warning("%s. Skipping typosquatting check.", err_msg)
return HeuristicResult.SKIP, {"error": err_msg}

popular_packages = []
try:
with open(self.popular_packages_path, encoding="utf-8") as file:
popular_packages = file.read().splitlines()
except OSError as e:
err_msg = f"Could not read popular packages file {self.popular_packages_path}: {e}"
logger.error(err_msg)
return HeuristicResult.SKIP, {"error": err_msg}

package_name = pypi_package_json.component_name
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also have a check for when the popular_packages list ends up being empty.

for popular_package in popular_packages:
if package_name == popular_package:
return HeuristicResult.PASS, {"package_name": package_name}

distance_ratio = self.ratio(package_name, popular_package)
if distance_ratio >= self.distance_ratio_threshold:
logger.info(
"Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)",
package_name,
popular_package,
distance_ratio,
)
return HeuristicResult.FAIL, {
"package_name": package_name,
"popular_package": popular_package,
"similarity_ratio": distance_ratio,
}

return HeuristicResult.PASS, {"package_name": package_name}
Loading