Skip to content

refactor: replace unreachable project links heuristic with source code repo heuristic #983

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,18 @@ macaron.malware\_analyzer.pypi\_heuristics.metadata.one\_release module
:undoc-members:
:show-inheritance:

macaron.malware\_analyzer.pypi\_heuristics.metadata.unchanged\_release module
macaron.malware\_analyzer.pypi\_heuristics.metadata.source\_code\_repo module
-----------------------------------------------------------------------------

.. automodule:: macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release
.. automodule:: macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo
:members:
:undoc-members:
:show-inheritance:

macaron.malware\_analyzer.pypi\_heuristics.metadata.unreachable\_project\_links module
--------------------------------------------------------------------------------------
macaron.malware\_analyzer.pypi\_heuristics.metadata.unchanged\_release module
-----------------------------------------------------------------------------

.. automodule:: macaron.malware_analyzer.pypi_heuristics.metadata.unreachable_project_links
.. automodule:: macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release
:members:
:undoc-members:
:show-inheritance:
Expand Down
8 changes: 4 additions & 4 deletions src/macaron/malware_analyzer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
Repositories). Many malicious packages do not include any project links.
- **Rule**: Return `HeuristicResult.FAIL` when there are no project links; otherwise, return `HeuristicResult.PASS`.

2. **Unreachable Project Links**
- **Description**: Checks the accessibility of the project links. This is considered an auxiliary
heuristic since no cases have met this heuristic.
- **Rule**: Return `HeuristicResult.FAIL` if all project links are unreachable; otherwise, return `HeuristicResult.PASS`.
2. **Source Code Repo**
- **Description**: Check's if Macaron was able to find a repository containing this package's source code (i.e., checks if
one exists).
- **Rule**: Return `HeuristicResult.FAIL` if no repository was found; otherwise, return `HeuristicResult.PASS`.
- **Dependency**: Will be run if the Empty Project Link heuristic passes.

3. **One Release**
Expand Down
4 changes: 2 additions & 2 deletions src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ class Heuristics(str, Enum):
#: Indicates that the package does not contain any project links (such as documentation or Git repository pages).
EMPTY_PROJECT_LINK = "empty_project_link"

#: Indicates that the package contains project links, but all of them are unreachable.
UNREACHABLE_PROJECT_LINKS = "unreachable_project_links"
#: Indicates that the source code repository for the package was not found.
SOURCE_CODE_REPO = "source_code_repo"

#: Indicates that the package contains only one release.
ONE_RELEASE = "one_release"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""The heuristic analyzer to check the project links."""
"""The heuristic analyzer to check if a source code repo was found."""

import logging

import requests

from macaron.json_tools import JsonType
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
Expand All @@ -15,17 +13,17 @@
logger: logging.Logger = logging.getLogger(__name__)


class UnreachableProjectLinksAnalyzer(BaseHeuristicAnalyzer):
class SourceCodeRepoAnalyzer(BaseHeuristicAnalyzer):
"""
Analyze the accessibility of the project links.
Analyze the accessibility of the source code repository.

If >= 1 project links are reachable, the analyzer consider the package as benign.
Passes if a repository was found and validated by the repo finder, otherwise fails.
"""

def __init__(self) -> None:
super().__init__(
name="unreachable_project_links_analyzer",
heuristic=Heuristics.UNREACHABLE_PROJECT_LINKS,
name="source_code_repo_analyzer",
heuristic=Heuristics.SOURCE_CODE_REPO,
depends_on=[(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.PASS)],
)

Expand All @@ -42,18 +40,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
"""
project_links: dict | None = pypi_package_json.get_project_links()

if project_links is None:
return HeuristicResult.SKIP, {}

for link in project_links.values():
try:
response = requests.head(link, timeout=3)
if response.status_code < 400:
return HeuristicResult.PASS, {}
except requests.exceptions.RequestException as error:
logger.debug(error)
continue

return HeuristicResult.FAIL, {}
# If a sourcecode repo exists, then this will have already been validated
if not pypi_package_json.component.repository:
return HeuristicResult.FAIL, {}
return HeuristicResult.PASS, {}
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.unreachable_project_links import UnreachableProjectLinksAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer
from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
Expand Down Expand Up @@ -69,7 +69,7 @@ class MaliciousMetadataFacts(CheckFacts):
# When implementing new analyzer, appending the classes to this list
ANALYZERS: list = [
EmptyProjectLinkAnalyzer,
UnreachableProjectLinksAnalyzer,
SourceCodeRepoAnalyzer,
OneReleaseAnalyzer,
HighReleaseFrequencyAnalyzer,
UnchangedReleaseAnalyzer,
Expand Down Expand Up @@ -97,7 +97,7 @@ class MaliciousMetadataFacts(CheckFacts):
] = {
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
Expand All @@ -112,7 +112,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
Expand All @@ -127,7 +127,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.PASS, # One Release
HeuristicResult.FAIL, # High Release Frequency
HeuristicResult.FAIL, # Unchanged Release
Expand All @@ -141,7 +141,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.PASS, # One Release
HeuristicResult.FAIL, # High Release Frequency
HeuristicResult.PASS, # Unchanged Release
Expand All @@ -155,7 +155,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.PASS, # One Release
HeuristicResult.FAIL, # High Release Frequency
HeuristicResult.FAIL, # Unchanged Release
Expand All @@ -169,7 +169,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.MEDIUM,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.PASS, # One Release
HeuristicResult.FAIL, # High Release Frequency
HeuristicResult.FAIL, # Unchanged Release
Expand All @@ -183,21 +183,21 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.MEDIUM,
(
HeuristicResult.PASS, # Empty Project
HeuristicResult.FAIL, # Unreachable Project Links
HeuristicResult.FAIL, # Source Code Repo
HeuristicResult.PASS, # One Release
HeuristicResult.FAIL, # High Release Frequency
HeuristicResult.PASS, # Unchanged Release
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.SKIP, # Anomalous Version
# All project links are unreachable, frequent releases of multiple versions,
# No source code repo, frequent releases of multiple versions,
# and the maintainer released it shortly after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
Expand All @@ -212,7 +212,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.MEDIUM,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
Expand All @@ -227,7 +227,7 @@ class MaliciousMetadataFacts(CheckFacts):
): Confidence.MEDIUM,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.SKIP, # Source Code Repo
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
Expand Down
32 changes: 32 additions & 0 deletions tests/malware_analyzer/pypi/test_source_code_repo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Tests for heuristic detecting malicious metadata from PyPI"""

from unittest.mock import MagicMock

import pytest

from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer


@pytest.mark.parametrize(
("repository", "expected_result"),
[
pytest.param(None, HeuristicResult.FAIL, id="test_no_repo"),
pytest.param(
MagicMock(),
HeuristicResult.PASS,
id="test_valid_repo",
),
],
)
def test_repo_existence(
pypi_package_json: MagicMock, repository: MagicMock | None, expected_result: HeuristicResult
) -> None:
"""Test if the source code repo exists."""
pypi_package_json.component.repository = repository
analyzer = SourceCodeRepoAnalyzer()
result, _ = analyzer.analyze(pypi_package_json)
assert result == expected_result

This file was deleted.

Loading