oracle · art1f1c3R · Jun 4, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 2, 2025
@@ -30,6 +30,7 @@ repos:
   - id: isort
     name: Sort import statements
     args: [--settings-path, pyproject.toml]
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
 
 # Add Black code formatters.
 - repo: https://github.com/ambv/black
@@ -38,6 +39,7 @@ repos:
   - id: black
     name: Format code
     args: [--config, pyproject.toml]
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
 - repo: https://github.com/asottile/blacken-docs
   rev: 1.19.1
   hooks:
@@ -65,6 +67,7 @@ repos:
     files: ^src/macaron/|^tests/
     types: [text, python]
     additional_dependencies: [flake8-bugbear==22.10.27, flake8-builtins==2.0.1, flake8-comprehensions==3.10.1, flake8-docstrings==1.6.0, flake8-mutable==1.2.0, flake8-noqa==1.4.0, flake8-pytest-style==1.6.0, flake8-rst-docstrings==0.3.0, pep8-naming==0.13.2]
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
     args: [--config, .flake8]
 
 # Check GitHub Actions workflow files.
@@ -82,6 +85,7 @@ repos:
     entry: pylint
     language: python
     files: ^src/macaron/|^tests/
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
     types: [text, python]
     args: [--rcfile, pyproject.toml]
 
@@ -94,6 +98,7 @@ repos:
     language: python
     files: ^src/macaron/|^tests/
     types: [text, python]
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
     args: [--show-traceback, --config-file, pyproject.toml]
 
 # Check for potential security issues.
@@ -106,6 +111,7 @@ repos:
     files: ^src/macaron/|^tests/
     types: [text, python]
     additional_dependencies: ['bandit[toml]']
+    exclude: ^tests/malware_analyzer/pypi/resources/sourcecode_samples.*
 
 # Enable a whole bunch of useful helper hooks, too.
 # See https://pre-commit.com/hooks.html for more hooks.
@@ -197,6 +203,18 @@ repos:
     always_run: true
     pass_filenames: false
 
+# Checks that tests/malware_analyzer/pypi/resources/sourcecode_samples files do not have executable permissions
+# This is another measure to make sure the files can't be accidentally executed
+- repo: local
+  hooks:
+  - id: sourcecode-sample-permissions
+    name: Sourcecode sample executable permissions checker
+    entry: scripts/dev_scripts/samples_permissions_checker.sh
+    language: system
+    always_run: true
+    pass_filenames: false
+
+
 # A linter for Golang
 - repo: https://github.com/golangci/golangci-lint
   rev: v1.64.6

@@ -0,0 +1 @@
+# Items added to this file will be ignored by Semgrep.
@@ -72,6 +72,10 @@ See below for instructions to set up the development environment.
 - PRs should be merged using the `Squash and merge` strategy. In most cases a single commit with
 a detailed commit message body is preferred. Make sure to keep the `Signed-off-by` line in the body.
 
+### PyPI Malware Detection Contribution
+
+Please see the [README for the malware analyzer](./src/macaron/malware_analyzer/README.md) for information on contributing Heuristics and code patterns.
+
 ## Branching model
 
 * The `main` branch should be used as the base branch for pull requests. The `release` branch is designated for releases and should only be merged into when creating a new release for Macaron.

@@ -46,7 +46,7 @@ RUN : \
     && . .venv/bin/activate \
     && pip install --no-compile --no-cache-dir --upgrade pip setuptools \
     && find $HOME/dist -depth \( -type f \( -name "macaron-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \
-    && pip uninstall semgrep \
+    && pip uninstall semgrep -y \
     && find $HOME/dist -depth \( -type f \( -name "semgrep-*.whl" \) \) -exec pip install --no-compile --no-cache-dir '{}' \; \
     && rm -rf $HOME/dist \
     && deactivate

@@ -9,6 +9,14 @@ macaron.malware\_analyzer.pypi\_heuristics.sourcecode package
 Submodules
 ----------
 
+macaron.malware\_analyzer.pypi\_heuristics.sourcecode.pypi\_sourcecode\_analyzer module
+---------------------------------------------------------------------------------------
+
+.. automodule:: macaron.malware_analyzer.pypi_heuristics.sourcecode.pypi_sourcecode_analyzer
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 macaron.malware\_analyzer.pypi\_heuristics.sourcecode.suspicious\_setup module
 ------------------------------------------------------------------------------
 

@@ -37,6 +37,7 @@ dependencies = [
     "beautifulsoup4 >= 4.12.0,<5.0.0",
     "problog >= 2.2.6,<3.0.0",
     "cryptography >=44.0.0,<45.0.0",
+    "semgrep == 1.113.0",
 ]
 keywords = []
 # https://pypi.org/classifiers/
@@ -119,12 +120,14 @@ Issues = "https://github.com/oracle/macaron/issues"
 [tool.bandit]
 tests = []
 skips = ["B101"]
-
+exclude_dirs = ['tests/malware_analyzer/pypi/resources/sourcecode_samples']
 
 # https://github.com/psf/black#configuration
 [tool.black]
 line-length = 120
-
+force-exclude = '''
+tests/malware_analyzer/pypi/resources/sourcecode_samples/
+'''
 
 # https://github.com/commitizen-tools/commitizen
 # https://commitizen-tools.github.io/commitizen/bump/
@@ -170,7 +173,6 @@ exclude = [
     "SECURITY.md",
 ]
 
-
 # https://pycqa.github.io/isort/
 [tool.isort]
 profile = "black"
@@ -181,7 +183,6 @@ skip_gitignore = true
 
 # https://mypy.readthedocs.io/en/stable/config_file.html#using-a-pyproject-toml
 [tool.mypy]
-# exclude=
 show_error_codes = true
 show_column_numbers = true
 check_untyped_defs = true
@@ -209,7 +210,6 @@ module = [
 ]
 ignore_missing_imports = true
 
-
 # https://pylint.pycqa.org/en/latest/user_guide/configuration/index.html
 [tool.pylint.MASTER]
 fail-under = 10.0
@@ -261,6 +261,7 @@ addopts = """-vv -ra --tb native \
     --doctest-modules --doctest-continue-on-failure --doctest-glob '*.rst' \
     --cov macaron \
     --ignore tests/integration \
+    --ignore tests/malware_analyzer/pypi/resources/sourcecode_samples \
 """  # Consider adding --pdb
 # https://docs.python.org/3/library/doctest.html#option-flags
 doctest_optionflags = "IGNORE_EXCEPTION_DETAIL"

@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 - 2025, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+#
+# Checks if the files in tests/malware_analyzer/pypi/resources/sourcecode_samples have executable permissions,
+# failing if any do.
+#
+
+# Strict bash options.
+#
+# -e:          exit immediately if a command fails (with non-zero return code),
+#              or if a function returns non-zero.
+#
+# -u:          treat unset variables and parameters as error when performing
+#              parameter expansion.
+#              In case a variable ${VAR} is unset but we still need to expand,
+#              use the syntax ${VAR:-} to expand it to an empty string.
+#
+# -o pipefail: set the return value of a pipeline to the value of the last
+#              (rightmost) command to exit with a non-zero status, or zero
+#              if all commands in the pipeline exit successfully.
+#
+# Reference: https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html.
+set -euo pipefail
+
+MACARON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && cd ../.. && pwd)"
+SAMPLES_PATH="${MACARON_DIR}/tests/malware_analyzer/pypi/resources/sourcecode_samples"
+
+# any files have any of the executable bits set
+executables=$( ( find "$SAMPLES_PATH" -type f -perm -u+x -o -type f -perm -g+x -o -type f -perm -o+x | sed "s|$MACARON_DIR/||"; git ls-files "$SAMPLES_PATH" --full-name) | sort | uniq -d)
+if [ -n "$executables" ]; then
+    echo "The following files should not have any executable permissions:"
+    echo "$executables"
+    exit 1
+fi
@@ -96,6 +96,10 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None
 
         global_config.local_maven_repo = user_provided_local_maven_repo
 
+    if analyzer_single_args.force_analyze_source and not analyzer_single_args.analyze_source:
+        logger.error("'--force-analyze-source' requires '--analyze-source'.")
+        sys.exit(os.EX_USAGE)
+
     analyzer = Analyzer(global_config.output_path, global_config.build_log_path)
 
     # Initiate reporters.
@@ -172,8 +176,9 @@ def analyze_slsa_levels_single(analyzer_single_args: argparse.Namespace) -> None
         analyzer_single_args.sbom_path,
         deps_depth,
         provenance_payload=prov_payload,
-        validate_malware=analyzer_single_args.validate_malware,
         verify_provenance=analyzer_single_args.verify_provenance,
+        analyze_source=analyzer_single_args.analyze_source,
+        force_analyze_source=analyzer_single_args.force_analyze_source,
     )
     sys.exit(status_code)
 
@@ -477,10 +482,22 @@ def main(argv: list[str] | None = None) -> None:
     )
 
     single_analyze_parser.add_argument(
-        "--validate-malware",
+        "--analyze-source",
         required=False,
         action="store_true",
-        help=("Enable malware validation."),
+        help=(
+            "For improved malware detection, analyze the source code of the"
+            + " (PyPI) package using a textual scan and dataflow analysis."
+        ),
+    )
+
+    single_analyze_parser.add_argument(
+        "--force-analyze-source",
+        required=False,
+        action="store_true",
+        help=(
+            "Forces PyPI sourcecode analysis to run regardless of other heuristic results. Requires '--analyze-source'."
+        ),
     )
 
     single_analyze_parser.add_argument(

@@ -611,3 +611,27 @@ scaling = 0.15
 cost = 1.0
 # The path to the file that contains the list of popular packages.
 popular_packages_path =
+
+# ==== The following sections are for source code analysis using Semgrep ====
+# rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules.
+# rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in
+# the Segmrep .yaml file.
+# default rulesets: these are a collection of rulesets provided with Macaron which are run by default with the sourcecode
+# analyzer. These live in src/macaron/resources/pypi_malware_rules.
+# custom rulesets: this is a collection of user-provided rulesets, living inside the path provided to 'custom_semgrep_rules_path'.
+
+# disable default semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name
+# without the .yaml prefix. Currently, we disable the exfiltration rulesets by default due to a high false positive rate.
+# This list may not contain duplicated elements. Macaron's default ruleset names are all unique.
+disabled_default_rulesets = exfiltration
+# disable individual rules here (i.e. individual rule IDs inside a Semgrep .yaml file) using rule IDs. You may also
+# provide the IDs of your custom semgrep rules here too, as all Semgrep rule IDs must be unique. This list may not contain
+# duplicated elements.
+disabled_rules =
+# absolute path to a directory where a custom set of semgrep rules for source code analysis are stored. These will be included
+# with Macaron's default rules. The path will be normalised to the OS path type.
+custom_semgrep_rules_path =
+# disable custom semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name without the
+# .yaml prefix. Note, this will be ignored if a path to custom semgrep rules is not provided. This list may not contain
+# duplicated elements, meaning that ruleset names must be unique.
+disabled_custom_rulesets =
@@ -109,3 +109,7 @@ class HeuristicAnalyzerValueError(MacaronError):
 
 class LocalArtifactFinderError(MacaronError):
     """Happens when there is an error looking for local artifacts."""
+
+
+class SourceCodeError(MacaronError):
+    """Error for operations on package source code."""
@@ -1,4 +1,4 @@
-# Implementation of Heuristic Malware Detector
+# Implementation of Malware Detector
 
 ## PyPI Ecosystem
 
@@ -56,6 +56,20 @@ When a heuristic fails, with `HeuristicResult.FAIL`, then that is an indicator b
     - **Description**:  Checks if the package name is suspiciously similar to any package name in a predefined list of popular packages. The similarity check incorporates the Jaro-Winkler distance and considers keyboard layout proximity to identify potential typosquatting.
     - **Rule**: Return `HeuristicResult.FAIL` if the similarity ratio between the package name and any popular package name meets or exceeds a defined threshold; otherwise, return `HeuristicResult.PASS`.
     - **Dependency**: None.
+### Source Code Analysis with Semgrep
+
+The following analyzer has been included as an optional feature, available by supplying `--analyze-source` in the CLI to `macaron analyze`:
+
+**PyPI Source Code Analyzer**
+- **Description**: Uses Semgrep, with default rules written in `src/macaron/resources/pypi_malware_rules` and custom rules available by supplying a path to `custom_semgrep_rules` in `defaults.ini`, to scan the package `.tar` source code.
+- **Rule**: If any Semgrep rule is triggered, the heuristic fails with `HeuristicResult.FAIL` and subsequently fails the package with `CheckResultType.FAILED`. If no rule is triggered, the heuristic passes with `HeuristicResult.PASS` and the `CheckResultType` result from the combination of all other heuristics is maintained.
+- **Dependency**: Will be run if the Source Code Repo fails. This dependency can be bypassed by suppying `--force-analyze-source` in the CLI, along with `--analyze-source`.
+
+This feature is currently a work in progress, and supports detection of code obfuscation techniques and remote exfiltration behaviors. It uses Semgrep OSS for detection. `defaults.ini` may be used to provide custom rules and exclude them:
+- `disabled_default_rulesets`: supply to this a comma separated list of the names of default Semgrep rule files (excluding the `.yaml` extension) to disable all rule IDs in that file.
+- `disabled_rules`: supply to this a comma separated list of individual rule IDs to disable (from both the default and custom list).
+- `custom_semgrep_rules`: supply to this an absolute path to a directory containing custom Semgrep `.yaml` files to be run alongside the default ones.
+- `disabled_custom_rulesets`: supply to this a comma separated list of the names of custom Semgrep rule files (excluding the `.yaml` extension) to disable all rule IDs in that file.
 
 ### Contributing
 
@@ -64,13 +78,47 @@ When contributing an analyzer, it must meet the following requirements:
 - The analyzer must be implemented in a separate file, placed in the relevant folder based on what it analyzes ([metadata](./pypi_heuristics/metadata/) or [sourcecode](./pypi_heuristics/sourcecode/)).
 - The analyzer must inherit from the `BaseHeuristicAnalyzer` class and implement the `analyze` function, returning relevant information specific to the analysis.
 - The analyzer name must be added to [heuristics.py](./pypi_heuristics/heuristics.py) file so it can be used for rule combinations in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py)
+- The analyzer must be added to the list of analyzers in `detect_malicious_metadata_check.py` to be run.
 - Update the `malware_rules_problog_model` in [detect_malicious_metadata_check.py](../slsa_analyzer/checks/detect_malicious_metadata_check.py) with logical statements where the heuristic should be included. When adding new rules, please follow the following guidelines:
    - Provide a [confidence value](../slsa_analyzer/checks/check_result.py) using the `Confidence` enum.
    - Ensure it is assigned to the `problog_result_access` string variable, otherwise it will not be queried and evaluated.
    - Assign a rule ID to the rule. This will be used to backtrack to determine if it was triggered.
    - Make sure to wrap pass/fail statements in `passed()` and `failed()`. Not doing so may result in undesirable behaviour, see the comments in the model for more details.
    - If there are commonly used combinations introduced by adding the heuristic, combine and justify them at the top of the static model (see `quickUndetailed` and `forceSetup` as current examples).
 
+**Contributing Code Pattern Rules**
+
+When contributing more Semgrep rules for `pypi_sourcecode_analyzer.py` to use, the following requirements must be met:
+
+- Semgrep `.yaml` Rules are stored in `src/macaron/resources/pypi_malware_rules` and are named based on the category of code behaviors they detect.
+- If the rule comes under one of the already defined categories, place it within that `.yaml` file, else create a new `.yaml` file using the category name.
+- Each rule ID must be prefixed by the category followed by a single underscore ('_'), so for obfuscation rules in `obfuscation.yaml` each rule ID is prefixed with `obfuscation_`, followed by an ID which uses a hiphen ('-') as a separator.
+- Tests must be written for each rule contributed. These are stored in `tests/malware_analyzer/pypi/test_pypi_sourcescode_analyzer.py`.
+- These tests are written on a per-category bases, running each category individually. Each category must have a folder under `tests/malware_analyzer/pypi/resources/sourcecode_samples`.
+- Within these folders, there must be sample code patterns for testing, and a file `expected_results.json` with the expected JSON output of the analyzer for that category.
+- Each sample code pattern `.py` file must not have executable permissions and must include code that prevents it from being accidentally imported or run. The current files use this method:
+
+```
+"""
+Running this code will not produce any malicious behavior, but code isolation measures are
+in place for safety.
+"""
+
+import sys
+
+# ensure no symbols are exported so this code cannot accidentally be used
+__all__ = []
+sys.exit()
+
+def test_function():
+    """
+    All code to be tested will be defined inside this function, so it is all local to it. This is
+    to isolate the code to be tested, as it exists to replicate the patterns present in malware
+    samples.
+    """
+    sys.exit()
+```
+
 ### Confidence Score Motivation
 
 The original seven heuristics which started this work were Empty Project Link, Unreachable Project Links, One Release, High Release Frequency, Unchange Release, Closer Release Join Date, and Suspicious Setup. These heuristics (excluding those with a dependency) were run on 1167 packages from trusted organizations, with the following results:

@@ -40,6 +40,9 @@ class Heuristics(str, Enum):
     #: Indicates that the package name is similar to a popular package.
     TYPOSQUATTING_PRESENCE = "typosquatting_presence"
 
+    #: Indicates that the package source code contains suspicious code patterns.
+    SUSPICIOUS_PATTERNS = "suspicious_patterns"
+
 
 class HeuristicResult(str, Enum):
     """Result type indicating the outcome of a heuristic."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Items added to this file will be ignored by Semgrep.