docs: definitions provided for rules and rulesets

art1f1c3R · art1f1c3R · commit 489e69ff86c3 · 2025-06-02T15:06:55.000+10:00
Signed-off-by: Carl Flottmann &lt;carl.flottmann@oracle.com&gt;
diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini
@@ -601,6 +601,14 @@ epoch_threshold = 3
 # The number of days +/- the day of publish the calendar versioning day may be.
 day_publish_error = 4
 
+# ==== The following sections are for source code analysis using Semgrep ====
+# rulesets: a reference to a 'ruleset' in this section refers to a Semgrep .yaml file containing one or more rules.
+# rules: a reference to a 'rule' in this section refers to an individual rule ID, specified by the '- id:' field in
+# the Segmrep .yaml file.
+# default rulesets: these are a collection of rulesets provided with Macaron which are run by default with the sourcecode
+# analyzer. These live in src/macaron/resources/pypi_malware_rules.
+# custom rulesets: this is a collection of user-provided rulesets, living inside the path provided to 'custom_semgrep_rules_path'.
+
 # disable default semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name
 # without the .yaml prefix. Currently, we disable the exfiltration rulesets by default due to a high false positive rate.
 # This list may not contain duplicated elements. Macaron's default ruleset names are all unique.
diff --git a/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py b/src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py
@@ -238,10 +238,11 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             if there is no source code available.
         """
         analysis_result: dict = {}
-        disabled_results: dict = (
-            {}
-        )  # since we have to run them anyway, return disabled rule findings for debug information
-        # only run semgrep open-source features, and disable 'nosemgrep' ignoring so this does not bypass our scan
+        # since we have to run them anyway, return disabled rule findings for debug information
+        disabled_results: dict = {}
+        # Here, we disable 'nosemgrep' ignoring so that this is not an evasion method of our scan (i.e. malware includes
+        # 'nosemgrep' comments to prevent our scan detecting those code lines). Read more about the 'nosemgrep' feature
+        # here: https://semgrep.dev/docs/ignoring-files-folders-code
         semgrep_commands: list[str] = ["semgrep", "scan", "--oss-only", "--disable-nosem"]
         result: HeuristicResult = HeuristicResult.PASS
 
@@ -302,6 +303,8 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
             # only work if `--experimental` is also supplied to enable experimental features, which we do not use.
             # Semgrep provides a relative path separated by '.' to the rule ID, where the rule ID is always the
             # final element in that path, so we use that to match our rule IDs.
+            # e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from
+            # the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'.
             if rule_id.split(".")[-1] in self.disabled_rule_ids:
                 if rule_id not in self.disabled_rule_ids:
                     disabled_results[rule_id] = {"message": message, "detections": []}