Skip to content

Commit f00e63d

Browse files
committed
refactor: more efficient file traversing for semgrep rule IDs
Signed-off-by: Carl Flottmann <[email protected]>
1 parent 489e69f commit f00e63d

File tree

1 file changed

+32
-27
lines changed

1 file changed

+32
-27
lines changed

src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -185,35 +185,40 @@ def _extract_rule_ids(self, path: str, target_files: set[str]) -> set[str]:
185185
If any Semgrep rule file could not be safely loaded, or if their format was not in the expected Semgrep
186186
format, or if there were any files in 'target_files' not found when searching in 'path'.
187187
"""
188-
path_tree = glob.glob(os.path.join(path, "**", "*"), recursive=True)
189-
all_file_names = {os.path.basename(file) for file in path_tree if os.path.isfile(file)}
190-
if not target_files.issubset(all_file_names):
191-
error_msg = f"The following semgrep files were not found in {path}: {target_files - all_file_names}"
188+
# We keep a record of any file paths we coulnd't find to provide a more useful error message, rather than raising
189+
# an error on the first missing file we see.
190+
missing_files: list[str] = []
191+
target_file_paths: list[str] = []
192+
rule_ids: set[str] = set()
193+
194+
for target_file in target_files:
195+
file_paths = glob.glob(os.path.join(path, "**", target_file), recursive=True)
196+
if not file_paths:
197+
missing_files.append(target_file)
198+
target_file_paths.extend(file_paths)
199+
200+
if missing_files:
201+
error_msg = f"The following semgrep files were not found in {path}: {missing_files}"
192202
logger.debug(error_msg)
193203
raise ConfigurationError(error_msg)
194204

195-
rule_ids = set()
196-
for root, _, files in os.walk(path):
197-
files_found = set.intersection(target_files, set(files))
198-
for filename in files_found:
199-
semgrep_ruleset_file = os.path.join(root, filename)
200-
201-
try:
202-
with open(semgrep_ruleset_file, encoding="utf-8") as file:
203-
semgrep_ruleset: dict[str, list] = yaml.safe_load(file.read())
204-
except yaml.YAMLError as yaml_error:
205-
error_msg = f"Unable to open semgrep rule file {semgrep_ruleset_file}: {yaml_error}."
206-
logger.debug(error_msg)
207-
raise ConfigurationError(error_msg) from yaml_error
208-
209-
# should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
210-
try:
211-
for semgrep_rule in semgrep_ruleset["rules"]:
212-
rule_ids.add(semgrep_rule["id"])
213-
except (KeyError, TypeError) as format_error:
214-
error_msg = f"Invalid semgrep rule format for {semgrep_ruleset_file}: {format_error}."
215-
logger.debug(error_msg)
216-
raise ConfigurationError(error_msg) from format_error
205+
for file_path in target_file_paths:
206+
try:
207+
with open(file_path, encoding="utf-8") as file:
208+
semgrep_ruleset: dict[str, list] = yaml.safe_load(file.read())
209+
except yaml.YAMLError as yaml_error:
210+
error_msg = f"Unable to open semgrep rule file {file_path}: {yaml_error}."
211+
logger.debug(error_msg)
212+
raise ConfigurationError(error_msg) from yaml_error
213+
214+
# should be a top-level key "rules", and then a list of rules (dictionaries) with "id" entries
215+
try:
216+
for semgrep_rule in semgrep_ruleset["rules"]:
217+
rule_ids.add(semgrep_rule["id"])
218+
except (KeyError, TypeError) as format_error:
219+
error_msg = f"Invalid semgrep rule format for {file_path}: {format_error}."
220+
logger.debug(error_msg)
221+
raise ConfigurationError(error_msg) from format_error
217222

218223
return rule_ids
219224

@@ -306,7 +311,7 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
306311
# e.g. rule_id = src.macaron.resources.pypi_malware_rules.obfuscation_decode-and-execute, which comes from
307312
# the rule ID 'obfuscation_decode-and-execute' inside 'obfuscation.yaml'.
308313
if rule_id.split(".")[-1] in self.disabled_rule_ids:
309-
if rule_id not in self.disabled_rule_ids:
314+
if rule_id not in disabled_results:
310315
disabled_results[rule_id] = {"message": message, "detections": []}
311316
disabled_results[rule_id]["detections"].append({"file": file, "start": start, "end": end})
312317

0 commit comments

Comments
 (0)