Skip to content

Commit 38cc36b

Browse files
committed
fix: entire source code is no longer stored in memory
1 parent 1ece531 commit 38cc36b

File tree

3 files changed

+125
-55
lines changed

3 files changed

+125
-55
lines changed

src/macaron/malware_analyzer/pypi_heuristics/sourcecode/pypi_sourcecode_analyzer.py

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import yaml
2424

2525
from macaron.config.defaults import defaults
26-
from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError
26+
from macaron.errors import ConfigurationError, HeuristicAnalyzerValueError, SourceCodeError
2727
from macaron.json_tools import JsonType, json_extract
2828
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
2929
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
@@ -231,31 +231,30 @@ def analyze_dataflow(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[Heu
231231
analysis_result: dict = {}
232232
result: HeuristicResult = HeuristicResult.SKIP
233233

234-
source_code = pypi_package_json.package_sourcecode
235-
if not source_code:
234+
try:
235+
for filename, content in pypi_package_json.iter_sourcecode():
236+
try:
237+
_ = ast.parse(content.decode("utf-8"))
238+
except (SyntaxError, ValueError) as ast_parse_error:
239+
logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error)
240+
continue
241+
242+
# tracer = DataFlowTracer()
243+
# tracer.generate_symbol_table(content)
244+
245+
# functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer)
246+
# is_malware, detail_info = functioncall_analyzer.analyze(content)
247+
# if is_malware:
248+
# result = HeuristicResult.FAIL
249+
250+
# # TODO: Currently, the result collector does not handle the situation that
251+
# # multiple same filename. In the future, this will be replace with absolute path.
252+
# if detail_info:
253+
# analysis_result[filename] = detail_info
254+
except SourceCodeError as sourcecode_error:
236255
error_msg = "Unable to retrieve PyPI package source code"
237256
logger.debug(error_msg)
238-
raise HeuristicAnalyzerValueError(error_msg)
239-
240-
for filename, content in source_code.items():
241-
try:
242-
_ = ast.parse(content)
243-
except (SyntaxError, ValueError) as ast_parse_error:
244-
logger.debug("File %s cannot be parsed as a python file: %s", filename, ast_parse_error)
245-
continue
246-
247-
# tracer = DataFlowTracer()
248-
# tracer.generate_symbol_table(content)
249-
250-
# functioncall_analyzer = FunctionCallAnalyzer(self.suspicious_pattern, tracer)
251-
# is_malware, detail_info = functioncall_analyzer.analyze(content)
252-
# if is_malware:
253-
# result = HeuristicResult.FAIL
254-
255-
# # TODO: Currently, the result collector does not handle the situation that
256-
# # multiple same filename. In the future, this will be replace with absolute path.
257-
# if detail_info:
258-
# analysis_result[filename] = detail_info
257+
raise HeuristicAnalyzerValueError(error_msg) from sourcecode_error
259258

260259
return result, analysis_result
261260

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,6 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
409409
component=ctx.component,
410410
pypi_registry=pypi_registry,
411411
package_json={},
412-
package_sourcecode={},
413412
package_sourcecode_path="",
414413
)
415414

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 102 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import tarfile
1111
import tempfile
1212
import urllib.parse
13-
from collections.abc import Callable
13+
from collections.abc import Callable, Iterator
1414
from dataclasses import dataclass
1515
from datetime import datetime
1616

@@ -31,6 +31,10 @@
3131
logger: logging.Logger = logging.getLogger(__name__)
3232

3333

34+
def _handle_temp_dir_clean(function: Callable, path: str, onerror: tuple) -> None:
35+
raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}")
36+
37+
3438
class PyPIRegistry(PackageRegistry):
3539
"""This class implements the pypi package registry."""
3640

@@ -187,10 +191,7 @@ def download_package_json(self, url: str) -> dict:
187191

188192
return res_obj
189193

190-
def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None:
191-
raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}")
192-
193-
def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
194+
def download_package_sourcecode(self, url: str) -> str:
194195
"""Download the package source code from pypi registry.
195196
196197
Parameters
@@ -200,11 +201,14 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
200201
201202
Returns
202203
-------
203-
tuple[dict[str, bytes], str]
204-
A dictionary of filenames and file contents, and the temp directory with the source code.
205-
"""
206-
sourcecode: dict = {}
204+
str
205+
The temp directory with the source code.
207206
207+
Raises
208+
------
209+
InvalidHTTPResponseError
210+
If the HTTP request to the registry fails or an unexpected response is returned.
211+
"""
208212
# Get name of file.
209213
_, _, file_name = url.rpartition("/")
210214
package_name = re.sub(r"\.tar\.gz$", "", file_name)
@@ -216,7 +220,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
216220
error_msg = f"Unable to find package source code using URL: {url}"
217221
logger.debug(error_msg)
218222
try:
219-
shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean)
223+
shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean)
220224
except SourceCodeError as tempdir_exception:
221225
tempdir_exception_msg = (
222226
f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}"
@@ -235,7 +239,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
235239
error_msg = f"Error while streaming source file: {stream_error}"
236240
logger.debug(error_msg)
237241
try:
238-
shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean)
242+
shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean)
239243
except SourceCodeError as tempdir_exception:
240244
tempdir_exception_msg = (
241245
f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}"
@@ -249,15 +253,11 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
249253
with tarfile.open(source_file.name, "r:gz") as sourcecode_tar:
250254
sourcecode_tar.extractall(temp_dir, filter="data")
251255

252-
for member in sourcecode_tar.getmembers():
253-
if member.isfile() and (file_obj := sourcecode_tar.extractfile(member)):
254-
sourcecode[member.name] = file_obj.read()
255-
256256
except tarfile.ReadError as read_error:
257257
error_msg = f"Error reading source code tar file: {read_error}"
258258
logger.debug(error_msg)
259259
try:
260-
shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean)
260+
shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean)
261261
except SourceCodeError as tempdir_exception:
262262
tempdir_exception_msg = (
263263
f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}"
@@ -266,11 +266,16 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
266266

267267
raise InvalidHTTPResponseError(error_msg) from read_error
268268

269+
extracted_dir = os.listdir(temp_dir)
270+
if len(extracted_dir) == 1 and re.sub(".tar.gz$", "", file_name) == extracted_dir[0]:
271+
# structure used package name and version as top-level directory
272+
temp_dir = os.path.join(temp_dir, extracted_dir[0])
273+
269274
else:
270275
error_msg = f"Unable to extract source code from file {file_name}"
271276
logger.debug(error_msg)
272277
try:
273-
shutil.rmtree(temp_dir, onerror=self._handle_temp_dir_clean)
278+
shutil.rmtree(temp_dir, onerror=_handle_temp_dir_clean)
274279
except SourceCodeError as tempdir_exception:
275280
tempdir_exception_msg = (
276281
f"Unable to cleanup temporary directory {temp_dir} for source code: {tempdir_exception}"
@@ -281,7 +286,7 @@ def download_package_sourcecode(self, url: str) -> tuple[dict[str, bytes], str]:
281286
raise InvalidHTTPResponseError(error_msg)
282287

283288
logger.debug("Temporary download and unzip of %s stored in %s", file_name, temp_dir)
284-
return sourcecode, temp_dir
289+
return temp_dir
285290

286291
def get_package_page(self, package_name: str) -> str | None:
287292
"""Implement custom API to get package main page.
@@ -401,9 +406,6 @@ class PyPIPackageJsonAsset:
401406
#: The asset content.
402407
package_json: dict
403408

404-
#: The source code of the package hosted on PyPI
405-
package_sourcecode: dict
406-
407409
#: the source code temporary location name
408410
package_sourcecode_path: str
409411

@@ -537,7 +539,7 @@ def get_latest_release_upload_time(self) -> str | None:
537539
return None
538540

539541
def download_sourcecode(self) -> bool:
540-
"""Get the source code of the package and store it in the package_sourcecode attribute.
542+
"""Get the source code of the package and store it in a temporary directory.
541543
542544
Returns
543545
-------
@@ -547,26 +549,22 @@ def download_sourcecode(self) -> bool:
547549
url = self.get_sourcecode_url()
548550
if url:
549551
try:
550-
self.package_sourcecode, self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode(
551-
url
552-
)
552+
self.package_sourcecode_path = self.pypi_registry.download_package_sourcecode(url)
553553
return True
554554
except InvalidHTTPResponseError as error:
555555
logger.debug(error)
556556
return False
557557

558-
def _handle_temp_dir_clean(self, function: Callable, path: str, onerror: tuple) -> None:
559-
raise SourceCodeError(f"Error removing with shutil. function={function}, " f"path={path}, excinfo={onerror}")
560-
561558
def cleanup_sourcecode(self) -> None:
562559
"""
563560
Delete the temporary directory created when downloading the source code.
564561
565-
The package source code is no longer accessible after this.
562+
The package source code is no longer accessible after this, and the package_sourcecode_path
563+
attribute is set to an empty string.
566564
"""
567565
if self.package_sourcecode_path:
568566
try:
569-
shutil.rmtree(self.package_sourcecode_path, onerror=self._handle_temp_dir_clean)
567+
shutil.rmtree(self.package_sourcecode_path, onerror=_handle_temp_dir_clean)
570568
self.package_sourcecode_path = ""
571569
except SourceCodeError as tempdir_exception:
572570
tempdir_exception_msg = (
@@ -575,3 +573,77 @@ def cleanup_sourcecode(self) -> None:
575573
)
576574
logger.debug(tempdir_exception_msg)
577575
raise tempdir_exception
576+
577+
def get_sourcecode_file_contents(self, path: str) -> bytes:
578+
"""
579+
Get the contents of a single source code file specified by the path.
580+
581+
The path can be relative to the package_sourcecode_path attribute, or an absolute path.
582+
583+
Parameters
584+
----------
585+
path: str
586+
The absolute or relative to package_sourcecode_path file path to open.
587+
588+
Returns
589+
-------
590+
bytes
591+
The raw contents of the source code file.
592+
593+
Raises
594+
------
595+
SourceCodeError
596+
if the source code has not been downloaded, or there is an error accessing the file.
597+
"""
598+
if not self.package_sourcecode_path:
599+
error_msg = "No source code files have been downloaded"
600+
logger.debug(error_msg)
601+
raise SourceCodeError(error_msg)
602+
603+
if not os.path.isabs(path):
604+
path = os.path.join(self.package_sourcecode_path, path)
605+
606+
if not os.path.exists(path):
607+
error_msg = f"Unable to locate file {path}"
608+
logger.debug(error_msg)
609+
raise SourceCodeError(error_msg)
610+
611+
try:
612+
with open(path, "rb") as file:
613+
return file.read()
614+
except OSError as read_error:
615+
error_msg = f"Unable to read file {path}: {read_error}"
616+
logger.debug(error_msg)
617+
raise SourceCodeError(error_msg) from read_error
618+
619+
def iter_sourcecode(self) -> Iterator[tuple[str, bytes]]:
620+
"""
621+
Iterate through all source code files.
622+
623+
Returns
624+
-------
625+
tuple[str, bytes]
626+
The source code file path, and the the raw contents of the source code file.
627+
628+
Raises
629+
------
630+
SourceCodeError
631+
if the source code has not been downloaded.
632+
"""
633+
if not self.package_sourcecode_path:
634+
error_msg = "No source code files have been downloaded"
635+
logger.debug(error_msg)
636+
raise SourceCodeError(error_msg)
637+
638+
for root, _directories, files in os.walk(self.package_sourcecode_path):
639+
for file in files:
640+
if root == ".":
641+
root_path = os.getcwd() + os.linesep
642+
else:
643+
root_path = root
644+
filepath = os.path.join(root_path, file)
645+
646+
with open(filepath, "rb") as handle:
647+
contents = handle.read()
648+
649+
yield filepath, contents

0 commit comments

Comments
 (0)