Skip to content

Commit 09b0581

Browse files
committed
Use subprocess tar command to extract files from tarball
1 parent fd8d695 commit 09b0581

File tree

1 file changed

+53
-12
lines changed

1 file changed

+53
-12
lines changed

lib/pbench/server/cache_manager.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import shutil
88
import subprocess
99
import tarfile
10+
from time import sleep
1011
from typing import Any, IO, Optional, Union
1112

1213
from pbench.common import MetadataLog, selinux
@@ -521,19 +522,59 @@ def extract(tarball_path: Path, path: str) -> Inventory:
521522
Raise:
522523
TarballNotFound on failure opening the tarball
523524
CacheExtractBadPath if the target cannot be extracted
525+
Any exception raised by subprocess.Popen()
526+
RuntimeError on unexpected failures (see message)
524527
"""
525-
try:
526-
tar = tarfile.open(tarball_path, "r:*")
527-
except Exception as exc:
528-
raise TarballNotFound(str(tarball_path)) from exc
529-
try:
530-
stream = tar.extractfile(str(path))
531-
except Exception as exc:
532-
raise CacheExtractBadPath(tarball_path, path) from exc
533-
else:
534-
if not stream:
535-
raise CacheExtractBadPath(tarball_path, path)
536-
return Inventory(stream, tar)
528+
tar_path = shutil.which("tar")
529+
if tar_path is None:
530+
raise RuntimeError("External 'tar' executable not found")
531+
532+
# The external tar utility offers better capabilities than the
533+
# Standard Library package, so run it in a subprocess: extract
534+
# the target member from the specified tar archive and direct it to
535+
# stdout; we expect only one occurrence of the target member, so stop
536+
# processing as soon as we find it instead of looking for additional
537+
# instances of it later in the archive -- this is a huge savings when
538+
# the archive is very large.
539+
tarproc = subprocess.Popen(
540+
[str(tar_path), "xOf", tarball_path, "--occurrence=1", path],
541+
stdin=subprocess.DEVNULL,
542+
stdout=subprocess.PIPE,
543+
stderr=subprocess.PIPE
544+
)
545+
546+
# Wait for one of two things to happen: either the subprocess produces
547+
# some output or it exits.
548+
while not tarproc.stdout.peek() and tarproc.poll() is None:
549+
sleep(0.02)
550+
551+
# If the return code is None (meaning the command is still running) or
552+
# is zero (meaning it completed successfully), then return the stream
553+
# containing the extracted file to our caller.
554+
if not tarproc.returncode:
555+
# Since we own the `tarproc` object, we don't need to return a
556+
# value for the second part of the Inventory object (this is an
557+
# artifact from when we used the Standard Library tarfile
558+
# package).
559+
return Inventory(tarproc.stdout, None)
560+
561+
# The tar command was invoked successfully (otherwise, the Popen()
562+
# constructor would have raised an exception), but it exited with
563+
# an error code. We have to glean what went wrong by looking at
564+
# stderror, which is fragile but the only option. Rather than
565+
# relying on looking for specific text, we assume that, if the
566+
# error references the tar file, the file was not found (or is
567+
# otherwise inaccessible) and if the error references the archive
568+
# member, then it was a bad path. (Failing those, report a generic
569+
# failure.)
570+
error_text = tarproc.stderr.read().decode()
571+
if str(tarball_path) in error_text:
572+
# "tar: /path/to/bad_tarball.tar.xz: Cannot open: No such file or directory"
573+
raise TarballNotFound(str(tarball_path))
574+
if path in error_text:
575+
# "tar: missing_member.txt: Not found in archive"
576+
raise CacheExtractBadPath(tarball_path, path)
577+
raise RuntimeError(f"Unexpected error from {tar_path}: {error_text!r}")
537578

538579
def get_inventory(self, path: str) -> Optional[JSONOBJECT]:
539580
"""Access the file stream of a tarball member file.

0 commit comments

Comments
 (0)