Skip to content

Commit 8647298

Browse files
Skn0ttMargarete01
andauthored
ENH: add support for reading .tar archives (#44787)
* Add reproduction test for .tar.gz archives co-authored-by: Margarete Dippel <[email protected]> * add support for .tar archives python's `tarfile` supports gzip, xz and bz2 encoding, so we don't need to make any special cases for that. co-authored-by: Margarete Dippel <[email protected]> * update doc comments * fix: pep8 errors * refactor: flip _compression_to_extension around to support multiple extensions on same compression co-authored-by: Margarete Dippel <[email protected] y.github.com> * refactor: detect tar files using existing extension mapping co-authored-by: Margarete Dippel <[email protected]> * feat: add support for writing tar files co-authored-by: Margarete Dippel <[email protected]> * feat: assure it respects .gz endings * feat: add "tar" entry to compressionoptions * chore: add whatsnew entry * fix: test_compression_size_fh * add tarfile to shared compression docs * fix formatting * pass through "mode" via compression args * fix pickle test * add class comment * sort imports * add _compression_to_extension back for backwards compatibility * fix some type warnings * fix: formatting * fix: mypy complaints * fix: more tests * fix: some error with xml * fix: interpreted text role * move to v1.5 whatsnw * add versionadded note * don't leave blank lines * add tests for zero files / multiple files * move _compression_to_extension to tests * revert added "mode" argument * add test to ensure that `compression.mode` works * compare strings, not bytes * replace carriage returns Co-authored-by: Margarete Dippel <[email protected]>
1 parent 4d9439e commit 8647298

File tree

17 files changed

+370
-57
lines changed

17 files changed

+370
-57
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,31 @@ as seen in the following example.
100100
1 2021-01-02 08:00:00 4
101101
2 2021-01-02 16:00:00 5
102102
103+
.. _whatsnew_150.enhancements.tar:
104+
105+
Reading directly from TAR archives
106+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
107+
108+
I/O methods like :func:`read_csv` or :meth:`DataFrame.to_json` now allow reading and writing
109+
directly on TAR archives (:issue:`44787`).
110+
111+
.. code-block:: python
112+
113+
df = pd.read_csv("./movement.tar.gz")
114+
# ...
115+
df.to_csv("./out.tar.gz")
116+
117+
This supports ``.tar``, ``.tar.gz``, ``.tar.bz`` and ``.tar.xz2`` archives.
118+
The used compression method is inferred from the filename.
119+
If the compression method cannot be inferred, use the ``compression`` argument:
120+
121+
.. code-block:: python
122+
123+
df = pd.read_csv(some_file_obj, compression={"method": "tar", "mode": "r:gz"}) # noqa F821
124+
125+
(``mode`` being one of ``tarfile.open``'s modes: https://docs.python.org/3/library/tarfile.html#tarfile.open)
126+
127+
103128
.. _whatsnew_150.enhancements.other:
104129

105130
Other enhancements

pandas/_testing/_io.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import bz2
44
from functools import wraps
55
import gzip
6+
import io
67
import socket
8+
import tarfile
79
from typing import (
810
TYPE_CHECKING,
911
Any,
@@ -398,6 +400,14 @@ def write_to_compressed(compression, path, data, dest="test"):
398400
mode = "w"
399401
args = (dest, data)
400402
method = "writestr"
403+
elif compression == "tar":
404+
compress_method = tarfile.TarFile
405+
mode = "w"
406+
file = tarfile.TarInfo(name=dest)
407+
bytes = io.BytesIO(data)
408+
file.size = len(data)
409+
args = (file, bytes)
410+
method = "addfile"
401411
elif compression == "gzip":
402412
compress_method = gzip.GzipFile
403413
elif compression == "bz2":

pandas/_typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ def closed(self) -> bool:
256256
# compression keywords and compression
257257
CompressionDict = Dict[str, Any]
258258
CompressionOptions = Optional[
259-
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
259+
Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict]
260260
]
261261

262262
# types in DataFrameFormatter

pandas/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,7 @@ def other_closed(request):
294294
"bz2",
295295
"zip",
296296
"xz",
297+
"tar",
297298
pytest.param("zstd", marks=td.skip_if_no("zstandard")),
298299
]
299300
)
@@ -310,6 +311,7 @@ def compression(request):
310311
"bz2",
311312
"zip",
312313
"xz",
314+
"tar",
313315
pytest.param("zstd", marks=td.skip_if_no("zstandard")),
314316
]
315317
)

pandas/core/shared_docs.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -421,29 +421,43 @@
421421
] = """compression : str or dict, default 'infer'
422422
For on-the-fly compression of the output data. If 'infer' and '%s'
423423
path-like, then detect compression from the following extensions: '.gz',
424-
'.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to
425-
``None`` for no compression. Can also be a dict with key ``'method'`` set
426-
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
427-
key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
428-
``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
429-
example, the following could be passed for faster compression and to create
424+
'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
425+
(otherwise no compression).
426+
Set to ``None`` for no compression.
427+
Can also be a dict with key ``'method'`` set
428+
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
429+
key-value pairs are forwarded to
430+
``zipfile.ZipFile``, ``gzip.GzipFile``,
431+
``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
432+
``tarfile.TarFile``, respectively.
433+
As an example, the following could be passed for faster compression and to create
430434
a reproducible gzip archive:
431-
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``."""
435+
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
436+
437+
.. versionadded:: 1.5.0
438+
Added support for `.tar` files."""
432439

433440
_shared_docs[
434441
"decompression_options"
435442
] = """compression : str or dict, default 'infer'
436443
For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
437444
path-like, then detect compression from the following extensions: '.gz',
438-
'.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using
439-
'zip', the ZIP file must contain only one data file to be read in. Set to
440-
``None`` for no decompression. Can also be a dict with key ``'method'`` set
441-
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other
442-
key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``,
443-
``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an
444-
example, the following could be passed for Zstandard decompression using a
445+
'.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
446+
(otherwise no compression).
447+
If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
448+
Set to ``None`` for no decompression.
449+
Can also be a dict with key ``'method'`` set
450+
to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other
451+
key-value pairs are forwarded to
452+
``zipfile.ZipFile``, ``gzip.GzipFile``,
453+
``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or
454+
``tarfile.TarFile``, respectively.
455+
As an example, the following could be passed for Zstandard decompression using a
445456
custom compression dictionary:
446-
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``."""
457+
``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
458+
459+
.. versionadded:: 1.5.0
460+
Added support for `.tar` files."""
447461

448462
_shared_docs[
449463
"replace"

pandas/io/common.py

Lines changed: 151 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from io import (
1111
BufferedIOBase,
1212
BytesIO,
13+
FileIO,
1314
RawIOBase,
1415
StringIO,
1516
TextIOBase,
@@ -19,6 +20,7 @@
1920
import os
2021
from pathlib import Path
2122
import re
23+
import tarfile
2224
from typing import (
2325
IO,
2426
Any,
@@ -450,13 +452,18 @@ def file_path_to_url(path: str) -> str:
450452
return urljoin("file:", pathname2url(path))
451453

452454

453-
_compression_to_extension = {
454-
"gzip": ".gz",
455-
"bz2": ".bz2",
456-
"zip": ".zip",
457-
"xz": ".xz",
458-
"zstd": ".zst",
455+
_extension_to_compression = {
456+
".tar": "tar",
457+
".tar.gz": "tar",
458+
".tar.bz2": "tar",
459+
".tar.xz": "tar",
460+
".gz": "gzip",
461+
".bz2": "bz2",
462+
".zip": "zip",
463+
".xz": "xz",
464+
".zst": "zstd",
459465
}
466+
_supported_compressions = set(_extension_to_compression.values())
460467

461468

462469
def get_compression_method(
@@ -532,20 +539,18 @@ def infer_compression(
532539
return None
533540

534541
# Infer compression from the filename/URL extension
535-
for compression, extension in _compression_to_extension.items():
542+
for extension, compression in _extension_to_compression.items():
536543
if filepath_or_buffer.lower().endswith(extension):
537544
return compression
538545
return None
539546

540547
# Compression has been specified. Check that it's valid
541-
if compression in _compression_to_extension:
548+
if compression in _supported_compressions:
542549
return compression
543550

544551
# https://github.com/python/mypy/issues/5492
545552
# Unsupported operand types for + ("List[Optional[str]]" and "List[str]")
546-
valid = ["infer", None] + sorted(
547-
_compression_to_extension
548-
) # type: ignore[operator]
553+
valid = ["infer", None] + sorted(_supported_compressions) # type: ignore[operator]
549554
msg = (
550555
f"Unrecognized compression type: {compression}\n"
551556
f"Valid compression types are {valid}"
@@ -682,7 +687,7 @@ def get_handle(
682687
ioargs.encoding,
683688
ioargs.mode,
684689
errors,
685-
ioargs.compression["method"] not in _compression_to_extension,
690+
ioargs.compression["method"] not in _supported_compressions,
686691
)
687692

688693
is_path = isinstance(handle, str)
@@ -753,6 +758,30 @@ def get_handle(
753758
f"Only one file per ZIP: {zip_names}"
754759
)
755760

761+
# TAR Encoding
762+
elif compression == "tar":
763+
if "mode" not in compression_args:
764+
compression_args["mode"] = ioargs.mode
765+
if is_path:
766+
handle = _BytesTarFile.open(name=handle, **compression_args)
767+
else:
768+
handle = _BytesTarFile.open(fileobj=handle, **compression_args)
769+
assert isinstance(handle, _BytesTarFile)
770+
if handle.mode == "r":
771+
handles.append(handle)
772+
files = handle.getnames()
773+
if len(files) == 1:
774+
file = handle.extractfile(files[0])
775+
assert file is not None
776+
handle = file
777+
elif len(files) == 0:
778+
raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
779+
else:
780+
raise ValueError(
781+
"Multiple files found in TAR archive. "
782+
f"Only one file per TAR archive: {files}"
783+
)
784+
756785
# XZ Compression
757786
elif compression == "xz":
758787
handle = get_lzma_file()(handle, ioargs.mode)
@@ -844,6 +873,116 @@ def get_handle(
844873
)
845874

846875

876+
# error: Definition of "__exit__" in base class "TarFile" is incompatible with
877+
# definition in base class "BytesIO" [misc]
878+
# error: Definition of "__enter__" in base class "TarFile" is incompatible with
879+
# definition in base class "BytesIO" [misc]
880+
# error: Definition of "__enter__" in base class "TarFile" is incompatible with
881+
# definition in base class "BinaryIO" [misc]
882+
# error: Definition of "__enter__" in base class "TarFile" is incompatible with
883+
# definition in base class "IO" [misc]
884+
# error: Definition of "read" in base class "TarFile" is incompatible with
885+
# definition in base class "BytesIO" [misc]
886+
# error: Definition of "read" in base class "TarFile" is incompatible with
887+
# definition in base class "IO" [misc]
888+
class _BytesTarFile(tarfile.TarFile, BytesIO): # type: ignore[misc]
889+
"""
890+
Wrapper for standard library class TarFile and allow the returned file-like
891+
handle to accept byte strings via `write` method.
892+
893+
BytesIO provides attributes of file-like object and TarFile.addfile writes
894+
bytes strings into a member of the archive.
895+
"""
896+
897+
# GH 17778
898+
def __init__(
899+
self,
900+
name: str | bytes | os.PathLike[str] | os.PathLike[bytes],
901+
mode: Literal["r", "a", "w", "x"],
902+
fileobj: FileIO,
903+
archive_name: str | None = None,
904+
**kwargs,
905+
):
906+
self.archive_name = archive_name
907+
self.multiple_write_buffer: BytesIO | None = None
908+
self._closing = False
909+
910+
super().__init__(name=name, mode=mode, fileobj=fileobj, **kwargs)
911+
912+
@classmethod
913+
def open(cls, name=None, mode="r", **kwargs):
914+
mode = mode.replace("b", "")
915+
return super().open(name=name, mode=cls.extend_mode(name, mode), **kwargs)
916+
917+
@classmethod
918+
def extend_mode(
919+
cls, name: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], mode: str
920+
) -> str:
921+
if mode != "w":
922+
return mode
923+
if isinstance(name, (os.PathLike, str)):
924+
filename = Path(name)
925+
if filename.suffix == ".gz":
926+
return mode + ":gz"
927+
elif filename.suffix == ".xz":
928+
return mode + ":xz"
929+
elif filename.suffix == ".bz2":
930+
return mode + ":bz2"
931+
return mode
932+
933+
def infer_filename(self):
934+
"""
935+
If an explicit archive_name is not given, we still want the file inside the zip
936+
file not to be named something.tar, because that causes confusion (GH39465).
937+
"""
938+
if isinstance(self.name, (os.PathLike, str)):
939+
# error: Argument 1 to "Path" has
940+
# incompatible type "Union[str, PathLike[str], PathLike[bytes]]";
941+
# expected "Union[str, PathLike[str]]" [arg-type]
942+
filename = Path(self.name) # type: ignore[arg-type]
943+
if filename.suffix == ".tar":
944+
return filename.with_suffix("").name
945+
if filename.suffix in [".tar.gz", ".tar.bz2", ".tar.xz"]:
946+
return filename.with_suffix("").with_suffix("").name
947+
return filename.name
948+
return None
949+
950+
def write(self, data):
951+
# buffer multiple write calls, write on flush
952+
if self.multiple_write_buffer is None:
953+
self.multiple_write_buffer = BytesIO()
954+
self.multiple_write_buffer.write(data)
955+
956+
def flush(self) -> None:
957+
# write to actual handle and close write buffer
958+
if self.multiple_write_buffer is None or self.multiple_write_buffer.closed:
959+
return
960+
961+
# TarFile needs a non-empty string
962+
archive_name = self.archive_name or self.infer_filename() or "tar"
963+
with self.multiple_write_buffer:
964+
value = self.multiple_write_buffer.getvalue()
965+
tarinfo = tarfile.TarInfo(name=archive_name)
966+
tarinfo.size = len(value)
967+
self.addfile(tarinfo, BytesIO(value))
968+
969+
def close(self):
970+
self.flush()
971+
super().close()
972+
973+
@property
974+
def closed(self):
975+
if self.multiple_write_buffer is None:
976+
return False
977+
return self.multiple_write_buffer.closed and super().closed
978+
979+
@closed.setter
980+
def closed(self, value):
981+
if not self._closing and value:
982+
self._closing = True
983+
self.close()
984+
985+
847986
# error: Definition of "__exit__" in base class "ZipFile" is incompatible with
848987
# definition in base class "BytesIO" [misc]
849988
# error: Definition of "__enter__" in base class "ZipFile" is incompatible with

pandas/tests/io/formats/test_to_csv.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
compat,
1414
)
1515
import pandas._testing as tm
16-
17-
import pandas.io.common as icom
16+
from pandas.tests.io.test_compression import _compression_to_extension
1817

1918

2019
class TestToCSV:
@@ -555,7 +554,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer):
555554

556555
# We'll complete file extension subsequently.
557556
filename = "test."
558-
filename += icom._compression_to_extension[compression]
557+
filename += _compression_to_extension[compression]
559558

560559
df = DataFrame({"A": [1]})
561560

0 commit comments

Comments
 (0)