diff --git a/MANIFEST.in b/MANIFEST.in index c6ddc79eaa83c..78464c9aaedc8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -36,6 +36,7 @@ global-exclude *.xpt global-exclude *.cpt global-exclude *.xz global-exclude *.zip +global-exclude *.zst global-exclude *~ global-exclude .DS_Store global-exclude .git* diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml index cfafcd679e9b9..5b3ff947aef8a 100644 --- a/ci/deps/actions-38-slow.yaml +++ b/ci/deps/actions-38-slow.yaml @@ -34,3 +34,4 @@ dependencies: - xlsxwriter - xlwt - numba + - zstandard diff --git a/ci/deps/actions-39-slow.yaml b/ci/deps/actions-39-slow.yaml index 511ea8004af70..46e8ca17f096b 100644 --- a/ci/deps/actions-39-slow.yaml +++ b/ci/deps/actions-39-slow.yaml @@ -38,3 +38,4 @@ dependencies: - xlwt - pyreadstat - pyxlsb + - zstandard diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 81bc5a028e96c..0373599b787f0 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -37,3 +37,4 @@ dependencies: - xlwt - pyreadstat - pyxlsb + - zstandard diff --git a/ci/deps/azure-macos-38.yaml b/ci/deps/azure-macos-38.yaml index 03de63fed4f9c..472dc8754d13e 100644 --- a/ci/deps/azure-macos-38.yaml +++ b/ci/deps/azure-macos-38.yaml @@ -32,6 +32,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - zstandard - pip - pip: - cython>=0.29.24 diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 9b68eb6f4c55b..aa2921ca7e496 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -32,3 +32,4 @@ dependencies: - xlrd - xlsxwriter - xlwt + - zstandard diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml index e582de42cc17f..3d8fe05e2fce5 100644 --- a/ci/deps/azure-windows-39.yaml +++ b/ci/deps/azure-windows-39.yaml @@ -37,3 +37,4 @@ dependencies: - xlwt - pyreadstat - pyxlsb + - zstandard diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index 7912ce5f8a5fa..60608c3ee1a86 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -12,8 +12,10 @@ dependencies: # pandas dependencies - botocore>=1.11 + - flask + - moto - numpy - python-dateutil - pytz - - flask - - moto + - zstandard + - pip diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c054153fe4a43..28b9da137e86d 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -402,3 +402,13 @@ qtpy Clipboard I/O xclip Clipboard I/O on linux xsel Clipboard I/O on linux ========================= ================== ============================================================= + + +Compression +^^^^^^^^^^^ + +========================= ================== ============================================================= +Dependency Minimum Version Notes +========================= ================== ============================================================= +Zstandard Zstandard compression +========================= ================== ============================================================= diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 403599297a492..9faef9b15bfb4 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -316,14 +316,14 @@ chunksize : int, default ``None`` Quoting, compression, and file format +++++++++++++++++++++++++++++++++++++ -compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` +compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``'zstd'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', - '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', + bz2, zip, xz, or zstandard if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', + '.zip', '.xz', '.zst', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` - set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are - forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``. + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other key-value pairs are + forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. @@ -4032,18 +4032,18 @@ Compressed pickle files ''''''''''''''''''''''' :func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` can read -and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing. +and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz``, ``zstd`` are supported for reading and writing. The ``zip`` file format only supports reading and must contain only one data file to be read. The compression type can be an explicit parameter or be inferred from the file extension. -If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or -``'.xz'``, respectively. +If 'infer', then use ``gzip``, ``bz2``, ``zip``, ``xz``, ``zstd`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, +``'.xz'``, or ``'.zst'``, respectively. The compression parameter can also be a ``dict`` in order to pass options to the compression protocol. It must have a ``'method'`` key set to the name of the compression protocol, which must be one of -{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to +{``'zip'``, ``'gzip'``, ``'bz2'``, ``'xz'``, ``'zstd'``}. All other key-value pairs are passed to the underlying compression library. .. ipython:: python diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ffd32e263aa50..fcfd429e1df0a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -234,6 +234,7 @@ Other enhancements - :meth:`DataFrame.take` now raises a ``TypeError`` when passed a scalar for the indexer (:issue:`42875`) - :meth:`is_list_like` now identifies duck-arrays as list-like unless ``.ndim == 0`` (:issue:`35131`) - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`). +- Add support for `Zstandard `_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`) - diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py index 8fe78fdd499ae..097af99dbfd88 100644 --- a/pandas/_testing/_io.py +++ b/pandas/_testing/_io.py @@ -15,6 +15,7 @@ ReadPickleBuffer, ) from pandas.compat import get_lzma_file +from pandas.compat._optional import import_optional_dependency import pandas as pd from pandas._testing._random import rands @@ -364,7 +365,7 @@ def write_to_compressed(compression, path, data, dest="test"): Parameters ---------- - compression : {'gzip', 'bz2', 'zip', 'xz'} + compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'} The compression type to use. path : str The file path to write the data. @@ -391,6 +392,8 @@ def write_to_compressed(compression, path, data, dest="test"): compress_method = gzip.GzipFile elif compression == "bz2": compress_method = bz2.BZ2File + elif compression == "zstd": + compress_method = import_optional_dependency("zstandard").open elif compression == "xz": compress_method = get_lzma_file() else: diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index aff3e9e5471de..5a77c06d65d07 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -29,7 +29,7 @@ def decompress_file(path, compression): path : str The path where the file is read from. - compression : {'gzip', 'bz2', 'zip', 'xz', None} + compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd', None} Name of the decompression to use Returns diff --git a/pandas/_typing.py b/pandas/_typing.py index 95277e97eae98..eb5bb30238893 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -243,7 +243,7 @@ def closed(self) -> bool: # compression keywords and compression CompressionDict = Dict[str, Any] CompressionOptions = Optional[ - Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict] + Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict] ] diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 532150004f4ad..c3b7cfa3e0026 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -34,6 +34,7 @@ "xlwt": "1.3.0", "xlsxwriter": "1.2.2", "numba": "0.50.1", + "zstandard": "0.15.2", } # A mapping from import name to package name (on PyPI) for packages where diff --git a/pandas/conftest.py b/pandas/conftest.py index 8c870bc98b5ff..be28dbe35fcb2 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -267,7 +267,16 @@ def other_closed(request): return request.param -@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"]) +@pytest.fixture( + params=[ + None, + "gzip", + "bz2", + "zip", + "xz", + pytest.param("zstd", marks=td.skip_if_no("zstandard")), + ] +) def compression(request): """ Fixture for trying common compression types in compression tests. @@ -275,7 +284,15 @@ def compression(request): return request.param -@pytest.fixture(params=["gzip", "bz2", "zip", "xz"]) +@pytest.fixture( + params=[ + "gzip", + "bz2", + "zip", + "xz", + pytest.param("zstd", marks=td.skip_if_no("zstandard")), + ] +) def compression_only(request): """ Fixture for trying common compression types in compression tests excluding diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3cd787748738e..7013c03bb2b67 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -133,7 +133,6 @@ from pandas.core import ( algorithms, common as com, - generic, nanops, ops, ) @@ -155,10 +154,7 @@ sanitize_array, sanitize_masked_array, ) -from pandas.core.generic import ( - NDFrame, - _shared_docs, -) +from pandas.core.generic import NDFrame from pandas.core.indexers import check_key_length from pandas.core.indexes.api import ( DatetimeIndex, @@ -194,6 +190,7 @@ ) from pandas.core.reshape.melt import melt from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ( get_group_index, lexsort_indexer, @@ -2482,7 +2479,10 @@ def _from_arrays( ) return cls(mgr) - @doc(storage_options=generic._shared_docs["storage_options"]) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, @@ -2561,19 +2561,12 @@ def to_stata( format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. - compression : str or dict, default 'infer' - For on-the-fly compression of the output dta. If string, specifies - compression mode. If dict, value at key 'method' specifies - compression mode. Compression mode must be one of {{'infer', 'gzip', - 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and - `fname` is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). If dict and compression mode is one of {{'zip', - 'gzip', 'bz2'}}, or inferred as one of the above, other entries - passed as additional compression options. + {compression_options} .. versionadded:: 1.1.0 + .. versionchanged:: 1.4.0 Zstandard support. + {storage_options} .. versionadded:: 1.2.0 @@ -2734,7 +2727,7 @@ def to_markdown( handles.handle.write(result) return None - @doc(storage_options=generic._shared_docs["storage_options"]) + @doc(storage_options=_shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, @@ -2939,7 +2932,10 @@ def to_html( render_links=render_links, ) - @doc(storage_options=generic._shared_docs["storage_options"]) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", + ) def to_xml( self, path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, @@ -3016,12 +3012,10 @@ def to_xml( layout of elements and attributes from original output. This argument requires ``lxml`` to be installed. Only XSLT 1.0 scripts and not later versions is currently supported. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + {storage_options} Returns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a683599e20b77..bbdb88a4b04a7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2330,7 +2330,10 @@ def to_excel( ) @final - @doc(storage_options=_shared_docs["storage_options"]) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buf", + ) def to_json( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, @@ -2406,12 +2409,10 @@ def to_json( If 'orient' is 'records' write out line-delimited json format. Will throw ValueError if incorrect 'orient' since others are not list-like. + {compression_options} - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} + .. versionchanged:: 1.4.0 Zstandard support. - A string representing the compression to use in the output file, - only used when the first argument is a filename. By default, the - compression is inferred from the filename. index : bool, default True Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when @@ -2919,7 +2920,10 @@ def to_sql( ) @final - @doc(storage_options=_shared_docs["storage_options"]) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) def to_pickle( self, path, @@ -2934,17 +2938,7 @@ def to_pickle( ---------- path : str File path where the pickled object will be stored. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \ - default 'infer' - A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. - Compression mode may be any of the following possible - values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression - mode is ‘infer’ and path_or_buf is path-like, then detect - compression mode from the following extensions: - ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression). - If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries - passed as additional compression options. + {compression_options} protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible @@ -3338,7 +3332,10 @@ def to_latex( ) @final - @doc(storage_options=_shared_docs["storage_options"]) + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"], + ) def to_csv( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, @@ -3404,19 +3401,7 @@ def to_csv( A string representing the encoding to use in the output file, defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` is a non-binary file object. - compression : str or dict, default 'infer' - If str, represents compression mode. If dict, value at 'method' is - the compression mode. Compression mode may be any of the following - possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If - compression mode is 'infer' and `path_or_buf` is path-like, then - detect compression mode from the following extensions: '.gz', - '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as - one of the above, other entries passed as - additional compression options. - If `path_or_buf` is omitted or `None` or is a file opened in text - mode, this argument is ignored and an (uncompressed) string is - returned/written. + {compression_options} .. versionchanged:: 1.0.0 @@ -3427,8 +3412,7 @@ def to_csv( .. versionchanged:: 1.1.0 Passing compression options as keys in dict is - supported for compression modes 'gzip' and 'bz2' - as well as 'zip'. + supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'. .. versionchanged:: 1.2.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 77bc816fd52a1..15805c0aa94ed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -90,7 +90,6 @@ from pandas.core import ( algorithms, base, - generic, missing, nanops, ops, @@ -197,7 +196,7 @@ def wrapper(self): # Series class -class Series(base.IndexOpsMixin, generic.NDFrame): +class Series(base.IndexOpsMixin, NDFrame): """ One-dimensional ndarray with axis labels (including time series). @@ -296,11 +295,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _name: Hashable _metadata: list[str] = ["name"] - _internal_names_set = {"index"} | generic.NDFrame._internal_names_set + _internal_names_set = {"index"} | NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} _hidden_attrs = ( base.IndexOpsMixin._hidden_attrs - | generic.NDFrame._hidden_attrs + | NDFrame._hidden_attrs | frozenset(["compress", "ptp"]) ) @@ -455,7 +454,7 @@ def __init__( elif manager == "array": data = SingleArrayManager.from_array(data, index) - generic.NDFrame.__init__(self, data) + NDFrame.__init__(self, data) self.name = name self._set_axis(0, index, fastpath=True) @@ -891,7 +890,7 @@ def axes(self) -> list[Index]: # ---------------------------------------------------------------------- # Indexing Methods - @Appender(generic.NDFrame.take.__doc__) + @Appender(NDFrame.take.__doc__) def take(self, indices, axis=0, is_copy=None, **kwargs) -> Series: if is_copy is not None: warnings.warn( @@ -1579,7 +1578,7 @@ def to_string( @doc( klass=_shared_doc_kwargs["klass"], - storage_options=generic._shared_docs["storage_options"], + storage_options=_shared_docs["storage_options"], examples=dedent( """Examples -------- @@ -1872,7 +1871,7 @@ def _set_name(self, name, inplace=False) -> Series: Name: Max Speed, dtype: float64 """ ) - @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) def groupby( self, by=None, @@ -2981,7 +2980,7 @@ def _construct_result( return out @doc( - generic._shared_docs["compare"], + _shared_docs["compare"], """ Returns ------- @@ -4237,7 +4236,7 @@ def _gotitem(self, key, ndim, subset=None) -> Series: ) @doc( - generic._shared_docs["aggregate"], + _shared_docs["aggregate"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], see_also=_agg_see_also_doc, @@ -4599,7 +4598,7 @@ def set_axis(self, labels, axis: Axis = ..., inplace: bool = ...) -> Series | No axis_description_sub="", see_also_sub="", ) - @Appender(generic.NDFrame.set_axis.__doc__) + @Appender(NDFrame.set_axis.__doc__) def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) @@ -5227,7 +5226,7 @@ def _convert_dtypes( # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] def isna(self) -> Series: - return generic.NDFrame.isna(self) + return NDFrame.isna(self) # error: Cannot determine type of 'isna' @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) # type: ignore[has-type] diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index e3d6c0fc8d867..d853f728f64a3 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -402,6 +402,35 @@ starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec``. Please see ``fsspec`` and ``urllib`` for more details.""" +_shared_docs[ + "compression_options" +] = """compression : str or dict, default 'infer' + For on-the-fly compression of the output data. If 'infer' and '%s' + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). Set to + ``None`` for no compression. Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other + key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an + example, the following could be passed for faster compression and to create + a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.""" + +_shared_docs[ + "decompression_options" +] = """compression : str or dict, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer' and '%s' is + path-like, then detect compression from the following extensions: '.gz', + '.bz2', '.zip', '.xz', or '.zst' (otherwise no compression). If using + 'zip', the ZIP file must contain only one data file to be read in. Set to + ``None`` for no decompression. Can also be a dict with key ``'method'`` set + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other + key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, + ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``, respectively. As an + example, the following could be passed for Zstandard decompression using a + custom compression dictionary: + ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.""" + _shared_docs[ "replace" ] = """ diff --git a/pandas/io/common.py b/pandas/io/common.py index e12a7348b0075..1697dce38e9d8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,6 +5,7 @@ import codecs from collections import abc import dataclasses +import functools import gzip from io import ( BufferedIOBase, @@ -49,10 +50,13 @@ ) from pandas.compat import get_lzma_file from pandas.compat._optional import import_optional_dependency +from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_file_like +from pandas.core.shared_docs import _shared_docs + _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") @@ -245,6 +249,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool: ) +@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer") def _get_filepath_or_buffer( filepath_or_buffer: FilePath | BaseBuffer, encoding: str = "utf-8", @@ -260,7 +265,10 @@ def _get_filepath_or_buffer( ---------- filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path), or buffer - compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional @@ -443,7 +451,13 @@ def file_path_to_url(path: str) -> str: return urljoin("file:", pathname2url(path)) -_compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +_compression_to_extension = { + "gzip": ".gz", + "bz2": ".bz2", + "zip": ".zip", + "xz": ".xz", + "zstd": ".zst", +} def get_compression_method( @@ -481,6 +495,7 @@ def get_compression_method( return compression_method, compression_args +@doc(compression_options=_shared_docs["compression_options"] % "filepath_or_buffer") def infer_compression( filepath_or_buffer: FilePath | BaseBuffer, compression: str | None ) -> str | None: @@ -494,10 +509,9 @@ def infer_compression( ---------- filepath_or_buffer : str or file handle File path or object. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. Returns ------- @@ -585,6 +599,7 @@ def get_handle( ... +@doc(compression_options=_shared_docs["compression_options"] % "path_or_buf") def get_handle( path_or_buf: FilePath | BaseBuffer, mode: str, @@ -607,26 +622,18 @@ def get_handle( Mode to open path_or_buf with. encoding : str or None Encoding to use. - compression : str or dict, default None - If string, specifies compression mode. If dict, value at key 'method' - specifies compression mode. Compression mode must be one of {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' - and `filepath_or_buffer` is path-like, then detect compression from - the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is one of - {'zip', 'gzip', 'bz2'}, or inferred as one of the above, - other entries passed as additional compression options. + {compression_options} .. versionchanged:: 1.0.0 - May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 - Passing compression options as keys in dict is now - supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'. + + .. versionchanged:: 1.4.0 Zstandard support. memory_map : bool, default False See parsers._parser_params for more information. @@ -689,8 +696,13 @@ def get_handle( check_parent_directory(str(handle)) if compression: - # compression libraries do not like an explicit text-mode - ioargs.mode = ioargs.mode.replace("t", "") + if compression != "zstd": + # compression libraries do not like an explicit text-mode + ioargs.mode = ioargs.mode.replace("t", "") + elif compression == "zstd" and "b" not in ioargs.mode: + # python-zstandard defaults to text mode, but we always expect + # compression libraries to use binary mode. + ioargs.mode += "b" # GZ Compression if compression == "gzip": @@ -749,6 +761,19 @@ def get_handle( elif compression == "xz": handle = get_lzma_file()(handle, ioargs.mode) + # Zstd Compression + elif compression == "zstd": + zstd = import_optional_dependency("zstandard") + if "r" in ioargs.mode: + open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)} + else: + open_args = {"cctx": zstd.ZstdCompressor(**compression_args)} + handle = zstd.open( + handle, + mode=ioargs.mode, + **open_args, + ) + # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" @@ -1101,6 +1126,24 @@ def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool: if issubclass(type(handle), text_classes): return False - # classes that expect bytes - binary_classes = (BufferedIOBase, RawIOBase) - return isinstance(handle, binary_classes) or "b" in getattr(handle, "mode", mode) + return isinstance(handle, _get_binary_io_classes()) or "b" in getattr( + handle, "mode", mode + ) + + +@functools.lru_cache +def _get_binary_io_classes() -> tuple[type, ...]: + """IO classes that that expect bytes""" + binary_classes: tuple[type, ...] = (BufferedIOBase, RawIOBase) + + # python-zstandard doesn't use any of the builtin base classes; instead we + # have to use the `zstd.ZstdDecompressionReader` class for isinstance checks. + # Unfortunately `zstd.ZstdDecompressionReader` isn't exposed by python-zstandard + # so we have to get it from a `zstd.ZstdDecompressor` instance. + # See also https://github.com/indygreg/python-zstandard/pull/165. + zstd = import_optional_dependency("zstandard", errors="ignore") + if zstd is not None: + with zstd.ZstdDecompressor().stream_reader(b"") as reader: + binary_classes += (type(reader),) + + return binary_classes diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index e4547b527a6b9..9813b91419060 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,17 +15,17 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc -from pandas.core import generic from pandas.core.api import ( DataFrame, Int64Index, RangeIndex, ) +from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc(storage_options=_shared_docs["storage_options"]) def to_feather( df: DataFrame, path: FilePath | WriteBuffer[bytes], @@ -93,7 +93,7 @@ def to_feather( feather.write_feather(df, handles.handle, **kwargs) -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc(storage_options=_shared_docs["storage_options"]) def read_feather( path: FilePath | ReadBuffer[bytes], columns: Sequence[Hashable] | None = None, diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index af167964a48f4..1f1ca434a22c0 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -38,8 +38,8 @@ MultiIndex, PeriodIndex, ) -from pandas.core import generic import pandas.core.common as com +from pandas.core.shared_docs import _shared_docs from pandas.io.formats._color_data import CSS4_COLORS from pandas.io.formats.css import ( @@ -828,7 +828,7 @@ def get_formatted_cells(self) -> Iterable[ExcelCell]: cell.val = self._format_value(cell.val) yield cell - @doc(storage_options=generic._shared_docs["storage_options"]) + @doc(storage_options=_shared_docs["storage_options"]) def write( self, writer, diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d9550f0940376..e0e077d2e96db 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -39,13 +39,13 @@ IndexSlice, RangeIndex, ) -from pandas.core import generic import pandas.core.common as com from pandas.core.frame import ( DataFrame, Series, ) from pandas.core.generic import NDFrame +from pandas.core.shared_docs import _shared_docs from pandas.io.formats.format import save_to_buffer @@ -442,7 +442,7 @@ def set_tooltips( @doc( NDFrame.to_excel, klass="Styler", - storage_options=generic._shared_docs["storage_options"], + storage_options=_shared_docs["storage_options"], ) def to_excel( self, diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index b997cd9bddd1e..8e05afaa06919 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -15,10 +15,12 @@ WriteBuffer, ) from pandas.errors import AbstractMethodError +from pandas.util._decorators import doc from pandas.core.dtypes.common import is_list_like from pandas.core.frame import DataFrame +from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle from pandas.io.xml import ( @@ -27,6 +29,7 @@ ) +@doc(compression_options=_shared_docs["compression_options"] % "path_or_buffer") class BaseXMLFormatter: """ Subclass for formatting data in XML. @@ -74,9 +77,9 @@ class BaseXMLFormatter: stylesheet : str or file-like A URL, file, file-like object, or a raw string containing XSLT. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - Compression type for on-the-fly decompression of on-disk data. - If 'infer', then use extension for gzip, bz2, zip or xz. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. storage_options : dict, optional Extra options that make sense for a particular storage connection, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 21d89f18d4959..9d4998784222f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -45,10 +45,10 @@ notna, to_datetime, ) -from pandas.core import generic from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.generic import NDFrame from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs from pandas.io.common import ( IOHandles, @@ -312,7 +312,10 @@ def obj_to_write(self) -> NDFrame | Mapping[IndexLabel, Any]: return {"schema": self.schema, "data": self.obj} -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "path_or_buf", +) @deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) @deprecate_nonkeyword_arguments( version="2.0", allowed_args=["path_or_buf"], stacklevel=3 @@ -473,12 +476,9 @@ def read_json( ``JsonReader`` is a context manager. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buf is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. nrows : int, optional The number of lines from the line-delimited jsonfile that has to be read. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 56131d000b176..f70649b29a1eb 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -21,7 +21,7 @@ MultiIndex, get_option, ) -from pandas.core import generic +from pandas.core.shared_docs import _shared_docs from pandas.util.version import Version from pandas.io.common import ( @@ -351,7 +351,7 @@ def read( return result -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc(storage_options=_shared_docs["storage_options"]) def to_parquet( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, @@ -434,7 +434,7 @@ def to_parquet( return None -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc(storage_options=_shared_docs["storage_options"]) def read_parquet( path, engine: str = "auto", diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 47bc7ff95669b..98cf2ba0e022e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -43,9 +43,9 @@ is_list_like, ) -from pandas.core import generic from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex +from pandas.core.shared_docs import _shared_docs from pandas.io.common import validate_header_arg from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper @@ -280,12 +280,10 @@ .. versionchanged:: 1.2 ``TextFileReader`` is a context manager. -compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer' and - `filepath_or_buffer` is path-like, then detect compression from the - following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - decompression). If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. +{decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + thousands : str, optional Thousands separator. decimal : str, default '.' @@ -577,7 +575,8 @@ def _read( func_name="read_csv", summary="Read a comma-separated values (csv) file into DataFrame.", _default_sep="','", - storage_options=generic._shared_docs["storage_options"], + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"], ) ) def read_csv( @@ -675,7 +674,8 @@ def read_csv( func_name="read_table", summary="Read general delimited file into DataFrame.", _default_sep=r"'\\t' (tab-stop)", - storage_options=generic._shared_docs["storage_options"], + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"], ) ) def read_table( diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5e0a3e1646883..12156bf354919 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -15,12 +15,15 @@ from pandas.compat import pickle_compat as pc from pandas.util._decorators import doc -from pandas.core import generic +from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "filepath_or_buffer", +) def to_pickle( obj: Any, filepath_or_buffer: FilePath | WriteBuffer[bytes], @@ -41,12 +44,10 @@ def to_pickle( .. versionchanged:: 1.0.0 Accept URL. URL has to be of S3 or GCS. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - If 'infer' and 'path_or_url' is path-like, then detect compression from - the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression) If 'infer' and 'path_or_url' is not path-like, then use - None (= no decompression). protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -111,7 +112,10 @@ def to_pickle( pickle.dump(obj, handles.handle, protocol=protocol) -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer", +) def read_pickle( filepath_or_buffer: FilePath | ReadPickleBuffer, compression: CompressionOptions = "infer", @@ -134,11 +138,9 @@ def read_pickle( .. versionchanged:: 1.0.0 Accept URL. URL is not limited to S3 and GCS. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - If 'infer' and 'path_or_url' is path-like, then detect compression from - the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression) If 'infer' and 'path_or_url' is not path-like, then use - None (= no decompression). + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. {storage_options} diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 672d6ec539124..fde2f32af9939 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -61,12 +61,12 @@ to_datetime, to_timedelta, ) -from pandas.core import generic from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import _IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs from pandas.io.common import get_handle @@ -109,19 +109,6 @@ Return StataReader object for iterations, returns chunks with given number of lines.""" -_compression_params = f"""\ -compression : str or dict, default None - If string, specifies compression mode. If dict, value at key 'method' - specifies compression mode. Compression mode must be one of {{'infer', - 'gzip', 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' - and `filepath_or_buffer` is path-like, then detect compression from - the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise - no compression). If dict and compression mode is one of - {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, - other entries passed as additional compression options. -{generic._shared_docs["storage_options"]}""" - - _iterator_params = """\ iterator : bool, default False Return StataReader object.""" @@ -153,7 +140,8 @@ {_statafile_processing_params2} {_chunksize_params} {_iterator_params} -{_compression_params} +{_shared_docs["decompression_options"]} +{_shared_docs["storage_options"]} Returns ------- @@ -216,7 +204,8 @@ {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} -{_compression_params} +{_shared_docs["decompression_options"]} +{_shared_docs["storage_options"]} {_reader_notes} """ @@ -2170,7 +2159,10 @@ def _dtype_to_default_stata_fmt( raise NotImplementedError(f"Data type {dtype} not supported.") -@doc(storage_options=generic._shared_docs["storage_options"]) +@doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "fname", +) class StataWriter(StataParser): """ A class for writing Stata binary dta files @@ -2202,18 +2194,12 @@ class StataWriter(StataParser): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - compression : str or dict, default 'infer' - For on-the-fly compression of the output dta. If string, specifies - compression mode. If dict, value at key 'method' specifies compression - mode. Compression mode must be one of {{'infer', 'gzip', 'bz2', 'zip', - 'xz', None}}. If compression mode is 'infer' and `fname` is path-like, - then detect compression from the following extensions: '.gz', '.bz2', - '.zip', or '.xz' (otherwise no compression). If dict and compression - mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as one of the above, - other entries passed as additional compression options. + {compression_options} .. versionadded:: 1.1.0 + .. versionchanged:: 1.4.0 Zstandard support. + {storage_options} .. versionadded:: 1.2.0 @@ -3135,18 +3121,12 @@ class StataWriter117(StataWriter): Smaller columns can be converted by including the column name. Using StrLs can reduce output file size when strings are longer than 8 characters, and either frequently repeated or sparse. - compression : str or dict, default 'infer' - For on-the-fly compression of the output dta. If string, specifies - compression mode. If dict, value at key 'method' specifies compression - mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', - 'xz', None}. If compression mode is 'infer' and `fname` is path-like, - then detect compression from the following extensions: '.gz', '.bz2', - '.zip', or '.xz' (otherwise no compression). If dict and compression - mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, - other entries passed as additional compression options. + {compression_options} .. versionadded:: 1.1.0 + .. versionchanged:: 1.4.0 Zstandard support. + value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. The combined length of all labels for a single @@ -3535,18 +3515,12 @@ class StataWriterUTF8(StataWriter117): The dta version to use. By default, uses the size of data to determine the version. 118 is used if data.shape[1] <= 32767, and 119 is used for storing larger DataFrames. - compression : str or dict, default 'infer' - For on-the-fly compression of the output dta. If string, specifies - compression mode. If dict, value at key 'method' specifies compression - mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', - 'xz', None}. If compression mode is 'infer' and `fname` is path-like, - then detect compression from the following extensions: '.gz', '.bz2', - '.zip', or '.xz' (otherwise no compression). If dict and compression - mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, - other entries passed as additional compression options. + {compression_options} .. versionadded:: 1.1.0 + .. versionchanged:: 1.4.0 Zstandard support. + value_labels : dict of dicts Dictionary containing columns as keys and dictionaries of column value to labels as values. The combined length of all labels for a single diff --git a/pandas/io/xml.py b/pandas/io/xml.py index a54546a37f284..d72f02fa817ce 100644 --- a/pandas/io/xml.py +++ b/pandas/io/xml.py @@ -34,6 +34,7 @@ from pandas.io.parsers import TextParser +@doc(decompression_options=_shared_docs["decompression_options"] % "path_or_buffer") class _XMLFrameParser: """ Internal subclass to parse XML into DataFrames. @@ -68,9 +69,9 @@ class _XMLFrameParser: URL, file, file-like object, or a raw string containing XSLT, `etree` does not support XSLT but retained for consistency. - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - Compression type for on-the-fly decompression of on-disk data. - If 'infer', then use extension for gzip, bz2, zip or xz. + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. storage_options : dict, optional Extra options that make sense for a particular storage connection, @@ -727,7 +728,10 @@ def _parse( return _data_to_frame(data=data_dicts, **kwargs) -@doc(storage_options=_shared_docs["storage_options"]) +@doc( + storage_options=_shared_docs["storage_options"], + decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", +) def read_xml( path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], xpath: str | None = "./*", @@ -801,12 +805,9 @@ def read_xml( transformation and not the original XML document. Only XSLT 1.0 scripts and not later versions is currently supported. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buffer is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. + {decompression_options} + + .. versionchanged:: 1.4.0 Zstandard support. {storage_options} diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8c5bd6ec170e5..86842f6a608d6 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -10,6 +10,7 @@ is_platform_mac, is_platform_windows, ) +import pandas.util._test_decorators as td import pandas._testing as tm @@ -184,3 +185,29 @@ def add_tips_files(bucket_name): while cli.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1 + + +_compression_formats_params = [ + (".no_compress", None), + ("", None), + (".gz", "gzip"), + (".GZ", "gzip"), + (".bz2", "bz2"), + (".BZ2", "bz2"), + (".zip", "zip"), + (".ZIP", "zip"), + (".xz", "xz"), + (".XZ", "xz"), + pytest.param((".zst", "zstd"), marks=td.skip_if_no("zstandard")), + pytest.param((".ZST", "zstd"), marks=td.skip_if_no("zstandard")), +] + + +@pytest.fixture(params=_compression_formats_params[1:]) +def compression_format(request): + return request.param + + +@pytest.fixture(params=_compression_formats_params) +def compression_ext(request): + return request.param[0] diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index bf17132d1b9c2..c04c79ab01e60 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -14,6 +14,8 @@ ) import pandas._testing as tm +import pandas.io.common as icom + class TestToCSV: def test_to_csv_with_single_column(self): @@ -515,13 +517,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - - if compression == "gzip": - filename += "gz" - else: - # xz --> .xz - # bz2 --> .bz2 - filename += compression + filename += icom._compression_to_extension[compression] df = DataFrame({"A": [1]}) @@ -538,7 +534,11 @@ def test_to_csv_compression_dict(self, compression_only): method = compression_only df = DataFrame({"ABC": [1]}) filename = "to_csv_compress_as_dict." - filename += "gz" if method == "gzip" else method + extension = { + "gzip": "gz", + "zstd": "zst", + }.get(method, method) + filename += extension with tm.ensure_clean(filename) as path: df.to_csv(path, compression={"method": method}) read_df = pd.read_csv(path, index_col=0) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index e0136520bdeb5..a752c93a8046a 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -7,6 +7,8 @@ import pandas as pd import pandas._testing as tm +import pandas.io.common as icom + def test_compression_roundtrip(compression): df = pd.DataFrame( @@ -97,13 +99,7 @@ def test_to_json_compression(compression_only, read_infer, to_infer): # We'll complete file extension subsequently. filename = "test." - - if compression == "gzip": - filename += "gz" - else: - # xz --> .xz - # bz2 --> .bz2 - filename += compression + filename += icom._compression_to_extension[compression] df = pd.DataFrame({"A": [1]}) diff --git a/pandas/tests/io/parser/data/salaries.csv.zst b/pandas/tests/io/parser/data/salaries.csv.zst new file mode 100644 index 0000000000000..20c9ed8a7e39f Binary files /dev/null and b/pandas/tests/io/parser/data/salaries.csv.zst differ diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 5aa0edfd8b46a..d97b594623023 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -12,6 +12,8 @@ from pandas import DataFrame import pandas._testing as tm +import pandas.io.common as icom + skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -93,7 +95,7 @@ def test_compression(parser_and_data, compression_only, buffer, filename): parser, data, expected = parser_and_data compress_type = compression_only - ext = "gz" if compress_type == "gzip" else compress_type + ext = icom._compression_to_extension[compress_type] filename = filename if filename is None else filename.format(ext=ext) if filename and buffer: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 36f53bb1bb155..235f19d86a6b3 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -16,6 +16,7 @@ from pandas import DataFrame import pandas._testing as tm +import pandas.io.common as icom from pandas.io.feather_format import read_feather from pandas.io.parsers import read_csv @@ -23,7 +24,7 @@ @pytest.mark.network @pytest.mark.parametrize( "compress_type, extension", - [("gzip", ".gz"), ("bz2", ".bz2"), ("zip", ".zip"), ("xz", ".xz")], + icom._compression_to_extension.items(), ) @pytest.mark.parametrize("mode", ["explicit", "infer"]) @pytest.mark.parametrize("engine", ["python", "c"]) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 584cf9e5331dc..63f783b01d1f3 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -22,6 +22,7 @@ ) import pandas._testing as tm +import pandas.io.common as icom from pandas.io.parsers import ( read_csv, read_fwf, @@ -655,7 +656,7 @@ def test_fwf_compression(compression_only, infer): 3333333333""".strip() compression = compression_only - extension = "gz" if compression == "gzip" else compression + extension = icom._compression_to_extension[compression] kwargs = {"widths": [5, 5], "names": ["one", "two"]} expected = read_fwf(StringIO(data), **kwargs) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a00268d82a57d..f718a52a8a96b 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -100,22 +100,9 @@ def test_stringify_file_and_path_like(self): with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj: assert fsspec_obj == icom.stringify_path(fsspec_obj) - @pytest.mark.parametrize( - "extension,expected", - [ - ("", None), - (".gz", "gzip"), - (".bz2", "bz2"), - (".zip", "zip"), - (".xz", "xz"), - (".GZ", "gzip"), - (".BZ2", "bz2"), - (".ZIP", "zip"), - (".XZ", "xz"), - ], - ) @pytest.mark.parametrize("path_type", path_types) - def test_infer_compression_from_path(self, extension, expected, path_type): + def test_infer_compression_from_path(self, compression_format, path_type): + extension, expected = compression_format path = path_type("foo/bar.csv" + extension) compression = icom.infer_compression(path, compression="infer") assert compression == expected diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 2e8e4a9017dbc..5e0da6f3ab3bb 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -16,6 +16,8 @@ import pandas._testing as tm from pandas.util import _test_decorators as td +import pandas.io.common as icom + @pytest.fixture def gcs_buffer(monkeypatch): @@ -142,10 +144,9 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) tm.assert_frame_equal(df, read_df) # write compressed file with implicit compression - if compression_only == "gzip": - compression_only = "gz" + file_ext = icom._compression_to_extension[compression_only] compression["method"] = "infer" - path_gcs += f".{compression_only}" + path_gcs += f".{file_ext}" df.to_csv(path_gcs, compression=compression, encoding=encoding) res = gcs_buffer.getvalue() diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index aa80df1bcbd38..4700e307f2407 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -35,6 +35,7 @@ get_lzma_file, is_platform_little_endian, ) +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td import pandas as pd @@ -45,6 +46,7 @@ ) import pandas._testing as tm +import pandas.io.common as icom from pandas.tseries.offsets import ( Day, MonthEnd, @@ -286,12 +288,8 @@ def get_random_path(): class TestCompression: - _compression_to_extension = { - None: ".none", - "gzip": ".gz", - "bz2": ".bz2", - "zip": ".zip", - "xz": ".xz", + _extension_to_compression = { + ext: compression for compression, ext in icom._compression_to_extension.items() } def compress_file(self, src_path, dest_path, compression): @@ -308,6 +306,8 @@ def compress_file(self, src_path, dest_path, compression): f.write(src_path, os.path.basename(src_path)) elif compression == "xz": f = get_lzma_file()(dest_path, "w") + elif compression == "zstd": + f = import_optional_dependency("zstandard").open(dest_path, "wb") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) @@ -344,16 +344,11 @@ def test_write_explicit_bad(self, compression, get_random_path): df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".no_compress", ".xz"]) - def test_write_infer(self, ext, get_random_path): + def test_write_infer(self, compression_ext, get_random_path): base = get_random_path - path1 = base + ext + path1 = base + compression_ext path2 = base + ".raw" - compression = None - for c in self._compression_to_extension: - if self._compression_to_extension[c] == ext: - compression = c - break + compression = self._extension_to_compression.get(compression_ext.lower()) with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() @@ -390,16 +385,11 @@ def test_read_explicit(self, compression, get_random_path): tm.assert_frame_equal(df, df2) - @pytest.mark.parametrize("ext", ["", ".gz", ".bz2", ".zip", ".no_compress", ".xz"]) - def test_read_infer(self, ext, get_random_path): + def test_read_infer(self, compression_ext, get_random_path): base = get_random_path path1 = base + ".raw" - path2 = base + ext - compression = None - for c in self._compression_to_extension: - if self._compression_to_extension[c] == ext: - compression = c - break + path2 = base + compression_ext + compression = self._extension_to_compression.get(compression_ext.lower()) with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index eb457d74c6a01..f0fd391c2a9c4 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -21,6 +21,7 @@ ) from pandas.core.indexes.api import ensure_index +import pandas.io.common as icom from pandas.io.parsers import read_csv from pandas.io.stata import ( CategoricalConversionWarning, @@ -1881,7 +1882,10 @@ def test_backward_compat(version, datapath): def test_compression(compression, version, use_dict, infer): file_name = "dta_inferred_compression.dta" if compression: - file_ext = "gz" if compression == "gzip" and not use_dict else compression + if use_dict: + file_ext = compression + else: + file_ext = icom._compression_to_extension[compression] file_name += f".{file_ext}" compression_arg = compression if infer: @@ -1902,6 +1906,10 @@ def test_compression(compression, version, use_dict, infer): elif compression == "bz2": with bz2.open(path, "rb") as comp: fp = io.BytesIO(comp.read()) + elif compression == "zstd": + zstd = pytest.importorskip("zstandard") + with zstd.open(path, "rb") as comp: + fp = io.BytesIO(comp.read()) elif compression == "xz": lzma = pytest.importorskip("lzma") with lzma.open(path, "rb") as comp: @@ -2032,7 +2040,7 @@ def test_compression_roundtrip(compression): def test_stata_compression(compression_only, read_infer, to_infer): compression = compression_only - ext = "gz" if compression == "gzip" else compression + ext = icom._compression_to_extension[compression] filename = f"test.{ext}" df = DataFrame( diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index eea6c535f12b6..e0c2b3794a00c 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -17,6 +17,7 @@ ) import pandas._testing as tm +import pandas.io.common as icom from pandas.io.common import get_handle from pandas.io.xml import read_xml @@ -1273,15 +1274,14 @@ def test_style_to_json(): """ -@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) -def test_compression_output(parser, comp): +def test_compression_output(parser, compression_only): with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression=comp) + geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", - compression=comp, + compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() @@ -1290,16 +1290,15 @@ def test_compression_output(parser, comp): assert geom_xml == output.strip() -@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) -@pytest.mark.parametrize("compfile", ["xml.bz2", "xml.gz", "xml.xz", "xml.zip"]) -def test_filename_and_suffix_comp(parser, comp, compfile): +def test_filename_and_suffix_comp(parser, compression_only): + compfile = "xml." + icom._compression_to_extension[compression_only] with tm.ensure_clean(filename=compfile) as path: - geom_df.to_xml(path, parser=parser, compression=comp) + geom_df.to_xml(path, parser=parser, compression=compression_only) with get_handle( path, "r", - compression=comp, + compression=compression_only, ) as handle_obj: output = handle_obj.handle.read() diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py index 2e718073c4174..30ba95fd82bf2 100644 --- a/pandas/tests/io/xml/test_xml.py +++ b/pandas/tests/io/xml/test_xml.py @@ -4,12 +4,15 @@ BytesIO, StringIO, ) +from lzma import LZMAError import os from urllib.error import HTTPError +from zipfile import BadZipFile import numpy as np import pytest +from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td from pandas import DataFrame @@ -1014,56 +1017,40 @@ def test_online_stylesheet(): # COMPRESSION -@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz", "zip"]) -def test_compression_read(parser, comp): +def test_compression_read(parser, compression_only): with tm.ensure_clean() as path: - geom_df.to_xml(path, index=False, parser=parser, compression=comp) + geom_df.to_xml(path, index=False, parser=parser, compression=compression_only) - xml_df = read_xml(path, parser=parser, compression=comp) + xml_df = read_xml(path, parser=parser, compression=compression_only) tm.assert_frame_equal(xml_df, geom_df) -@pytest.mark.parametrize("comp", ["gzip", "xz", "zip"]) -def test_wrong_compression_bz2(parser, comp): - with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression=comp) - - with pytest.raises(OSError, match="Invalid data stream"): - read_xml(path, parser=parser, compression="bz2") - - -@pytest.mark.parametrize("comp", ["bz2", "xz", "zip"]) -def test_wrong_compression_gz(parser, comp): - with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression=comp) - - with pytest.raises(OSError, match="Not a gzipped file"): - read_xml(path, parser=parser, compression="gzip") +def test_wrong_compression(parser, compression, compression_only): + actual_compression = compression + attempted_compression = compression_only + if actual_compression == attempted_compression: + return -@pytest.mark.parametrize("comp", ["bz2", "gzip", "zip"]) -def test_wrong_compression_xz(parser, comp): - lzma = pytest.importorskip("lzma") - - with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression=comp) - - with pytest.raises( - lzma.LZMAError, match="Input format not supported by decoder" - ): - read_xml(path, parser=parser, compression="xz") - - -@pytest.mark.parametrize("comp", ["bz2", "gzip", "xz"]) -def test_wrong_compression_zip(parser, comp): - from zipfile import BadZipFile + errors = { + "bz2": (OSError, "Invalid data stream"), + "gzip": (OSError, "Not a gzipped file"), + "zip": (BadZipFile, "File is not a zip file"), + } + zstd = import_optional_dependency("zstandard", errors="ignore") + if zstd is not None: + errors["zstd"] = (zstd.ZstdError, "Unknown frame descriptor") + lzma = import_optional_dependency("lzma", errors="ignore") + if lzma is not None: + errors["xz"] = (LZMAError, "Input format not supported by decoder") + error_cls, error_str = errors[attempted_compression] with tm.ensure_clean() as path: - geom_df.to_xml(path, parser=parser, compression=comp) + geom_df.to_xml(path, parser=parser, compression=actual_compression) - with pytest.raises(BadZipFile, match="File is not a zip file"): - read_xml(path, parser=parser, compression="zip") + with pytest.raises(error_cls, match=error_str): + read_xml(path, parser=parser, compression=attempted_compression) def test_unsuported_compression(datapath, parser):