pandas-dev · jreback · Dec 22, 2021 · Oct 8, 2021
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -36,6 +36,7 @@ global-exclude *.xpt
 global-exclude *.cpt
 global-exclude *.xz
 global-exclude *.zip
+global-exclude *.zst
 global-exclude *~
 global-exclude .DS_Store
 global-exclude .git*

diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml
@@ -34,3 +34,4 @@ dependencies:
   - xlsxwriter
   - xlwt
   - numba
+  - zstandard
diff --git a/ci/deps/actions-39-slow.yaml b/ci/deps/actions-39-slow.yaml
@@ -38,3 +38,4 @@ dependencies:
   - xlwt
   - pyreadstat
   - pyxlsb
+  - zstandard
diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml
@@ -37,3 +37,4 @@ dependencies:
   - xlwt
   - pyreadstat
   - pyxlsb
+  - zstandard
diff --git a/ci/deps/azure-macos-38.yaml b/ci/deps/azure-macos-38.yaml
@@ -32,6 +32,7 @@ dependencies:
   - xlrd
   - xlsxwriter
   - xlwt
+  - zstandard
   - pip
   - pip:
     - cython>=0.29.24
diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -32,3 +32,4 @@ dependencies:
   - xlrd
   - xlsxwriter
   - xlwt
+  - zstandard
diff --git a/ci/deps/azure-windows-39.yaml b/ci/deps/azure-windows-39.yaml
@@ -37,3 +37,4 @@ dependencies:
   - xlwt
   - pyreadstat
   - pyxlsb
+  - zstandard
diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml
@@ -12,8 +12,10 @@ dependencies:
 
   # pandas dependencies
   - botocore>=1.11
+  - flask
+  - moto
   - numpy
   - python-dateutil
   - pytz
-  - flask
-  - moto
+  - zstandard
+  - pip
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -402,3 +402,13 @@ qtpy                                         Clipboard I/O
 xclip                                        Clipboard I/O on linux
 xsel                                         Clipboard I/O on linux
 ========================= ================== =============================================================
+
+
+Compression
+^^^^^^^^^^^
+
+========================= ================== =============================================================
+Dependency                Minimum Version    Notes
+========================= ================== =============================================================
+Zstandard                                    Zstandard compression
+========================= ================== =============================================================
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -316,14 +316,14 @@ chunksize : int, default ``None``
 Quoting, compression, and file format
 +++++++++++++++++++++++++++++++++++++
 
-compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
+compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``'zstd'``, ``None``, ``dict``}, default ``'infer'``
   For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
-  bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2',
-  '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
+  bz2, zip, xz, or zstandard if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2',
+  '.zip', '.xz', '.zst', respectively, and no decompression otherwise. If using 'zip',
   the ZIP file must contain only one data file to be read in.
   Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
-  set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are
-  forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``.
+  set to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``} and other key-value pairs are
+  forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, ``bz2.BZ2File``, or ``zstandard.ZstdDecompressor``.
   As an example, the following could be passed for faster compression and to
   create a reproducible gzip archive:
   ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
@@ -4032,18 +4032,18 @@ Compressed pickle files
 '''''''''''''''''''''''
 
 :func:`read_pickle`, :meth:`DataFrame.to_pickle` and :meth:`Series.to_pickle` can read
-and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz`` are supported for reading and writing.
+and write compressed pickle files. The compression types of ``gzip``, ``bz2``, ``xz``, ``zstd`` are supported for reading and writing.
 The ``zip`` file format only supports reading and must contain only one data file
 to be read.
 
 The compression type can be an explicit parameter or be inferred from the file extension.
-If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or
-``'.xz'``, respectively.
+If 'infer', then use ``gzip``, ``bz2``, ``zip``, ``xz``, ``zstd`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``,
+``'.xz'``, or ``'.zst'``, respectively.
 
 The compression parameter can also be a ``dict`` in order to pass options to the
 compression protocol. It must have a ``'method'`` key set to the name
 of the compression protocol, which must be one of
-{``'zip'``, ``'gzip'``, ``'bz2'``}. All other key-value pairs are passed to
+{``'zip'``, ``'gzip'``, ``'bz2'``, ``'xz'``, ``'zstd'``}. All other key-value pairs are passed to
 the underlying compression library.
 
 .. ipython:: python

diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -234,6 +234,7 @@ Other enhancements
 - :meth:`DataFrame.take` now raises a ``TypeError`` when passed a scalar for the indexer (:issue:`42875`)
 - :meth:`is_list_like` now identifies duck-arrays as list-like unless ``.ndim == 0`` (:issue:`35131`)
 - :class:`ExtensionDtype` and :class:`ExtensionArray` are now (de)serialized when exporting a :class:`DataFrame` with :meth:`DataFrame.to_json` using ``orient='table'`` (:issue:`20612`, :issue:`44705`).
+- Add support for `Zstandard <http://facebook.github.io/zstd/>`_ compression to :meth:`DataFrame.to_pickle`/:meth:`read_pickle` and friends (:issue:`43925`)
 -
 
 

diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -15,6 +15,7 @@
     ReadPickleBuffer,
 )
 from pandas.compat import get_lzma_file
+from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
 from pandas._testing._random import rands
@@ -364,7 +365,7 @@ def write_to_compressed(compression, path, data, dest="test"):
 
     Parameters
     ----------
-    compression : {'gzip', 'bz2', 'zip', 'xz'}
+    compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'}
         The compression type to use.
     path : str
         The file path to write the data.
@@ -391,6 +392,8 @@ def write_to_compressed(compression, path, data, dest="test"):
         compress_method = gzip.GzipFile
     elif compression == "bz2":
         compress_method = bz2.BZ2File
+    elif compression == "zstd":
+        compress_method = import_optional_dependency("zstandard").open
     elif compression == "xz":
         compress_method = get_lzma_file()
     else:

diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
@@ -29,7 +29,7 @@ def decompress_file(path, compression):
     path : str
         The path where the file is read from.
 
-    compression : {'gzip', 'bz2', 'zip', 'xz', None}
+    compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd', None}
         Name of the decompression to use
 
     Returns

diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -243,7 +243,7 @@ def closed(self) -> bool:
 # compression keywords and compression
 CompressionDict = Dict[str, Any]
 CompressionOptions = Optional[
-    Union[Literal["infer", "gzip", "bz2", "zip", "xz"], CompressionDict]
+    Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd"], CompressionDict]
 ]
 
 

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -34,6 +34,7 @@
     "xlwt": "1.3.0",
     "xlsxwriter": "1.2.2",
     "numba": "0.50.1",
+    "zstandard": "0.15.2",
 }
 
 # A mapping from import name to package name (on PyPI) for packages where

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -267,15 +267,32 @@ def other_closed(request):
     return request.param
 
 
-@pytest.fixture(params=[None, "gzip", "bz2", "zip", "xz"])
+@pytest.fixture(
+    params=[
+        None,
+        "gzip",
+        "bz2",
+        "zip",
+        "xz",
+        pytest.param("zstd", marks=td.skip_if_no("zstandard")),
+    ]
+)
 def compression(request):
     """
     Fixture for trying common compression types in compression tests.
     """
     return request.param
 
 
-@pytest.fixture(params=["gzip", "bz2", "zip", "xz"])
+@pytest.fixture(
+    params=[
+        "gzip",
+        "bz2",
+        "zip",
+        "xz",
+        pytest.param("zstd", marks=td.skip_if_no("zstandard")),
+    ]
+)
 def compression_only(request):
     """
     Fixture for trying common compression types in compression tests excluding

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -133,7 +133,6 @@
 from pandas.core import (
     algorithms,
     common as com,
-    generic,
     nanops,
     ops,
 )
@@ -155,10 +154,7 @@
     sanitize_array,
     sanitize_masked_array,
 )
-from pandas.core.generic import (
-    NDFrame,
-    _shared_docs,
-)
+from pandas.core.generic import NDFrame
 from pandas.core.indexers import check_key_length
 from pandas.core.indexes.api import (
     DatetimeIndex,
@@ -194,6 +190,7 @@
 )
 from pandas.core.reshape.melt import melt
 from pandas.core.series import Series
+from pandas.core.shared_docs import _shared_docs
 from pandas.core.sorting import (
     get_group_index,
     lexsort_indexer,
@@ -2482,7 +2479,10 @@ def _from_arrays(
         )
         return cls(mgr)
 
-    @doc(storage_options=generic._shared_docs["storage_options"])
+    @doc(
+        storage_options=_shared_docs["storage_options"],
+        compression_options=_shared_docs["compression_options"] % "path",
+    )
     @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
     def to_stata(
         self,
@@ -2561,19 +2561,12 @@ def to_stata(
             format. Only available if version is 117.  Storing strings in the
             StrL format can produce smaller dta files if strings have more than
             8 characters and values are repeated.
-        compression : str or dict, default 'infer'
-            For on-the-fly compression of the output dta. If string, specifies
-            compression mode. If dict, value at key 'method' specifies
-            compression mode. Compression mode must be one of {{'infer', 'gzip',
-            'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and
-            `fname` is path-like, then detect compression from the following
-            extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
-            compression). If dict and compression mode is one of {{'zip',
-            'gzip', 'bz2'}}, or inferred as one of the above, other entries
-            passed as additional compression options.
+        {compression_options}
 
             .. versionadded:: 1.1.0
 
+            .. versionchanged:: 1.4.0 Zstandard support.
+
         {storage_options}
 
             .. versionadded:: 1.2.0
@@ -2734,7 +2727,7 @@ def to_markdown(
             handles.handle.write(result)
         return None
 
-    @doc(storage_options=generic._shared_docs["storage_options"])
+    @doc(storage_options=_shared_docs["storage_options"])
     @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
     def to_parquet(
         self,
@@ -2939,7 +2932,10 @@ def to_html(
             render_links=render_links,
         )
 
-    @doc(storage_options=generic._shared_docs["storage_options"])
+    @doc(
+        storage_options=_shared_docs["storage_options"],
+        compression_options=_shared_docs["compression_options"] % "path_or_buffer",
+    )
     def to_xml(
         self,
         path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
@@ -3016,12 +3012,10 @@ def to_xml(
             layout of elements and attributes from original output. This
             argument requires ``lxml`` to be installed. Only XSLT 1.0
             scripts and not later versions is currently supported.
-        compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
-            For on-the-fly decompression of on-disk data. If 'infer', then use
-            gzip, bz2, zip or xz if path_or_buffer is a string ending in
-            '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
-            otherwise. If using 'zip', the ZIP file must contain only one data
-            file to be read in. Set to None for no decompression.
+        {compression_options}
+
+            .. versionchanged:: 1.4.0 Zstandard support.
+
         {storage_options}
 
         Returns
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,3 +34,4 @@ dependencies: @@
       - xlsxwriter
       - xlwt
       - numba
+      - zstandard