diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..60346301d34d7 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -98,9 +98,9 @@ Other enhancements - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) +- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 6281475b6926f..7bddaad780b8c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -455,10 +455,10 @@ (otherwise no compression). Set to ``None`` for no compression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other - key-value pairs are forwarded to + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or + ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. As an example, the following could be passed for faster compression and to create a reproducible gzip archive: @@ -477,10 +477,10 @@ If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` set - to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other - key-value pairs are forwarded to + to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and + other key-value pairs are forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, - ``bz2.BZ2File``, ``zstandard.ZstdDecompressor`` or + ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively. As an example, the following could be passed for Zstandard decompression using a custom compression dictionary: diff --git a/pandas/io/common.py b/pandas/io/common.py index 7cdb50c629d21..43780a08a4339 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -825,8 +825,10 @@ def get_handle( elif compression == "xz": # error: Argument 1 to "LZMAFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str], - # PathLike[bytes]], IO[bytes]]]" - handle = get_lzma_file()(handle, ioargs.mode) # type: ignore[arg-type] + # PathLike[bytes]], IO[bytes]], None]" + handle = get_lzma_file()( + handle, ioargs.mode, **compression_args # type: ignore[arg-type] + ) # Zstd Compression elif compression == "zstd": diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index eadf35aedd708..ac11e2165eb6f 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -253,6 +253,28 @@ def test_gzip_compression_level(obj, method): assert compressed_size_default < compressed_size_fast +@pytest.mark.parametrize( + "obj", + [ + pd.DataFrame( + 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], + columns=["X", "Y", "Z"], + ), + pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"), + ], +) +@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"]) +def test_xz_compression_level_read(obj, method): + with tm.ensure_clean() as path: + getattr(obj, method)(path, compression="xz") + compressed_size_default = os.path.getsize(path) + getattr(obj, method)(path, compression={"method": "xz", "preset": 1}) + compressed_size_fast = os.path.getsize(path) + assert compressed_size_default < compressed_size_fast + if method == "to_csv": + pd.read_csv(path, compression="xz") + + @pytest.mark.parametrize( "obj", [