Skip to content

BUG/ENH: consistent gzip compression arguments #35645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,16 +287,19 @@ Quoting, compression, and file format

compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'``
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2',
'.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip',
the ZIP file must contain only one data file to be read in.
Set to ``None`` for no decompression. Can also be a dict with key ``'method'``
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to
compression settings. As an example, the following could be passed for
faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``.
set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are
forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``.
As an example, the following could be passed for faster compression and to
create a reproducible gzip archive:
``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.

.. versionchanged:: 0.24.0 'infer' option added and set to default.
.. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``.
.. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`.
thousands : str, default ``None``
Thousands separator.
decimal : str, default ``'.'``
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ I/O
- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`)
- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`)
- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`)

Plotting
^^^^^^^^
Expand Down
5 changes: 5 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,8 @@

# for arbitrary kwargs passed during reading/writing files
StorageOptions = Optional[Dict[str, Any]]


# compression keywords and compression
CompressionDict = Mapping[str, Optional[Union[str, int, bool]]]
CompressionOptions = Optional[Union[str, CompressionDict]]
13 changes: 10 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from pandas._libs.tslibs import Tick, Timestamp, to_offset
from pandas._typing import (
Axis,
CompressionOptions,
FilePathOrBuffer,
FrameOrSeries,
JSONSerializable,
Expand Down Expand Up @@ -2058,7 +2059,7 @@ def to_json(
date_unit: str = "ms",
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
lines: bool_t = False,
compression: Optional[str] = "infer",
compression: CompressionOptions = "infer",
index: bool_t = True,
indent: Optional[int] = None,
storage_options: StorageOptions = None,
Expand Down Expand Up @@ -2646,7 +2647,7 @@ def to_sql(
def to_pickle(
self,
path,
compression: Optional[str] = "infer",
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
) -> None:
Expand Down Expand Up @@ -3053,7 +3054,7 @@ def to_csv(
index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None,
mode: str = "w",
encoding: Optional[str] = None,
compression: Optional[Union[str, Mapping[str, str]]] = "infer",
compression: CompressionOptions = "infer",
quoting: Optional[int] = None,
quotechar: str = '"',
line_terminator: Optional[str] = None,
Expand Down Expand Up @@ -3144,6 +3145,12 @@ def to_csv(

Compression is supported for binary file objects.

.. versionchanged:: 1.2.0

Previous versions forwarded dict entries for 'gzip' to
`gzip.open` instead of `gzip.GzipFile` which prevented
setting `mtime`.

quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
Expand Down
31 changes: 17 additions & 14 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
Optional,
Tuple,
Type,
Union,
)
from urllib.parse import (
urljoin,
Expand All @@ -29,7 +28,12 @@
)
import zipfile

from pandas._typing import FilePathOrBuffer, StorageOptions
from pandas._typing import (
CompressionDict,
CompressionOptions,
FilePathOrBuffer,
StorageOptions,
)
from pandas.compat import _get_lzma_file, _import_lzma
from pandas.compat._optional import import_optional_dependency

Expand Down Expand Up @@ -160,7 +164,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool:
def get_filepath_or_buffer(
filepath_or_buffer: FilePathOrBuffer,
encoding: Optional[str] = None,
compression: Optional[str] = None,
compression: CompressionOptions = None,
mode: Optional[str] = None,
storage_options: StorageOptions = None,
):
Expand Down Expand Up @@ -188,7 +192,7 @@ def get_filepath_or_buffer(

Returns
-------
Tuple[FilePathOrBuffer, str, str, bool]
Tuple[FilePathOrBuffer, str, CompressionOptions, bool]
Tuple containing the filepath or buffer, the encoding, the compression
and should_close.
"""
Expand Down Expand Up @@ -291,8 +295,8 @@ def file_path_to_url(path: str) -> str:


def get_compression_method(
compression: Optional[Union[str, Mapping[str, Any]]]
) -> Tuple[Optional[str], Dict[str, Any]]:
compression: CompressionOptions,
) -> Tuple[Optional[str], CompressionDict]:
"""
Simplifies a compression argument to a compression method string and
a mapping containing additional arguments.
Expand All @@ -316,7 +320,7 @@ def get_compression_method(
if isinstance(compression, Mapping):
compression_args = dict(compression)
try:
compression_method = compression_args.pop("method")
compression_method = compression_args.pop("method") # type: ignore
except KeyError as err:
raise ValueError("If mapping, compression must have key 'method'") from err
else:
Expand Down Expand Up @@ -383,7 +387,7 @@ def get_handle(
path_or_buf,
mode: str,
encoding=None,
compression: Optional[Union[str, Mapping[str, Any]]] = None,
compression: CompressionOptions = None,
memory_map: bool = False,
is_text: bool = True,
errors=None,
Expand Down Expand Up @@ -464,16 +468,13 @@ def get_handle(
# GZ Compression
if compression == "gzip":
if is_path:
f = gzip.open(path_or_buf, mode, **compression_args)
f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args)
else:
f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args)

# BZ Compression
elif compression == "bz2":
if is_path:
f = bz2.BZ2File(path_or_buf, mode, **compression_args)
else:
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)

# ZIP Compression
elif compression == "zip":
Expand Down Expand Up @@ -577,7 +578,9 @@ def __init__(
if mode in ["wb", "rb"]:
mode = mode.replace("b", "")
self.archive_name = archive_name
super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs)
kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED}
kwargs_zip.update(kwargs)
super().__init__(file, mode, **kwargs_zip)

def write(self, data):
archive_name = self.filename
Expand Down
6 changes: 3 additions & 3 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
import csv as csvlib
from io import StringIO, TextIOWrapper
import os
from typing import Hashable, List, Mapping, Optional, Sequence, Union
from typing import Hashable, List, Optional, Sequence, Union
import warnings

import numpy as np

from pandas._libs import writers as libwriters
from pandas._typing import FilePathOrBuffer, StorageOptions
from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions

from pandas.core.dtypes.generic import (
ABCDatetimeIndex,
Expand Down Expand Up @@ -44,7 +44,7 @@ def __init__(
mode: str = "w",
encoding: Optional[str] = None,
errors: str = "strict",
compression: Union[str, Mapping[str, str], None] = "infer",
compression: CompressionOptions = "infer",
quoting: Optional[int] = None,
line_terminator="\n",
chunksize: Optional[int] = None,
Expand Down
31 changes: 22 additions & 9 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from io import BytesIO, StringIO
from itertools import islice
import os
from typing import Any, Callable, Optional, Type
from typing import IO, Any, Callable, List, Optional, Type

import numpy as np

import pandas._libs.json as json
from pandas._libs.tslibs import iNaT
from pandas._typing import JSONSerializable, StorageOptions
from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions
from pandas.errors import AbstractMethodError
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments

Expand All @@ -19,7 +19,12 @@
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.reshape.concat import concat

from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression
from pandas.io.common import (
get_compression_method,
get_filepath_or_buffer,
get_handle,
infer_compression,
)
from pandas.io.json._normalize import convert_to_line_delimits
from pandas.io.json._table_schema import build_table_schema, parse_table_schema
from pandas.io.parsers import _validate_integer
Expand All @@ -41,7 +46,7 @@ def to_json(
date_unit: str = "ms",
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
lines: bool = False,
compression: Optional[str] = "infer",
compression: CompressionOptions = "infer",
index: bool = True,
indent: int = 0,
storage_options: StorageOptions = None,
Expand Down Expand Up @@ -369,7 +374,7 @@ def read_json(
encoding=None,
lines: bool = False,
chunksize: Optional[int] = None,
compression="infer",
compression: CompressionOptions = "infer",
nrows: Optional[int] = None,
storage_options: StorageOptions = None,
):
Expand Down Expand Up @@ -607,7 +612,9 @@ def read_json(
if encoding is None:
encoding = "utf-8"

compression = infer_compression(path_or_buf, compression)
compression_method, compression = get_compression_method(compression)
compression_method = infer_compression(path_or_buf, compression_method)
compression = dict(compression, method=compression_method)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf,
encoding=encoding,
Expand Down Expand Up @@ -667,10 +674,13 @@ def __init__(
encoding,
lines: bool,
chunksize: Optional[int],
compression,
compression: CompressionOptions,
nrows: Optional[int],
):

compression_method, compression = get_compression_method(compression)
compression = dict(compression, method=compression_method)

self.orient = orient
self.typ = typ
self.dtype = dtype
Expand All @@ -687,6 +697,7 @@ def __init__(
self.nrows_seen = 0
self.should_close = False
self.nrows = nrows
self.file_handles: List[IO] = []

if self.chunksize is not None:
self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
Expand Down Expand Up @@ -735,8 +746,8 @@ def _get_data_from_filepath(self, filepath_or_buffer):
except (TypeError, ValueError):
pass

if exists or self.compression is not None:
data, _ = get_handle(
if exists or self.compression["method"] is not None:
data, self.file_handles = get_handle(
filepath_or_buffer,
"r",
encoding=self.encoding,
Expand Down Expand Up @@ -816,6 +827,8 @@ def close(self):
self.open_stream.close()
except (IOError, AttributeError):
pass
for file_handle in self.file_handles:
file_handle.close()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably unrelated to the recent CI issues, but we should definitely close those handles.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, is there a ResoucceWarning?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen any when reading/writing json files


def __next__(self):
if self.nrows:
Expand Down
8 changes: 4 additions & 4 deletions pandas/io/pickle.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
""" pickle compat """
import pickle
from typing import Any, Optional
from typing import Any
import warnings

from pandas._typing import FilePathOrBuffer, StorageOptions
from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions
from pandas.compat import pickle_compat as pc

from pandas.io.common import get_filepath_or_buffer, get_handle
Expand All @@ -12,7 +12,7 @@
def to_pickle(
obj: Any,
filepath_or_buffer: FilePathOrBuffer,
compression: Optional[str] = "infer",
compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: StorageOptions = None,
):
Expand Down Expand Up @@ -114,7 +114,7 @@ def to_pickle(

def read_pickle(
filepath_or_buffer: FilePathOrBuffer,
compression: Optional[str] = "infer",
compression: CompressionOptions = "infer",
storage_options: StorageOptions = None,
):
"""
Expand Down
19 changes: 6 additions & 13 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

from pandas._libs.lib import infer_dtype
from pandas._libs.writers import max_len_string_array
from pandas._typing import FilePathOrBuffer, Label, StorageOptions
from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions
from pandas.util._decorators import Appender

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1938,9 +1938,9 @@ def read_stata(

def _open_file_binary_write(
fname: FilePathOrBuffer,
compression: Union[str, Mapping[str, str], None],
compression: CompressionOptions,
storage_options: StorageOptions = None,
) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]:
) -> Tuple[BinaryIO, bool, CompressionOptions]:
"""
Open a binary file or no-op if file-like.

Expand Down Expand Up @@ -1978,17 +1978,10 @@ def _open_file_binary_write(
# Extract compression mode as given, if dict
compression_typ, compression_args = get_compression_method(compression)
compression_typ = infer_compression(fname, compression_typ)
path_or_buf, _, compression_typ, _ = get_filepath_or_buffer(
fname,
mode="wb",
compression=compression_typ,
storage_options=storage_options,
compression = dict(compression_args, method=compression_typ)
path_or_buf, _, compression, _ = get_filepath_or_buffer(
fname, mode="wb", compression=compression, storage_options=storage_options,
)
if compression_typ is not None:
compression = compression_args
compression["method"] = compression_typ
else:
compression = None
f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False)
return f, True, compression
else:
Expand Down
Loading