From 87ac6bc8943ca55873a7ec6dbfa1a78c9bd31d36 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 16:16:05 +0200 Subject: [PATCH 1/3] ENH: update feather IO for pyarrow 0.17 / Feather V2 --- doc/source/conf.py | 1 + doc/source/user_guide/io.rst | 6 ++---- pandas/core/frame.py | 12 +++++++++--- pandas/io/feather_format.py | 9 ++++++--- pandas/tests/io/test_feather.py | 17 ++++++++++++----- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index d24483abd28e1..d2404b757ca11 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -416,6 +416,7 @@ "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), "statsmodels": ("https://www.statsmodels.org/devel/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), } # extlinks alias diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d721e00a0a0b6..910022fe49d81 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4583,16 +4583,14 @@ frames efficient, and to make sharing data across data analysis languages easy. Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas dtypes, including extension dtypes such as categorical and datetime with tz. -Several caveats. +Several caveats: -* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible - to the earlier versions. * The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an error if a non-default one is provided. You can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. * Duplicate column names and non-string columns names are not supported -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message +* Non supported types actual Python object types. These will raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aedbba755227d..6f0f8f881933b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2058,18 +2058,24 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path) -> None: + def to_feather(self, path, **kwargs) -> None: """ - Write out the binary feather-format for DataFrames. + Write a DataFrame to the binary Feather format. Parameters ---------- path : str String file path. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + Starting with pyarrow 0.17, this includes the `compression`, + `compression_level`, `chunksize` and `version` keywords. + + .. versionadded:: 1.1.0 """ from pandas.io.feather_format import to_feather - to_feather(self, path) + to_feather(self, path, **kwargs) @Appender( """ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 5d4925620e75f..cd7045e7f2d2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,15 +7,18 @@ from pandas.io.common import stringify_path -def to_feather(df: DataFrame, path): +def to_feather(df: DataFrame, path, **kwargs): """ - Write a DataFrame to the feather-format + Write a DataFrame to the binary Feather format. Parameters ---------- df : DataFrame path : string file path, or file-like object + **kwargs : + Additional keywords passed to `pyarrow.feather.write_feather`. + .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather @@ -58,7 +61,7 @@ def to_feather(df: DataFrame, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path) + feather.write_feather(df, path, **kwargs) def read_feather(path, columns=None, use_threads: bool = True): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0038df78dd866..8cb27e23dfe75 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -27,15 +29,15 @@ def check_error_on_write(self, df, exc): with tm.ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, expected=None, **kwargs): + def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): if expected is None: expected = df with tm.ensure_clean() as path: - to_feather(df, path) + to_feather(df, path, **write_kwargs) - result = read_feather(path, **kwargs) + result = read_feather(path, **read_kwargs) tm.assert_frame_equal(result, expected) def test_error(self): @@ -102,8 +104,8 @@ def test_read_columns(self): def test_unsupported_other(self): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # mixed python objects + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) @@ -148,3 +150,8 @@ def test_path_localpath(self): df = tm.makeDataFrame().reset_index() result = tm.round_trip_localpath(df.to_feather, pd.read_feather) tm.assert_frame_equal(df, result) + + @td.skip_if_no("pyarrow", min_version="0.16.1.dev") + def test_passthrough_keywords(self): + df = tm.makeDataFrame().reset_index() + self.check_round_trip(df, write_kwargs=dict(version=1)) From 0b61a797f259faa1ce86e055f4d3c6e1281a3905 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 16:25:17 +0200 Subject: [PATCH 2/3] add period/timedelta to tests --- pandas/tests/io/test_feather.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 8cb27e23dfe75..0755501ee6285 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -73,6 +73,10 @@ def test_basic(self): "dtns": pd.date_range("20130101", periods=3, freq="ns"), } ) + if pyarrow_version >= LooseVersion("0.16.1.dev"): + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) + df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) From a42ad0f829a8e952c2de933b7c38307252eecaa4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 9 Apr 2020 19:27:50 +0200 Subject: [PATCH 3/3] update io.rst + add whatsnew --- doc/source/user_guide/io.rst | 4 ++-- doc/source/whatsnew/v1.1.0.rst | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 910022fe49d81..f2152c43ceaba 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4590,8 +4590,8 @@ Several caveats: can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. * Duplicate column names and non-string columns names are not supported -* Non supported types actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Actual Python objects in object dtype columns are not supported. These will + raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8a7db87b75d7b..0ba845aa06489 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,7 +88,9 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- +- The :meth:`DataFrame.to_feather` method now supports additional keyword + arguments (e.g. to set the compression) that are added in pyarrow 0.17 + (:issue:`33422`). .. ---------------------------------------------------------------------------