From 5ca280042da91a308301deb71ecc78e3efce349a Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:11:03 -0700 Subject: [PATCH 1/9] Revert nullable dtypes support --- pandas/io/parquet.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f0aeeb3e6c893..f3cac8c943052 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,20 +309,14 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - parquet_kwargs = {} + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs = {"pandas_nulls": False} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - # Technically works with 0.7.0, but was incorrect - # so lets just require 0.7.1 - if Version(self.api.__version__) >= Version("0.7.1"): - # Need to set even for use_nullable_dtypes = False, - # since our defaults differ - parquet_kwargs["pandas_nulls"] = use_nullable_dtypes - else: - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine for fastparquet versions less than 0.7.1" - ) + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the" + " fastparquet engine" + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -478,7 +472,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame. + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional @@ -486,10 +481,6 @@ def read_parquet( .. versionadded:: 1.2.0 - .. versionchanged:: 1.3.2 - ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine - if ``fastparquet`` is version 0.7.1 or higher. - **kwargs Any additional kwargs are passed to the engine. From c3f51f1bca9ac10713bf1adcb1e8bc4a1249e5c5 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:14:47 -0700 Subject: [PATCH 2/9] Revert test changes & add test for existing behavior --- pandas/tests/io/test_parquet.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b1f7f15dfa99a..59575a455e546 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -600,11 +600,9 @@ def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": - pytest.importorskip( - "fastparquet", - "0.7.1", - reason="fastparquet must be 0.7.1 or higher for nullable dtype support", - ) + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") table = pyarrow.table( { @@ -612,6 +610,8 @@ def test_use_nullable_dtypes(self, engine): "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1,2,3,4], "int64"), } ) with tm.ensure_clean() as path: @@ -627,6 +627,7 @@ def test_use_nullable_dtypes(self, engine): "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1,2,3,4], dtype="Int64") } ) if engine == "fastparquet": From d2d43dfea4eb12d6abdb03bf08b73051d7e1182f Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:15:43 -0700 Subject: [PATCH 3/9] Update v1.3.2.rst --- doc/source/whatsnew/v1.3.2.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index f54cea744f4d2..df0af0b5ed156 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -44,7 +44,6 @@ Bug fixes Other ~~~~~ -- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1. - .. --------------------------------------------------------------------------- From 58dbfac552cd0210fe559bcb9330b338aa75c683 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:23:08 -0700 Subject: [PATCH 4/9] Update test_parquet.py --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 59575a455e546..5f801ad6abfcf 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -611,7 +611,7 @@ def test_use_nullable_dtypes(self, engine): "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), # Test that nullable dtypes used even in absence of nulls - "e": pyarrow.array([1,2,3,4], "int64"), + "e": pyarrow.array([1, 2, 3, 4], "int64"), } ) with tm.ensure_clean() as path: @@ -627,7 +627,7 @@ def test_use_nullable_dtypes(self, engine): "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), - "e": pd.array([1,2,3,4], dtype="Int64") + "e": pd.array([1, 2, 3, 4], dtype="Int64") } ) if engine == "fastparquet": From 56d5bda66ec05be323dc64490d64bd73f967c50d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:44:01 -0700 Subject: [PATCH 5/9] Update parquet.py --- pandas/io/parquet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f3cac8c943052..92109918b5005 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,9 +309,11 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - # We are disabling nullable dtypes for fastparquet pending discussion - parquet_kwargs = {"pandas_nulls": False} + parquet_kwargs = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if Version(self.api.__version__) >= Version("0.7.1"): + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the" From a7c333c857c588d38791f76a925aaa449db910c9 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:44:33 -0700 Subject: [PATCH 6/9] Update test_parquet.py --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5f801ad6abfcf..3dbfcba35344c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -627,7 +627,7 @@ def test_use_nullable_dtypes(self, engine): "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), - "e": pd.array([1, 2, 3, 4], dtype="Int64") + "e": pd.array([1, 2, 3, 4], dtype="Int64"), } ) if engine == "fastparquet": From 0a3d999784002773ef485ed30162027ad0a8a5a7 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 10:54:52 -0700 Subject: [PATCH 7/9] Update parquet.py --- pandas/io/parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 92109918b5005..6ae367c912873 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -316,8 +316,8 @@ def read( parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the" - " fastparquet engine" + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" ) path = stringify_path(path) handles = None From 9edfd8cd33867a2d763caf071e20ff4f006a67ea Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 13:15:09 -0700 Subject: [PATCH 8/9] typing? --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6ae367c912873..ef02be8fa77a6 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,7 +309,7 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - parquet_kwargs = {} + parquet_kwargs: dict[bool, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) if Version(self.api.__version__) >= Version("0.7.1"): # We are disabling nullable dtypes for fastparquet pending discussion From acf36df561cf6853e726533a10b1ad8970e0adf4 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 9 Aug 2021 15:48:13 -0700 Subject: [PATCH 9/9] typing? --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ef02be8fa77a6..49384cfb2e554 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,7 +309,7 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - parquet_kwargs: dict[bool, Any] = {} + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) if Version(self.api.__version__) >= Version("0.7.1"): # We are disabling nullable dtypes for fastparquet pending discussion