From 9d415ea3a750f8de0bf2c89d46be7315d86ccdb2 Mon Sep 17 00:00:00 2001 From: veljanin Date: Wed, 2 Oct 2024 09:56:12 +0200 Subject: [PATCH 1/9] Handling the case where converting empty categorical to 'pyarrow' dtype_backend results in error. Since conversion of non-empty categorical returns categorical of 'numpy_nullable' dtype_backend, now, instead of raising an error, we ensure empty categorical is returned as well. --- pandas/core/dtypes/cast.py | 8 ++++++-- .../tests/frame/methods/test_convert_dtypes.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f11aefeeaaa00..9c0b61c542c65 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1145,12 +1145,16 @@ def convert_dtypes( and isna(input_array).all() ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + if isna(input_array).all() and hasattr(input_array, 'categories'): + inferred_dtype = input_array.dtype + else: + inferred_dtype = ArrowDtype(pa_type) + elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index d0f30204758d3..96ecf552c6fea 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,6 +34,22 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + + def test_convert_empty_categorical_to_pyarrow(self): + df = pd.DataFrame( + { + "A": pd.Series(pd.Categorical([None] * 5)), + "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), + } + ) + converted = df.convert_dtypes(dtype_backend="pyarrow") + expected = df + tm.assert_frame_equal(converted, expected) + + assert df.A.dtype == "category", "Dtype in column A is not 'category'" + assert df.B.dtype == "category", "Dtype in column B is not 'category'" + assert df.A.cat.categories.empty, "Categories in column A are not empty" + assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" def test_convert_dtypes_retain_column_names(self): # GH#41435 From e7c49118deafed8d153539cc6a013240fc023ed2 Mon Sep 17 00:00:00 2001 From: veljanin Date: Fri, 4 Oct 2024 11:48:40 +0200 Subject: [PATCH 2/9] additional revisions --- pandas/core/dtypes/cast.py | 5 ++-- .../frame/methods/test_convert_dtypes.py | 23 +++++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9c0b61c542c65..805cf3520c6d6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1143,14 +1143,15 @@ def convert_dtypes( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 and isna(input_array).all() + and not isinstance(input_array.dtype, CategoricalDtype) ): import pyarrow as pa - + pa_type = pa.null() else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, 'categories'): + if isna(input_array).all() and hasattr(input_array, "categories"): inferred_dtype = input_array.dtype else: inferred_dtype = ArrowDtype(pa_type) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 96ecf552c6fea..8bceba7ec7ab4 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -34,22 +34,27 @@ def test_convert_empty(self): # Empty DataFrame can pass convert_dtypes, see GH#40393 empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) - + def test_convert_empty_categorical_to_pyarrow(self): + # GH#59934 df = pd.DataFrame( { - "A": pd.Series(pd.Categorical([None] * 5)), - "B": pd.Series(pd.Categorical([None] * 5, categories=["B1", "B2"])), - } + "A": pd.Categorical([None] * 5), + "B": pd.Categorical([None] * 5, categories=["B1", "B2"]), + } ) converted = df.convert_dtypes(dtype_backend="pyarrow") expected = df tm.assert_frame_equal(converted, expected) - - assert df.A.dtype == "category", "Dtype in column A is not 'category'" - assert df.B.dtype == "category", "Dtype in column B is not 'category'" - assert df.A.cat.categories.empty, "Categories in column A are not empty" - assert (df.B.cat.categories == ["B1", "B2"]).all(), "Categories in column A are not empty" + + assert converted.A.dtype == "category", "Dtype in column A is not 'category'" + assert converted.B.dtype == "category", "Dtype in column B is not 'category'" + assert converted.A.cat.categories.empty, "Categories in column A are not empty" + assert converted.B.cat.categories.__contains__( + "B1" + ) and converted.B.cat.categories.__contains__( + "B2" + ), "Categories in column B doesn't contain adequate categories" def test_convert_dtypes_retain_column_names(self): # GH#41435 From 02218af49e41111725f563e84e5251c469b76bab Mon Sep 17 00:00:00 2001 From: veljanin Date: Fri, 4 Oct 2024 11:53:11 +0200 Subject: [PATCH 3/9] removing the change for input_array... --- pandas/core/dtypes/cast.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 805cf3520c6d6..9f98040cfe987 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1151,10 +1151,7 @@ def convert_dtypes( else: pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: - if isna(input_array).all() and hasattr(input_array, "categories"): - inferred_dtype = input_array.dtype - else: - inferred_dtype = ArrowDtype(pa_type) + inferred_dtype = ArrowDtype(pa_type) elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 From 67732812dd43981ca8020ed91e6a845539b05dd8 Mon Sep 17 00:00:00 2001 From: veljanin Date: Mon, 7 Oct 2024 08:20:04 +0200 Subject: [PATCH 4/9] reverting newline in Series.convert_dtypes and precising respective docs in whatsnew --- pandas/core/dtypes/cast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9f98040cfe987..e703e7092070d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1152,7 +1152,6 @@ def convert_dtypes( pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type) - elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype): # GH 53648 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype] From 2ca6467262a40296ef230c04d92a762dca6f7d97 Mon Sep 17 00:00:00 2001 From: veljanin Date: Mon, 14 Oct 2024 12:34:05 +0200 Subject: [PATCH 5/9] revised testing to resolve CI errors v2 --- pandas/tests/frame/methods/test_convert_dtypes.py | 10 +++++----- pandas/tests/series/methods/test_convert_dtypes.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 8bceba7ec7ab4..215dc9bc68972 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -50,11 +52,9 @@ def test_convert_empty_categorical_to_pyarrow(self): assert converted.A.dtype == "category", "Dtype in column A is not 'category'" assert converted.B.dtype == "category", "Dtype in column B is not 'category'" assert converted.A.cat.categories.empty, "Categories in column A are not empty" - assert converted.B.cat.categories.__contains__( - "B1" - ) and converted.B.cat.categories.__contains__( - "B2" - ), "Categories in column B doesn't contain adequate categories" + assert converted.B.cat.categories.isin( + ["B1", "B2"] + ).all(), "Categories in column B doesn't contain adequate categories" def test_convert_dtypes_retain_column_names(self): # GH#41435 diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index d373386108ff6..9bc02b46400e2 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -4,6 +4,7 @@ import pytest from pandas._libs import lib +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm From 33cd18b0c5289a6e9360226c4b81302cee873d23 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sat, 15 Mar 2025 23:37:28 +0200 Subject: [PATCH 6/9] Finish the pr --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/tests/frame/methods/test_convert_dtypes.py | 8 +------- pandas/tests/series/methods/test_convert_dtypes.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 873c1e7cd41cc..8d2f02a90602e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -636,6 +636,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) +- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty categorical series raise error or get converted to ``null[pyarrow]`` (:issue:`59934`) - Datetimelike diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 215dc9bc68972..ab847e2f8e81e 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -37,6 +37,7 @@ def test_convert_empty(self): empty_df = pd.DataFrame() tm.assert_frame_equal(empty_df, empty_df.convert_dtypes()) + @td.skip_if_no("pyarrow") def test_convert_empty_categorical_to_pyarrow(self): # GH#59934 df = pd.DataFrame( @@ -49,13 +50,6 @@ def test_convert_empty_categorical_to_pyarrow(self): expected = df tm.assert_frame_equal(converted, expected) - assert converted.A.dtype == "category", "Dtype in column A is not 'category'" - assert converted.B.dtype == "category", "Dtype in column B is not 'category'" - assert converted.A.cat.categories.empty, "Categories in column A are not empty" - assert converted.B.cat.categories.isin( - ["B1", "B2"] - ).all(), "Categories in column B doesn't contain adequate categories" - def test_convert_dtypes_retain_column_names(self): # GH#41435 df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 9bc02b46400e2..177921a6064bf 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -299,6 +299,19 @@ def test_convert_dtypes_pyarrow_null(self): expected = pd.Series([None, None], dtype=pd.ArrowDtype(pa.null())) tm.assert_series_equal(result, expected) + @td.skip_if_no("pyarrow") + def test_convert_empty_categorical_to_pyarrow(self): + # GH#59934 + ser1 = pd.Series(pd.Categorical([None] * 5)) + converted1 = ser1.convert_dtypes(dtype_backend="pyarrow") + expected = ser1 + tm.assert_series_equal(converted1, expected) + + ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"])) + converted2 = ser2.convert_dtypes(dtype_backend="pyarrow") + expected = ser2 + tm.assert_series_equal(converted2, expected) + def test_convert_dtype_pyarrow_timezone_preserve(self): # GH 60237 pytest.importorskip("pyarrow") From 0b29d8272d4a0b42bc89f6e5b754eb58f14fd5ab Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Mon, 17 Mar 2025 20:07:56 +0200 Subject: [PATCH 7/9] Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8d2f02a90602e..89b75fbdb0801 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -636,7 +636,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) -- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty categorical series raise error or get converted to ``null[pyarrow]`` (:issue:`59934`) +- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) - Datetimelike From 7c00c66dafac99a4e940ffb1f465e9712ff6d96a Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 17 Mar 2025 20:19:23 +0200 Subject: [PATCH 8/9] paramterize test --- .../tests/series/methods/test_convert_dtypes.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 177921a6064bf..324e03894e92c 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -300,17 +300,13 @@ def test_convert_dtypes_pyarrow_null(self): tm.assert_series_equal(result, expected) @td.skip_if_no("pyarrow") - def test_convert_empty_categorical_to_pyarrow(self): + @pytest.mark.parametrize("categories", [None, ["S1", "S2"]]) + def test_convert_empty_categorical_to_pyarrow(self, categories): # GH#59934 - ser1 = pd.Series(pd.Categorical([None] * 5)) - converted1 = ser1.convert_dtypes(dtype_backend="pyarrow") - expected = ser1 - tm.assert_series_equal(converted1, expected) - - ser2 = pd.Series(pd.Categorical([None] * 5, categories=["S1", "S2"])) - converted2 = ser2.convert_dtypes(dtype_backend="pyarrow") - expected = ser2 - tm.assert_series_equal(converted2, expected) + ser = pd.Series(pd.Categorical([None] * 5, categories=categories)) + converted = ser.convert_dtypes(dtype_backend="pyarrow") + expected = ser + tm.assert_series_equal(converted, expected) def test_convert_dtype_pyarrow_timezone_preserve(self): # GH 60237 From 90d62510bafa52e6978ae9daba6a90700d97eff3 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 17 Mar 2025 20:23:56 +0200 Subject: [PATCH 9/9] move condition --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e703e7092070d..dae04ba6244d4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1127,6 +1127,7 @@ def convert_dtypes( or ( inferred_dtype.kind not in "iufcb" and not isinstance(inferred_dtype, StringDtype) + and not isinstance(inferred_dtype, CategoricalDtype) ) ): if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance( @@ -1143,7 +1144,6 @@ def convert_dtypes( base_dtype.kind == "O" # type: ignore[union-attr] and input_array.size > 0 and isna(input_array).all() - and not isinstance(input_array.dtype, CategoricalDtype) ): import pyarrow as pa