diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e860d59f2e5bd..51be96542ee38 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -272,6 +272,7 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Bug in :meth:`Categorical.__repr__` and :meth:`Series.__repr`, where :class:`Categorical`'s having categories backed by a :class:`pandas.api.extensions.ExtensionDtype` had null values show up as "NaN" instead of ``ExtensionDtype.na_value`` (:issue:`52681`) - Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:`22527`). - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index adb083c16a838..606b247442a43 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1599,6 +1599,11 @@ def _internal_get_values(self): if needs_i8_conversion(self.categories.dtype): return self.categories.take(self._codes, fill_value=NaT) elif is_integer_dtype(self.categories) and -1 in self._codes: + if isinstance(self.categories.dtype, ExtensionDtype): + # Nullable integer dtype + # Don't astype to object + fill_value = self.categories.dtype.na_value + return self.categories.take(self._codes, fill_value=fill_value) return self.categories.astype("object").take(self._codes, fill_value=np.nan) return np.array(self) @@ -1911,14 +1916,18 @@ def _formatter(self, boxed: bool = False): # Defer to CategoricalFormatter's formatter. return None - def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: + def _tidy_repr( + self, max_vals: int = 10, footer: bool = True, na_rep: str = "NaN" + ) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ num = max_vals // 2 - head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) + head = self[:num]._get_repr(length=False, footer=False, na_rep=na_rep) + tail = self[-(max_vals - num) :]._get_repr( + length=False, footer=False, na_rep=na_rep + ) result = f"{head[:-1]}, ..., {tail[1:]}" if footer: @@ -2001,12 +2010,19 @@ def __repr__(self) -> str: String representation. """ _maxlen = 10 + na_repr = "NaN" + if isinstance(self.categories.dtype, ExtensionDtype): + # np.nan should show up as NaN, not as nan + if self.categories.dtype.na_value is not np.nan: + na_repr = repr(self.categories.dtype.na_value) if len(self._codes) > _maxlen: - result = self._tidy_repr(_maxlen) + result = self._tidy_repr(_maxlen, na_rep=na_repr) elif len(self._codes) > 0: - result = self._get_repr(length=len(self) > _maxlen) + result = self._get_repr(length=len(self) > _maxlen, na_rep=na_repr) else: - msg = self._get_repr(length=False, footer=True).replace("\n", ", ") + msg = self._get_repr(length=False, footer=True, na_rep=na_repr).replace( + "\n", ", " + ) result = f"[], {msg}" return result diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index ffc44b30a3870..ffb8162b7964b 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,15 +1,19 @@ import numpy as np +import pytest from pandas import ( + NA, Categorical, CategoricalDtype, CategoricalIndex, Series, + array, date_range, option_context, period_range, timedelta_range, ) +import pandas._testing as tm class TestCategoricalReprWithFactor: @@ -253,6 +257,19 @@ def test_categorical_repr_int_with_nan(self): Categories (2, int64): [1, 2]""" assert repr(s) == s_exp + @pytest.mark.parametrize("values_dtype", tm.ALL_INT_EA_DTYPES) + def test_categorical_repr_nullable_int_NA(self, values_dtype): + arr = array([1, 2, np.nan], dtype=values_dtype) + c = Categorical(arr) + c_exp = f"""[1, 2, {NA}]\nCategories (2, {values_dtype}): [1, 2]""" + assert repr(c) == c_exp + + s = Series([1, 2, np.nan], dtype=values_dtype).astype("category") + s_exp = f"""0 1\n1 2\n2 +dtype: category +Categories (2, {values_dtype}): [1, 2]""" + assert repr(s) == s_exp + def test_categorical_repr_period(self): idx = period_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx)