diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 89a9da4a73b35..cee1778e05bb8 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -253,6 +253,34 @@ are returned. (:issue:`21521`) df.groupby("a").ffill() +``DataFrame`` describe on an empty categorical / object column will return top and freq +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When calling :meth:`DataFrame.describe` with an empty categorical / object +column, the 'top' and 'freq' columns were previously omitted, which was inconsistent with +the output for non-empty columns. Now the 'top' and 'freq' columns will always be included, +with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397`) + +.. ipython:: python + + df = pd.DataFrame({"empty_col": pd.Categorical([])}) + df + +*Previous Behavior*: + +.. code-block:: python + + In [3]: df.describe() + Out[3]: + empty_col + count 0 + unique 0 + +*New Behavior*: + +.. ipython:: python + + df.describe() ``__str__`` methods now call ``__repr__`` rather than vica-versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 89b86c66d7b05..65c6babff38ba 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1478,7 +1478,7 @@ def value_counts(self, dropna=True): if dropna or clean: obs = code if clean else code[mask] - count = bincount(obs, minlength=ncat or None) + count = bincount(obs, minlength=ncat or 0) else: count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0596d0ab844ec..7ca2c52e18c41 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9920,6 +9920,12 @@ def describe_categorical_1d(data): names += ['top', 'freq'] result += [top, freq] + # If the DataFrame is empty, set 'top' and 'freq' to None + # to maintain output shape consistency + else: + names += ['top', 'freq'] + result += [None, None] + return pd.Series(result, index=names, name=data.name) def describe_1d(data): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index effe7eb47323d..487ff7932ec5f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -588,6 +588,16 @@ def test_describe_categorical(self): result = df3.describe() tm.assert_numpy_array_equal(result["cat"].values, result["s"].values) + def test_describe_empty_categorical_column(self): + # GH 26397 + # Ensure the index of an an empty categoric DataFrame column + # also contains (count, unique, top, freq) + df = pd.DataFrame({"empty_col": Categorical([])}) + result = df.describe() + expected = DataFrame({'empty_col': [0, 0, None, None]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], @@ -608,6 +618,7 @@ def test_describe_categorical_columns(self): index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], columns=exp_columns) + tm.assert_frame_equal(result, expected) tm.assert_categorical_equal(result.columns.values, expected.columns.values)