diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index ac7d30310be9e..f63563472f1f2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1088,6 +1088,7 @@ Removal of prior version deprecations/changes - Arguments after ``expr`` in :meth:`DataFrame.eval` and :meth:`DataFrame.query` are keyword-only (:issue:`47587`) - Removed :meth:`Index._get_attributes_dict` (:issue:`50648`) - Removed :meth:`Series.__array_wrap__` (:issue:`50648`) +- Changed behavior of :meth:`.DataFrame.value_counts` to return a :class:`Series` with :class:`MultiIndex` for any list-like(one element or not) but an :class:`Index` for a single label (:issue:`50829`) .. --------------------------------------------------------------------------- .. _whatsnew_200.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4da8cb975b975..2520f54ae3d99 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6957,7 +6957,7 @@ def value_counts( Parameters ---------- - subset : list-like, optional + subset : label or list of labels, optional Columns to use when counting unique combinations. normalize : bool, default False Return proportions rather than frequencies. @@ -6981,9 +6981,10 @@ def value_counts( Notes ----- The returned Series will have a MultiIndex with one level per input - column. By default, rows that contain any NA values are omitted from - the result. By default, the resulting Series will be in descending - order so that the first element is the most frequently-occurring row. + column but an Index (non-multi) for a single label. By default, rows + that contain any NA values are omitted from the result. By default, + the resulting Series will be in descending order so that the first + element is the most frequently-occurring row. Examples -------- @@ -7049,6 +7050,13 @@ def value_counts( John Smith 1 NaN 1 Name: count, dtype: int64 + + >>> df.value_counts("first_name") + first_name + John 2 + Anne 1 + Beth 1 + Name: count, dtype: int64 """ if subset is None: subset = self.columns.tolist() @@ -7063,7 +7071,7 @@ def value_counts( counts /= counts.sum() # Force MultiIndex for single column - if len(subset) == 1: + if is_list_like(subset) and len(subset) == 1: counts.index = MultiIndex.from_arrays( [counts.index], names=[counts.index.name] ) diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index e8c129fd12bfd..355f05cd5156c 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd import pandas._testing as tm @@ -155,3 +156,22 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1])) +def test_data_frame_value_counts_subset(nulls_fixture, columns): + # GH 50829 + df = pd.DataFrame( + { + columns[0]: ["John", "Anne", "John", "Beth"], + columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"], + }, + ) + result = df.value_counts(columns[0]) + expected = pd.Series( + data=[2, 1, 1], + index=pd.Index(["John", "Anne", "Beth"], name=columns[0]), + name="count", + ) + + tm.assert_series_equal(result, expected)