diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index b1c6172fb1261..9f654060d1c5b 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -176,6 +176,7 @@ Computations / descriptive stats DataFrame.std DataFrame.var DataFrame.nunique + DataFrame.value_counts Reindexing / selection / label manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2aa06f5c1c584..fe8bd50eb4263 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -78,7 +78,7 @@ Other API changes ^^^^^^^^^^^^^^^^^ - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) -- +- Added :meth:`pandas.core.frame.DataFrame.value_counts` (:issue:`5377`). - .. _whatsnew_1000.deprecations: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c12208db983e2..c83331c84afc5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -90,7 +90,7 @@ from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.multi import maybe_droplevels +from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager @@ -8455,6 +8455,126 @@ def isin(self, values): self.columns, ) + def value_counts( + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True + ): + """ + Return a Series containing counts of unique rows in the DataFrame. + + .. versionadded:: 1.0.0 + + The returned Series will have a MultiIndex with one level per input + column. + + By default, rows that contain any NaN value are omitted from the + results. + + By default, the resulting series will be in descending order so that the + first element is the most frequently-occurring row. + + Parameters + ---------- + normalize : boolean, default False + If True then the Series returned will contain the relative + frequencies of the unique values. + sort : boolean, default True + Sort by frequencies. + ascending : boolean, default False + Sort in ascending order. + bins : integer, optional + This parameter is not yet supported and must be set to None (the + default value). It exists to ensure compatibiliy with + `Series.value_counts`. + + Rather than count values, group them into half-open bins, + a convenience for ``pd.cut``, only works with single-column numeric + data. + dropna : boolean, default True + This parameter is not yet supported and must be set to True (the + default value). It exists to ensure compatibiliy with + `Series.value_counts`. + + Don't include counts of rows containing NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts: Equivalent method on Series. + + Examples + -------- + + >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], + ... 'num_wings': [2, 0, 0, 0]}, + ... index=['falcon', 'dog', 'cat', 'ant']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 + + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 6 0 1 + 2 2 1 + dtype: int64 + + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + dtype: int64 + + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + dtype: int64 + + >>> df.value_counts(normalize=True) + num_legs num_wings + 4 0 0.50 + 6 0 0.25 + 2 2 0.25 + dtype: float64 + + >>> single_col_df = df[['num_legs']] + >>> single_col_df.value_counts(bins=4) + num_legs + (3.0, 4.0] 2 + (5.0, 6.0] 1 + (1.995, 3.0] 1 + (4.0, 5.0] 0 + dtype: int64 + """ + + # Some features not supported yet. + if not dropna: + raise NotImplementedError( + "`dropna=False` not yet supported for dataframes." + ) + if bins is not None: + raise NotImplementedError( + "`bins` parameter not yet supported for dataframes." + ) + + counts = self.groupby(self.columns.tolist()).size() + if sort: + counts.sort_values(ascending=ascending, inplace=True) + if normalize: + counts /= counts.sum() + # Force MultiIndex index. + if len(self.columns) == 1: + counts.index = MultiIndex.from_arrays([counts.index]) + return counts + # ---------------------------------------------------------------------- # Add plotting methods to DataFrame plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index e99208ac78e15..5157d9d4edb27 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2766,3 +2766,109 @@ def test_multiindex_column_lookup(self): result = df.nlargest(3, ("x", "b")) expected = df.iloc[[3, 2, 1]] tm.assert_frame_equal(result, expected) + + def test_data_frame_value_counts_unsorted(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(sort=False) + expected = pd.Series( + data=[1, 2, 1], + index=pd.MultiIndex.from_arrays( + [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_ascending(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(ascending=True) + expected = pd.Series( + data=[1, 1, 2], + index=pd.MultiIndex.from_arrays( + [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_default(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_normalize(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + result = df.value_counts(normalize=True) + expected = pd.Series( + data=[0.5, 0.25, 0.25], + index=pd.MultiIndex.from_arrays( + [(4, 6, 2), (0, 0, 2)], names=["num_legs", "num_wings"] + ), + ) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_dropna_not_supported_yet(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + with pytest.raises(NotImplementedError, match="not yet supported"): + df.value_counts(dropna=False) + + def test_data_frame_value_counts_bins_not_supported(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + with pytest.raises(NotImplementedError, match="not yet supported"): + df.value_counts(bins=2) + + def test_data_frame_value_counts_single_col_default(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + df_single_col = df[["num_legs"]] + result = df_single_col.value_counts() + expected = pd.Series( + data=[2, 1, 1], + index=pd.MultiIndex.from_arrays([[4, 6, 2]], names=["num_legs"]), + ) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_single_col_bins(self): + df = pd.DataFrame( + {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + index=["falcon", "dog", "cat", "ant"], + ) + df_single_col = df[["num_legs"]] + with pytest.raises(NotImplementedError, match="not yet supported"): + _ = df_single_col.value_counts(bins=4) + + def test_data_frame_value_counts_empty(self): + df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts() + expected = pd.Series([], dtype=np.int64) + tm.assert_series_equal(result, expected) + + def test_data_frame_value_counts_empty_normalize(self): + df_no_cols = pd.DataFrame() + result = df_no_cols.value_counts(normalize=True) + expected = pd.Series([], dtype=np.float64) + tm.assert_series_equal(result, expected)