diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index adbe73aa5c5ef..9f491302a4d6f 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self): +#----------------------------------------------------------------------------- +# nunique + +class frame_nunique(object): + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + + def time_frame_nunique(self): + self.df.nunique() + + + #----------------------------------------------------------------------------- # duplicated diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ad58cd0fc6d70..fa68b122cc98d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -251,6 +251,22 @@ def time_groupby_int_count(self): self.df.groupby(['key1', 'key2']).count() +#---------------------------------------------------------------------- +# nunique() speed + +class groupby_nunique(object): + + def setup(self): + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), + 'key2': randint(0, 100, size=self.n), + 'ints': randint(0, 1000, size=self.n), + 'ints2': randint(0, 1000, size=self.n), }) + + def time_groupby_nunique(self): + self.df.groupby(['key1', 'key2']).nunique() + + #---------------------------------------------------------------------- # group with different functions per column diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b1..a5b95dfc4f3f7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -97,6 +97,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) +- ``DataFrame`` has gained a ``nunique()`` method as short-cut for ``.apply(lambda x: x.nunique())`` (counting the distinct values over an axis) (:issue:`14336`). +- New ``DataFrame.groupby().nunique()`` method as short-cut for ``.apply(lambda g: g.apply(lambda x: x.nunique()))`` (counting the distinct values for all columns within each group) (:issue:`14336`). + - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d12b8af35469b..b872f35277402 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4969,6 +4969,38 @@ def f(x): return Series(result, index=labels) + def nunique(self, axis=0, dropna=True): + """ + Return Series with number of distinct observations over requested + axis. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise + dropna : boolean, default True + Don't include NaN in the counts. + + Returns + ------- + nunique : Series + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df.nunique() + A 3 + B 1 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 + """ + return self.apply(Series.nunique, axis=axis, dropna=dropna) + def idxmin(self, axis=0, skipna=True): """ Return index of first occurrence of minimum over requested axis. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..ea361afdc3a60 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3899,6 +3899,54 @@ def count(self): return self._wrap_agged_blocks(data.items, list(blk)) + def nunique(self, dropna=True): + """ + Return DataFrame with number of distinct observations per group for + each column. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + dropna : boolean, default True + Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + id value1 value2 + id + egg 1 1 1 + ham 1 1 2 + spam 1 2 1 + + # check for rows with the same id but conflicting values + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + """ + return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna)) + from pandas.tools.plotting import boxplot_frame_groupby # noqa DataFrameGroupBy.boxplot = boxplot_frame_groupby diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f6081e14d4081..4af26c3d0fdc0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -16,6 +16,7 @@ MultiIndex, date_range, Timestamp) import pandas as pd import pandas.core.nanops as nanops +import pandas.core.algorithms as algorithms import pandas.formats.printing as printing import pandas.util.testing as tm @@ -410,6 +411,21 @@ def test_count(self): expected = Series(0, index=[]) tm.assert_series_equal(result, expected) + def test_nunique(self): + f = lambda s: len(algorithms.unique1d(s.dropna())) + self._check_stat_op('nunique', f, has_skipna=False, + check_dtype=False, check_dates=True) + + df = DataFrame({'A': [1, 1, 1], + 'B': [1, 2, 3], + 'C': [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2})) + tm.assert_series_equal(df.nunique(dropna=False), + Series({'A': 1, 'B': 3, 'C': 3})) + tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), + Series({0: 1, 1: 3, 2: 2})) + def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e87b5d04271e8..5cab941e74ce5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2800,6 +2800,34 @@ def test_count_cross_type(self): # GH8169 result = df.groupby(['c', 'd']).count() tm.assert_frame_equal(result, expected) + def test_nunique(self): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + def test_non_cython_api(self): # GH5610 @@ -5150,11 +5178,11 @@ def test_tab_completion(self): 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'head', 'irow', 'describe', 'cummax', 'quantile', 'rank', - 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', - 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', - 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', - 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) self.assertEqual(results, expected)