From 5c4b3256eed9ad9745f324932f754818d02c9dc5 Mon Sep 17 00:00:00 2001 From: Sebastian Bank Date: Fri, 7 Oct 2016 19:59:07 +0200 Subject: [PATCH 1/4] API: add DataFrame.nunique() and DataFrameGroupBy.nunique() --- doc/source/whatsnew/v0.20.0.txt | 3 +++ pandas/core/frame.py | 21 +++++++++++++++++++++ pandas/core/groupby.py | 15 +++++++++++++++ pandas/tests/frame/test_analytics.py | 5 +++++ pandas/tests/groupby/test_groupby.py | 20 +++++++++++++++----- 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0873e4b34b0b1..a5b95dfc4f3f7 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -97,6 +97,9 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`) +- ``DataFrame`` has gained a ``nunique()`` method as short-cut for ``.apply(lambda x: x.nunique())`` (counting the distinct values over an axis) (:issue:`14336`). +- New ``DataFrame.groupby().nunique()`` method as short-cut for ``.apply(lambda g: g.apply(lambda x: x.nunique()))`` (counting the distinct values for all columns within each group) (:issue:`14336`). + - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`) - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d12b8af35469b..1ea9208d45da7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4969,6 +4969,27 @@ def f(x): return Series(result, index=labels) + def nunique(self, axis=0, dropna=True): + """ + Return Series with number of distinct observations over requested + axis. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + 0 or 'index' for row-wise, 1 or 'columns' for column-wise + dropna : boolean, default True + Don't include NaN in the counts. + + Returns + ------- + nunique : Series + """ + func = functools.partial(Series.nunique, dropna=dropna) + return self.apply(func, axis=axis) + def idxmin(self, axis=0, skipna=True): """ Return index of first occurrence of minimum over requested axis. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..45b18127286e5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3899,6 +3899,21 @@ def count(self): return self._wrap_agged_blocks(data.items, list(blk)) + def nunique(self, dropna=True): + """ + Return Series with number of distinct observations per group. + + .. versionadded:: 0.20.0 + + Parameters + ---------- + dropna : boolean, default True + Don't include NaN in the counts. + """ + from functools import partial + func = partial(Series.nunique, dropna=dropna) + return self.apply(lambda g: g.apply(func)) + from pandas.tools.plotting import boxplot_frame_groupby # noqa DataFrameGroupBy.boxplot = boxplot_frame_groupby diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f6081e14d4081..612a563bd15b3 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -410,6 +410,11 @@ def test_count(self): expected = Series(0, index=[]) tm.assert_series_equal(result, expected) + def test_nunique(self): + f = lambda s: len(nanops.unique1d(s.dropna())) + self._check_stat_op('nunique', f, has_skipna=False, + check_dtype=False, check_dates=True) + def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e87b5d04271e8..2c117b9503779 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2800,6 +2800,16 @@ def test_count_cross_type(self): # GH8169 result = df.groupby(['c', 'd']).count() tm.assert_frame_equal(result, expected) + def test_nunique(self): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + def test_non_cython_api(self): # GH5610 @@ -5150,11 +5160,11 @@ def test_tab_completion(self): 'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max', 'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot', 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', - 'head', 'irow', 'describe', 'cummax', 'quantile', 'rank', - 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', - 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', - 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', - 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', + 'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile', + 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', + 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', + 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', + 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) self.assertEqual(results, expected) From fd0f22d2a126d0003fc42b723084660970f5f0fb Mon Sep 17 00:00:00 2001 From: Sebastian Bank Date: Mon, 31 Oct 2016 18:08:19 +0100 Subject: [PATCH 2/4] add simple benchmarks --- asv_bench/benchmarks/frame_methods.py | 14 ++++++++++++++ asv_bench/benchmarks/groupby.py | 16 ++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index adbe73aa5c5ef..9f491302a4d6f 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self): +#----------------------------------------------------------------------------- +# nunique + +class frame_nunique(object): + + def setup(self): + self.data = np.random.randn(10000, 1000) + self.df = DataFrame(self.data) + + def time_frame_nunique(self): + self.df.nunique() + + + #----------------------------------------------------------------------------- # duplicated diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ad58cd0fc6d70..fa68b122cc98d 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -251,6 +251,22 @@ def time_groupby_int_count(self): self.df.groupby(['key1', 'key2']).count() +#---------------------------------------------------------------------- +# nunique() speed + +class groupby_nunique(object): + + def setup(self): + self.n = 10000 + self.df = DataFrame({'key1': randint(0, 500, size=self.n), + 'key2': randint(0, 100, size=self.n), + 'ints': randint(0, 1000, size=self.n), + 'ints2': randint(0, 1000, size=self.n), }) + + def time_groupby_nunique(self): + self.df.groupby(['key1', 'key2']).nunique() + + #---------------------------------------------------------------------- # group with different functions per column From c8d3ac4da110a77f975107cc163eabe587972bd7 Mon Sep 17 00:00:00 2001 From: Sebastian Bank Date: Wed, 28 Dec 2016 15:47:51 +0100 Subject: [PATCH 3/4] extend docs and tests --- pandas/core/frame.py | 12 ++++++++++ pandas/core/groupby.py | 35 +++++++++++++++++++++++++++- pandas/tests/frame/test_analytics.py | 3 ++- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++ 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1ea9208d45da7..bf9fb263093be 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4986,6 +4986,18 @@ def nunique(self, axis=0, dropna=True): Returns ------- nunique : Series + + Examples + -------- + >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df.nunique() + A 3 + B 1 + + >>> df.nunique(axis=1) + 0 1 + 1 2 + 2 2 """ func = functools.partial(Series.nunique, dropna=dropna) return self.apply(func, axis=axis) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 45b18127286e5..ff034118424dc 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3901,7 +3901,8 @@ def count(self): def nunique(self, dropna=True): """ - Return Series with number of distinct observations per group. + Return DataFrame with number of distinct observations per group for + each column. .. versionadded:: 0.20.0 @@ -3909,6 +3910,38 @@ def nunique(self, dropna=True): ---------- dropna : boolean, default True Don't include NaN in the counts. + + Returns + ------- + nunique: DataFrame + + Examples + -------- + >>> df = DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) + >>> df + id value1 value2 + 0 spam 1 a + 1 egg 5 b + 2 egg 5 b + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y + + >>> df.groupby('id').nunique() + id value1 value2 + id + egg 1 1 1 + ham 1 1 2 + spam 1 2 1 + + >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) + id value1 value2 + 0 spam 1 a + 3 spam 2 a + 4 ham 5 x + 5 ham 5 y """ from functools import partial func = partial(Series.nunique, dropna=dropna) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 612a563bd15b3..71ab2f46cf6f7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -16,6 +16,7 @@ MultiIndex, date_range, Timestamp) import pandas as pd import pandas.core.nanops as nanops +import pandas.core.algorithms as algorithms import pandas.formats.printing as printing import pandas.util.testing as tm @@ -411,7 +412,7 @@ def test_count(self): tm.assert_series_equal(result, expected) def test_nunique(self): - f = lambda s: len(nanops.unique1d(s.dropna())) + f = lambda s: len(algorithms.unique1d(s.dropna())) self._check_stat_op('nunique', f, has_skipna=False, check_dtype=False, check_dates=True) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2c117b9503779..5cab941e74ce5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2806,10 +2806,28 @@ def test_nunique(self): 'B': list('abxacc'), 'C': list('abbacx'), }) + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) result = df.groupby('A', as_index=False).nunique() tm.assert_frame_equal(result, expected) + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + def test_non_cython_api(self): # GH5610 From a0558e7875ceb38b9897f0c878ef8f3cc746583e Mon Sep 17 00:00:00 2001 From: Sebastian Bank Date: Mon, 2 Jan 2017 18:41:00 +0100 Subject: [PATCH 4/4] use apply()-kwargs instead of partial, more tests, better examples --- pandas/core/frame.py | 5 ++--- pandas/core/groupby.py | 12 ++++++------ pandas/tests/frame/test_analytics.py | 10 ++++++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf9fb263093be..b872f35277402 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4989,7 +4989,7 @@ def nunique(self, axis=0, dropna=True): Examples -------- - >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) + >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]}) >>> df.nunique() A 3 B 1 @@ -4999,8 +4999,7 @@ def nunique(self, axis=0, dropna=True): 1 2 2 2 """ - func = functools.partial(Series.nunique, dropna=dropna) - return self.apply(func, axis=axis) + return self.apply(Series.nunique, axis=axis, dropna=dropna) def idxmin(self, axis=0, skipna=True): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index ff034118424dc..ea361afdc3a60 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3917,9 +3917,10 @@ def nunique(self, dropna=True): Examples -------- - >>> df = DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 'ham', 'ham'], - ... 'value1': [1, 5, 5, 2, 5, 5], - ... 'value2': list('abbaxy')}) + >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam', + ... 'ham', 'ham'], + ... 'value1': [1, 5, 5, 2, 5, 5], + ... 'value2': list('abbaxy')}) >>> df id value1 value2 0 spam 1 a @@ -3936,6 +3937,7 @@ def nunique(self, dropna=True): ham 1 1 2 spam 1 2 1 + # check for rows with the same id but conflicting values >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any()) id value1 value2 0 spam 1 a @@ -3943,9 +3945,7 @@ def nunique(self, dropna=True): 4 ham 5 x 5 ham 5 y """ - from functools import partial - func = partial(Series.nunique, dropna=dropna) - return self.apply(lambda g: g.apply(func)) + return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna)) from pandas.tools.plotting import boxplot_frame_groupby # noqa diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 71ab2f46cf6f7..4af26c3d0fdc0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -416,6 +416,16 @@ def test_nunique(self): self._check_stat_op('nunique', f, has_skipna=False, check_dtype=False, check_dates=True) + df = DataFrame({'A': [1, 1, 1], + 'B': [1, 2, 3], + 'C': [1, np.nan, 3]}) + tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2})) + tm.assert_series_equal(df.nunique(dropna=False), + Series({'A': 1, 'B': 3, 'C': 3})) + tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) + tm.assert_series_equal(df.nunique(axis=1, dropna=False), + Series({0: 1, 1: 3, 2: 2})) + def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True)