From 5c4b3256eed9ad9745f324932f754818d02c9dc5 Mon Sep 17 00:00:00 2001
From: Sebastian Bank <sebastian.bank@uni-leipzig.de>
Date: Fri, 7 Oct 2016 19:59:07 +0200
Subject: [PATCH 1/4] API: add DataFrame.nunique() and
 DataFrameGroupBy.nunique()

---
 doc/source/whatsnew/v0.20.0.txt      |  3 +++
 pandas/core/frame.py                 | 21 +++++++++++++++++++++
 pandas/core/groupby.py               | 15 +++++++++++++++
 pandas/tests/frame/test_analytics.py |  5 +++++
 pandas/tests/groupby/test_groupby.py | 20 +++++++++++++++-----
 5 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 0873e4b34b0b1..a5b95dfc4f3f7 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -97,6 +97,9 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - ``Series.sort_index`` accepts parameters ``kind`` and ``na_position`` (:issue:`13589`, :issue:`14444`)
 
+- ``DataFrame`` has gained a ``nunique()`` method as short-cut for ``.apply(lambda x: x.nunique())`` (counting the distinct values over an axis) (:issue:`14336`).
+- New ``DataFrame.groupby().nunique()`` method as short-cut for ``.apply(lambda g: g.apply(lambda x: x.nunique()))`` (counting the distinct values for all columns within each group) (:issue:`14336`).
+
 - ``pd.read_excel`` now preserves sheet order when using ``sheetname=None`` (:issue:`9930`)
 - Multiple offset aliases with decimal points are now supported (e.g. '0.5min' is parsed as '30s') (:issue:`8419`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index d12b8af35469b..1ea9208d45da7 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4969,6 +4969,27 @@ def f(x):
 
         return Series(result, index=labels)
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Return Series with number of distinct observations over requested
+        axis.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            0 or 'index' for row-wise, 1 or 'columns' for column-wise
+        dropna : boolean, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        nunique : Series
+        """
+        func = functools.partial(Series.nunique, dropna=dropna)
+        return self.apply(func, axis=axis)
+
     def idxmin(self, axis=0, skipna=True):
         """
         Return index of first occurrence of minimum over requested axis.
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 7eba32b4932d0..45b18127286e5 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3899,6 +3899,21 @@ def count(self):
 
         return self._wrap_agged_blocks(data.items, list(blk))
 
+    def nunique(self, dropna=True):
+        """
+        Return Series with number of distinct observations per group.
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        dropna : boolean, default True
+            Don't include NaN in the counts.
+        """
+        from functools import partial
+        func = partial(Series.nunique, dropna=dropna)
+        return self.apply(lambda g: g.apply(func))
+
 
 from pandas.tools.plotting import boxplot_frame_groupby  # noqa
 DataFrameGroupBy.boxplot = boxplot_frame_groupby
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index f6081e14d4081..612a563bd15b3 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -410,6 +410,11 @@ def test_count(self):
         expected = Series(0, index=[])
         tm.assert_series_equal(result, expected)
 
+    def test_nunique(self):
+        f = lambda s: len(nanops.unique1d(s.dropna()))
+        self._check_stat_op('nunique', f, has_skipna=False,
+                            check_dtype=False, check_dates=True)
+
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index e87b5d04271e8..2c117b9503779 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2800,6 +2800,16 @@ def test_count_cross_type(self):  # GH8169
             result = df.groupby(['c', 'd']).count()
             tm.assert_frame_equal(result, expected)
 
+    def test_nunique(self):
+        df = DataFrame({
+            'A': list('abbacc'),
+            'B': list('abxacc'),
+            'C': list('abbacx'),
+        })
+        expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
+        result = df.groupby('A', as_index=False).nunique()
+        tm.assert_frame_equal(result, expected)
+
     def test_non_cython_api(self):
 
         # GH5610
@@ -5150,11 +5160,11 @@ def test_tab_completion(self):
              'first', 'get_group', 'groups', 'hist', 'indices', 'last', 'max',
              'mean', 'median', 'min', 'name', 'ngroups', 'nth', 'ohlc', 'plot',
              'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count',
-             'head', 'irow', 'describe', 'cummax', 'quantile', 'rank',
-             'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum',
-             'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take',
-             'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov',
-             'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
+             'nunique', 'head', 'irow', 'describe', 'cummax', 'quantile',
+             'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna',
+             'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill',
+             'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith',
+             'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin',
              'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding'])
         self.assertEqual(results, expected)
 

From fd0f22d2a126d0003fc42b723084660970f5f0fb Mon Sep 17 00:00:00 2001
From: Sebastian Bank <sebastian.bank@uni-leipzig.de>
Date: Mon, 31 Oct 2016 18:08:19 +0100
Subject: [PATCH 2/4] add simple benchmarks

---
 asv_bench/benchmarks/frame_methods.py | 14 ++++++++++++++
 asv_bench/benchmarks/groupby.py       | 16 ++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index adbe73aa5c5ef..9f491302a4d6f 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -433,6 +433,20 @@ def time_frame_from_records_generator_nrows(self):
 
 
 
+#-----------------------------------------------------------------------------
+# nunique
+
+class frame_nunique(object):
+
+    def setup(self):
+        self.data = np.random.randn(10000, 1000)
+        self.df = DataFrame(self.data)
+
+    def time_frame_nunique(self):
+        self.df.nunique()
+
+
+
 #-----------------------------------------------------------------------------
 # duplicated
 
diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py
index ad58cd0fc6d70..fa68b122cc98d 100644
--- a/asv_bench/benchmarks/groupby.py
+++ b/asv_bench/benchmarks/groupby.py
@@ -251,6 +251,22 @@ def time_groupby_int_count(self):
         self.df.groupby(['key1', 'key2']).count()
 
 
+#----------------------------------------------------------------------
+# nunique() speed
+
+class groupby_nunique(object):
+
+    def setup(self):
+        self.n = 10000
+        self.df = DataFrame({'key1': randint(0, 500, size=self.n),
+                             'key2': randint(0, 100, size=self.n),
+                             'ints': randint(0, 1000, size=self.n),
+                             'ints2': randint(0, 1000, size=self.n), })
+
+    def time_groupby_nunique(self):
+        self.df.groupby(['key1', 'key2']).nunique()
+
+
 #----------------------------------------------------------------------
 # group with different functions per column
 

From c8d3ac4da110a77f975107cc163eabe587972bd7 Mon Sep 17 00:00:00 2001
From: Sebastian Bank <sebastian.bank@uni-leipzig.de>
Date: Wed, 28 Dec 2016 15:47:51 +0100
Subject: [PATCH 3/4] extend docs and tests

---
 pandas/core/frame.py                 | 12 ++++++++++
 pandas/core/groupby.py               | 35 +++++++++++++++++++++++++++-
 pandas/tests/frame/test_analytics.py |  3 ++-
 pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++
 4 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1ea9208d45da7..bf9fb263093be 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4986,6 +4986,18 @@ def nunique(self, axis=0, dropna=True):
         Returns
         -------
         nunique : Series
+
+        Examples
+        --------
+        >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    1
+
+        >>> df.nunique(axis=1)
+        0    1
+        1    2
+        2    2
         """
         func = functools.partial(Series.nunique, dropna=dropna)
         return self.apply(func, axis=axis)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 45b18127286e5..ff034118424dc 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3901,7 +3901,8 @@ def count(self):
 
     def nunique(self, dropna=True):
         """
-        Return Series with number of distinct observations per group.
+        Return DataFrame with number of distinct observations per group for
+        each column.
 
         .. versionadded:: 0.20.0
 
@@ -3909,6 +3910,38 @@ def nunique(self, dropna=True):
         ----------
         dropna : boolean, default True
             Don't include NaN in the counts.
+
+        Returns
+        -------
+        nunique: DataFrame
+
+        Examples
+        --------
+        >>> df = DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 'ham', 'ham'],
+        ...                 'value1': [1, 5, 5, 2, 5, 5],
+        ...                 'value2': list('abbaxy')})
+        >>> df
+             id  value1 value2
+        0  spam       1      a
+        1   egg       5      b
+        2   egg       5      b
+        3  spam       2      a
+        4   ham       5      x
+        5   ham       5      y
+
+        >>> df.groupby('id').nunique()
+            id  value1  value2
+        id
+        egg    1       1       1
+        ham    1       1       2
+        spam   1       2       1
+
+        >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
+             id  value1 value2
+        0  spam       1      a
+        3  spam       2      a
+        4   ham       5      x
+        5   ham       5      y
         """
         from functools import partial
         func = partial(Series.nunique, dropna=dropna)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 612a563bd15b3..71ab2f46cf6f7 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -16,6 +16,7 @@
                     MultiIndex, date_range, Timestamp)
 import pandas as pd
 import pandas.core.nanops as nanops
+import pandas.core.algorithms as algorithms
 import pandas.formats.printing as printing
 
 import pandas.util.testing as tm
@@ -411,7 +412,7 @@ def test_count(self):
         tm.assert_series_equal(result, expected)
 
     def test_nunique(self):
-        f = lambda s: len(nanops.unique1d(s.dropna()))
+        f = lambda s: len(algorithms.unique1d(s.dropna()))
         self._check_stat_op('nunique', f, has_skipna=False,
                             check_dtype=False, check_dates=True)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 2c117b9503779..5cab941e74ce5 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2806,10 +2806,28 @@ def test_nunique(self):
             'B': list('abxacc'),
             'C': list('abbacx'),
         })
+
         expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]})
         result = df.groupby('A', as_index=False).nunique()
         tm.assert_frame_equal(result, expected)
 
+        # as_index
+        expected.index = list('abc')
+        expected.index.name = 'A'
+        result = df.groupby('A').nunique()
+        tm.assert_frame_equal(result, expected)
+
+        # with na
+        result = df.replace({'x': None}).groupby('A').nunique(dropna=False)
+        tm.assert_frame_equal(result, expected)
+
+        # dropna
+        expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3},
+                             index=list('abc'))
+        expected.index.name = 'A'
+        result = df.replace({'x': None}).groupby('A').nunique()
+        tm.assert_frame_equal(result, expected)
+
     def test_non_cython_api(self):
 
         # GH5610

From a0558e7875ceb38b9897f0c878ef8f3cc746583e Mon Sep 17 00:00:00 2001
From: Sebastian Bank <sebastian.bank@uni-leipzig.de>
Date: Mon, 2 Jan 2017 18:41:00 +0100
Subject: [PATCH 4/4] use apply()-kwargs instead of partial, more tests, better
 examples

---
 pandas/core/frame.py                 |  5 ++---
 pandas/core/groupby.py               | 12 ++++++------
 pandas/tests/frame/test_analytics.py | 10 ++++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index bf9fb263093be..b872f35277402 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4989,7 +4989,7 @@ def nunique(self, axis=0, dropna=True):
 
         Examples
         --------
-        >>> df = DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
+        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
         >>> df.nunique()
         A    3
         B    1
@@ -4999,8 +4999,7 @@ def nunique(self, axis=0, dropna=True):
         1    2
         2    2
         """
-        func = functools.partial(Series.nunique, dropna=dropna)
-        return self.apply(func, axis=axis)
+        return self.apply(Series.nunique, axis=axis, dropna=dropna)
 
     def idxmin(self, axis=0, skipna=True):
         """
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index ff034118424dc..ea361afdc3a60 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3917,9 +3917,10 @@ def nunique(self, dropna=True):
 
         Examples
         --------
-        >>> df = DataFrame({'id': ['spam', 'egg', 'egg', 'spam', 'ham', 'ham'],
-        ...                 'value1': [1, 5, 5, 2, 5, 5],
-        ...                 'value2': list('abbaxy')})
+        >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
+        ...                           'ham', 'ham'],
+        ...                    'value1': [1, 5, 5, 2, 5, 5],
+        ...                    'value2': list('abbaxy')})
         >>> df
              id  value1 value2
         0  spam       1      a
@@ -3936,6 +3937,7 @@ def nunique(self, dropna=True):
         ham    1       1       2
         spam   1       2       1
 
+        # check for rows with the same id but conflicting values
         >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
              id  value1 value2
         0  spam       1      a
@@ -3943,9 +3945,7 @@ def nunique(self, dropna=True):
         4   ham       5      x
         5   ham       5      y
         """
-        from functools import partial
-        func = partial(Series.nunique, dropna=dropna)
-        return self.apply(lambda g: g.apply(func))
+        return self.apply(lambda g: g.apply(Series.nunique, dropna=dropna))
 
 
 from pandas.tools.plotting import boxplot_frame_groupby  # noqa
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index 71ab2f46cf6f7..4af26c3d0fdc0 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -416,6 +416,16 @@ def test_nunique(self):
         self._check_stat_op('nunique', f, has_skipna=False,
                             check_dtype=False, check_dates=True)
 
+        df = DataFrame({'A': [1, 1, 1],
+                        'B': [1, 2, 3],
+                        'C': [1, np.nan, 3]})
+        tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
+        tm.assert_series_equal(df.nunique(dropna=False),
+                               Series({'A': 1, 'B': 3, 'C': 3}))
+        tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
+        tm.assert_series_equal(df.nunique(axis=1, dropna=False),
+                               Series({0: 1, 1: 3, 2: 2}))
+
     def test_sum(self):
         self._check_stat_op('sum', np.sum, has_numeric_only=True)