From ed0ff5d18110d85c5005cd14ff6bb42d45a7c906 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 8 Feb 2018 17:04:32 -0800
Subject: [PATCH 1/3] Consolidated groupby_helpers; added / cleaned tests

---
 pandas/_libs/groupby.pyx             | 99 ----------------------------
 pandas/_libs/groupby_helper.pxi.in   | 32 +++++----
 pandas/tests/groupby/test_groupby.py | 73 ++++++++++++--------
 3 files changed, 65 insertions(+), 139 deletions(-)
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index d75c3a71896e3..866683ce378ab 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -26,105 +26,6 @@ cdef double NaN = <double> np.NaN
 cdef double nan = NaN
 
 
-# TODO: aggregate multiple columns in single pass
-# ----------------------------------------------------------------------
-# first, nth, last
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth_object(ndarray[object, ndim=2] out,
-                     ndarray[int64_t] counts,
-                     ndarray[object, ndim=2] values,
-                     ndarray[int64_t] labels,
-                     int64_t rank,
-                     Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[int64_t, ndim=2] nobs
-        ndarray[object, ndim=2] resx
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = <object> nan
-            else:
-                out[i, j] = resx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last_object(ndarray[object, ndim=2] out,
-                      ndarray[int64_t] counts,
-                      ndarray[object, ndim=2] values,
-                      ndarray[int64_t] labels,
-                      Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[object, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
-
 cdef inline float64_t median_linear(float64_t* a, int n) nogil:
     cdef int i, j, na_count = 0
     cdef float64_t result
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index b24444c422efa..58a944a8241dd 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 # name, c_type, dest_type2, nan_val
 dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'),
           ('float32', 'float32_t', 'float32_t', 'NAN'),
-          ('int64', 'int64_t', 'int64_t', 'iNaT')]
+          ('int64', 'int64_t', 'int64_t', 'iNaT'),
+          ('object', 'object', 'object', 'NAN')]
 
 def get_dispatch(dtypes):
 
@@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
+        {{dest_type2}} val
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    {{if name=='object'}}
+    resx = np.empty((<object> out).shape, dtype=object)
+    {{else}}
     resx = np.empty_like(out)
+    {{endif}}
 
     N, K = (<object> values).shape
 
+    {{if name == "object"}}
+    if True:  # make templating happy
+    {{else}}
     with nogil:
+    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
                 if val == val and val != {{nan_val}}:
-                {{endif}}
                     nobs[lab, j] += 1
                     resx[lab, j] = val
 
@@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = resx[i, j]
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
@@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
+        {{dest_type2}} val
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    {{if name=='object'}}
+    resx = np.empty((<object> out).shape, dtype=object)
+    {{else}}
     resx = np.empty_like(out)
+    {{endif}}
 
     N, K = (<object> values).shape
 
+    {{if name == "object"}}
+    if True:  # make templating happy
+    {{else}}
     with nogil:
+    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
                 if val == val and val != {{nan_val}}:
-                {{endif}}
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 2db772ac54369..b16eaf1f11dac 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2252,7 +2252,19 @@ def test_median_empty_bins(self):
         expected = df.groupby(bins).agg(lambda x: x.median())
         assert_frame_equal(result, expected)
 
-    def test_groupby_non_arithmetic_agg_types(self):
+    @pytest.mark.parametrize("dtype", [
+        'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
+    @pytest.mark.parametrize("method,data", [
+        ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+        ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+        ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+        ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+        ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
+                 'args': [1]}),
+        ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
+                   'out_type': 'int64'})
+    ])
+    def test_groupby_non_arithmetic_agg_types(self, dtype, method, data):
         # GH9311, GH6620
         df = pd.DataFrame(
             [{'a': 1, 'b': 1},
@@ -2260,39 +2272,44 @@ def test_groupby_non_arithmetic_agg_types(self):
              {'a': 2, 'b': 3},
              {'a': 2, 'b': 4}])
 
-        dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
-
-        grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
-                   'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
-                   'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
-                   'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
-                   'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
-                           'args': [1]},
-                   'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
-                             'out_type': 'int64'}}
+        df['b'] = df.b.astype(dtype)
 
-        for dtype in dtypes:
-            df_in = df.copy()
-            df_in['b'] = df_in.b.astype(dtype)
+        if 'args' not in data:
+            data['args'] = []
 
-            for method, data in compat.iteritems(grp_exp):
-                if 'args' not in data:
-                    data['args'] = []
+        if 'out_type' in data:
+            out_type = data['out_type']
+        else:
+            out_type = dtype
 
-                if 'out_type' in data:
-                    out_type = data['out_type']
-                else:
-                    out_type = dtype
+        exp = data['df']
+        df_out = pd.DataFrame(exp)
 
-                exp = data['df']
-                df_out = pd.DataFrame(exp)
+        df_out['b'] = df_out.b.astype(out_type)
+        df_out.set_index('a', inplace=True)
 
-                df_out['b'] = df_out.b.astype(out_type)
-                df_out.set_index('a', inplace=True)
+        grpd = df.groupby('a')
+        t = getattr(grpd, method)(*data['args'])
+        assert_frame_equal(t, df_out)
 
-                grpd = df_in.groupby('a')
-                t = getattr(grpd, method)(*data['args'])
-                assert_frame_equal(t, df_out)
+    @pytest.mark.parametrize("method,exp,args", [
+        ('first', [('bar', 'quuz'), ('foo', 'baz')], []),
+        ('last', [('bar', 'grault'), ('foo', 'quux')], []),
+        ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]),
+    ])
+    def test_groupby_get_nth_object(self, method, exp, args):
+        df = pd.DataFrame(
+            [{'a': 'foo', 'b': 'baz'},
+             {'a': 'foo', 'b': 'qux'},
+             {'a': 'foo', 'b': 'quux'},
+             {'a': 'bar', 'b': 'quuz'},
+             {'a': 'bar', 'b': 'corge'},
+             {'a': 'bar', 'b': 'grault'}])
+        exp_df = pd.DataFrame(exp, columns=['a', 'b'])
+        exp_df.set_index('a', inplace=True)
+        grpd = df.groupby('a')
+        t = getattr(grpd, method)(*args)
+        assert_frame_equal(t, exp_df)
 
     def test_groupby_non_arithmetic_agg_intlike_precision(self):
         # GH9311, GH6620

From 4b83a25a48f42952e2fec613ccd72006f5f0dca6 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 8 Feb 2018 19:29:30 -0800
Subject: [PATCH 2/3] Removed unnecessary get_nth_object test

---
 pandas/tests/groupby/test_groupby.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index b16eaf1f11dac..6eacd45deb7bc 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2292,25 +2292,6 @@ def test_groupby_non_arithmetic_agg_types(self, dtype, method, data):
         t = getattr(grpd, method)(*data['args'])
         assert_frame_equal(t, df_out)
 
-    @pytest.mark.parametrize("method,exp,args", [
-        ('first', [('bar', 'quuz'), ('foo', 'baz')], []),
-        ('last', [('bar', 'grault'), ('foo', 'quux')], []),
-        ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]),
-    ])
-    def test_groupby_get_nth_object(self, method, exp, args):
-        df = pd.DataFrame(
-            [{'a': 'foo', 'b': 'baz'},
-             {'a': 'foo', 'b': 'qux'},
-             {'a': 'foo', 'b': 'quux'},
-             {'a': 'bar', 'b': 'quuz'},
-             {'a': 'bar', 'b': 'corge'},
-             {'a': 'bar', 'b': 'grault'}])
-        exp_df = pd.DataFrame(exp, columns=['a', 'b'])
-        exp_df.set_index('a', inplace=True)
-        grpd = df.groupby('a')
-        t = getattr(grpd, method)(*args)
-        assert_frame_equal(t, exp_df)
-
     def test_groupby_non_arithmetic_agg_intlike_precision(self):
         # GH9311, GH6620
         c = 24650000000000000

From 178c8283a7d143261c1ac4c08f8d0fed28a07148 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Sat, 10 Feb 2018 09:48:09 -0800
Subject: [PATCH 3/3] Prevented group_rank_object from being templated

---
 pandas/_libs/groupby_helper.pxi.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
index 58a944a8241dd..48dac7bf10362 100644
--- a/pandas/_libs/groupby_helper.pxi.in
+++ b/pandas/_libs/groupby_helper.pxi.in
@@ -317,7 +317,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 {{endfor}}
 
 #----------------------------------------------------------------------
-# group_nth, group_last
+# group_nth, group_last, group_rank
 #----------------------------------------------------------------------
 
 {{py:
@@ -453,6 +453,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                     out[i, j] = resx[i, j]
 
 
+{{if name != 'object'}}
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
@@ -616,6 +617,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out,
         if pct:
             for i in range(N):
                 out[i, 0] = out[i, 0] / grp_sizes[i, 0]
+{{endif}}
 {{endfor}}