From ed0ff5d18110d85c5005cd14ff6bb42d45a7c906 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 17:04:32 -0800 Subject: [PATCH 1/3] Consolidated groupby_helpers; added / cleaned tests --- pandas/_libs/groupby.pyx | 99 ---------------------------- pandas/_libs/groupby_helper.pxi.in | 32 +++++---- pandas/tests/groupby/test_groupby.py | 73 ++++++++++++-------- 3 files changed, 65 insertions(+), 139 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d75c3a71896e3..866683ce378ab 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -26,105 +26,6 @@ cdef double NaN = np.NaN cdef double nan = NaN -# TODO: aggregate multiple columns in single pass -# ---------------------------------------------------------------------- -# first, nth, last - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_nth_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - int64_t rank, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[int64_t, ndim=2] nobs - ndarray[object, ndim=2] resx - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_last_object(ndarray[object, ndim=2] out, - ndarray[int64_t] counts, - ndarray[object, ndim=2] values, - ndarray[int64_t] labels, - Py_ssize_t min_count=-1): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - object val - float64_t count - ndarray[object, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - assert min_count == -1, "'min_count' only used in add and prod" - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty(( out).shape, dtype=object) - - N, K = ( values).shape - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(len(counts)): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = nan - else: - out[i, j] = resx[i, j] - - cdef inline float64_t median_linear(float64_t* a, int n) nogil: cdef int i, j, na_count = 0 cdef float64_t result diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index b24444c422efa..58a944a8241dd 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, # name, c_type, dest_type2, nan_val dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), ('float32', 'float32_t', 'float32_t', 'NAN'), - ('int64', 'int64_t', 'int64_t', 'iNaT')] + ('int64', 'int64_t', 'int64_t', 'iNaT'), + ('object', 'object', 'object', 'NAN')] def get_dispatch(dtypes): @@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 resx[lab, j] = val @@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = resx[i, j] - @cython.wraparound(False) @cython.boundscheck(False) def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, @@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, """ cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{dest_type2}} val, count + {{dest_type2}} val ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs @@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, raise AssertionError("len(index) != len(labels)") nobs = np.zeros(( out).shape, dtype=np.int64) + {{if name=='object'}} + resx = np.empty(( out).shape, dtype=object) + {{else}} resx = np.empty_like(out) + {{endif}} N, K = ( values).shape + {{if name == "object"}} + if True: # make templating happy + {{else}} with nogil: + {{endif}} for i in range(N): lab = labels[i] if lab < 0: @@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, val = values[i, j] # not nan - {{if name == 'int64'}} - if val != {{nan_val}}: - {{else}} if val == val and val != {{nan_val}}: - {{endif}} nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 2db772ac54369..b16eaf1f11dac 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2252,7 +2252,19 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_non_arithmetic_agg_types(self): + @pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) + @pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) + ]) + def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, @@ -2260,39 +2272,44 @@ def test_groupby_non_arithmetic_agg_types(self): {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) - dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64'] - - grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}, - 'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}, - 'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}, - 'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}} + df['b'] = df.b.astype(dtype) - for dtype in dtypes: - df_in = df.copy() - df_in['b'] = df_in.b.astype(dtype) + if 'args' not in data: + data['args'] = [] - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype + exp = data['df'] + df_out = pd.DataFrame(exp) - exp = data['df'] - df_out = pd.DataFrame(exp) + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + assert_frame_equal(t, df_out) - grpd = df_in.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) + @pytest.mark.parametrize("method,exp,args", [ + ('first', [('bar', 'quuz'), ('foo', 'baz')], []), + ('last', [('bar', 'grault'), ('foo', 'quux')], []), + ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]), + ]) + def test_groupby_get_nth_object(self, method, exp, args): + df = pd.DataFrame( + [{'a': 'foo', 'b': 'baz'}, + {'a': 'foo', 'b': 'qux'}, + {'a': 'foo', 'b': 'quux'}, + {'a': 'bar', 'b': 'quuz'}, + {'a': 'bar', 'b': 'corge'}, + {'a': 'bar', 'b': 'grault'}]) + exp_df = pd.DataFrame(exp, columns=['a', 'b']) + exp_df.set_index('a', inplace=True) + grpd = df.groupby('a') + t = getattr(grpd, method)(*args) + assert_frame_equal(t, exp_df) def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 From 4b83a25a48f42952e2fec613ccd72006f5f0dca6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Feb 2018 19:29:30 -0800 Subject: [PATCH 2/3] Removed unnecessary get_nth_object test --- pandas/tests/groupby/test_groupby.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b16eaf1f11dac..6eacd45deb7bc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2292,25 +2292,6 @@ def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): t = getattr(grpd, method)(*data['args']) assert_frame_equal(t, df_out) - @pytest.mark.parametrize("method,exp,args", [ - ('first', [('bar', 'quuz'), ('foo', 'baz')], []), - ('last', [('bar', 'grault'), ('foo', 'quux')], []), - ('nth', [('bar', 'corge'), ('foo', 'qux')], [1]), - ]) - def test_groupby_get_nth_object(self, method, exp, args): - df = pd.DataFrame( - [{'a': 'foo', 'b': 'baz'}, - {'a': 'foo', 'b': 'qux'}, - {'a': 'foo', 'b': 'quux'}, - {'a': 'bar', 'b': 'quuz'}, - {'a': 'bar', 'b': 'corge'}, - {'a': 'bar', 'b': 'grault'}]) - exp_df = pd.DataFrame(exp, columns=['a', 'b']) - exp_df.set_index('a', inplace=True) - grpd = df.groupby('a') - t = getattr(grpd, method)(*args) - assert_frame_equal(t, exp_df) - def test_groupby_non_arithmetic_agg_intlike_precision(self): # GH9311, GH6620 c = 24650000000000000 From 178c8283a7d143261c1ac4c08f8d0fed28a07148 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 10 Feb 2018 09:48:09 -0800 Subject: [PATCH 3/3] Prevented group_rank_object from being templated --- pandas/_libs/groupby_helper.pxi.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 58a944a8241dd..48dac7bf10362 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -317,7 +317,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{endfor}} #---------------------------------------------------------------------- -# group_nth, group_last +# group_nth, group_last, group_rank #---------------------------------------------------------------------- {{py: @@ -453,6 +453,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, out[i, j] = resx[i, j] +{{if name != 'object'}} @cython.boundscheck(False) @cython.wraparound(False) def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, @@ -616,6 +617,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if pct: for i in range(N): out[i, 0] = out[i, 0] / grp_sizes[i, 0] +{{endif}} {{endfor}}