From b06f08ff138f2b690e60e2466b1c0c1bdd096452 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 16 Feb 2020 11:30:18 +0000 Subject: [PATCH 01/13] fix setting of index --- pandas/core/groupby/generic.py | 3 ++- pandas/tests/groupby/aggregate/test_aggregate.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1bb512aee39e2..a7c61d4497b2f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -956,7 +956,8 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_frame(func) else: result.columns = Index( - result.columns.levels[0], name=self._selected_obj.columns.name + [i[:-1] if len(i) > 2 else i[0] for i in result.columns], + name=self._selected_obj.columns.name, ) if not self.as_index: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48f8de7e51ae4..8af354a095573 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -691,6 +691,17 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) +def test_multiindex_custom_func(): + # GH 31777 + df = pd.DataFrame( + np.random.rand(10, 4), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) + ) + grp = df.groupby(np.r_[np.ones(5), np.zeros(5)]) + result = grp.agg(lambda s: s.mean()) + expected = grp.agg("mean") + tm.assert_frame_equal(result, expected) + + def myfunc(s): return np.percentile(s, q=0.90) From 1d573b32688e97c89227f76eeec36d15cdd8c820 Mon Sep 17 00:00:00 2001 From: MarcoGorelli Date: Sun, 16 Feb 2020 21:33:11 +0000 Subject: [PATCH 02/13] simplify --- pandas/core/groupby/generic.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a7c61d4497b2f..a525b46d7615a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,10 +955,7 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = Index( - [i[:-1] if len(i) > 2 else i[0] for i in result.columns], - name=self._selected_obj.columns.name, - ) + result.columns = result.columns.droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) From d8d588d9e68dfc5df550a0390846840eb3b29c59 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 17 Feb 2020 11:22:17 +0000 Subject: [PATCH 03/13] rename --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a525b46d7615a..35ea075662ed9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,7 +955,9 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = result.columns.droplevel(-1) + result.columns = result.columns.droplevel(-1).rename( + self._selected_obj.columns.name + ) if not self.as_index: self._insert_inaxis_grouper_inplace(result) From fd3854aa35567a46ee848784879bdce409149db6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 17 Feb 2020 11:56:21 +0000 Subject: [PATCH 04/13] fix renaming --- pandas/core/groupby/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35ea075662ed9..d2ce05b4b00b5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,9 +955,9 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = result.columns.droplevel(-1).rename( - self._selected_obj.columns.name - ) + result.columns = result.columns.rename( + [self._selected_obj.columns.name] * result.columns.nlevels + ).droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) From e709a06ccf06160df677fc41240fe633f5b21ed5 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 20 Feb 2020 14:57:30 +0000 Subject: [PATCH 05/13] note gh number, parametrize test --- pandas/core/groupby/generic.py | 1 + .../tests/groupby/aggregate/test_aggregate.py | 23 ++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d2ce05b4b00b5..50b0310dfd8ad 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,6 +955,7 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: + # GH 32040 result.columns = result.columns.rename( [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8af354a095573..8e79c5d598ecc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -691,14 +691,25 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) -def test_multiindex_custom_func(): +@pytest.mark.parametrize( + "func, expected_values", + [ + (lambda s: s.mean(), [[3, 2], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), + (np.mean, [[3.0, 2.0], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), + (np.nanmean, [[3.0, 2.0], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), + ], +) +def test_multiindex_custom_func(func, expected_values): # GH 31777 - df = pd.DataFrame( - np.random.rand(10, 4), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]]) + data = [[1, 4, 2, 8], [5, 7, 1, 4], [2, 8, 1, 4], [2, 8, 5, 7]] + df = pd.DataFrame(data, columns=pd.MultiIndex.from_product([[1, 2], [3, 4]])) + grp = df.groupby(np.r_[np.zeros(2), np.ones(2)]) + result = grp.agg(func) + expected_keys = [(1, 3), (1, 4), (2, 3), (2, 4)] + expected = pd.DataFrame( + {key: value for key, value in zip(expected_keys, expected_values)}, + index=Index([0.0, 1.0], dtype=float), ) - grp = df.groupby(np.r_[np.ones(5), np.zeros(5)]) - result = grp.agg(lambda s: s.mean()) - expected = grp.agg("mean") tm.assert_frame_equal(result, expected) From 093bcd518e52de44e46e194b56fd36ee357204cb Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 27 Feb 2020 11:45:58 +0000 Subject: [PATCH 06/13] whatsnew --- doc/source/whatsnew/v1.0.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 1b6098e6b6ac1..e3c1985c91d47 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) - Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) +- Fixed regression in :meth:`Groupby.aggregate` which was failing on frames with MultiIndex columns and a custom function (:issue:`31777`) - Fixed regression in :meth:`DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) - Fixed regression where :func:`read_pickle` raised a ``UnicodeDecodeError`` when reading a py27 pickle with :class:`MultiIndex` column (:issue:`31988`). - Fixed regression in :class:`DataFrame` arithmetic operations with mis-matched columns (:issue:`31623`) From 88ce23b13b94afacfdec5baec3f99c1b6b39760b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Mar 2020 12:05:21 +0000 Subject: [PATCH 07/13] clearer comment --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 50b0310dfd8ad..26eef0d4d0cb2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,7 +955,8 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - # GH 32040 + # select everything except for the last level, which is the one + # containing the name of the function, see GH 32040 result.columns = result.columns.rename( [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) From 4e4bae888d1cf20617979084758eac25dc4efd09 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 4 Mar 2020 12:06:15 +0000 Subject: [PATCH 08/13] clearer comment --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 26eef0d4d0cb2..ebb9d63d1acd6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -956,7 +956,7 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_frame(func) else: # select everything except for the last level, which is the one - # containing the name of the function, see GH 32040 + # containing the name of the function(s), see GH 32040 result.columns = result.columns.rename( [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) From 73caec2b7f47401132a9eb460e29e60ede560eb8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 11 Mar 2020 09:46:38 +0000 Subject: [PATCH 09/13] construct expected from literal values for legibility --- .../tests/groupby/aggregate/test_aggregate.py | 25 ++++++------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8e79c5d598ecc..4f58690a00263 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -691,25 +691,14 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "func, expected_values", - [ - (lambda s: s.mean(), [[3, 2], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), - (np.mean, [[3.0, 2.0], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), - (np.nanmean, [[3.0, 2.0], [5.5, 8.0], [1.5, 3.0], [6.0, 5.5]]), - ], -) -def test_multiindex_custom_func(func, expected_values): +@pytest.mark.parametrize("func", [lambda s: s.mean(), np.mean, np.nanmean]) +def test_multiindex_custom_func(func): # GH 31777 - data = [[1, 4, 2, 8], [5, 7, 1, 4], [2, 8, 1, 4], [2, 8, 5, 7]] - df = pd.DataFrame(data, columns=pd.MultiIndex.from_product([[1, 2], [3, 4]])) - grp = df.groupby(np.r_[np.zeros(2), np.ones(2)]) - result = grp.agg(func) - expected_keys = [(1, 3), (1, 4), (2, 3), (2, 4)] - expected = pd.DataFrame( - {key: value for key, value in zip(expected_keys, expected_values)}, - index=Index([0.0, 1.0], dtype=float), - ) + data = [[1, 4, 2], [5, 7, 1]] + df = pd.DataFrame(data, columns=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 3]])) + result = df.groupby(np.array([0, 1])).agg(func) + expected_dict = {(1, 3): {0: 1, 1: 5}, (1, 4): {0: 4, 1: 7}, (2, 3): {0: 2, 1: 1}} + expected = pd.DataFrame(expected_dict) tm.assert_frame_equal(result, expected) From bf048fa41180541264afd7e44952a9ad1ce9044b Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 11 Mar 2020 13:04:09 +0000 Subject: [PATCH 10/13] Use custom functions throughout --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 4f58690a00263..1265547653d7b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -691,7 +691,9 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", [lambda s: s.mean(), np.mean, np.nanmean]) +@pytest.mark.parametrize( + "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] +) def test_multiindex_custom_func(func): # GH 31777 data = [[1, 4, 2], [5, 7, 1]] From 4389f6a0bdee301418053234da45a1dc0e509c77 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 11 Mar 2020 14:38:37 +0000 Subject: [PATCH 11/13] Update v1.0.2.rst --- doc/source/whatsnew/v1.0.2.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index bac4824fe5b8d..63d9ce162baff 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -17,10 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`) - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) -- Fixed regression in :meth:`pandas.core.groupby.RollingGroupby.apply` where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`Groupby.aggregate` which was failing on frames with MultiIndex columns and a custom function (:issue:`31777`) -- Fixed regression in :meth:`pandas.core.window.Rolling.corr` when using a time offset (:issue:`31789`) -- Fixed regression in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) - Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) - Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) From 5beabaca614f46f22cc475158a726d26a9359692 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 11 Mar 2020 15:16:37 +0000 Subject: [PATCH 12/13] make whatsnew entry consistent --- doc/source/whatsnew/v1.0.2.rst | 2 +- pandas/core/groupby/generic.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 63d9ce162baff..d6bf8e656051d 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.to_excel` when ``columns`` kwarg is passed (:issue:`31677`) - Fixed regression in :meth:`Series.align` when ``other`` is a DataFrame and ``method`` is not None (:issue:`31785`) -- Fixed regression in :meth:`Groupby.aggregate` which was failing on frames with MultiIndex columns and a custom function (:issue:`31777`) +- Fixed regression in :meth:`groupby(..).agg() ` which was failing on frames with MultiIndex columns and a custom function (:issue:`31777`) - Fixed regression in ``groupby(..).rolling(..).apply()`` (``RollingGroupby``) where the ``raw`` parameter was ignored (:issue:`31754`) - Fixed regression in :meth:`rolling(..).corr() ` when using a time offset (:issue:`31789`) - Fixed regression in :meth:`groupby(..).nunique() ` which was modifying the original values if ``NaN`` values were present (:issue:`31950`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index fda66f68f7adc..b7ac3048631c5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,11 +955,9 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - # select everything except for the last level, which is the one - # containing the name of the function(s), see GH 32040 - result.columns = result.columns.rename( - [self._selected_obj.columns.name] * result.columns.nlevels - ).droplevel(-1) + result.columns = Index( + result.columns.levels[0], name=self._selected_obj.columns.name + ) if not self.as_index: self._insert_inaxis_grouper_inplace(result) From a9fae8d9e8022e097301696dd9190d2aed08df62 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Wed, 11 Mar 2020 15:18:58 +0000 Subject: [PATCH 13/13] reinstate change --- pandas/core/groupby/generic.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7ac3048631c5..fda66f68f7adc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -955,9 +955,11 @@ def aggregate(self, func=None, *args, **kwargs): raise result = self._aggregate_frame(func) else: - result.columns = Index( - result.columns.levels[0], name=self._selected_obj.columns.name - ) + # select everything except for the last level, which is the one + # containing the name of the function(s), see GH 32040 + result.columns = result.columns.rename( + [self._selected_obj.columns.name] * result.columns.nlevels + ).droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result)