From 94b3bdbcf24562afb58403543134bf3f75c3e8f1 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Mar 2020 03:10:49 +0200 Subject: [PATCH 1/9] BUG: GroupBy Doesn't Always Maintain Column Index Name #29764 --- pandas/core/groupby/generic.py | 8 ++++++-- pandas/core/groupby/groupby.py | 6 +++++- pandas/tests/groupby/test_groupby.py | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7c071a8dfbbf..e8dfbab0da248 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1687,8 +1687,10 @@ def _wrap_aggregated_output( ------- DataFrame """ + idx_name = output.get("idx_name") + output.pop("idx_name", None) indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) + columns = Index([key.label for key in output], name=idx_name) result = DataFrame(indexed_output) result.columns = columns @@ -1720,8 +1722,10 @@ def _wrap_transformed_output( ------- DataFrame """ + idx_name = output.get("idx_name") + output.pop("idx_name", None) indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) + columns = Index([key.label for key in output], name=idx_name) result = DataFrame(indexed_output) result.columns = columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 86171944d0c78..31a4776bf81c8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2235,8 +2235,12 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output: Dict[base.OutputKey, np.ndarray] = {} + output: Dict[base.OutputKey, np.ndarray, str:str] = {} base_func = getattr(libgroupby, how) + from pandas.core.groupby.generic import DataFrameGroupBy + + if isinstance(self, DataFrameGroupBy): + output["idx_name"] = self.dtypes.columns.name for idx, obj in enumerate(self._iterate_slices()): name = obj.name diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b8d8f56512a69..ef0b7ac07f4f2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2057,3 +2057,19 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_groupby_column_index_name_lost(): + # GH: 29764 groupby loses index sometimes + df = pd.DataFrame([[1]], columns=pd.Index(["a"], name="idx")) + result = df.groupby([1]).sum() + expected = pd.DataFrame([1], columns=pd.Index(["a"], name="idx"), index=[1]) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1]).any() + expected = pd.DataFrame([True], columns=pd.Index(["a"], name="idx"), index=[1]) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1]).shift() + expected = pd.DataFrame([np.nan], columns=pd.Index(["a"], name="idx"), index=[0]) + tm.assert_frame_equal(result, expected) From 4b2012527218ff855a04483ec8af305a1203eb7a Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Mar 2020 03:14:00 +0200 Subject: [PATCH 2/9] BUG: Add whats new entry #29764 --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20415bba99476..30d88288c6a4f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,6 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for some functions (:issue:`29764`) Reshaping From 39825cc0dcb81e0c516889de6df9c4df5965b739 Mon Sep 17 00:00:00 2001 From: patrick Date: Sun, 29 Mar 2020 03:42:26 +0200 Subject: [PATCH 3/9] BUG: Modify whats new entry and shorten pop onto dict --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/generic.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 30d88288c6a4f..ff6d1dec488d3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,7 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) -- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for some functions (:issue:`29764`) +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for some functions (:issue:`29764`); affected functions: any, all, bfill, ffill, shift Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e8dfbab0da248..d1f4a678bbbfa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1687,8 +1687,7 @@ def _wrap_aggregated_output( ------- DataFrame """ - idx_name = output.get("idx_name") - output.pop("idx_name", None) + idx_name = output.pop("idx_name", None) indexed_output = {key.position: val for key, val in output.items()} columns = Index([key.label for key in output], name=idx_name) @@ -1722,8 +1721,7 @@ def _wrap_transformed_output( ------- DataFrame """ - idx_name = output.get("idx_name") - output.pop("idx_name", None) + idx_name = output.pop("idx_name", None) indexed_output = {key.position: val for key, val in output.items()} columns = Index([key.label for key in output], name=idx_name) From d61ac2b508339ed745b4aa586dbbb0969e549585 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Mar 2020 03:50:05 +0200 Subject: [PATCH 4/9] BUG: Modify whats new entry --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ff6d1dec488d3..22fc95e84ee97 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,7 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) -- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for some functions (:issue:`29764`); affected functions: any, all, bfill, ffill, shift +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) Reshaping From 85ff9c850015662045550bf1c785b95c4b041689 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Mar 2020 12:46:10 +0200 Subject: [PATCH 5/9] BUG: Avoid runtime import --- pandas/core/groupby/groupby.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 31a4776bf81c8..c3917469c565b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2237,10 +2237,9 @@ def _get_cythonized_result( labels, _, ngroups = grouper.group_info output: Dict[base.OutputKey, np.ndarray, str:str] = {} base_func = getattr(libgroupby, how) - from pandas.core.groupby.generic import DataFrameGroupBy - - if isinstance(self, DataFrameGroupBy): - output["idx_name"] = self.dtypes.columns.name + obj = self._selected_obj + if isinstance(obj, DataFrame): + output["idx_name"] = getattr(getattr(obj, "columns"), "name") for idx, obj in enumerate(self._iterate_slices()): name = obj.name From 1b765b363ebcf3df6bba40baba7da79f57bfc354 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 29 Mar 2020 20:59:00 +0200 Subject: [PATCH 6/9] BUG: Add new test and change getattr --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c3917469c565b..2ac94dab220df 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2239,7 +2239,7 @@ def _get_cythonized_result( base_func = getattr(libgroupby, how) obj = self._selected_obj if isinstance(obj, DataFrame): - output["idx_name"] = getattr(getattr(obj, "columns"), "name") + output["idx_name"] = obj.columns.name for idx, obj in enumerate(self._iterate_slices()): name = obj.name diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ef0b7ac07f4f2..20d7eb82ef9eb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2073,3 +2073,21 @@ def test_groupby_column_index_name_lost(): result = df.groupby([1]).shift() expected = pd.DataFrame([np.nan], columns=pd.Index(["a"], name="idx"), index=[0]) tm.assert_frame_equal(result, expected) + + df = pd.DataFrame( + [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], + columns=pd.Index(["type", "a", "b"], name="idx"), + ) + result = df.groupby(["type"])[["a", "b"]].ffill() + expected = pd.DataFrame( + [[1.0, -1.0], [1.0, -1.0], [2.0, -2.0]], + columns=pd.Index(["a", "b"], name="idx"), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby(["type"])[["a", "b"]].bfill() + expected = pd.DataFrame( + [[1.0, -1.0], [2.0, -2.0], [2.0, -2.0]], + columns=pd.Index(["a", "b"], name="idx"), + ) + tm.assert_frame_equal(result, expected) From 1c3b16ceab747d0f29c7d6b3fc57cad26f247c15 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 12 Apr 2020 19:28:55 +0200 Subject: [PATCH 7/9] Parametrize unittests --- pandas/tests/groupby/test_groupby.py | 39 ++++++++++------------------ 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 20d7eb82ef9eb..d615fca90fe0d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2059,35 +2059,24 @@ def test_groups_repr_truncates(max_seq_items, expected): assert result == expected -def test_groupby_column_index_name_lost(): +@pytest.mark.parametrize("func", ["sum", "any", "shift"]) +def test_groupby_column_index_name_lost(func): # GH: 29764 groupby loses index sometimes - df = pd.DataFrame([[1]], columns=pd.Index(["a"], name="idx")) - result = df.groupby([1]).sum() - expected = pd.DataFrame([1], columns=pd.Index(["a"], name="idx"), index=[1]) - tm.assert_frame_equal(result, expected) - - result = df.groupby([1]).any() - expected = pd.DataFrame([True], columns=pd.Index(["a"], name="idx"), index=[1]) - tm.assert_frame_equal(result, expected) + expected = pd.Index(["a"], name="idx") + df = pd.DataFrame([[1]], columns=expected) + df_grouped = df.groupby([1]) + result = getattr(df_grouped, func)().columns + tm.assert_index_equal(result, expected) - result = df.groupby([1]).shift() - expected = pd.DataFrame([np.nan], columns=pd.Index(["a"], name="idx"), index=[0]) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_groupby_column_index_name_lost_fill_funcs(func): + # GH: 29764 groupby loses index sometimes df = pd.DataFrame( [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], columns=pd.Index(["type", "a", "b"], name="idx"), ) - result = df.groupby(["type"])[["a", "b"]].ffill() - expected = pd.DataFrame( - [[1.0, -1.0], [1.0, -1.0], [2.0, -2.0]], - columns=pd.Index(["a", "b"], name="idx"), - ) - tm.assert_frame_equal(result, expected) - - result = df.groupby(["type"])[["a", "b"]].bfill() - expected = pd.DataFrame( - [[1.0, -1.0], [2.0, -2.0], [2.0, -2.0]], - columns=pd.Index(["a", "b"], name="idx"), - ) - tm.assert_frame_equal(result, expected) + df_grouped = df.groupby(["type"])[["a", "b"]] + result = getattr(df_grouped, func)().columns + expected = pd.Index(["a", "b"], name="idx") + tm.assert_index_equal(result, expected) From 2d8026fded6c7acab02e9c66acd074876359b1aa Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 20 Apr 2020 00:19:48 +0200 Subject: [PATCH 8/9] Change type hint --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fcf8ad573e530..9489e685cc0fa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2278,7 +2278,7 @@ def _get_cythonized_result( grouper = self.grouper labels, _, ngroups = grouper.group_info - output: Dict[base.OutputKey, np.ndarray, str:str] = {} + output: Dict[Union[base.OutputKey, str], Union[np.ndarray, str]] = {} base_func = getattr(libgroupby, how) obj = self._selected_obj if isinstance(obj, DataFrame): From b750394b0c4b04b615d4378bb6855186e76f2152 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 4 Aug 2020 15:38:28 +0200 Subject: [PATCH 9/9] Move whats new entry to 1.2 --- doc/source/whatsnew/v1.2.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..010c8a22dcc9f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -132,9 +132,8 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - -- - Reshaping ^^^^^^^^^