From 432ffba3f6ccaf9466ee3110039944fddbe636f2 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Fri, 16 Sep 2022 02:49:57 +0100 Subject: [PATCH 01/10] fix to maintain consistency for apply UDF on empty inputs --- pandas/core/groupby/ops.py | 14 +++++++++----- pandas/tests/groupby/test_apply.py | 8 ++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ba808e1f2e07f..23f97fcdfd3df 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -844,7 +844,6 @@ def apply( if not mutated and not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) - # getattr pattern for __name__ is needed for functools.partial objects if len(group_keys) == 0 and getattr(f, "__name__", None) not in [ "idxmin", @@ -852,10 +851,15 @@ def apply( "nanargmin", "nanargmax", ]: - # If group_keys is empty, then no function calls have been made, - # so we will not have raised even if this is an invalid dtype. - # So do one dummy call here to raise appropriate TypeError. - f(data.iloc[:0]) + try: + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) + except IndexError: + # If IndexError is raised, + # maintain consistency for all operations on empty groups + pass return result_values, mutated diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index b064c12f89c21..0909b1b408708 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1331,3 +1331,11 @@ def test_result_name_when_one_group(name): expected = Series([1, 2], name=name) tm.assert_series_equal(result, expected) + + +def test_empty_df(): + empty_df = pd.DataFrame({"a": [], "b": []}) + result = empty_df.groupby("a").b.apply(lambda x: x.values[-1]) + expected = empty_df.groupby("a").b.take([0]) + + tm.assert_series_equal(result, expected) From 519fa101f26bfa89038ebb1ce2d6a23c5f2f76ad Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Fri, 16 Sep 2022 03:12:52 +0100 Subject: [PATCH 02/10] use DataFrame instead of pd.DataFrame for test --- pandas/tests/groupby/test_apply.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0909b1b408708..79ff6d5f96d42 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1334,8 +1334,10 @@ def test_result_name_when_one_group(name): def test_empty_df(): - empty_df = pd.DataFrame({"a": [], "b": []}) + empty_df = DataFrame({"a": [], "b": []}) + + # Both operations should return an empty series instead of IndexError for apply UDF result = empty_df.groupby("a").b.apply(lambda x: x.values[-1]) - expected = empty_df.groupby("a").b.take([0]) + expected = empty_df.groupby("a").b.agg("sum") tm.assert_series_equal(result, expected) From f734276cb08dd5d210497b8e5368c057424748c0 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Mon, 19 Sep 2022 23:12:11 +0100 Subject: [PATCH 03/10] change apply function to only handle TypeError for when df is empty and func_name is in mad, skew, sum or prod --- pandas/core/groupby/ops.py | 23 +++++++++-------------- pandas/tests/groupby/test_apply.py | 22 ++++++++++++++++++++-- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 23f97fcdfd3df..1344f0ce45912 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -845,21 +845,16 @@ def apply( mutated = True result_values.append(res) # getattr pattern for __name__ is needed for functools.partial objects - if len(group_keys) == 0 and getattr(f, "__name__", None) not in [ - "idxmin", - "idxmax", - "nanargmin", - "nanargmax", + if len(group_keys) == 0 and getattr(f, "__name__", None) in [ + "mad", + "skew", + "sum", + "prod", ]: - try: - # If group_keys is empty, then no function calls have been made, - # so we will not have raised even if this is an invalid dtype. - # So do one dummy call here to raise appropriate TypeError. - f(data.iloc[:0]) - except IndexError: - # If IndexError is raised, - # maintain consistency for all operations on empty groups - pass + # If group_keys is empty, then no function calls have been made, + # so we will not have raised even if this is an invalid dtype. + # So do one dummy call here to raise appropriate TypeError. + f(data.iloc[:0]) return result_values, mutated diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 79ff6d5f96d42..5f8c3513b70ab 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1337,7 +1337,25 @@ def test_empty_df(): empty_df = DataFrame({"a": [], "b": []}) # Both operations should return an empty series instead of IndexError for apply UDF - result = empty_df.groupby("a").b.apply(lambda x: x.values[-1]) - expected = empty_df.groupby("a").b.agg("sum") + result = empty_df.groupby("a", group_keys=True).b.apply(lambda x: x.values[-1]) + expected = empty_df.groupby("a", group_keys=True).b.agg("sum") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "error_type", + [ + TypeError, + ValueError, + IndexError, + ], +) +def test_udf_raise_error_on_empty_df(error_type): + empty_df = DataFrame({"a": [], "b": []}) + + def f(group): + raise error_type + + # Exception should not be raised. + empty_df.groupby("a", group_keys=True).b.apply(f) From 1ad10e123aed6e7f72c34dbe4d5238bfd8a34738 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Thu, 22 Sep 2022 18:34:22 +0100 Subject: [PATCH 04/10] improve test for udfs on empty inputs --- pandas/tests/groupby/test_apply.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5f8c3513b70ab..3a8928f85cc01 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -3,6 +3,7 @@ datetime, ) from io import StringIO +from tkinter import S import numpy as np import pytest @@ -1334,13 +1335,19 @@ def test_result_name_when_one_group(name): def test_empty_df(): + # GH 47985 empty_df = DataFrame({"a": [], "b": []}) # Both operations should return an empty series instead of IndexError for apply UDF - result = empty_df.groupby("a", group_keys=True).b.apply(lambda x: x.values[-1]) - expected = empty_df.groupby("a", group_keys=True).b.agg("sum") + result1 = empty_df.groupby("a", group_keys=True).b.apply(lambda x: x.values[-1]) + result2 = empty_df.groupby("a", group_keys=True).b.agg("sum") - tm.assert_series_equal(result, expected) + expected = Series( + [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") + ) + + tm.assert_series_equal(result1, expected) + tm.assert_series_equal(result2, expected) @pytest.mark.parametrize( @@ -1352,10 +1359,16 @@ def test_empty_df(): ], ) def test_udf_raise_error_on_empty_df(error_type): + # GH 47985 empty_df = DataFrame({"a": [], "b": []}) def f(group): raise error_type # Exception should not be raised. - empty_df.groupby("a", group_keys=True).b.apply(f) + result = empty_df.groupby("a", group_keys=True).b.apply(f) + expected = Series( + [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") + ) + + tm.assert_series_equal(result, expected) From 29646583b0e2c67737ab88625165c5345b81f74f Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Thu, 22 Sep 2022 18:45:06 +0100 Subject: [PATCH 05/10] fix typo --- pandas/tests/groupby/test_apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 3a8928f85cc01..4cc2014192533 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -3,7 +3,6 @@ datetime, ) from io import StringIO -from tkinter import S import numpy as np import pytest From 98b303024c3b021fc7a6ec770fe1e01ebb0bc4d2 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Fri, 23 Sep 2022 19:02:53 +0100 Subject: [PATCH 06/10] remove unrelated test --- pandas/tests/groupby/test_apply.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4cc2014192533..a1e827f4e115c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1347,27 +1347,3 @@ def test_empty_df(): tm.assert_series_equal(result1, expected) tm.assert_series_equal(result2, expected) - - -@pytest.mark.parametrize( - "error_type", - [ - TypeError, - ValueError, - IndexError, - ], -) -def test_udf_raise_error_on_empty_df(error_type): - # GH 47985 - empty_df = DataFrame({"a": [], "b": []}) - - def f(group): - raise error_type - - # Exception should not be raised. - result = empty_df.groupby("a", group_keys=True).b.apply(f) - expected = Series( - [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") - ) - - tm.assert_series_equal(result, expected) From 60941f4522db2480d2a5e466c73438edaa47afad Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Sun, 25 Sep 2022 01:13:39 +0100 Subject: [PATCH 07/10] change test for empty df --- pandas/tests/groupby/test_apply.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a1e827f4e115c..7ec978460803c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1333,17 +1333,19 @@ def test_result_name_when_one_group(name): tm.assert_series_equal(result, expected) -def test_empty_df(): +@pytest.mark.parametrize( + "apply_func", [lambda x: x.values[-1], lambda gb: gb["b"].iloc[0]] +) +@pytest.mark.parametrize("op", ["mad", "skew", "sum", "prod"]) +def test_empty_df(apply_func, op): # GH 47985 empty_df = DataFrame({"a": [], "b": []}) + gb = empty_df.groupby("a", group_keys=True) + group = getattr(gb, "b") - # Both operations should return an empty series instead of IndexError for apply UDF - result1 = empty_df.groupby("a", group_keys=True).b.apply(lambda x: x.values[-1]) - result2 = empty_df.groupby("a", group_keys=True).b.agg("sum") - + result = group.apply(apply_func) if apply_func else group.agg(op) expected = Series( [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") ) - tm.assert_series_equal(result1, expected) - tm.assert_series_equal(result2, expected) + tm.assert_series_equal(result, expected) From ee08f0fb1e36c4fb0bfa74b92fa2371558ad9192 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Mon, 26 Sep 2022 07:49:24 +0100 Subject: [PATCH 08/10] fix test for udf on empty df --- pandas/tests/groupby/test_apply.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 7ec978460803c..47ea6a99ffea9 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1334,16 +1334,23 @@ def test_result_name_when_one_group(name): @pytest.mark.parametrize( - "apply_func", [lambda x: x.values[-1], lambda gb: gb["b"].iloc[0]] + "method, op", + [ + ("apply", lambda gb: gb.values[-1]), + ("apply", lambda gb: gb["b"].iloc[0]), + ("agg", "mad"), + ("agg", "skew"), + ("agg", "prod"), + ("agg", "sum"), + ], ) -@pytest.mark.parametrize("op", ["mad", "skew", "sum", "prod"]) -def test_empty_df(apply_func, op): +def test_empty_df(method, op): # GH 47985 empty_df = DataFrame({"a": [], "b": []}) gb = empty_df.groupby("a", group_keys=True) group = getattr(gb, "b") - result = group.apply(apply_func) if apply_func else group.agg(op) + result = getattr(group, method)(op) expected = Series( [], name="b", dtype="float64", index=Index([], dtype="float64", name="a") ) From 348b14affce7e80024fcc00fe9e1c8a3952d5e97 Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Tue, 27 Sep 2022 00:26:28 +0100 Subject: [PATCH 09/10] add to whatsnew documentation --- doc/source/whatsnew/v1.5.1.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index da0bd746e3da5..4eec6f2927b3a 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -86,6 +86,9 @@ Bug fixes - Bug in :meth:`DataFrame.pivot_table` raising unexpected ``FutureWarning`` when setting datetime column as index (:issue:`48683`) - +Groupby +^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.apply` invokes user defined function when called on an empty dataframe (:issue:`47985`) .. --------------------------------------------------------------------------- .. _whatsnew_151.other: From 6972fd4eefd1b1ec4f882dca68ce788197ebf16c Mon Sep 17 00:00:00 2001 From: th3nn3ss Date: Tue, 27 Sep 2022 00:39:18 +0100 Subject: [PATCH 10/10] fix whatsnew v1.5.1 documentation --- doc/source/whatsnew/v1.5.1.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.5.1.rst b/doc/source/whatsnew/v1.5.1.rst index 3dcad77c72969..3157723167020 100644 --- a/doc/source/whatsnew/v1.5.1.rst +++ b/doc/source/whatsnew/v1.5.1.rst @@ -77,6 +77,7 @@ Fixed regressions - Fixed performance regression in :func:`factorize` when ``na_sentinel`` is not ``None`` and ``sort=False`` (:issue:`48620`) - Fixed regression causing an ``AttributeError`` during warning emitted if the provided table name in :meth:`DataFrame.to_sql` and the table name actually used in the database do not match (:issue:`48733`) - Fixed :meth:`.DataFrameGroupBy.size` not returning a Series when ``axis=1`` (:issue:`48738`) +- Fixed Regression in :meth:`DataFrameGroupBy.apply` when user defined function is called on an empty dataframe (:issue:`47985`) .. --------------------------------------------------------------------------- @@ -90,9 +91,6 @@ Bug fixes - Bug in :meth:`DataFrame.pivot_table` raising unexpected ``FutureWarning`` when setting datetime column as index (:issue:`48683`) - -Groupby -^^^^^^^ -- Bug in :meth:`DataFrameGroupBy.apply` invokes user defined function when called on an empty dataframe (:issue:`47985`) .. --------------------------------------------------------------------------- .. _whatsnew_151.other: