From 784e62108d9f5f438b2b220439744b0c5792e37b Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:05:28 -0500 Subject: [PATCH 01/15] BUG: do not broadcast result of transformations in transform --- doc/source/whatsnew/v1.0.0.rst | 2 ++ pandas/core/groupby/base.py | 5 +++-- pandas/core/groupby/generic.py | 15 ++++++++++---- pandas/tests/groupby/conftest.py | 9 ++++++++- pandas/tests/groupby/test_transform.py | 28 ++++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c352a36bf6de1..63629f26a44c7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -78,6 +78,8 @@ Performance improvements Bug fixes ~~~~~~~~~ +- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (`rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function +directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). Categorical ^^^^^^^^^^^ diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index fc3bb69afd0cb..2a9ea1d094e9d 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -144,7 +144,6 @@ def _gotitem(self, key, ndim, subset=None): [ "backfill", "bfill", - "corrwith", "cumcount", "cummax", "cummin", @@ -173,6 +172,8 @@ def _gotitem(self, key, ndim, subset=None): # are neither a transformation nor a reduction "corr", "cov", + # corrwith does not preserve shape, depending on `other` + "corrwith", "describe", "dtypes", "expanding", @@ -197,4 +198,4 @@ def _gotitem(self, key, ndim, subset=None): # Valid values of `name` for `groupby.transform(name)` # NOTE: do NOT edit this directly. New additions should be inserted # into the appropriate list above. -transform_kernel_whitelist = reduction_kernels | transformation_kernels +groupby_transform_whitelist = reduction_kernels | transformation_kernels diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1fef65349976b..737a763dac194 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -576,7 +576,7 @@ def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): + if not (func in base.groupby_transform_whitelist): msg = "'{func}' is not a valid function name for transform(name)" raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: @@ -615,7 +615,11 @@ def _transform_fast(self, result, obj, func_nm): ids, _, ngroup = self.grouper.group_info output = [] for i, _ in enumerate(result.columns): - res = algorithms.take_1d(result.iloc[:, i].values, ids) + if func_nm in base.reduction_kernels: + # only broadcast results if we performed a reduction + res = algorithms.take_1d(result.iloc[:, i].values, ids) + else: + res = result.iloc[:, i].values if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) @@ -1014,7 +1018,7 @@ def transform(self, func, *args, **kwargs): func = self._get_cython_func(func) or func if isinstance(func, str): - if not (func in base.transform_kernel_whitelist): + if not (func in base.groupby_transform_whitelist): msg = "'{func}' is not a valid function name for transform(name)" raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: @@ -1072,7 +1076,10 @@ def _transform_fast(self, func, func_nm): ids, _, ngroup = self.grouper.group_info cast = self._transform_should_cast(func_nm) - out = algorithms.take_1d(func()._values, ids) + out = func() + if func_nm in base.reduction_kernels: + # only broadcast results if we performed a reduction + out = algorithms.take_1d(out._values, ids) if cast: out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 72e60c5099304..5c6caa6790153 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -from pandas.core.groupby.base import reduction_kernels +from pandas.core.groupby.base import reduction_kernels, transformation_kernels from pandas.util import testing as tm @@ -105,6 +105,13 @@ def three_group(): ) +@pytest.fixture(params=sorted(transformation_kernels)) +def transformation_func(request): + """yields the string names of all groupby reduction functions, one at a time. + """ + return request.param + + @pytest.fixture(params=sorted(reduction_kernels)) def reduction_func(request): """yields the string names of all groupby reduction functions, one at a time. diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index d3972e6ba9008..04cac0e910d5c 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1052,6 +1052,34 @@ def test_transform_agg_by_name(reduction_func, obj): assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 +@pytest.mark.parametrize( + "obj", + [ + DataFrame( + dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"] + ), + Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]), + ], +) +def test_transform_transformation_by_name(transformation_func, obj): + func = transformation_func + g = obj.groupby(np.repeat([0, 1], 3)) + + if func.startswith("cum"): # BUG + pytest.xfail("skip cum* function tests") + if func == "tshift": # BUG + pytest.xfail("tshift") + + args = {"fillna": [0]}.get(func, []) + + result = g.transform(func, *args) + expected = getattr(g, func)(*args) + if isinstance(obj, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_series_equal(result, expected) + + def test_transform_lambda_with_datetimetz(): # GH 27496 df = DataFrame( From 4ad587e341c01d1194303e964b133a191388c0e8 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:22:13 -0500 Subject: [PATCH 02/15] comment --- pandas/tests/groupby/test_transform.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 04cac0e910d5c..cea3b50802b83 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1073,6 +1073,8 @@ def test_transform_transformation_by_name(transformation_func, obj): args = {"fillna": [0]}.get(func, []) result = g.transform(func, *args) + # for transformations, g.transform.(name) should return the same result + # as `g.name()` expected = getattr(g, func)(*args) if isinstance(obj, DataFrame): tm.assert_frame_equal(result, expected) From b6216af11fdc4ff7448b623c3e064cbe1a8555da Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:22:40 -0500 Subject: [PATCH 03/15] doc --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 63629f26a44c7..d03133cbd9fc2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -78,7 +78,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (`rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function +- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). Categorical From 15597f54bf09c883e971cdcd666be75c7f33f320 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 18:23:01 -0500 Subject: [PATCH 04/15] doc --- doc/source/whatsnew/v1.0.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index d03133cbd9fc2..cc3c6336cb8b9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -78,8 +78,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function -directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). +- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). Categorical ^^^^^^^^^^^ From 0f77cbfb93da4466bb4a3d5f76b7bfc4ae842ad0 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 20:00:36 -0500 Subject: [PATCH 05/15] doc --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cc3c6336cb8b9..eb25bc71eac3b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -78,7 +78,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc') it broadcast the results resulting in erroneous values. only reduction results and not the results of transformations. The return value in thse cases is the same as calling the named function directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). +- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc) it broadcast the results resulting in erroneous values. Only reduction results and not the results of transformations. The return value in these cases is the same as calling the named function directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). Categorical ^^^^^^^^^^^ From 880b2d84997b0a039998bd035c1b58defdf9e15c Mon Sep 17 00:00:00 2001 From: pilkibun Date: Thu, 25 Jul 2019 20:01:46 -0500 Subject: [PATCH 06/15] use _values --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 737a763dac194..11ac797992ae2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -617,7 +617,7 @@ def _transform_fast(self, result, obj, func_nm): for i, _ in enumerate(result.columns): if func_nm in base.reduction_kernels: # only broadcast results if we performed a reduction - res = algorithms.take_1d(result.iloc[:, i].values, ids) + res = algorithms.take_1d(result.iloc[:, i]._values, ids) else: res = result.iloc[:, i].values if cast: From 36a15f29d258a157b7817634b9debd3f1fe6363b Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 10:50:17 -0500 Subject: [PATCH 07/15] CLN: reserve g.transform(name) exclusively for broadcasting aggregations --- pandas/core/groupby/base.py | 2 +- pandas/core/groupby/generic.py | 22 +++++++++++-- pandas/tests/groupby/test_transform.py | 44 +++++++++----------------- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 2a9ea1d094e9d..b23a75747e737 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -198,4 +198,4 @@ def _gotitem(self, key, ndim, subset=None): # Valid values of `name` for `groupby.transform(name)` # NOTE: do NOT edit this directly. New additions should be inserted # into the appropriate list above. -groupby_transform_whitelist = reduction_kernels | transformation_kernels +groupby_transform_whitelist = reduction_kernels diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 11ac797992ae2..ec0c6beed2638 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -577,7 +577,16 @@ def transform(self, func, *args, **kwargs): if isinstance(func, str): if not (func in base.groupby_transform_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" + msg = ( + "`g.transform('func')` is used exclusively for " + "computing aggregations and broadcasting the results across groups. " + "'{func}' is not a valid aggregation name. " + ) + + if func in base.transformation_kernels | base.groupby_other_methods: + msg += "Perhaps you should try .{func}() instead?".format( + func=func + ) raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transformation or canned "reduction+broadcast" @@ -1019,7 +1028,16 @@ def transform(self, func, *args, **kwargs): if isinstance(func, str): if not (func in base.groupby_transform_whitelist): - msg = "'{func}' is not a valid function name for transform(name)" + msg = ( + "`g.transform('func')` is used exclusively for " + "computing aggregations and broadcasting the results across groups. " + "'{func}' is not a valid aggregation name. " + ) + + if func in base.transformation_kernels | base.groupby_other_methods: + msg += "Perhaps you should try .{func}() instead?".format( + func=func + ) raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transform or canned "agg+broadcast" diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index cea3b50802b83..aabf436a0c695 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -1007,17 +1007,19 @@ def test_transform_invalid_name_raises(): # GH#27486 df = DataFrame(dict(a=[0, 1, 1, 2])) g = df.groupby(["a", "b", "b", "c"]) - with pytest.raises(ValueError, match="not a valid function name"): + with pytest.raises(ValueError, match="exclusively"): g.transform("some_arbitrary_name") # method exists on the object, but is not a valid transformation/agg + # make sure the error suggests using the method directly. assert hasattr(g, "aggregate") # make sure the method exists - with pytest.raises(ValueError, match="not a valid function name"): + with pytest.raises(ValueError, match="exclusively.+you should try"): g.transform("aggregate") # Test SeriesGroupBy - g = df["a"].groupby(["a", "b", "b", "c"]) - with pytest.raises(ValueError, match="not a valid function name"): + ser = Series(range(4)) + g = ser.groupby(["a", "b", "b", "c"]) + with pytest.raises(ValueError, match="exclusively"): g.transform("some_arbitrary_name") @@ -1052,34 +1054,18 @@ def test_transform_agg_by_name(reduction_func, obj): assert len(set(DataFrame(result).iloc[-3:, -1])) == 1 -@pytest.mark.parametrize( - "obj", - [ - DataFrame( - dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"] - ), - Series([0, 0, 0, 1, 1, 1], index=["A", "B", "C", "D", "E", "F"]), - ], -) -def test_transform_transformation_by_name(transformation_func, obj): +def test_transform_transformation_by_name(transformation_func): + """Make sure g.transform('name') raises a helpful error for non-agg + """ func = transformation_func + obj = DataFrame( + dict(a=[0, 0, 0, 1, 1, 1], b=range(6)), index=["A", "B", "C", "D", "E", "F"] + ) g = obj.groupby(np.repeat([0, 1], 3)) - if func.startswith("cum"): # BUG - pytest.xfail("skip cum* function tests") - if func == "tshift": # BUG - pytest.xfail("tshift") - - args = {"fillna": [0]}.get(func, []) - - result = g.transform(func, *args) - # for transformations, g.transform.(name) should return the same result - # as `g.name()` - expected = getattr(g, func)(*args) - if isinstance(obj, DataFrame): - tm.assert_frame_equal(result, expected) - else: - tm.assert_series_equal(result, expected) + match = "exclusively for.+you should try" + with pytest.raises(ValueError, match=match): + g.transform(func) def test_transform_lambda_with_datetimetz(): From 0c77a37ae658388c01311e5e7251cdc5a1727b31 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 11:06:09 -0500 Subject: [PATCH 08/15] rank is not an agg --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index aabf436a0c695..f5f3267470c3a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -765,7 +765,7 @@ def test_transform_with_non_scalar_group(): ), ], ) -@pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) +@pytest.mark.parametrize("agg_func", ["count", "size"]) def test_transform_numeric_ret(cols, exp, comp_func, agg_func): if agg_func == "size" and isinstance(cols, list): pytest.xfail("'size' transformation not supported with NDFrameGroupy") From 09703f8d005aa36943c7b590bfe10104d8f1c5ce Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 11:00:26 -0500 Subject: [PATCH 09/15] fix test --- pandas/tests/groupby/test_transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index f5f3267470c3a..39df4d6c81959 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -632,7 +632,7 @@ def test_cython_transform_series(op, args, targop): ) def test_groupby_cum_skipna(op, skipna, input, exp): df = pd.DataFrame(input) - result = df.groupby("key")["value"].transform(op, skipna=skipna) + result = getattr(df.groupby("key")["value"],op)(skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: From 8eeb01a655ab7a2d8a5f77619a75409cc579dc6a Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 11:05:42 -0500 Subject: [PATCH 10/15] fix tests --- pandas/tests/groupby/test_transform.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 39df4d6c81959..56622ce24e9bf 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -581,7 +581,7 @@ def test_cython_transform_series(op, args, targop): # print(data.head()) expected = data.groupby(labels).transform(targop) - tm.assert_series_equal(expected, data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) tm.assert_series_equal(expected, getattr(data.groupby(labels), op)(*args)) @@ -632,7 +632,7 @@ def test_cython_transform_series(op, args, targop): ) def test_groupby_cum_skipna(op, skipna, input, exp): df = pd.DataFrame(input) - result = getattr(df.groupby("key")["value"],op)(skipna=skipna) + result = getattr(df.groupby("key")["value"], op)(skipna=skipna) if isinstance(exp, dict): expected = exp[(op, skipna)] else: @@ -710,20 +710,17 @@ def test_cython_transform_frame(op, args, targop): expected = gb.apply(targop) expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1)) tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: if c not in ["float", "int", "float_missing"] and op != "shift": msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - gb[c].transform(op) with pytest.raises(DataError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop) expected.name = c - tm.assert_series_equal(expected, gb[c].transform(op, *args)) + tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) tm.assert_series_equal(expected, getattr(gb[c], op)(*args)) From c0a71ce6eaf90e1ddc366afe840d739e2d03721e Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 11:35:45 -0500 Subject: [PATCH 11/15] checks --- pandas/core/groupby/generic.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec0c6beed2638..647dc63326767 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -579,14 +579,14 @@ def transform(self, func, *args, **kwargs): if not (func in base.groupby_transform_whitelist): msg = ( "`g.transform('func')` is used exclusively for " - "computing aggregations and broadcasting the results across groups. " + "computing aggregations and broadcasting " + "the results across groups. " "'{func}' is not a valid aggregation name. " ) if func in base.transformation_kernels | base.groupby_other_methods: - msg += "Perhaps you should try .{func}() instead?".format( - func=func - ) + msg += "Perhaps you should try .{func}() instead?" + msg = msg.format(func=func) raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transformation or canned "reduction+broadcast" @@ -1030,14 +1030,14 @@ def transform(self, func, *args, **kwargs): if not (func in base.groupby_transform_whitelist): msg = ( "`g.transform('func')` is used exclusively for " - "computing aggregations and broadcasting the results across groups. " + "computing aggregations and broadcasting " + "the results across groups. " "'{func}' is not a valid aggregation name. " ) if func in base.transformation_kernels | base.groupby_other_methods: - msg += "Perhaps you should try .{func}() instead?".format( - func=func - ) + msg += "Perhaps you should try .{func}() instead?" + msg = msg.format(func=func) raise ValueError(msg.format(func=func)) if func in base.cythonized_kernels: # cythonized transform or canned "agg+broadcast" From 2ce2bb7a3196ad1dee74c38a4bcedd8f6be97fcc Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 12:12:34 -0500 Subject: [PATCH 12/15] whatsnew --- doc/source/whatsnew/v1.0.0.rst | 57 ++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index eb25bc71eac3b..707aab563cee6 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -39,8 +39,59 @@ Backwards incompatible API changes .. _whatsnew_1000.api.other: -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). -- +Groupby.transform(name) accepts only aggregation names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`DataFrameGroupBy.transform` and :meth:`SeriesGroupBy.transform` are used to either invoked +a callable, or to convert an aggregation into a transformation by replicating the resulting aggregation +value for each across the whole group. In previous releases, these methods did not +validate that the function name passed was actually the name of an aggregation. As a result, +users might get a cryptic error or worse, erroneous results. Starting with this release, passing +either of these method the name of anything except a known aggregation function name will raise +an exception. There is no change in the behavior associated with passing a callable. + +Users who relied on :meth:`DataFrameGroupBy.transform` or :meth:`SeriesGroupBy.transform` +for transformations such as :meth:`DataFrameGroupBy.rank`, :meth:`DataFrameGroupBy.ffill`, +etc, should instead call these method directly +(:issue:`27597`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). + +.. ipython:: python + + df = pd.DataFrame([0,1,100,99]) + labels = [0,0,1,1] + g = df.groupby(labels) + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: g.transform('ers >= Decepticons') + AttributeError: 'DataFrameGroupBy' object has no attribute 'ers >= Decepticons' + + g.transform('rank') + Out[14]: + 0 + 0 1.0 + 1 1.0 + 2 2.0 + 3 2.0 + + g.rank() + Out[15]: + 0 + 0 1.0 + 1 2.0 + 2 2.0 + 3 1.0 + +*New behavior*: + +.. ipython:: python + :okexcept: + + g.transform('ers >= Decepticons') + g.transform('rank') + Other API changes ^^^^^^^^^^^^^^^^^ @@ -78,7 +129,7 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Previously, when :class:`pandas.core.groupby.GroupBy.transform` was passed the name of a transformation (e.g. `rank`, `ffill`, etc) it broadcast the results resulting in erroneous values. Only reduction results and not the results of transformations. The return value in these cases is the same as calling the named function directly (e.g. `g.rank()`) (:issue:`14274`) (:issue:`19354`) (:issue:`22509`). +- Categorical ^^^^^^^^^^^ From d4bafefc9cf1bcfbeceb2b912f09cab080398164 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 13:38:30 -0500 Subject: [PATCH 13/15] checks --- doc/source/whatsnew/v1.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 707aab563cee6..abddf6efc85a6 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -57,8 +57,8 @@ etc, should instead call these method directly .. ipython:: python - df = pd.DataFrame([0,1,100,99]) - labels = [0,0,1,1] + df = pd.DataFrame([0, 1, 100, 99]) + labels = [0, 0, 1, 1] g = df.groupby(labels) *Previous behavior*: From 57e2122345120d036b2bd24d07a4274e154cacc3 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 14:40:12 -0500 Subject: [PATCH 14/15] whatsnew --- doc/source/whatsnew/v1.0.0.rst | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index abddf6efc85a6..c768da7c9f81f 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -39,16 +39,15 @@ Backwards incompatible API changes .. _whatsnew_1000.api.other: -Groupby.transform(name) accepts only aggregation names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:meth:`DataFrameGroupBy.transform` and :meth:`SeriesGroupBy.transform` are used to either invoked -a callable, or to convert an aggregation into a transformation by replicating the resulting aggregation -value for each across the whole group. In previous releases, these methods did not -validate that the function name passed was actually the name of an aggregation. As a result, -users might get a cryptic error or worse, erroneous results. Starting with this release, passing -either of these method the name of anything except a known aggregation function name will raise -an exception. There is no change in the behavior associated with passing a callable. +Groupby.transform(str) validates name is an aggregation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous releases, :meth:`DataFrameGroupBy.transform` and +:meth:`SeriesGroupBy.transform` did not validate that the function name +passed was actually the name of an aggregation. As a result, users might get a +cryptic error or worse, erroneous results. Starting with this release, these +methods will rised if the name of a non-aggregation is passed to them. There +is no change in the behavior associated with passing a callable. Users who relied on :meth:`DataFrameGroupBy.transform` or :meth:`SeriesGroupBy.transform` for transformations such as :meth:`DataFrameGroupBy.rank`, :meth:`DataFrameGroupBy.ffill`, From f942e5516824eca5a15bdf3b069ac88358ed1ec5 Mon Sep 17 00:00:00 2001 From: pilkibun Date: Fri, 26 Jul 2019 15:08:43 -0500 Subject: [PATCH 15/15] Update transform docstring --- pandas/core/groupby/groupby.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3d4dbd3f8d887..5568aacfd849f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -241,8 +241,9 @@ class providing the base-class of operations. Parameters ---------- -f : function - Function to apply to each group +func : callable or str + Callable to apply to each group OR + name of an aggregation function. Returns ------- @@ -257,6 +258,10 @@ class providing the base-class of operations. Each group is endowed the attribute 'name' in case you need to know which group you are working on. +If `func` is the name of an aggregation, the resulting value for +each group is replicated along the row axis to produce an output +with the same shape as the input. + The current implementation imposes three requirements on f: * f must return a value that either has the same shape as the input