From 384930275b4e7a6e8aa5e0f1b729bbe746fb1c04 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 28 Feb 2021 11:28:35 -0500 Subject: [PATCH] POC: aggregate always aggregates --- pandas/core/apply.py | 9 ++++----- pandas/core/generic.py | 3 +-- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 11 +---------- pandas/tests/apply/test_series_apply.py | 19 +++++++++++++++---- .../tests/groupby/aggregate/test_aggregate.py | 13 +++++-------- pandas/tests/groupby/aggregate/test_other.py | 3 +-- pandas/tests/groupby/test_groupby.py | 10 ++++------ pandas/tests/test_multilevel.py | 2 +- 10 files changed, 34 insertions(+), 40 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c159abe55b38c..5112d8fd41c3d 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -674,7 +674,9 @@ def agg(self): result = result.T if result is not None else result if result is None: - result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) + results, res_index = self.apply_series_generator() + result = self.obj._constructor_sliced(results) + result.index = res_index return result @@ -1018,10 +1020,7 @@ def agg(self): # we cannot FIRST try the vectorized evaluation, because # then .agg and .apply would have different semantics if the # operation is actually defined on the Series, e.g. str - try: - result = self.obj.apply(f, *args, **kwargs) - except (ValueError, AttributeError, TypeError): - result = f(self.obj, *args, **kwargs) + result = f(self.obj, *args, **kwargs) return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 140f456926763..8cd7afbca256b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10285,9 +10285,8 @@ def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) - axis = self._get_axis_number(axis) method = getattr(type(self), name) - applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) + applyf = lambda x: method(x, skipna=skipna, **kwargs) return grouped.aggregate(applyf) @final diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2de5e81360a93..d030396f7e2f9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1010,7 +1010,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if result is None: # grouper specific aggregations - if self.grouper.nkeys > 1: + if not self._obj_with_exclusions.empty or self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: result = self._aggregate_frame(func, *args, **kwargs) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e11c296783476..a57bb86e2ff6d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1092,7 +1092,7 @@ def _agg_general( # apply a non-cython aggregation if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self.aggregate(lambda x: npfunc(x)) return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a61e8872a7ce7..c691b55062101 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -760,7 +760,6 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): counts = np.zeros(ngroups, dtype=int) result = np.empty(ngroups, dtype="O") - initialized = False splitter = get_splitter(obj, group_index, ngroups, axis=0) @@ -768,16 +767,8 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): # Each step of this loop corresponds to # libreduction._BaseGrouper._apply_to_group - res = func(group) - res = libreduction.extract_result(res) - - if not initialized: - # We only do this validation on the first iteration - libreduction.check_result_array(res, 0) - initialized = True - counts[label] = group.shape[0] - result[label] = res + result[label] = func(group) result = lib.maybe_convert_objects(result, try_float=False) result = maybe_cast_result(result, obj, numeric_only=True) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index dcb5de29da320..e0323306d996b 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -298,17 +298,27 @@ def test_demo(): tm.assert_series_equal(result, expected) -def test_agg_apply_evaluate_lambdas_the_same(string_series): +def test_agg_apply_evaluate_lambdas(string_series): # test that we are evaluating row-by-row first # before vectorized evaluation + expected = string_series.astype(str) + result = string_series.apply(lambda x: str(x)) - expected = string_series.agg(lambda x: str(x)) tm.assert_series_equal(result, expected) result = string_series.apply(str) - expected = string_series.agg(str) tm.assert_series_equal(result, expected) + # GH 35725 + # Agg always aggs - applies the function to the entire Series + expected = str(string_series) + + result = string_series.agg(lambda x: str(x)) + assert result == expected + + result = string_series.agg(str) + assert result == expected + def test_with_nested_series(datetime_series): # GH 2316 @@ -318,7 +328,8 @@ def test_with_nested_series(datetime_series): tm.assert_frame_equal(result, expected) result = datetime_series.agg(lambda x: Series([x, x ** 2], index=["x", "x^2"])) - tm.assert_frame_equal(result, expected) + expected = Series([datetime_series, datetime_series ** 2], index=["x", "x^2"]) + tm.assert_series_equal(result, expected) def test_replicate_describe(string_series): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b7df1c8382daa..dfcfd4ff813d0 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -47,12 +47,10 @@ def test_agg_regression1(tsframe): def test_agg_must_agg(df): grouped = df.groupby("A")["C"] - - msg = "Must produce aggregated value" - with pytest.raises(Exception, match=msg): - grouped.agg(lambda x: x.describe()) - with pytest.raises(Exception, match=msg): - grouped.agg(lambda x: x.index[:2]) + result = grouped.agg(lambda x: x.describe()) + expected = Series({name: group.describe() for name, group in grouped}, name="C") + expected.index.name = "A" + tm.assert_series_equal(result, expected) def test_agg_ser_multi_key(df): @@ -127,9 +125,8 @@ def test_groupby_aggregation_multi_level_column(): data=lst, columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), ) - result = df.groupby(level=1, axis=1).sum() - expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + expected = DataFrame({0: [2, 1, 1, 1], 1: [1, 0, 1, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 681192881c301..d38ae10be54c7 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -605,9 +605,8 @@ def test_agg_lambda_with_timezone(): ) result = df.groupby("tag").agg({"date": lambda e: e.head(1)}) expected = DataFrame( - [pd.Timestamp("2018-01-01", tz="UTC")], + {"date": [df["date"].iloc[:1]]}, index=Index([1], name="tag"), - columns=["date"], ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index afde1daca74c1..179170126e455 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -75,11 +75,9 @@ def test_basic(dtype): agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) assert agged[1] == 21 - # corner cases - msg = "Must produce aggregated value" - # exception raised is type Exception - with pytest.raises(Exception, match=msg): - grouped.aggregate(lambda x: x * 2) + result = grouped.aggregate(lambda x: x * 2) + expected = Series({name: group * 2 for name, group in grouped}) + tm.assert_series_equal(result, expected) def test_groupby_nonobject_dtype(mframe, df_mixed_floats): @@ -1026,7 +1024,7 @@ def test_groupby_with_hier_columns(): result = df.groupby(level=0).apply(lambda x: x.mean()) tm.assert_index_equal(result.columns, columns) - result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean()) tm.assert_index_equal(result.columns, Index(["A", "B"])) tm.assert_index_equal(result.index, df.index) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 8e6a636a8f602..b6d006439cfb6 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -214,7 +214,7 @@ def test_frame_group_ops( def aggf(x): pieces.append(x) - return getattr(x, op)(skipna=skipna, axis=axis) + return getattr(x, op)(skipna=skipna) leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna)