From 1781ee5b28376c1dc54e1191bd307f49220596ae Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 10:47:17 -0700 Subject: [PATCH 1/7] REF: preserve Index dtype in BlockManager._combine --- pandas/core/internals/array_manager.py | 6 +----- pandas/core/internals/managers.py | 17 +++++++++++------ pandas/tests/generic/test_generic.py | 8 ++++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 71e6d14e6a716..dd02771f735a6 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -274,9 +274,6 @@ def apply( else: new_axes = self._axes - if len(result_arrays) == 0: - return self.make_empty(new_axes) - # error: Argument 1 to "ArrayManager" has incompatible type "List[ndarray]"; # expected "List[Union[ndarray, ExtensionArray]]" return type(self)(result_arrays, new_axes) # type: ignore[arg-type] @@ -487,7 +484,7 @@ def _get_data_subset(self: T, predicate: Callable) -> T: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] arrays = [self.arrays[i] for i in indices] # TODO copy? - new_axes = [self._axes[0], self._axes[1][np.array(indices, dtype="int64")]] + new_axes = [self._axes[0], self._axes[1][np.array(indices, dtype="intp")]] return type(self)(arrays, new_axes, verify_integrity=False) def get_bool_data(self: T, copy: bool = False) -> T: @@ -696,7 +693,6 @@ def _equal_values(self, other) -> bool: return True # TODO - # equals # to_dict diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 73f463997c085..a5b604868fb5d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -345,9 +345,6 @@ def apply( if ignore_failures: return self._combine(result_blocks) - if len(result_blocks) == 0: - return self.make_empty(self.axes) - return type(self).from_blocks(result_blocks, self.axes) def where(self: T, other, cond, align: bool, errors: str) -> T: @@ -532,7 +529,15 @@ def _combine( ) -> T: """ return a new manager with the blocks """ if len(blocks) == 0: - return self.make_empty() + if self.ndim == 2: + # retain our own Index dtype + if index is not None: + axes = [self.items[:0], index] + else: + axes = [self.items[:0]] + self.axes[1:] + else: + axes = None + return self.make_empty(axes) # FIXME: optimization potential indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) @@ -1233,7 +1238,7 @@ def grouped_reduce(self: T, func: Callable, ignore_failures: bool = False) -> T: index = Index(range(result_blocks[0].values.shape[-1])) if ignore_failures: - return self._combine(result_blocks, index=index) + return self._combine(result_blocks, copy=False, index=index) return type(self).from_blocks(result_blocks, [self.axes[0], index]) @@ -1270,7 +1275,7 @@ def reduce( new_mgr = self._combine(res_blocks, copy=False, index=index) else: indexer = [] - new_mgr = type(self).from_blocks([], [Index([]), index]) + new_mgr = type(self).from_blocks([], [self.items[:0], index]) else: indexer = np.arange(self.shape[0]) new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index d07f843f4acfc..771d31aa6865b 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -85,7 +85,7 @@ def test_rename(self): # multiple axes at once - def test_get_numeric_data(self, using_array_manager): + def test_get_numeric_data(self): n = 4 kwargs = { @@ -100,9 +100,9 @@ def test_get_numeric_data(self, using_array_manager): # non-inclusion result = o._get_bool_data() expected = self._construct(n, value="empty", **kwargs) - if using_array_manager and isinstance(o, DataFrame): - # INFO(ArrayManager) preserve the dtype of the columns Index - expected.columns = expected.columns.astype("int64") + if isinstance(o, DataFrame): + # preserve columns dtype + expected.columns = o.columns[:0] self._compare(result, expected) # get the bool data From 5dda642d69e79ac6d1310ba5b3eff58be440c44f Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 11:44:40 -0700 Subject: [PATCH 2/7] mypy fixup --- pandas/core/internals/managers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a5b604868fb5d..cdb7d8a6ccd45 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -535,9 +535,8 @@ def _combine( axes = [self.items[:0], index] else: axes = [self.items[:0]] + self.axes[1:] - else: - axes = None - return self.make_empty(axes) + return self.make_empty(axes) + return self.make_empty() # FIXME: optimization potential indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) From 8f4df41ae0dffd4036a0e3ce62ffed9b9615f682 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 5 May 2021 15:13:18 -0700 Subject: [PATCH 3/7] BUG/API: SeriesGroupBy reduction with numeric_only=True --- pandas/core/groupby/generic.py | 9 ++-- pandas/core/groupby/groupby.py | 47 ++++++++++++++++--- pandas/tests/groupby/aggregate/test_cython.py | 18 +++---- pandas/tests/groupby/test_groupby.py | 26 ++++++++-- pandas/tests/resample/test_datetime_index.py | 6 +-- 5 files changed, 81 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 18506b871bda6..c733fd7adbbf0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -346,7 +346,7 @@ def _aggregate_multiple_funcs(self, arg): return output def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): obj = self._selected_obj @@ -354,7 +354,10 @@ def _cython_agg_general( data = obj._mgr if numeric_only and not is_numeric_dtype(obj.dtype): - raise DataError("No numeric types to aggregate") + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) # This is overkill because it is only called once, but is here to # mirror the array_func used in DataFrameGroupBy._cython_agg_general @@ -1069,7 +1072,7 @@ def _iterate_slices(self) -> Iterable[Series]: yield values def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ) -> DataFrame: # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5ef18c51a533..d5c9e9a02045e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1101,6 +1101,24 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + """ + For SeriesGroupBy we want the default to be False (to match Series behavior). + For DataFrameGroupBy we want it to be True (for backwards-compat). + """ + # GH#41291 + if numeric_only is lib.no_default: + # i.e. not explicitly passed by user + if self.obj.ndim == 2: + # i.e. DataFrameGroupBy + numeric_only = True + else: + numeric_only = False + + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] + # ----------------------------------------------------------------- # numba @@ -1290,6 +1308,7 @@ def _agg_general( alias: str, npfunc: Callable, ): + with group_selection_context(self): # try a cython aggregation if we can result = None @@ -1357,7 +1376,7 @@ def _agg_py_fallback( return ensure_block_shape(res_values, ndim=ndim) def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): raise AbstractMethodError(self) @@ -1599,7 +1618,7 @@ def count(self): @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def mean(self, numeric_only: bool = True): + def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute mean of groups, excluding missing values. @@ -1647,6 +1666,8 @@ def mean(self, numeric_only: bool = True): 2 4.0 Name: B, dtype: float64 """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only), @@ -1657,7 +1678,7 @@ def mean(self, numeric_only: bool = True): @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only=True): + def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute median of groups, excluding missing values. @@ -1674,6 +1695,8 @@ def median(self, numeric_only=True): Series or DataFrame Median of values within each group. """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only), @@ -1731,8 +1754,9 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof) + "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only ) else: func = lambda x: x.var(ddof=ddof) @@ -1797,7 +1821,10 @@ def size(self) -> FrameOrSeriesUnion: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) - def sum(self, numeric_only: bool = True, min_count: int = 0): + def sum( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in @@ -1814,7 +1841,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) - def prod(self, numeric_only: bool = True, min_count: int = 0): + def prod( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) + return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -2739,7 +2770,7 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, needs_counts: bool = False, needs_values: bool = False, needs_2d: bool = False, @@ -2807,6 +2838,8 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ + numeric_only = self._resolve_numeric_only(numeric_only) + if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing and not callable(post_processing): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index ded10ab11d5a8..8777615d4519b 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -91,7 +91,10 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("a")["b"].mean(numeric_only=True) + + with pytest.raises(TypeError, match="Could not convert (foo|bar)*"): frame.groupby("a")["b"].mean() frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) @@ -107,9 +110,8 @@ def test_cython_agg_nothing_to_agg_with_dates(): "dates": pd.date_range("now", periods=50, freq="T"), } ) - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - frame.groupby("b").dates.mean() + with pytest.raises(NotImplementedError, match="does not implement"): + frame.groupby("b").dates.mean(numeric_only=True) def test_cython_agg_frame_columns(): @@ -170,7 +172,7 @@ def test__cython_agg_general(op, targop): df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) - result = df.groupby(labels)._cython_agg_general(op) + result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True) expected = df.groupby(labels).agg(targop) tm.assert_frame_equal(result, expected) @@ -192,7 +194,7 @@ def test_cython_agg_empty_buckets(op, targop, observed): # calling _cython_agg_general directly, instead of via the user API # which sets different values for min_count, so do that here. g = df.groupby(pd.cut(df[0], grps), observed=observed) - result = g._cython_agg_general(op) + result = g._cython_agg_general(op, alt=None, numeric_only=True) g = df.groupby(pd.cut(df[0], grps), observed=observed) expected = g.agg(lambda x: targop(x)) @@ -209,7 +211,7 @@ def test_cython_agg_empty_buckets_nanops(observed): grps = range(0, 25, 5) # add / sum result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "add" + "add", alt=None, numeric_only=True ) intervals = pd.interval_range(0, 20, freq=5) expected = DataFrame( @@ -223,7 +225,7 @@ def test_cython_agg_empty_buckets_nanops(observed): # prod result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general( - "prod" + "prod", alt=None, numeric_only=True ) expected = DataFrame( {"a": [1, 1, 1716, 1]}, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f716a3a44cd54..9dcf56db3d0b4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1803,11 +1803,29 @@ def test_empty_groupby(columns, keys, values, method, op, request): df = df.iloc[:0] gb = df.groupby(keys)[columns] - if method == "attr": - result = getattr(gb, op)() - else: - result = getattr(gb, method)(op) + def get_result(): + if method == "attr": + return getattr(gb, op)() + else: + return getattr(gb, method)(op) + + if columns == "C": + # i.e. SeriesGroupBy + if op in ["prod", "sum"]: + # ops that require more than just ordered-ness + if method != "apply": + # FIXME: apply goes through different code path + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + msg = "datetime64 type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return + + result = get_result() expected = df.set_index(keys)[columns] if override_dtype is not None: expected = expected.astype(override_dtype) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 66cb2f2291e98..3bc7b539a3e07 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -61,7 +61,7 @@ def test_custom_grouper(index): g.ohlc() # doesn't use _cython_agg_general funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) b = Grouper(freq=Minute(5), closed="right", label="right") g = s.groupby(b) @@ -69,7 +69,7 @@ def test_custom_grouper(index): g.ohlc() # doesn't use _cython_agg_general funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) assert g.ngroups == 2593 assert notna(g.mean()).all() @@ -417,7 +417,7 @@ def test_resample_frame_basic(): # check all cython functions work funcs = ["add", "mean", "prod", "min", "max", "var"] for f in funcs: - g._cython_agg_general(f) + g._cython_agg_general(f, alt=None, numeric_only=True) result = df.resample("A").mean() tm.assert_series_equal(result["A"], df["A"].resample("A").mean()) From 467ab63ab7c9196d4fa14fcf0e91e3b4700bdcf9 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 6 May 2021 08:11:30 -0700 Subject: [PATCH 4/7] trim xfails --- pandas/tests/groupby/test_groupby.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9dcf56db3d0b4..7b526fe4c6605 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1739,7 +1739,15 @@ def test_pivot_table_values_key_error(): def test_empty_groupby(columns, keys, values, method, op, request): # GH8093 & GH26411 - if isinstance(values, Categorical) and len(keys) == 1 and method == "apply": + if ( + isinstance(values, Categorical) + and not isinstance(columns, list) + and op in ["sum", "prod"] + and method != "apply" + ): + # handled below GH#41291 + pass + elif isinstance(values, Categorical) and len(keys) == 1 and method == "apply": mark = pytest.mark.xfail(raises=TypeError, match="'str' object is not callable") request.node.add_marker(mark) elif ( @@ -1824,6 +1832,13 @@ def get_result(): get_result() return + elif isinstance(values, Categorical): + # GH#41291 + msg = "category type does not support" + with pytest.raises(TypeError, match=msg): + get_result() + + return result = get_result() expected = df.set_index(keys)[columns] From 2357da03bb1617a14696dd03f714208c35551571 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 May 2021 09:09:35 -0700 Subject: [PATCH 5/7] BUG: DataFrameGroupBy with numeric_only and empty non-numeric data --- pandas/_libs/groupby.pyx | 2 +- pandas/core/groupby/generic.py | 22 ++------ pandas/core/groupby/groupby.py | 20 +++---- .../tests/groupby/aggregate/test_aggregate.py | 5 +- pandas/tests/groupby/aggregate/test_cython.py | 7 ++- pandas/tests/groupby/aggregate/test_other.py | 16 +++++- pandas/tests/groupby/test_groupby.py | 52 ++++++++++++++++++- 7 files changed, 81 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7a286188c4e74..b72b927b3c2a8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + if not checknull(val): nobs[lab, j] += 1 if nobs[lab, j] == 1: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b51fb2234e148..ec2d8b313599d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,10 +67,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import ( - DataError, - SpecificationError, -) +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -516,16 +513,9 @@ def _cython_transform( obj = self._selected_obj - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - raise DataError("No numeric types to aggregate") - - try: - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) - except (NotImplementedError, TypeError): - raise DataError("No numeric types to aggregate") + result = self.grouper._cython_operation( + "transform", obj._values, how, axis, **kwargs + ) return obj._constructor(result, index=self.obj.index, name=obj.name) @@ -1064,7 +1054,6 @@ def _cython_agg_general( # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy data: Manager2D = self._get_data_to_aggregate() - orig = data if numeric_only: data = data.get_numeric_data(copy=False) @@ -1087,9 +1076,6 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(orig): - # If the original Manager was already empty, no need to raise - raise DataError("No numeric types to aggregate") if len(new_mgr) < len(data): warnings.warn( f"Dropping invalid columns in {type(self).__name__}.{how} " diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b00a1160fb01b..6deb5bb1a76f0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1339,20 +1339,12 @@ def _agg_general( with group_selection_context(self): # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - - # apply a non-cython aggregation - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index eb82e03aea82f..851dd7311183f 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column(): columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]), ) - result = df.groupby(level=1, axis=1).sum() - expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]}) + gb = df.groupby(level=1, axis=1) + result = gb.sum(numeric_only=False) + expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index cf1177d231e37..a035c5500e2dc 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -18,7 +18,6 @@ bdate_range, ) import pandas._testing as tm -from pandas.core.groupby.groupby import DataError @pytest.mark.parametrize( @@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg(): frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25}) - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - frame[["b"]].groupby(frame["a"]).mean() + result = frame[["b"]].groupby(frame["a"]).mean() + expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates()) + tm.assert_frame_equal(result, expected) def test_cython_agg_nothing_to_agg_with_dates(): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 4d30543355d47..79990deed261d 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays(): ], columns=["category", "arraydata"], ) - result = df.groupby("category").agg(sum) + gb = df.groupby("category") expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] expected_index = Index([1, 2], name="category") expected_column = ["arraydata"] expected = DataFrame(expected_data, index=expected_index, columns=expected_column) + alt = gb.sum(numeric_only=False) + tm.assert_frame_equal(alt, expected) + + result = gb.agg("sum", numeric_only=False) tm.assert_frame_equal(result, expected) + # FIXME: the original version of this test called `gb.agg(sum)` + # and that raises TypeError if `numeric_only=False` is passed + @pytest.mark.parametrize("as_period", [True, False]) def test_agg_tzaware_non_datetime_result(as_period): @@ -524,9 +531,14 @@ def test_sum_uint64_overflow(): ) expected.index.name = 0 - result = df.groupby(0).sum() + result = df.groupby(0).sum(numeric_only=False) tm.assert_frame_equal(result, expected) + # out column is non-numeric, so with numeric_only=True it is dropped + result2 = df.groupby(0).sum(numeric_only=True) + expected2 = expected[[]] + tm.assert_frame_equal(result2, expected2) + @pytest.mark.parametrize( "structure, expected", diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 70bdfe92602b2..58a618bda0fb8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -638,7 +638,7 @@ def test_as_index_select_column(): def test_groupby_as_index_select_column_sum_empty_df(): # GH 35246 df = DataFrame(columns=["A", "B", "C"]) - left = df.groupby(by="A", as_index=False)["B"].sum() + left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) assert type(left) is DataFrame assert left.to_dict() == {"A": {}, "B": {}} @@ -1861,6 +1861,49 @@ def get_result(): get_result() return + else: + # ie. DataFrameGroupBy + if op in ["prod", "sum"]: + # ops that require more than just ordered-ness + if method != "apply": + # FIXME: apply goes through different code path + if df.dtypes[0].kind == "M": + # GH#41291 + # datetime64 -> prod and sum are invalid + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return + + elif isinstance(values, Categorical): + # GH#41291 + # Categorical doesn't implement sum or prod + result = get_result() + + # with numeric_only=True, these are dropped, and we get + # an empty DataFrame back + expected = df.set_index(keys)[[]] + if len(keys) != 1 and op == "prod": + # TODO: why just prod and not sum? + # Categorical is special without 'observed=True' + lev = Categorical([0], dtype=values.dtype) + mi = MultiIndex.from_product([lev, lev], names=["A", "B"]) + expected = DataFrame([], columns=[], index=mi) + + tm.assert_equal(result, expected) + return + + elif df.dtypes[0] == object: + # FIXME: the test is actually wrong here, xref #41341 + result = get_result() + # In this case we have list-of-list, will raise TypeError, + # and subsequently be dropped as nuisance columns + expected = df.set_index(keys)[[]] + tm.assert_equal(result, expected) + return result = get_result() expected = df.set_index(keys)[columns] @@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop(): def test_groupby_empty_multi_column(): # GH 15106 - result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum() + df = DataFrame(data=[], columns=["A", "B", "C"]) + gb = df.groupby(["A", "B"]) + result = gb.sum(numeric_only=False) expected = DataFrame( [], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"]) ) tm.assert_frame_equal(result, expected) + result = gb.sum(numeric_only=True) + tm.assert_frame_equal(result, expected[[]]) + def test_groupby_filtered_df_std(): # GH 16174 From 0d26f7dfb406593aa689988a810c83fa9f7f3c11 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 May 2021 09:14:39 -0700 Subject: [PATCH 6/7] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index acf5202a28409..c2d07d91723ec 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1053,6 +1053,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) - Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) +- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) From 8b7efb86bb6c5280a8a381f287937c065f6f5c8c Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 May 2021 15:58:06 -0700 Subject: [PATCH 7/7] update test --- pandas/core/groupby/generic.py | 9 ++++++--- .../tests/groupby/transform/test_transform.py | 19 ++++++++++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec2d8b313599d..69f992f840c7c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -513,9 +513,12 @@ def _cython_transform( obj = self._selected_obj - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) + try: + result = self.grouper._cython_operation( + "transform", obj._values, how, axis, **kwargs + ) + except NotImplementedError as err: + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err return obj._constructor(result, index=self.obj.index, name=obj.name) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index b5092f83e1a9f..7682b2e7ace59 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -24,7 +24,6 @@ DataFrameGroupBy, SeriesGroupBy, ) -from pandas.core.groupby.groupby import DataError def assert_fp_equal(a, b): @@ -741,11 +740,21 @@ def test_cython_transform_frame(op, args, targop): tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1)) # individual columns for c in df: - if c not in ["float", "int", "float_missing"] and op != "shift": - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): + if ( + c not in ["float", "int", "float_missing"] + and op != "shift" + and not (c == "timedelta" and op == "cumsum") + ): + msg = "|".join( + [ + "does not support .* operations", + ".* is not supported for object dtype", + "is not implemented for this dtype", + ] + ) + with pytest.raises(TypeError, match=msg): gb[c].transform(op) - with pytest.raises(DataError, match=msg): + with pytest.raises(TypeError, match=msg): getattr(gb[c], op)() else: expected = gb[c].apply(targop)