From 84a03470dc15af4eb86eda4e63be82cfa787ecee Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 14 Feb 2021 11:30:48 -0500 Subject: [PATCH 1/9] BUG: Empty result in df.groupby.agg on multiple keys has no columns --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/groupby/generic.py | 10 +++++++--- pandas/core/groupby/groupby.py | 4 ++-- pandas/tests/groupby/aggregate/test_aggregate.py | 10 ++++++---- pandas/tests/groupby/test_groupby.py | 13 ++++++++----- 5 files changed, 24 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 799bc88ffff4e..57f8ab2462af1 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -418,6 +418,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) - Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) +- Bug in :meth:`DataFrameGroupBy.agg` would return no columns when the result had no rows (:issue:`39809`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a7297923f1034..2fe603eea37aa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -419,7 +419,11 @@ def _wrap_transformed_output( return result def _wrap_applied_output( - self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + self, + data: Series, + keys: Index, + values: Optional[List[Any]], + not_indexed_same: bool = False, ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. @@ -1192,9 +1196,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if len(keys) == 0: - return self.obj._constructor(index=keys) + return self.obj._constructor(index=keys, columns=data.columns) # GH12824 first_not_none = next(com.not_none(*values), None) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5758762c13984..ab72f3933a5fb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -948,7 +948,7 @@ def _python_apply_general( keys, values, mutated = self.grouper.apply(f, data, self.axis) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + data, keys, values, not_indexed_same=mutated or self.mutated ) def _iterate_slices(self) -> Iterable[Series]: @@ -1025,7 +1025,7 @@ def _wrap_aggregated_output( def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) @final diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48527de6b2047..ed0b31d91b2fd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -141,11 +141,13 @@ def test_agg_apply_corner(ts, tsframe): # DataFrame grouped = tsframe.groupby(tsframe["A"] * np.nan) exp_df = DataFrame( - columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64) + columns=tsframe.columns, + dtype=float, + index=Index([], name="A", dtype=np.float64), ) - tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) - tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) + tm.assert_frame_equal(grouped.sum(), exp_df) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df.astype(object)) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4dce7e8553be4..77edee938f41c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1716,13 +1716,16 @@ def test_pivot_table_values_key_error(): ) -def test_empty_dataframe_groupby(): - # GH8093 +@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) +def test_empty_dataframe_groupby(keys): + # GH8093 & GH39809 df = DataFrame(columns=["A", "B", "C"]) - result = df.groupby("A").sum() - expected = DataFrame(columns=["B", "C"], dtype=np.float64) - expected.index.name = "A" + result = df.groupby(keys).sum() + expected = DataFrame(columns=df.columns.difference(keys)) + if len(keys) == 1: + expected = expected.astype(float) + expected.index.name = keys[0] tm.assert_frame_equal(result, expected) From bb30001433d8dc08307001c231b67d6df9bcd40e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 20 Feb 2021 15:24:37 -0500 Subject: [PATCH 2/9] BUG: Groupby ops on empty objects loses index, columns, dtypes --- doc/source/whatsnew/v1.3.0.rst | 3 +- pandas/core/groupby/generic.py | 13 +++++-- pandas/core/reshape/pivot.py | 10 ++---- .../tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/test_groupby.py | 36 +++++++++++++++---- .../tests/resample/test_resampler_grouper.py | 13 +++++++ pandas/tests/reshape/test_crosstab.py | 2 +- pandas/tests/reshape/test_pivot.py | 5 +-- 8 files changed, 62 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index dbcaa7d88e0c1..f0403ce320ada 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -434,7 +434,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) - Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) -- Bug in :meth:`DataFrameGroupBy.agg` would return no columns when the result had no rows (:issue:`39809`) +- Bug in various Groupby operations on an empty ``Series`` or ``DataFrame`` would lose index, columns, and data types (:issue:`26411`) Reshaping ^^^^^^^^^ @@ -449,6 +449,7 @@ Reshaping - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) +- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) Sparse ^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7ed9184e744bc..40a1f8d1a0439 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -461,6 +461,8 @@ def _wrap_applied_output( Parameters ---------- + data : Series + Input data for groupby operation. keys : Index Keys of groups that Series was grouped by. values : Optional[List[Any]] @@ -475,7 +477,10 @@ def _wrap_applied_output( if len(keys) == 0: # GH #6265 return self.obj._constructor( - [], name=self._selection_name, index=keys, dtype=np.float64 + [], + name=self._selection_name, + index=self.grouper.result_index, + dtype=data.dtype, ) assert values is not None @@ -1235,7 +1240,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if len(keys) == 0: - return self.obj._constructor(index=keys, columns=data.columns) + result = self.obj._constructor( + index=self.grouper.result_index, columns=data.columns + ) + result = result.astype(data.dtypes.to_dict()) + return result # GH12824 first_not_none = next(com.not_none(*values), None) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 778e37bc07eb5..0df04f81c8a92 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -202,14 +202,8 @@ def pivot_table( ) # discard the top level - if ( - values_passed - and not values_multi - and not table.empty - and (table.columns.nlevels > 1) - ): - table = table[values[0]] - + if values_passed and not values_multi and table.columns.nlevels > 1: + table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 67c315f86795d..f2f9cfee178d9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -153,7 +153,7 @@ def test_agg_apply_corner(ts, tsframe): ) tm.assert_frame_equal(grouped.sum(), exp_df) tm.assert_frame_equal(grouped.agg(np.sum), exp_df) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.astype(object)) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 77edee938f41c..ec5118af3214c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1716,18 +1716,40 @@ def test_pivot_table_values_key_error(): ) +@pytest.mark.parametrize("columns", ["C", ["C"]]) @pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) -def test_empty_dataframe_groupby(keys): - # GH8093 & GH39809 +@pytest.mark.parametrize( + "dtypes", + [ + "object", + "int", + "float", + {"A": "object", "B": "int", "C": "float"}, + {"A": "int", "B": "float", "C": "object"}, + ], +) +@pytest.mark.parametrize( + "op, args", + [ + ["sum", ()], + ["agg", ("sum",)], + ["apply", ("sum",)], + ["transform", ("sum",)], + ], +) +def test_empty_dataframe_groupby(columns, keys, dtypes, op, args): + # GH8093 & GH26411 df = DataFrame(columns=["A", "B", "C"]) + df = df.astype(dtypes) - result = df.groupby(keys).sum() - expected = DataFrame(columns=df.columns.difference(keys)) + result = getattr(df.groupby(keys)[columns], op)(*args) + if op == "transform": + expected = df[columns] + else: + expected = df.set_index(keys)[columns] if len(keys) == 1: - expected = expected.astype(float) expected.index.name = keys[0] - - tm.assert_frame_equal(result, expected) + tm.assert_equal(result, expected) def test_tuple_as_grouping(): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index a17ed44c4011a..50775b9ef3a47 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -10,6 +10,7 @@ from pandas import ( DataFrame, Series, + TimedeltaIndex, Timestamp, ) import pandas._testing as tm @@ -398,6 +399,18 @@ def test_resample_groupby_agg(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_empty(keys): + # GH 26411 + df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False) + if len(keys) == 1: + expected.index.name = keys[0] + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 86cde3eee874d..2570b303d3845 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -240,7 +240,7 @@ def test_crosstab_no_overlap(self): s2 = Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) - expected = DataFrame() + expected = DataFrame(index=Index([], dtype="int", name="row_0")) tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 19eba4305fdf6..8d2b4f2b325c2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) def test_pivot_table_empty_aggfunc(self): - # GH 9186 + # GH 9186 & GH 13483 df = DataFrame( { "A": [2, 2, 3, 3, 2], @@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self): } ) result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) - expected = DataFrame() + expected = DataFrame(index=Index([], dtype="int64", name="A")) + expected.columns.name = "D" tm.assert_frame_equal(result, expected) def test_pivot_table_no_column_raises(self): From 2fc70ff956d91674f1b61bcf2558bc8a7a0b30b8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 21 Feb 2021 09:06:53 -0500 Subject: [PATCH 3/9] Fixup and simplification for crosstab --- pandas/core/reshape/pivot.py | 9 +-------- pandas/tests/reshape/test_crosstab.py | 5 ++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0c30cd299a602..d0026d7acbe65 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -644,7 +644,6 @@ def crosstab( **dict(zip(unique_colnames, columns)), } df = DataFrame(data, index=common_idx) - original_df_cols = df.columns if values is None: df["__dummy__"] = 0 @@ -654,7 +653,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - ["__dummy__"], + "__dummy__", index=unique_rownames, columns=unique_colnames, margins=margins, @@ -663,12 +662,6 @@ def crosstab( **kwargs, ) - # GH18321, after pivoting, an extra top level of column index of `__dummy__` is - # created, and this extra level should not be included in the further steps - if not table.empty: - cols_diff = df.columns.difference(original_df_cols)[0] - table = table[cols_diff] - # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 2570b303d3845..1ecb408d49813 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self): s2 = Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) - expected = DataFrame(index=Index([], dtype="int", name="row_0")) + expected = DataFrame( + index=Index([], dtype="int64", name="row_0"), + columns=Index([], dtype="int64", name="col_0"), + ) tm.assert_frame_equal(actual, expected) From af55c7d59373af31e00867071498158a33528594 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 22 Feb 2021 15:01:15 -0500 Subject: [PATCH 4/9] Added test for groupby ops --- pandas/tests/groupby/test_groupby.py | 43 ++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ec5118af3214c..5c02a4d35be92 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1716,6 +1716,46 @@ def test_pivot_table_values_key_error(): ) +@pytest.mark.parametrize("columns", [["C"]]) +# @pytest.mark.parametrize("columns", ["C", ["C"]]) +@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) +@pytest.mark.parametrize( + "data", + [ + 3 * ["a"], + 3 * [0], + 3 * [0.0], + ["a", 0, 0.0], + [0, 0.0, "a"], + ], +) +def test_empty_ndframe_groupby(columns, keys, data, groupby_func): + # GH8093 & GH26411 + df = DataFrame([data], columns=["A", "B", "C"]) + + # Get resulting dtype + expected_err = None + try: + expected_dtypes = getattr( + df.groupby(keys)[columns], groupby_func + )().dtypes.to_dict() + except Exception as err: + expected_err = err + + df = df.iloc[:0] + if expected_err is None: + result = getattr(df.groupby(keys)[columns], groupby_func)() + expected = df.set_index(keys).astype(expected_dtypes)[columns] + if len(keys) == 1: + expected.index.name = keys[0] + tm.assert_equal(result, expected) + else: + import re + + with pytest.raises(type(expected_err), match=re.escape(str(expected_err))): + getattr(df.groupby(keys)[columns], groupby_func)() + + @pytest.mark.parametrize("columns", ["C", ["C"]]) @pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) @pytest.mark.parametrize( @@ -1731,13 +1771,12 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op, args", [ - ["sum", ()], ["agg", ("sum",)], ["apply", ("sum",)], ["transform", ("sum",)], ], ) -def test_empty_dataframe_groupby(columns, keys, dtypes, op, args): +def test_empty_ndframe_groupby_udf(columns, keys, dtypes, op, args): # GH8093 & GH26411 df = DataFrame(columns=["A", "B", "C"]) df = df.astype(dtypes) From 4ec2eca3668e71adc92da2cb2a216c32cd2342f7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 10:21:47 -0500 Subject: [PATCH 5/9] whatsnew note, expanded test --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/groupby/test_groupby.py | 87 ++++++++++------------------ 2 files changed, 30 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 25131277c6517..1fb09cb257e71 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -436,7 +436,7 @@ Groupby/resample/rolling - Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`) -- Bug in various Groupby operations on an empty ``Series`` or ``DataFrame`` would lose index, columns, and data types (:issue:`26411`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`) - Reshaping diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 5c02a4d35be92..c5c91f8e94629 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( + Categorical, DataFrame, Grouper, Index, @@ -18,6 +19,7 @@ Timestamp, date_range, read_csv, + to_datetime, ) import pandas._testing as tm from pandas.core.base import SpecificationError @@ -1716,76 +1718,45 @@ def test_pivot_table_values_key_error(): ) -@pytest.mark.parametrize("columns", [["C"]]) -# @pytest.mark.parametrize("columns", ["C", ["C"]]) +@pytest.mark.parametrize("columns", ["C", ["C"]]) @pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) @pytest.mark.parametrize( - "data", + "values", [ - 3 * ["a"], - 3 * [0], - 3 * [0.0], - ["a", 0, 0.0], - [0, 0.0, "a"], + [True], + [0], + [0.0], + ["a"], + [Categorical([0])], + [to_datetime(0)], + [date_range(0, 1, 1, tz="US/Eastern")], + [pd.array([0], dtype="Int64")], ], ) -def test_empty_ndframe_groupby(columns, keys, data, groupby_func): +@pytest.mark.parametrize("method", ["attr", "agg", "apply"]) +@pytest.mark.parametrize( + "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] +) +def test_empty_groupby(columns, keys, values, method, op, request): # GH8093 & GH26411 - df = DataFrame([data], columns=["A", "B", "C"]) - # Get resulting dtype - expected_err = None - try: - expected_dtypes = getattr( - df.groupby(keys)[columns], groupby_func - )().dtypes.to_dict() - except Exception as err: - expected_err = err + if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": + request.node.add_marker( + pytest.mark.xfail(reason="wrong dtype from _wrap_series_output") + ) + df = DataFrame([3 * values], columns=list("ABC")) df = df.iloc[:0] - if expected_err is None: - result = getattr(df.groupby(keys)[columns], groupby_func)() - expected = df.set_index(keys).astype(expected_dtypes)[columns] - if len(keys) == 1: - expected.index.name = keys[0] - tm.assert_equal(result, expected) - else: - import re - with pytest.raises(type(expected_err), match=re.escape(str(expected_err))): - getattr(df.groupby(keys)[columns], groupby_func)() - - -@pytest.mark.parametrize("columns", ["C", ["C"]]) -@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) -@pytest.mark.parametrize( - "dtypes", - [ - "object", - "int", - "float", - {"A": "object", "B": "int", "C": "float"}, - {"A": "int", "B": "float", "C": "object"}, - ], -) -@pytest.mark.parametrize( - "op, args", - [ - ["agg", ("sum",)], - ["apply", ("sum",)], - ["transform", ("sum",)], - ], -) -def test_empty_ndframe_groupby_udf(columns, keys, dtypes, op, args): - # GH8093 & GH26411 - df = DataFrame(columns=["A", "B", "C"]) - df = df.astype(dtypes) + args = () - result = getattr(df.groupby(keys)[columns], op)(*args) - if op == "transform": - expected = df[columns] + gb = df.groupby(keys)[columns] + if method == "attr": + result = getattr(gb, op)(*args) else: - expected = df.set_index(keys)[columns] + result = getattr(gb, method)(op, *args) + + expected = df.set_index(keys)[columns] if len(keys) == 1: expected.index.name = keys[0] tm.assert_equal(result, expected) From bd51562e955064ec53196acec72f1fcc8ca32d01 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 10:24:38 -0500 Subject: [PATCH 6/9] remove unnecessary args from test --- pandas/tests/groupby/test_groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c5c91f8e94629..a12146df83ecd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1748,13 +1748,11 @@ def test_empty_groupby(columns, keys, values, method, op, request): df = DataFrame([3 * values], columns=list("ABC")) df = df.iloc[:0] - args = () - gb = df.groupby(keys)[columns] if method == "attr": - result = getattr(gb, op)(*args) + result = getattr(gb, op)() else: - result = getattr(gb, method)(op, *args) + result = getattr(gb, method)(op) expected = df.set_index(keys)[columns] if len(keys) == 1: From c4e1c0d589a8f870fe20bca0bdce68511af89bba Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 10:37:39 -0500 Subject: [PATCH 7/9] Remove xfail --- pandas/tests/groupby/test_groupby.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a12146df83ecd..fe6c48e3c66c7 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1740,10 +1740,10 @@ def test_pivot_table_values_key_error(): def test_empty_groupby(columns, keys, values, method, op, request): # GH8093 & GH26411 + override_dtype = None if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": - request.node.add_marker( - pytest.mark.xfail(reason="wrong dtype from _wrap_series_output") - ) + # sum/product of bools is an integer + override_dtype = "int64" df = DataFrame([3 * values], columns=list("ABC")) df = df.iloc[:0] @@ -1755,6 +1755,8 @@ def test_empty_groupby(columns, keys, values, method, op, request): result = getattr(gb, method)(op) expected = df.set_index(keys)[columns] + if override_dtype is not None: + expected = expected.astype(override_dtype) if len(keys) == 1: expected.index.name = keys[0] tm.assert_equal(result, expected) From d00d5bcf894dc235c816ddbd8cda22943f351a3e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 23 Feb 2021 10:39:13 -0500 Subject: [PATCH 8/9] Remove request argument --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index fe6c48e3c66c7..6731790c89384 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1737,7 +1737,7 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, request): +def test_empty_groupby(columns, keys, values, method, op): # GH8093 & GH26411 override_dtype = None From 379e12a1505c0c4eea670b58b7ae0b30a8a89f03 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 24 Feb 2021 08:10:19 -0500 Subject: [PATCH 9/9] copy=False --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 40a1f8d1a0439..b06adbd96874f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1243,7 +1243,7 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): result = self.obj._constructor( index=self.grouper.result_index, columns=data.columns ) - result = result.astype(data.dtypes.to_dict()) + result = result.astype(data.dtypes.to_dict(), copy=False) return result # GH12824