diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index c65502898195a..4378a164a2edd 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -439,6 +439,7 @@ Groupby/resample/rolling - Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) - Bug in :meth:`.GroupBy.mean`, :meth:`.GroupBy.median` and :meth:`DataFrame.pivot_table` not propagating metadata (:issue:`28283`) - Bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly when window is an offset and dates are in descending order (:issue:`40002`) +- Bug in :class:`SeriesGroupBy` and :class:`DataFrameGroupBy` on an empty ``Series`` or ``DataFrame`` would lose index, columns, and/or data types when directly using the methods ``idxmax``, ``idxmin``, ``mad``, ``min``, ``max``, ``sum``, ``prod``, and ``skew`` or using them through ``apply``, ``aggregate``, or ``resample`` (:issue:`26411`) - Reshaping @@ -454,6 +455,7 @@ Reshaping - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) +- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) Sparse ^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c1a277925de2a..b06adbd96874f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -450,13 +450,19 @@ def _wrap_transformed_output( return result def _wrap_applied_output( - self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + self, + data: Series, + keys: Index, + values: Optional[List[Any]], + not_indexed_same: bool = False, ) -> FrameOrSeriesUnion: """ Wrap the output of SeriesGroupBy.apply into the expected result. Parameters ---------- + data : Series + Input data for groupby operation. keys : Index Keys of groups that Series was grouped by. values : Optional[List[Any]] @@ -471,7 +477,10 @@ def _wrap_applied_output( if len(keys) == 0: # GH #6265 return self.obj._constructor( - [], name=self._selection_name, index=keys, dtype=np.float64 + [], + name=self._selection_name, + index=self.grouper.result_index, + dtype=data.dtype, ) assert values is not None @@ -1229,9 +1238,13 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: return self.obj._constructor(result, columns=result_columns) - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if len(keys) == 0: - return self.obj._constructor(index=keys) + result = self.obj._constructor( + index=self.grouper.result_index, columns=data.columns + ) + result = result.astype(data.dtypes.to_dict(), copy=False) + return result # GH12824 first_not_none = next(com.not_none(*values), None) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e939c184d501a..b9310794d7caa 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -981,7 +981,7 @@ def _python_apply_general( keys, values, mutated = self.grouper.apply(f, data, self.axis) return self._wrap_applied_output( - keys, values, not_indexed_same=mutated or self.mutated + data, keys, values, not_indexed_same=mutated or self.mutated ) def _iterate_slices(self) -> Iterable[Series]: @@ -1058,7 +1058,7 @@ def _wrap_aggregated_output( def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): raise AbstractMethodError(self) - def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): + def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) @final diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8feb379a82ada..d0026d7acbe65 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -236,14 +236,8 @@ def __internal_pivot_table( ) # discard the top level - if ( - values_passed - and not values_multi - and not table.empty - and (table.columns.nlevels > 1) - ): - table = table[values[0]] - + if values_passed and not values_multi and table.columns.nlevels > 1: + table = table.droplevel(0, axis=1) if len(index) == 0 and len(columns) > 0: table = table.T @@ -650,7 +644,6 @@ def crosstab( **dict(zip(unique_colnames, columns)), } df = DataFrame(data, index=common_idx) - original_df_cols = df.columns if values is None: df["__dummy__"] = 0 @@ -660,7 +653,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - ["__dummy__"], + "__dummy__", index=unique_rownames, columns=unique_colnames, margins=margins, @@ -669,12 +662,6 @@ def crosstab( **kwargs, ) - # GH18321, after pivoting, an extra top level of column index of `__dummy__` is - # created, and this extra level should not be included in the further steps - if not table.empty: - cols_diff = df.columns.difference(original_df_cols)[0] - table = table[cols_diff] - # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 92b7aefa6dd8c..f2f9cfee178d9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -147,11 +147,13 @@ def test_agg_apply_corner(ts, tsframe): # DataFrame grouped = tsframe.groupby(tsframe["A"] * np.nan) exp_df = DataFrame( - columns=tsframe.columns, dtype=float, index=Index([], dtype=np.float64) + columns=tsframe.columns, + dtype=float, + index=Index([], name="A", dtype=np.float64), ) - tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False) - tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], check_names=False) + tm.assert_frame_equal(grouped.sum(), exp_df) + tm.assert_frame_equal(grouped.agg(np.sum), exp_df) + tm.assert_frame_equal(grouped.apply(np.sum), exp_df) def test_agg_grouping_is_list_tuple(ts): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4dce7e8553be4..6731790c89384 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import ( + Categorical, DataFrame, Grouper, Index, @@ -18,6 +19,7 @@ Timestamp, date_range, read_csv, + to_datetime, ) import pandas._testing as tm from pandas.core.base import SpecificationError @@ -1716,15 +1718,48 @@ def test_pivot_table_values_key_error(): ) -def test_empty_dataframe_groupby(): - # GH8093 - df = DataFrame(columns=["A", "B", "C"]) - - result = df.groupby("A").sum() - expected = DataFrame(columns=["B", "C"], dtype=np.float64) - expected.index.name = "A" - - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("columns", ["C", ["C"]]) +@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) +@pytest.mark.parametrize( + "values", + [ + [True], + [0], + [0.0], + ["a"], + [Categorical([0])], + [to_datetime(0)], + [date_range(0, 1, 1, tz="US/Eastern")], + [pd.array([0], dtype="Int64")], + ], +) +@pytest.mark.parametrize("method", ["attr", "agg", "apply"]) +@pytest.mark.parametrize( + "op", ["idxmax", "idxmin", "mad", "min", "max", "sum", "prod", "skew"] +) +def test_empty_groupby(columns, keys, values, method, op): + # GH8093 & GH26411 + + override_dtype = None + if isinstance(values[0], bool) and op in ("prod", "sum") and method != "apply": + # sum/product of bools is an integer + override_dtype = "int64" + + df = DataFrame([3 * values], columns=list("ABC")) + df = df.iloc[:0] + + gb = df.groupby(keys)[columns] + if method == "attr": + result = getattr(gb, op)() + else: + result = getattr(gb, method)(op) + + expected = df.set_index(keys)[columns] + if override_dtype is not None: + expected = expected.astype(override_dtype) + if len(keys) == 1: + expected.index.name = keys[0] + tm.assert_equal(result, expected) def test_tuple_as_grouping(): diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index a17ed44c4011a..50775b9ef3a47 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -10,6 +10,7 @@ from pandas import ( DataFrame, Series, + TimedeltaIndex, Timestamp, ) import pandas._testing as tm @@ -398,6 +399,18 @@ def test_resample_groupby_agg(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) +def test_empty(keys): + # GH 26411 + df = pd.DataFrame([], columns=["a", "b"], index=TimedeltaIndex([])) + result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean() + expected = DataFrame(columns=["a", "b"]).set_index(keys, drop=False) + if len(keys) == 1: + expected.index.name = keys[0] + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("consolidate", [True, False]) def test_resample_groupby_agg_object_dtype_all_nan(consolidate): # https://github.com/pandas-dev/pandas/issues/39329 diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 86cde3eee874d..1ecb408d49813 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -240,7 +240,10 @@ def test_crosstab_no_overlap(self): s2 = Series([4, 5, 6], index=[4, 5, 6]) actual = crosstab(s1, s2) - expected = DataFrame() + expected = DataFrame( + index=Index([], dtype="int64", name="row_0"), + columns=Index([], dtype="int64", name="col_0"), + ) tm.assert_frame_equal(actual, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 19eba4305fdf6..8d2b4f2b325c2 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2040,7 +2040,7 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) def test_pivot_table_empty_aggfunc(self): - # GH 9186 + # GH 9186 & GH 13483 df = DataFrame( { "A": [2, 2, 3, 3, 2], @@ -2050,7 +2050,8 @@ def test_pivot_table_empty_aggfunc(self): } ) result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) - expected = DataFrame() + expected = DataFrame(index=Index([], dtype="int64", name="A")) + expected.columns.name = "D" tm.assert_frame_equal(result, expected) def test_pivot_table_no_column_raises(self):