From 5e5059e0e528d859f77bef148baf497800f14cbe Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 8 May 2021 15:05:49 -0700 Subject: [PATCH 1/3] REF: re-use dispatch methods in DataFrameGroupBy.nunique --- pandas/core/groupby/generic.py | 61 ++++++++++++++++------------ pandas/core/groupby/groupby.py | 4 +- pandas/tests/groupby/test_groupby.py | 13 ++++-- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9287163053cac..7de99a7735a53 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1627,21 +1627,21 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: return self._reindex_output(result)._convert(datetime=True) - def _iterate_column_groupbys(self): - for i, colname in enumerate(self._selected_obj.columns): + def _iterate_column_groupbys(self, obj: FrameOrSeries): + for i, colname in enumerate(obj.columns): yield colname, SeriesGroupBy( - self._selected_obj.iloc[:, i], + obj.iloc[:, i], selection=colname, grouper=self.grouper, exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func) -> DataFrame: + def _apply_to_column_groupbys(self, func, obj: FrameOrSeries) -> DataFrame: from pandas.core.reshape.concat import concat - columns = self._selected_obj.columns + columns = obj.columns results = [ - func(col_groupby) for _, col_groupby in self._iterate_column_groupbys() + func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj) ] if not len(results): @@ -1730,39 +1730,46 @@ def nunique(self, dropna: bool = True) -> DataFrame: """ from pandas.core.reshape.concat import concat - # TODO: this is duplicative of how GroupBy naturally works - # Try to consolidate with normal wrapping functions - obj = self._obj_with_exclusions + if self.axis == 0: - iter_func = obj.items + results = self._apply_to_column_groupbys( + lambda sgb: sgb.nunique(dropna), obj=obj + ) + results.columns.names = obj.columns.names # TODO: do at higher level? else: + # see test_groupby_crash_on_nunique + # TODO: this is duplicative of how GroupBy naturally works + # Try to consolidate with normal wrapping functions + iter_func = obj.iterrows - res_list = [ - SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( - dropna - ) - for label, content in iter_func() - ] - if res_list: - results = concat(res_list, axis=1) - results = cast(DataFrame, results) - else: - # concat would raise - results = DataFrame( - [], index=self.grouper.result_index, columns=obj.columns[:0] - ) + res_list = [ + SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( + dropna + ) + for label, content in iter_func() + ] + if res_list: + results = concat(res_list, axis=1) + results = cast(DataFrame, results) + else: + # concat would raise + results = DataFrame( + [], index=self.grouper.result_index, columns=obj.columns[:0] + ) - if self.axis == 1: results = results.T - other_axis = 1 - self.axis - results._get_axis(other_axis).names = obj._get_axis(other_axis).names + results.index.names = obj.index.names + if results.index.equals(obj.index): + # retain freq attribute on DatetimeIndex/TimedeltaIndex + results.index = obj.index.copy() if not self.as_index: results.index = ibase.default_index(len(results)) self._insert_inaxis_grouper_inplace(results) + return results @Appender(DataFrame.idxmax.__doc__) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1105c1bd1d782..d6b0e118cc7ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1904,7 +1904,9 @@ def ohlc(self) -> DataFrame: ) return self._reindex_output(result) - return self._apply_to_column_groupbys(lambda x: x.ohlc()) + return self._apply_to_column_groupbys( + lambda x: x.ohlc(), self._obj_with_exclusions + ) @final @doc(DataFrame.describe) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f716a3a44cd54..7e3feae844061 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2060,23 +2060,28 @@ def test_dup_labels_output_shape(groupby_func, idx): def test_groupby_crash_on_nunique(axis): # Fix following 30253 + dti = date_range("2016-01-01", periods=2, name="foo") df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) + df.columns.names = ("bar", "baz") + df.index = dti axis_number = df._get_axis_number(axis) if not axis_number: df = df.T - result = df.groupby(axis=axis_number, level=0).nunique() + gb = df.groupby(axis=axis_number, level=0) + result = gb.nunique() - expected = DataFrame({"A": [1, 2], "D": [1, 1]}) + expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti) + expected.columns.name = "bar" if not axis_number: expected = expected.T tm.assert_frame_equal(result, expected) # same thing, but empty columns - gb = df[[]].groupby(axis=axis_number, level=0) - res = gb.nunique() + gb2 = df[[]].groupby(axis=axis_number, level=0) + res = gb2.nunique() exp = expected[[]] tm.assert_frame_equal(res, exp) From 88fa90973a358c1223f738183cd036ced0dacec1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 8 May 2021 20:27:04 -0700 Subject: [PATCH 2/3] REF: re-use machinery in DataFrameGroupBy.nunique --- pandas/core/groupby/generic.py | 46 +++++++--------------------- pandas/tests/groupby/test_groupby.py | 13 ++++++-- 2 files changed, 21 insertions(+), 38 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7de99a7735a53..76c53f2888a8f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -22,7 +22,6 @@ Mapping, TypeVar, Union, - cast, ) import warnings @@ -1576,6 +1575,10 @@ def _wrap_aggregated_output( if self.axis == 1: result = result.T + if result.index.equals(self.obj.index): + # Retain e.g. DatetimeIndex/TimedeltaIndex freq + result.index = self.obj.index.copy() + # TODO: Do this more systematically return self._reindex_output(result) @@ -1728,43 +1731,16 @@ def nunique(self, dropna: bool = True) -> DataFrame: 4 ham 5 x 5 ham 5 y """ - from pandas.core.reshape.concat import concat - - obj = self._obj_with_exclusions - if self.axis == 0: - results = self._apply_to_column_groupbys( - lambda sgb: sgb.nunique(dropna), obj=obj - ) - results.columns.names = obj.columns.names # TODO: do at higher level? - else: + if self.axis != 0: # see test_groupby_crash_on_nunique - # TODO: this is duplicative of how GroupBy naturally works - # Try to consolidate with normal wrapping functions + return self._python_agg_general(lambda sgb: sgb.nunique(dropna)) - iter_func = obj.iterrows - - res_list = [ - SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique( - dropna - ) - for label, content in iter_func() - ] - if res_list: - results = concat(res_list, axis=1) - results = cast(DataFrame, results) - else: - # concat would raise - results = DataFrame( - [], index=self.grouper.result_index, columns=obj.columns[:0] - ) - - results = results.T - - results.index.names = obj.index.names - if results.index.equals(obj.index): - # retain freq attribute on DatetimeIndex/TimedeltaIndex - results.index = obj.index.copy() + obj = self._obj_with_exclusions + results = self._apply_to_column_groupbys( + lambda sgb: sgb.nunique(dropna), obj=obj + ) + results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: results.index = ibase.default_index(len(results)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7e3feae844061..67d2af46ac8ee 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2079,10 +2079,17 @@ def test_groupby_crash_on_nunique(axis): tm.assert_frame_equal(result, expected) - # same thing, but empty columns - gb2 = df[[]].groupby(axis=axis_number, level=0) + if axis_number == 0: + # same thing, but empty columns + gb2 = df[[]].groupby(axis=axis_number, level=0) + exp = expected[[]] + else: + # same thing, but empty rows + gb2 = df.loc[[]].groupby(axis=axis_number, level=0) + # default for empty when we can't infer a dtype is float64 + exp = expected.loc[[]].astype(np.float64) + res = gb2.nunique() - exp = expected[[]] tm.assert_frame_equal(res, exp) From a8564272a698218a8ed2654b98b1272e43623b33 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 9 May 2021 09:17:45 -0700 Subject: [PATCH 3/3] fix xfail --- pandas/tests/resample/test_time_grouper.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5dc64a33098f3..7cc2b7f72fb69 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -121,12 +121,8 @@ def test_aaa_group_order(): tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5]) -def test_aggregate_normal(request, resample_method): +def test_aggregate_normal(resample_method): """Check TimeGrouper's aggregation is identical as normal groupby.""" - if resample_method == "ohlc": - request.node.add_marker( - pytest.mark.xfail(reason="DataError: No numeric types to aggregate") - ) data = np.random.randn(20, 4) normal_df = DataFrame(data, columns=["A", "B", "C", "D"])