Skip to content

REF: re-use machinery for DataFrameGroupBy.nunique #41390

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 18 additions & 35 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
Mapping,
TypeVar,
Union,
cast,
)
import warnings

Expand Down Expand Up @@ -1576,6 +1575,10 @@ def _wrap_aggregated_output(

if self.axis == 1:
result = result.T
if result.index.equals(self.obj.index):
# Retain e.g. DatetimeIndex/TimedeltaIndex freq
result.index = self.obj.index.copy()
# TODO: Do this more systematically

return self._reindex_output(result)

Expand Down Expand Up @@ -1627,21 +1630,21 @@ def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:

return self._reindex_output(result)._convert(datetime=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
def _iterate_column_groupbys(self, obj: FrameOrSeries):
for i, colname in enumerate(obj.columns):
yield colname, SeriesGroupBy(
self._selected_obj.iloc[:, i],
obj.iloc[:, i],
selection=colname,
grouper=self.grouper,
exclusions=self.exclusions,
)

def _apply_to_column_groupbys(self, func) -> DataFrame:
def _apply_to_column_groupbys(self, func, obj: FrameOrSeries) -> DataFrame:
from pandas.core.reshape.concat import concat

columns = self._selected_obj.columns
columns = obj.columns
results = [
func(col_groupby) for _, col_groupby in self._iterate_column_groupbys()
func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
]

if not len(results):
Expand Down Expand Up @@ -1728,41 +1731,21 @@ def nunique(self, dropna: bool = True) -> DataFrame:
4 ham 5 x
5 ham 5 y
"""
from pandas.core.reshape.concat import concat

# TODO: this is duplicative of how GroupBy naturally works
# Try to consolidate with normal wrapping functions
if self.axis != 0:
# see test_groupby_crash_on_nunique
return self._python_agg_general(lambda sgb: sgb.nunique(dropna))

obj = self._obj_with_exclusions
if self.axis == 0:
iter_func = obj.items
else:
iter_func = obj.iterrows

res_list = [
SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
dropna
)
for label, content in iter_func()
]
if res_list:
results = concat(res_list, axis=1)
results = cast(DataFrame, results)
else:
# concat would raise
results = DataFrame(
[], index=self.grouper.result_index, columns=obj.columns[:0]
)

if self.axis == 1:
results = results.T

other_axis = 1 - self.axis
results._get_axis(other_axis).names = obj._get_axis(other_axis).names
results = self._apply_to_column_groupbys(
lambda sgb: sgb.nunique(dropna), obj=obj
)
results.columns.names = obj.columns.names # TODO: do at higher level?

if not self.as_index:
results.index = ibase.default_index(len(results))
self._insert_inaxis_grouper_inplace(results)

return results

@Appender(DataFrame.idxmax.__doc__)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1904,7 +1904,9 @@ def ohlc(self) -> DataFrame:
)
return self._reindex_output(result)

return self._apply_to_column_groupbys(lambda x: x.ohlc())
return self._apply_to_column_groupbys(
lambda x: x.ohlc(), self._obj_with_exclusions
)

@final
@doc(DataFrame.describe)
Expand Down
24 changes: 18 additions & 6 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2060,24 +2060,36 @@ def test_dup_labels_output_shape(groupby_func, idx):

def test_groupby_crash_on_nunique(axis):
# Fix following 30253
dti = date_range("2016-01-01", periods=2, name="foo")
df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
df.columns.names = ("bar", "baz")
df.index = dti

axis_number = df._get_axis_number(axis)
if not axis_number:
df = df.T

result = df.groupby(axis=axis_number, level=0).nunique()
gb = df.groupby(axis=axis_number, level=0)
result = gb.nunique()

expected = DataFrame({"A": [1, 2], "D": [1, 1]})
expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
expected.columns.name = "bar"
if not axis_number:
expected = expected.T

tm.assert_frame_equal(result, expected)

# same thing, but empty columns
gb = df[[]].groupby(axis=axis_number, level=0)
res = gb.nunique()
exp = expected[[]]
if axis_number == 0:
# same thing, but empty columns
gb2 = df[[]].groupby(axis=axis_number, level=0)
exp = expected[[]]
else:
# same thing, but empty rows
gb2 = df.loc[[]].groupby(axis=axis_number, level=0)
# default for empty when we can't infer a dtype is float64
exp = expected.loc[[]].astype(np.float64)

res = gb2.nunique()
tm.assert_frame_equal(res, exp)


Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/resample/test_time_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,8 @@ def test_aaa_group_order():
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])


def test_aggregate_normal(request, resample_method):
def test_aggregate_normal(resample_method):
"""Check TimeGrouper's aggregation is identical as normal groupby."""
if resample_method == "ohlc":
request.node.add_marker(
pytest.mark.xfail(reason="DataError: No numeric types to aggregate")
)

data = np.random.randn(20, 4)
normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
Expand Down