From aa02b839ee6dbe36c4552344b847b2b43eee97bb Mon Sep 17 00:00:00 2001 From: Richard Date: Sat, 25 Jul 2020 15:15:59 -0400 Subject: [PATCH 1/3] CLN: Clean/Simplify _wrap_applied_output --- pandas/core/groupby/generic.py | 235 +++++++++++------------------ pandas/core/indexes/api.py | 7 +- pandas/tests/groupby/test_apply.py | 8 +- 3 files changed, 97 insertions(+), 153 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..c7659d902fd32 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1213,171 +1213,112 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - - # GH12824 + # GH12824 - If first value is None, can't assume all are None first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() - elif isinstance(first_not_none, DataFrame): + + if isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index + if isinstance(first_not_none, NDFrame): + + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + if isinstance(first_not_none, Series): + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] + backup = first_not_none._constructor(**kwargs) - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) + values = [x if (x is not None) else backup for x in values] - # reorder the values - values = [values[i] for i in indexer] + key_index = self.grouper.result_index if self.as_index else None + v = values[0] - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: + if not isinstance(v, (np.ndarray, Index, Series)): + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) + else: + result = DataFrame(values, index=key_index, columns=[self._selection]) + self._insert_inaxis_grouper_inplace(result) + return result + + if not isinstance(v, ABCSeries): + # GH1738: values is list of arrays of unequal lengths + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) - key_index = Index(keys, name=key_names[0]) + all_indexed_same = all_indexes_same((x.index for x in values)) + + # GH3596 - provide a reduction (Frame -> Series) if groups are unique + if self.squeeze: + # assign the name to this series + applied_index = self._selected_obj._get_axis(self.axis) + if len(values) == 1 and applied_index.nlevels == 1: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) - # don't use the key indexer - if not self.as_index: - key_index = None + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + return concat(values) - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) - - values = [x if (x is not None) else backup for x in values] - - v = values[0] - - if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: - if isinstance(v, Series): - applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) - singular_series = len(values) == 1 and applied_index.nlevels == 1 - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) - - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - if self.axis == 0 and isinstance(v, ABCSeries): - # GH6124 if the list of Series have a consistent name, - # then propagate that name to the result. - index = v.index.copy() - if index.name is None: - # Only propagate the series name to the result - # if all series have a consistent name. If the - # series do not have a consistent name, do - # nothing. - names = {v.name for v in values} - if len(names) == 1: - index.name = list(names)[0] - - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index - ) - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame( - values, index=key_index, columns=[self._selection] - ) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = _recast_datetimelike_result(result) - else: - result = result._convert(datetime=True) + stacked_values = np.vstack([np.asarray(v) for v in values]) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T - return self._reindex_output(result) + result = self.obj._constructor(stacked_values, index=index, columns=columns) - # values are not series or array-like but scalars - else: - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + + return self._reindex_output(result) def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..678753f684141 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -298,15 +298,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..8e8053d1296b5 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -868,13 +868,15 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] From 21e1fcae71323c001bf07fa46687245e48d65bb9 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 4 Aug 2020 18:04:12 -0400 Subject: [PATCH 2/3] Refactored if-else. --- pandas/core/groupby/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c7659d902fd32..8710464769a12 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1223,6 +1223,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) + key_index = self.grouper.result_index if self.as_index else None + if isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning @@ -1236,11 +1238,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] - - key_index = self.grouper.result_index if self.as_index else None - v = values[0] - - if not isinstance(v, (np.ndarray, Index, Series)): + else: # values are not series or array-like but scalars # self._selection_name not passed through to Series as the # result should not take the name of original selection @@ -1252,6 +1250,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): self._insert_inaxis_grouper_inplace(result) return result + v = values[0] + if not isinstance(v, ABCSeries): # GH1738: values is list of arrays of unequal lengths # TODO: sure this is right? we used to do this From 059405cb15189446ed81bbd2f17de7f1704099ae Mon Sep 17 00:00:00 2001 From: Richard Date: Fri, 14 Aug 2020 13:52:40 -0400 Subject: [PATCH 3/3] Reworked logic for non-NDFrame cases --- pandas/core/groupby/generic.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 94fee533d98a6..449099e5ce073 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1231,20 +1231,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_index = self.grouper.result_index if self.as_index else None - if isinstance(first_not_none, NDFrame): - - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + if not isinstance(first_not_none, (Series, np.ndarray, Index)): - values = [x if (x is not None) else backup for x in values] - else: # values are not series or array-like but scalars # self._selection_name not passed through to Series as the # result should not take the name of original selection @@ -1256,9 +1244,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): self._insert_inaxis_grouper_inplace(result) return result - v = values[0] + elif not isinstance(first_not_none, Series): - if not isinstance(v, ABCSeries): # GH1738: values is list of arrays of unequal lengths # TODO: sure this is right? we used to do this # after raising AttributeError above @@ -1266,6 +1253,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): values, index=key_index, name=self._selection_name ) + # this is to silence a DeprecationWarning + # TODO: Replace when default dtype of empty Series is object + # with backup = first_not_none._constructor(**kwargs) + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(**kwargs, dtype_if_empty=object) + values = [x if (x is not None) else backup for x in values] + + v = values[0] all_indexed_same = all_indexes_same((x.index for x in values)) # GH3596 - provide a reduction (Frame -> Series) if groups are unique @@ -1273,7 +1268,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # assign the name to this series applied_index = self._selected_obj._get_axis(self.axis) if len(values) == 1 and applied_index.nlevels == 1: - values[0].name = keys[0] + v.name = keys[0] # GH2893 # we have series in the values array, we want to