From dea76c10b9b6d6c031c22efb918af5b03780b4f6 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Wed, 22 Jan 2020 19:36:00 -0700 Subject: [PATCH 1/4] REGR: Fix GroupBy aggregation with ExtensionArray backed index --- pandas/_libs/reduction.pyx | 6 +++--- pandas/tests/groupby/aggregate/test_aggregate.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 8571761f77265..635c0e36d7457 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -158,7 +158,7 @@ cdef class _BaseGrouper: if util.is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() - index = dummy.index.values + index = dummy.index.to_numpy() if not index.flags.contiguous: index = index.copy() @@ -229,7 +229,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor - self.index = series.index.values + self.index = series.index.to_numpy() self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -326,7 +326,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor - self.index = series.index.values + self.index = series.index.to_numpy() self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0a7272bbc131c..3629b0b52b3aa 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -360,6 +360,17 @@ def test_func_duplicates_raises(): df.groupby("A").agg(["min", "min"]) +@pytest.mark.parametrize( + "index", [pd.CategoricalIndex(list("abc")), pd.interval_range(0, 3)] +) +def test_agg_with_ea_backed_index(index): + # GH 31223 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").agg({"value": Series.nunique}) + expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group") + tm.assert_frame_equal(result, expected) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) From ed3a9a340bfff9287351489e5889eac32e774394 Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Sun, 26 Jan 2020 13:45:58 -0700 Subject: [PATCH 2/4] add back _has_complex_internals --- pandas/_libs/reduction.pyx | 6 +++--- pandas/core/apply.py | 7 +++---- pandas/core/groupby/ops.py | 4 ++-- pandas/core/indexes/base.py | 8 ++++++++ pandas/core/indexes/category.py | 5 +++++ pandas/core/indexes/interval.py | 5 +++++ pandas/core/indexes/multi.py | 5 +++++ pandas/core/indexes/period.py | 5 +++++ pandas/tests/groupby/aggregate/test_aggregate.py | 10 ++++++++-- 9 files changed, 44 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 635c0e36d7457..8571761f77265 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -158,7 +158,7 @@ cdef class _BaseGrouper: if util.is_array(values) and not values.flags.contiguous: # e.g. Categorical has no `flags` attribute values = values.copy() - index = dummy.index.to_numpy() + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -229,7 +229,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor - self.index = series.index.to_numpy() + self.index = series.index.values self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) @@ -326,7 +326,7 @@ cdef class SeriesGrouper(_BaseGrouper): self.arr = values self.typ = series._constructor self.ityp = series.index._constructor - self.index = series.index.to_numpy() + self.index = series.index.values self.name = series.name self.dummy_arr, self.dummy_index = self._check_dummy(dummy) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index ca1be3154757a..9947866a76e3f 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -14,7 +14,7 @@ is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -278,9 +278,8 @@ def apply_standard(self): if ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow complex_internals since libreduction shortcut - # cannot handle MultiIndex - and not isinstance(self.agg_axis, ABCMultiIndex) + # Disallow complex_internals since libreduction shortcut raises a TypeError + and not self.agg_axis._has_complex_internals ): values = self.values diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 37067a1897a52..844ad9a7068ea 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func): # TODO: can we get a performant workaround for EAs backed by ndarray? return self._aggregate_series_pure_python(obj, func) - elif isinstance(obj.index, MultiIndex): - # MultiIndex; Pre-empt TypeError in _aggregate_series_fast + elif obj.index._has_complex_internals: + # Pre-empt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c158bdfbac441..7f55f4f3f1357 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4099,6 +4099,14 @@ def _assert_can_do_op(self, value): if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + @property + def _has_complex_internals(self): + """ + Indicates if an index is not directly backed by a numpy array + """ + # used to disable groupby tricks + return False + def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 268ab9ba4e4c4..4f02faacad3df 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -378,6 +378,11 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) # We use _shallow_copy rather than the Index implementation diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3108c1a1afd0c..1bdab23d039fe 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -408,6 +408,11 @@ def values(self): """ return self._data + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a26a01ab7be21..d9546685e212e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1346,6 +1346,11 @@ def values(self): self._tuples = lib.fast_zip(values) return self._tuples + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + @cache_readonly def is_monotonic_increasing(self) -> bool: """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 2a40f4a6f6239..d2509aac8b5d1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -255,6 +255,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): def values(self): return np.asarray(self) + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + def _shallow_copy(self, values=None, **kwargs): # TODO: simplify, figure out type of values if values is None: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3629b0b52b3aa..67bdcc246579e 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -361,9 +361,15 @@ def test_func_duplicates_raises(): @pytest.mark.parametrize( - "index", [pd.CategoricalIndex(list("abc")), pd.interval_range(0, 3)] + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], ) -def test_agg_with_ea_backed_index(index): +def test_agg_index_has_complex_internals(index): # GH 31223 df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) result = df.groupby("group").agg({"value": Series.nunique}) From 1018ca9383e672fd6f606d0cc7adb4b1d82cc2bf Mon Sep 17 00:00:00 2001 From: Jeremy Schendel Date: Mon, 27 Jan 2020 09:01:32 -0700 Subject: [PATCH 3/4] prevent fast_apply case for _has_complex_internals --- pandas/core/groupby/ops.py | 4 ++-- pandas/tests/groupby/test_apply.py | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 844ad9a7068ea..679d3668523c2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 - # apply_frame_axis0 doesn't allow MultiIndex - and not isinstance(sdata.index, MultiIndex) + # fast_apply/libreduction doesn't allow non-numpy backed indexes + and not sdata.index._has_complex_internals ): try: result_values, mutated = splitter.fast_apply(f, group_keys) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index fc7b9f56002d8..c18ef73203914 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -811,3 +811,19 @@ def test_groupby_apply_datetime_result_dtypes(): index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_apply_index_has_complex_internals(index): + # GH 31248 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").apply(lambda x: x) + tm.assert_frame_equal(result, df) From cc400cd6d9a61c5ba0e7c1251cce8aee0b10a1a4 Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 27 Jan 2020 18:07:07 -0700 Subject: [PATCH 4/4] update comment --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/period.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index edc97fccc89cf..00f0984d43578 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4114,7 +4114,7 @@ def _has_complex_internals(self): """ Indicates if an index is not directly backed by a numpy array """ - # used to disable groupby tricks + # used to avoid libreduction code paths, which raise or require conversion return False def _is_memory_usage_qualified(self) -> bool: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 4f02faacad3df..36ba86bdc9f07 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -380,7 +380,7 @@ def values(self): @property def _has_complex_internals(self): - # to disable groupby tricks + # used to avoid libreduction code paths, which raise or require conversion return True def _wrap_setop_result(self, other, result): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index af9264393c9b9..89db2b4a7a379 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -413,7 +413,7 @@ def values(self): @property def _has_complex_internals(self): - # to disable groupby tricks + # used to avoid libreduction code paths, which raise or require conversion return True def __array_wrap__(self, result, context=None): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8e745a7ee38aa..2aa7a00707f24 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1348,7 +1348,7 @@ def values(self): @property def _has_complex_internals(self): - # to disable groupby tricks + # used to avoid libreduction code paths, which raise or require conversion return True @cache_readonly diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4541038d44fd4..106aaeec4f8be 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -257,7 +257,7 @@ def values(self): @property def _has_complex_internals(self): - # to disable groupby tricks + # used to avoid libreduction code paths, which raise or require conversion return True def _shallow_copy(self, values=None, **kwargs):