Skip to content

Commit b3e9ae7

Browse files
PERF: groupby with string dtype (#43634)
1 parent ffbeda7 commit b3e9ae7

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

asv_bench/benchmarks/groupby.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,38 @@ def time_sum(self):
603603
self.df.groupby(["a"])["b"].sum()
604604

605605

606+
class String:
607+
# GH#41596
608+
param_names = ["dtype", "method"]
609+
params = [
610+
["str", "string[python]"],
611+
[
612+
"sum",
613+
"prod",
614+
"min",
615+
"max",
616+
"mean",
617+
"median",
618+
"var",
619+
"first",
620+
"last",
621+
"any",
622+
"all",
623+
],
624+
]
625+
626+
def setup(self, dtype, method):
627+
cols = list("abcdefghjkl")
628+
self.df = DataFrame(
629+
np.random.randint(0, 100, size=(1_000_000, len(cols))),
630+
columns=cols,
631+
dtype=dtype,
632+
)
633+
634+
def time_str_func(self, dtype, method):
635+
self.df.groupby("a")[self.df.columns[1:]].agg(method)
636+
637+
606638
class Categories:
607639
def setup(self):
608640
N = 10 ** 5

doc/source/whatsnew/v1.3.4.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Fixed regressions
1717
- Fixed regression in :meth:`merge` with integer and ``NaN`` keys failing with ``outer`` merge (:issue:`43550`)
1818
- Fixed regression in :meth:`DataFrame.corr` raising ``ValueError`` with ``method="spearman"`` on 32-bit platforms (:issue:`43588`)
1919
- Fixed performance regression in :meth:`MultiIndex.equals` (:issue:`43549`)
20+
- Fixed performance regression in :meth:`.GroupBy.first` and :meth:`.GroupBy.last` with :class:`StringDtype` (:issue:`41596`)
2021
- Fixed regression in :meth:`Series.cat.reorder_categories` failing to update the categories on the ``Series`` (:issue:`43232`)
2122
- Fixed regression in :meth:`Series.cat.categories` setter failing to update the categories on the ``Series`` (:issue:`43334`)
2223
- Fixed regression in :meth:`pandas.read_csv` raising ``UnicodeDecodeError`` exception when ``memory_map=True`` (:issue:`43540`)

pandas/core/groupby/ops.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
BaseMaskedArray,
8383
BaseMaskedDtype,
8484
)
85+
from pandas.core.arrays.string_ import StringDtype
8586
from pandas.core.frame import DataFrame
8687
from pandas.core.generic import NDFrame
8788
from pandas.core.groupby import grouper
@@ -348,6 +349,9 @@ def _ea_wrap_cython_operation(
348349
elif isinstance(values.dtype, FloatingDtype):
349350
# FloatingArray
350351
npvalues = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan)
352+
elif isinstance(values.dtype, StringDtype):
353+
# StringArray
354+
npvalues = values.to_numpy(object, na_value=np.nan)
351355
else:
352356
raise NotImplementedError(
353357
f"function is not implemented for this dtype: {values.dtype}"
@@ -375,7 +379,9 @@ def _reconstruct_ea_result(self, values, res_values):
375379
"""
376380
# TODO: allow EAs to override this logic
377381

378-
if isinstance(values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype)):
382+
if isinstance(
383+
values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype, StringDtype)
384+
):
379385
dtype = self._get_result_dtype(values.dtype)
380386
cls = dtype.construct_array_type()
381387
return cls._from_sequence(res_values, dtype=dtype)

0 commit comments

Comments
 (0)