diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index e12b00dd06b39..5f3671012e6d5 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -548,6 +548,32 @@ def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() +class groupby_period(object): + # GH 14338 + goal_time = 0.2 + + def make_grouper(self, N): + return pd.period_range('1900-01-01', freq='D', periods=N) + + def setup(self): + N = 10000 + self.grouper = self.make_grouper(N) + self.df = pd.DataFrame(np.random.randn(N, 2)) + + def time_groupby_sum(self): + self.df.groupby(self.grouper).sum() + + +class groupby_datetime(groupby_period): + def make_grouper(self, N): + return pd.date_range('1900-01-01', freq='D', periods=N) + + +class groupby_datetimetz(groupby_period): + def make_grouper(self, N): + return pd.date_range('1900-01-01', freq='D', periods=N, + tz='US/Central') + #---------------------------------------------------------------------- # Series.value_counts diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3edb8c1fa9071..8843a7849c200 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -20,7 +20,7 @@ Highlights include: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - + - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ee59d6552bb2f..8644d4568e44d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -285,18 +285,27 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ - from pandas import Index, Series, DatetimeIndex - - vals = np.asarray(values) - - # localize to UTC - is_datetimetz_type = is_datetimetz(values) - if is_datetimetz_type: - values = DatetimeIndex(values) - vals = values.asi8 + from pandas import Index, Series, DatetimeIndex, PeriodIndex + + # handling two possibilities here + # - for a numpy datetimelike simply view as i8 then cast back + # - for an extension datetimelike view as i8 then + # reconstruct from boxed values to transfer metadata + dtype = None + if needs_i8_conversion(values): + if is_period_dtype(values): + values = PeriodIndex(values) + vals = values.asi8 + elif is_datetimetz(values): + values = DatetimeIndex(values) + vals = values.asi8 + else: + # numpy dtype + dtype = values.dtype + vals = values.view(np.int64) + else: + vals = np.asarray(values) - is_datetime = is_datetime64_dtype(vals) - is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) @@ -311,13 +320,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - if is_datetimetz_type: - # reset tz - uniques = values._shallow_copy(uniques) - elif is_datetime: - uniques = uniques.astype('M8[ns]') - elif is_timedelta: - uniques = uniques.astype('m8[ns]') + if dtype is not None: + uniques = uniques.astype(dtype) + if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series):