From d3b5992d3c3898675ff0d131c9f85a9f36b8dc07 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 4 Oct 2016 20:59:58 -0500 Subject: [PATCH 1/5] PERF: period factorization --- asv_bench/benchmarks/groupby.py | 12 ++++++++++++ pandas/core/algorithms.py | 22 ++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index e12b00dd06b39..6d9262e4edf47 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -547,6 +547,18 @@ def setup(self): def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() +class groupby_period(object): + # GH 14338 + goal_time = 0.2 + + def setup(self): + N = 10000 + self.pi = pd.period_range('1900-01-01', freq='D', periods=N) + self.df = pd.DataFrame(np.random.randn(N, 2)) + + def time_groupby_sum(self): + self.df.groupby(self.pi).sum() + #---------------------------------------------------------------------- # Series.value_counts diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ee59d6552bb2f..4401a5828c6ae 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,6 +14,7 @@ is_categorical_dtype, is_extension_type, is_datetimetz, + is_period, is_period_dtype, is_period_arraylike, is_float_dtype, @@ -285,15 +286,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ - from pandas import Index, Series, DatetimeIndex + from pandas import Index, Series, DatetimeIndex, PeriodIndex - vals = np.asarray(values) - # localize to UTC - is_datetimetz_type = is_datetimetz(values) - if is_datetimetz_type: + if is_datetimetz(values): values = DatetimeIndex(values) - vals = values.asi8 + + if is_period_dtype(values): + values = PeriodIndex(values) + # period array interface goes to object so intercept + vals = values.view(np.int64) + else: + vals = np.asarray(values) + is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) @@ -311,10 +316,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - if is_datetimetz_type: - # reset tz - uniques = values._shallow_copy(uniques) - elif is_datetime: + if is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') From c9c3d7ebb3371794c4971ad82b2ef2bcb8e1d86e Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 5 Oct 2016 05:07:45 -0500 Subject: [PATCH 2/5] lint --- asv_bench/benchmarks/groupby.py | 1 + pandas/core/algorithms.py | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6d9262e4edf47..7d8cc9f2955ce 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -547,6 +547,7 @@ def setup(self): def time_groupby_sum(self): self.df.groupby(['a'])['b'].sum() + class groupby_period(object): # GH 14338 goal_time = 0.2 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4401a5828c6ae..de95ca958ce84 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,7 +14,6 @@ is_categorical_dtype, is_extension_type, is_datetimetz, - is_period, is_period_dtype, is_period_arraylike, is_float_dtype, @@ -288,7 +287,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ from pandas import Index, Series, DatetimeIndex, PeriodIndex - if is_datetimetz(values): values = DatetimeIndex(values) @@ -299,7 +297,6 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): else: vals = np.asarray(values) - is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) From 6ba030cc5ef94cd12e147351db29c5557aead36b Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 5 Oct 2016 16:58:43 -0500 Subject: [PATCH 3/5] refactor; add asv --- asv_bench/benchmarks/groupby.py | 17 +++++++++++++++-- pandas/core/algorithms.py | 32 +++++++++++++++++++------------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7d8cc9f2955ce..f72fdcf9c80a6 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -554,12 +554,25 @@ class groupby_period(object): def setup(self): N = 10000 - self.pi = pd.period_range('1900-01-01', freq='D', periods=N) + self.grouper = self.make_grouper() self.df = pd.DataFrame(np.random.randn(N, 2)) + def make_grouper(self): + return pd.period_range('1900-01-01', freq='D', periods=N) + def time_groupby_sum(self): - self.df.groupby(self.pi).sum() + self.df.groupby(self.grouper).sum() + + +class groupby_datetime(groupby_period): + def make_grouper(self): + return pd.date_range('1900-01-01', freq='D', periods=N) + +class groupby_datetimetz(groupby_period): + def make_grouper(self): + return pd.date_range('1900-01-01', freq='D', periods=N + tz='US/Central') #---------------------------------------------------------------------- # Series.value_counts diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de95ca958ce84..8644d4568e44d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -287,18 +287,25 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ from pandas import Index, Series, DatetimeIndex, PeriodIndex - if is_datetimetz(values): - values = DatetimeIndex(values) - - if is_period_dtype(values): - values = PeriodIndex(values) - # period array interface goes to object so intercept - vals = values.view(np.int64) + # handling two possibilities here + # - for a numpy datetimelike simply view as i8 then cast back + # - for an extension datetimelike view as i8 then + # reconstruct from boxed values to transfer metadata + dtype = None + if needs_i8_conversion(values): + if is_period_dtype(values): + values = PeriodIndex(values) + vals = values.asi8 + elif is_datetimetz(values): + values = DatetimeIndex(values) + vals = values.asi8 + else: + # numpy dtype + dtype = values.dtype + vals = values.view(np.int64) else: vals = np.asarray(values) - is_datetime = is_datetime64_dtype(vals) - is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) @@ -313,10 +320,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - if is_datetime: - uniques = uniques.astype('M8[ns]') - elif is_timedelta: - uniques = uniques.astype('m8[ns]') + if dtype is not None: + uniques = uniques.astype(dtype) + if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): From 9b8ab75cb30bc7ed38e6155114c5f61c87faa93f Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 5 Oct 2016 18:11:32 -0500 Subject: [PATCH 4/5] fixup asv --- asv_bench/benchmarks/groupby.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index f72fdcf9c80a6..5f3671012e6d5 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -552,26 +552,26 @@ class groupby_period(object): # GH 14338 goal_time = 0.2 + def make_grouper(self, N): + return pd.period_range('1900-01-01', freq='D', periods=N) + def setup(self): N = 10000 - self.grouper = self.make_grouper() + self.grouper = self.make_grouper(N) self.df = pd.DataFrame(np.random.randn(N, 2)) - def make_grouper(self): - return pd.period_range('1900-01-01', freq='D', periods=N) - def time_groupby_sum(self): self.df.groupby(self.grouper).sum() class groupby_datetime(groupby_period): - def make_grouper(self): + def make_grouper(self, N): return pd.date_range('1900-01-01', freq='D', periods=N) class groupby_datetimetz(groupby_period): - def make_grouper(self): - return pd.date_range('1900-01-01', freq='D', periods=N + def make_grouper(self, N): + return pd.date_range('1900-01-01', freq='D', periods=N, tz='US/Central') #---------------------------------------------------------------------- From a77ccc21df5316a630d138d4a5afe62e3e1491f6 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 11 Oct 2016 19:14:35 -0500 Subject: [PATCH 5/5] whatsnew --- doc/source/whatsnew/v0.19.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3edb8c1fa9071..8843a7849c200 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -20,7 +20,7 @@ Highlights include: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - + - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`)