From 8bc61db736e77480e3d4bfa9d6dda89aeadbd661 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 15 Oct 2020 23:26:25 +0100 Subject: [PATCH 1/7] #36757 fix for speed issue --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8340f964fb44b..b80e33a780113 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1191,7 +1191,7 @@ def reset_identity(values): # when the ax has duplicates # so we resort to this # GH 14776, 30667 - if ax.has_duplicates: + if ax.has_duplicates and not result.axes[self.axis].equals(ax): indexer, _ = result.index.get_indexer_non_unique(ax.values) indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) From 1f8f5939fb142513e1edf133a87b2d4d9578f945 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Thu, 15 Oct 2020 23:40:36 +0100 Subject: [PATCH 2/7] whatsnew --- doc/source/whatsnew/v1.1.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index e63912ebc8fee..7a70d205046b8 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`GroupBy.fillna` that caused a performance regression between 1.0.5 and 1.1.X (:issue:`36757`) .. --------------------------------------------------------------------------- From 03bb0be060bbecc44e49cdef5dc691b73f66a651 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Fri, 16 Oct 2020 21:50:16 +0100 Subject: [PATCH 3/7] addressing comments --- doc/source/whatsnew/v1.1.4.rst | 2 +- pandas/core/groupby/groupby.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst index ef93aefe44fd3..2fd083bbfd25b 100644 --- a/doc/source/whatsnew/v1.1.4.rst +++ b/doc/source/whatsnew/v1.1.4.rst @@ -28,7 +28,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug causing ``groupby(...).sum()`` and similar to not preserve metadata (:issue:`29442`) -- Bug in :meth:`GroupBy.fillna` that caused a performance regression between 1.0.5 and 1.1.X (:issue:`36757`) +- Bug in :meth:`GroupBy.fillna` that introduced a performance regression after 1.0.5 (:issue:`36757`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6d651b4954754..ab01f99ba11f9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1201,7 +1201,7 @@ def reset_identity(values): indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis, copy=False) elif self.group_keys: From 19ddf0d6d29c6a7b6c2d0342ba61eaf31d33dff6 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 17 Oct 2020 00:13:39 +0100 Subject: [PATCH 4/7] added asv FillNA --- asv_bench/benchmarks/groupby.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index bda3ab71d1a00..d9f33e70b5f7b 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -358,6 +358,21 @@ def time_category_size(self): self.draws.groupby(self.cats).size() +class FillNA: + def setup(self): + N = 1500 + self.df = pd.DataFrame({ + 'group' : [1] * N + [2] * N, + 'value' : [np.nan, 1.0] * N + }).set_index('group') + + def time_df_ffill(self): + self.df.groupby('group').fillna(method='ffill') + + def time_srs_ffill(self): + self.df.groupby('group')['value'].fillna(method='ffill') + + class GroupByMethods: param_names = ["dtype", "method", "application"] From 5412aaa0faa9fa0ea1ace0332fbd088d03dfbc9d Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 17 Oct 2020 00:24:18 +0100 Subject: [PATCH 5/7] fixing FillNA --- asv_bench/benchmarks/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d9f33e70b5f7b..121125db99777 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -361,7 +361,7 @@ def time_category_size(self): class FillNA: def setup(self): N = 1500 - self.df = pd.DataFrame({ + self.df = DataFrame({ 'group' : [1] * N + [2] * N, 'value' : [np.nan, 1.0] * N }).set_index('group') From bafedb520279444c2629136a2aa775971929bcbb Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 17 Oct 2020 01:06:35 +0100 Subject: [PATCH 6/7] black --- asv_bench/benchmarks/groupby.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 121125db99777..0fcec3431de1e 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -361,16 +361,15 @@ def time_category_size(self): class FillNA: def setup(self): N = 1500 - self.df = DataFrame({ - 'group' : [1] * N + [2] * N, - 'value' : [np.nan, 1.0] * N - }).set_index('group') - + self.df = DataFrame( + {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} + ).set_index("group") + def time_df_ffill(self): - self.df.groupby('group').fillna(method='ffill') - + self.df.groupby("group").fillna(method="ffill") + def time_srs_ffill(self): - self.df.groupby('group')['value'].fillna(method='ffill') + self.df.groupby("group")["value"].fillna(method="ffill") class GroupByMethods: From 8a93e0b8b1e2f2777c0916f2d7a0ab176a9082d8 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 17 Oct 2020 22:51:15 +0100 Subject: [PATCH 7/7] addressing comments --- asv_bench/benchmarks/groupby.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 0fcec3431de1e..22f002e6cb79a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -360,7 +360,7 @@ def time_category_size(self): class FillNA: def setup(self): - N = 1500 + N = 100 self.df = DataFrame( {"group": [1] * N + [2] * N, "value": [np.nan, 1.0] * N} ).set_index("group") @@ -368,9 +368,15 @@ def setup(self): def time_df_ffill(self): self.df.groupby("group").fillna(method="ffill") + def time_df_bfill(self): + self.df.groupby("group").fillna(method="bfill") + def time_srs_ffill(self): self.df.groupby("group")["value"].fillna(method="ffill") + def time_srs_bfill(self): + self.df.groupby("group")["value"].fillna(method="bfill") + class GroupByMethods: