From c2aae5e2fe796dcd9d5bccffd5417bd15f611c2f Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 15 Feb 2022 22:13:51 -0500 Subject: [PATCH 1/5] faster joins when left and/or right is empty --- pandas/core/indexes/base.py | 14 +++++++-- pandas/tests/reshape/merge/test_join.py | 41 +++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6afa6c7bd2bee..4a02b579e993f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4544,12 +4544,22 @@ def join( if len(other) == 0 and how in ("left", "outer"): join_index = self._view() - rindexer = np.repeat(np.intp(-1), len(join_index)) + rindexer = np.broadcast_to(np.intp(-1), len(join_index)) return join_index, None, rindexer if len(self) == 0 and how in ("right", "outer"): join_index = other._view() - lindexer = np.repeat(np.intp(-1), len(join_index)) + lindexer = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lindexer, None + + if len(self) == 0 and how in ("left", "inner", "cross"): + join_index = self._view() + rindexer = np.array([]) + return join_index, None, rindexer + + if len(other) == 0 and how in ("right", "inner", "cross"): + join_index = other._view() + lindexer = np.array([]) return join_index, lindexer, None if self._join_precedence < other._join_precedence: diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 7b932a3bb80c0..629c8d53ce9bd 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -881,3 +881,44 @@ def test_join_multiindex_not_alphabetical_categorical(categories, values): } ).set_index(["first", "second"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "left_empty, how, exp", + [ + (False, "left", "left"), + (False, "right", "empty"), + (False, "inner", "empty"), + (False, "outer", "left"), + (False, "cross", "empty"), + (True, "left", "empty"), + (True, "right", "right"), + (True, "inner", "empty"), + (True, "outer", "right"), + (True, "cross", "empty"), + ], +) +def test_join_empty(left_empty, how, exp): + + left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A") + right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A") + + if left_empty: + left = left.head(0) + else: + right = right.head(0) + + result = left.join(right, how=how) + + if exp == "left": + expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]}) + expected = expected.set_index("A") + elif exp == "right": + expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]}) + expected = expected.set_index("A") + elif exp == "empty": + expected = DataFrame(index=Index([]), columns=["B", "C"], dtype="int64") + if how != "cross": + expected = expected.rename_axis("A") + + tm.assert_frame_equal(result, expected) From 9cde19900791b37500a1f9d69959e220a27f7410 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Tue, 15 Feb 2022 22:50:18 -0500 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 71394a858aefe..58fe748491027 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -254,6 +254,7 @@ Performance improvements - Performance improvement in :meth:`.GroupBy.transform` when broadcasting values for user-defined functions (:issue:`45708`) - Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`) - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`) +- Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - From dc5faf25caaf14af01d14f65b194c161e744ef53 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 16 Feb 2022 08:49:33 -0500 Subject: [PATCH 3/5] cleanup --- pandas/core/indexes/base.py | 38 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4a02b579e993f..5e38c3f3e6d00 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4542,25 +4542,25 @@ def join( if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) - if len(other) == 0 and how in ("left", "outer"): - join_index = self._view() - rindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, None, rindexer - - if len(self) == 0 and how in ("right", "outer"): - join_index = other._view() - lindexer = np.broadcast_to(np.intp(-1), len(join_index)) - return join_index, lindexer, None - - if len(self) == 0 and how in ("left", "inner", "cross"): - join_index = self._view() - rindexer = np.array([]) - return join_index, None, rindexer - - if len(other) == 0 and how in ("right", "inner", "cross"): - join_index = other._view() - lindexer = np.array([]) - return join_index, lindexer, None + if len(other) == 0: + if how in ("left", "outer"): + join_index = self._view() + rindexer = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, None, rindexer + elif how in ("right", "inner", "cross"): + join_index = other._view() + lindexer = np.array([]) + return join_index, lindexer, None + + if len(self) == 0: + if how in ("right", "outer"): + join_index = other._view() + lindexer = np.broadcast_to(np.intp(-1), len(join_index)) + return join_index, lindexer, None + elif how in ("left", "inner", "cross"): + join_index = self._view() + rindexer = np.array([]) + return join_index, None, rindexer if self._join_precedence < other._join_precedence: how = {"right": "left", "left": "right"}.get(how, how) From baf363f6daf4de589952aa87d07088aa620aa4da Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 16 Feb 2022 08:50:12 -0500 Subject: [PATCH 4/5] add asv for joining with empty frame --- asv_bench/benchmarks/join_merge.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 6e27d9355997f..dffe5ef8b4627 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -157,6 +157,12 @@ def setup(self): def time_left_outer_join_index(self): self.left.join(self.right, on="jim") + def time_join_left_empty(self): + self.left.iloc[:0].join(self.right, on="jim") + + def time_join_right_empty(self): + self.left.join(self.right.iloc[:0], on="jim") + class JoinNonUnique: # outer join of non-unique From 1f62198e15474e54deb39da886cdec8e4e785037 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Wed, 16 Feb 2022 09:13:44 -0500 Subject: [PATCH 5/5] asv --- asv_bench/benchmarks/join_merge.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index dffe5ef8b4627..e3c6bf9bd4e07 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -157,11 +157,18 @@ def setup(self): def time_left_outer_join_index(self): self.left.join(self.right, on="jim") - def time_join_left_empty(self): - self.left.iloc[:0].join(self.right, on="jim") - def time_join_right_empty(self): - self.left.join(self.right.iloc[:0], on="jim") +class JoinEmpty: + def setup(self): + N = 100_000 + self.df = DataFrame({"A": np.arange(N)}) + self.df_empty = DataFrame(columns=["B", "C"], dtype="int64") + + def time_inner_join_left_empty(self): + self.df_empty.join(self.df, how="inner") + + def time_inner_join_right_empty(self): + self.df.join(self.df_empty, how="inner") class JoinNonUnique: