From ad17b44c73217005358b8535d7aa8204247edfc8 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 1 Mar 2025 18:21:31 -0500 Subject: [PATCH 01/21] Implemented for core libraries --- narwhals/_arrow/dataframe.py | 3 ++- narwhals/_pandas_like/dataframe.py | 5 ++++- narwhals/dataframe.py | 10 ++++++---- tests/frame/join_test.py | 29 +++++++++++++++++++++++++++-- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index e0709a84ff..d02e8ec109 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -405,7 +405,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "cross", "anti", "semi"], + how: Literal["left", "inner", "full", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -415,6 +415,7 @@ def join( "semi": "left semi", "inner": "inner", "left": "left outer", + "full": "full outer", } if how == "cross": diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index e958575461..0732a790e8 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -599,11 +599,14 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "cross", "anti", "semi"], + how: Literal["left", "inner", "full", "outer", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, ) -> Self: + if how == "full": + how = "outer" # pandas calls full joins outer + if how == "cross": if ( self._implementation is Implementation.MODIN diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index d89a9dbedb..ec9b4fd657 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -235,7 +235,7 @@ def join( self: Self, other: Self, on: str | list[str] | None = None, - how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, @@ -245,7 +245,9 @@ def join( left_on = [left_on] if isinstance(left_on, str) else left_on right_on = [right_on] if isinstance(right_on, str) else right_on - if how not in (_supported_joins := ("inner", "left", "cross", "anti", "semi")): + if how not in ( + _supported_joins := ("inner", "left", "full", "cross", "anti", "semi") + ): msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'." raise NotImplementedError(msg) @@ -1623,7 +1625,7 @@ def join( self: Self, other: Self, on: str | list[str] | None = None, - how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, @@ -2914,7 +2916,7 @@ def join( self: Self, other: Self, on: str | list[str] | None = None, - how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + how: Literal["inner", "left", "full", "cross", "semi", "anti"] = "inner", *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 8fb2997eea..ce2e0b83df 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -17,6 +17,31 @@ from tests.utils import assert_equal_data +def test_full_join(constructor: Constructor) -> None: + df1 = {"id": [1, 2, 3], "value1": ["A", "B", "C"]} + df2 = {"id": [2, 3, 4], "value2": ["X", "Y", "Z"]} + expected_coalesce = { + "id": [1, 2, 3, 4], + "value1": ["A", "B", "C", None], + "value2": [None, "X", "Y", "Z"], + } + expected_no_coalesce = { + "id": [None, 1, 2, 3], + "value1": [None, "A", "B", "C"], + "id_right": [4, None, 2, 3], + "value2": ["Z", None, "X", "Y"], + } + + df_left = nw_main.from_native(constructor(df1)) + df_right = nw_main.from_native(constructor(df2)) + result = df_left.join(df_right, left_on="id", right_on="id", how="full") + result = result.sort("id") + try: + assert_equal_data(result, expected_coalesce) + except AssertionError: # coealesce-join not implemented + assert_equal_data(result, expected_no_coalesce) + + def test_inner_join_two_keys(constructor: Constructor) -> None: data = { "antananarivo": [1, 3, 2], @@ -215,7 +240,7 @@ def test_semi_join( assert_equal_data(result, expected) -@pytest.mark.parametrize("how", ["right", "full"]) +@pytest.mark.parametrize("how", ["right"]) def test_join_not_implemented(constructor: Constructor, how: str) -> None: data = {"antananarivo": [1, 3, 2], "bob": [4, 4, 6], "zor ro": [7.0, 8.0, 9.0]} df = nw.from_native(constructor(data)) @@ -223,7 +248,7 @@ def test_join_not_implemented(constructor: Constructor, how: str) -> None: with pytest.raises( NotImplementedError, match=re.escape( - f"Only the following join strategies are supported: ('inner', 'left', 'cross', 'anti', 'semi'); found '{how}'." + f"Only the following join strategies are supported: ('inner', 'left', 'full', 'cross', 'anti', 'semi'); found '{how}'." ), ): df.join(df, left_on="antananarivo", right_on="antananarivo", how=how) # type: ignore[arg-type] From 2154e9e67fce1aad107415ab7ed1a058adc6b571 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sun, 2 Mar 2025 10:45:08 -0500 Subject: [PATCH 02/21] Implement non coalescing keys for arrow full joins --- narwhals/_arrow/dataframe.py | 1 + tests/frame/join_test.py | 10 +--------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index d02e8ec109..fe9c9ef2e0 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -447,6 +447,7 @@ def join( right_keys=right_on, join_type=how_to_join_map[how], right_suffix=suffix, + coalesce_keys=False, # in compliance w/polars API ), ) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index ce2e0b83df..4594fbb32e 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -20,11 +20,6 @@ def test_full_join(constructor: Constructor) -> None: df1 = {"id": [1, 2, 3], "value1": ["A", "B", "C"]} df2 = {"id": [2, 3, 4], "value2": ["X", "Y", "Z"]} - expected_coalesce = { - "id": [1, 2, 3, 4], - "value1": ["A", "B", "C", None], - "value2": [None, "X", "Y", "Z"], - } expected_no_coalesce = { "id": [None, 1, 2, 3], "value1": [None, "A", "B", "C"], @@ -36,10 +31,7 @@ def test_full_join(constructor: Constructor) -> None: df_right = nw_main.from_native(constructor(df2)) result = df_left.join(df_right, left_on="id", right_on="id", how="full") result = result.sort("id") - try: - assert_equal_data(result, expected_coalesce) - except AssertionError: # coealesce-join not implemented - assert_equal_data(result, expected_no_coalesce) + assert_equal_data(result, expected_no_coalesce) def test_inner_join_two_keys(constructor: Constructor) -> None: From 0b874c74b1485130f04d37fd661f63f5e70c5b29 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sun, 2 Mar 2025 17:40:36 -0500 Subject: [PATCH 03/21] Remove pandas support for full join --- narwhals/_pandas_like/dataframe.py | 5 +---- tests/frame/join_test.py | 5 ++++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 0732a790e8..9bb522b33d 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -599,14 +599,11 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "outer", "cross", "anti", "semi"], + how: Literal["left", "inner", "outer", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, ) -> Self: - if how == "full": - how = "outer" # pandas calls full joins outer - if how == "cross": if ( self._implementation is Implementation.MODIN diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 4594fbb32e..4430bcd3eb 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -17,7 +17,10 @@ from tests.utils import assert_equal_data -def test_full_join(constructor: Constructor) -> None: +def test_full_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: + if "pandas" in str(constructor): + request.applymarker(pytest.mark.xfail) + df1 = {"id": [1, 2, 3], "value1": ["A", "B", "C"]} df2 = {"id": [2, 3, 4], "value2": ["X", "Y", "Z"]} expected_no_coalesce = { From 0013fddd2ccaf712668848b13abaed895cf5bea6 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sun, 2 Mar 2025 17:43:36 -0500 Subject: [PATCH 04/21] Clarify arrow coalesce behavior as full join specific --- narwhals/_arrow/dataframe.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index fe9c9ef2e0..5089f693f7 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -440,6 +440,9 @@ def join( .drop([key_token]), ) + coalesce_keys = True + if how == "full": # polars full join does not coalesce keys + coalesce_keys = False return self._from_native_frame( self._native_frame.join( other._native_frame, @@ -447,7 +450,7 @@ def join( right_keys=right_on, join_type=how_to_join_map[how], right_suffix=suffix, - coalesce_keys=False, # in compliance w/polars API + coalesce_keys=coalesce_keys, ), ) From cad1d1eb95c8d1fa4dc9e1226c866d94d9a2222b Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sun, 2 Mar 2025 18:02:08 -0500 Subject: [PATCH 05/21] Add libraries to exclusion list --- tests/frame/join_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 4430bcd3eb..b337ec4f67 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -18,7 +18,8 @@ def test_full_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "pandas" in str(constructor): + not_implemented = ("pandas", "dask", "duckdb", "modin", "sqlframe") + if any(lib for lib in not_implemented if lib in str(constructor)): request.applymarker(pytest.mark.xfail) df1 = {"id": [1, 2, 3], "value1": ["A", "B", "C"]} From dd31087579f6dec65b2a4bb2bd66ee6bf6051cf6 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sun, 2 Mar 2025 18:06:09 -0500 Subject: [PATCH 06/21] Add full join documentation --- narwhals/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index ec9b4fd657..4e1c3b778c 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -1641,6 +1641,7 @@ def join( * *inner*: Returns rows that have matching values in both tables. * *left*: Returns all rows from the left table, and the matched rows from the right table. + * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. @@ -2932,6 +2933,7 @@ def join( * *inner*: Returns rows that have matching values in both tables. * *left*: Returns all rows from the left table, and the matched rows from the right table. + * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. From 19a44ed8c53a1cab3f779444e7776eaa0e0b7648 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 18:32:34 -0500 Subject: [PATCH 07/21] Add pandas full join --- narwhals/_pandas_like/dataframe.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 9bb522b33d..15b283a137 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -599,7 +599,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "outer", "cross", "anti", "semi"], + how: Literal["left", "inner", "full", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -722,6 +722,18 @@ def join( extra.append(f"{right_key}{suffix}") return self._from_native_frame(result_native.drop(columns=extra)) + if how == "full": + ## pandas does not retain keys post-join + ## we must append the suffix to each key before-hand + if right_on is None: # pragma: no cover + msg = "`right_on` cannot be `None` in full-join" + raise TypeError(msg) + right_on_suffixed = [f"{k}{suffix}" for k in right_on] + right_on_mapper: dict[str, str] = dict(zip(right_on, right_on_suffixed)) + other._native_frame = other._native_frame.rename(columns=right_on_mapper) + right_on = right_on_suffixed # we now have the suffixed keys + how = "outer" # type: ignore[assignment] + return self._from_native_frame( self._native_frame.merge( other._native_frame, From a0df78db6827fd5cc004461e4c470d69c1fa50de Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 18:44:52 -0500 Subject: [PATCH 08/21] Implement duckdb full join --- narwhals/_duckdb/dataframe.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index e1fa303bd9..89676e1da0 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -256,13 +256,15 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "cross", "anti", "semi"], + how: Literal["left", "inner", "full", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, ) -> Self: original_alias = self._native_frame.alias + native_how = "outer" if how == "full" else how + if how == "cross": if self._backend_version < (1, 1, 4): msg = f"DuckDB>=1.1.4 is required for cross-join, found version: {self._backend_version}" @@ -278,16 +280,19 @@ def join( conditions = [ f'lhs."{left}" = rhs."{right}"' for left, right in zip(left_on, right_on) ] + if how == "full": # swap native how + native_how = "outer" + condition = " and ".join(conditions) rel = self._native_frame.set_alias("lhs").join( - other._native_frame.set_alias("rhs"), condition=condition, how=how + other._native_frame.set_alias("rhs"), condition=condition, how=native_how ) - if how in {"inner", "left", "cross"}: + if how in {"inner", "left", "cross", "outer"}: select = [f'lhs."{x}"' for x in self._native_frame.columns] for col in other._native_frame.columns: if col in self._native_frame.columns and ( - right_on is None or col not in right_on + right_on is None or col not in right_on or (native_how == "outer") ): select.append(f'rhs."{col}" as "{col}{suffix}"') elif right_on is None or col not in right_on: From 117a3f36f191d5b05092625aebce0084565464db Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 18:58:07 -0500 Subject: [PATCH 09/21] Fix duckdb not working for lazyframes --- narwhals/_duckdb/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 89676e1da0..aa5393bd31 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -288,7 +288,7 @@ def join( other._native_frame.set_alias("rhs"), condition=condition, how=native_how ) - if how in {"inner", "left", "cross", "outer"}: + if native_how in {"inner", "left", "cross", "outer"}: select = [f'lhs."{x}"' for x in self._native_frame.columns] for col in other._native_frame.columns: if col in self._native_frame.columns and ( From dce56e36a80f53dec47fbfac7d84b2ad4bad65f3 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 19:02:46 -0500 Subject: [PATCH 10/21] Add full join to dask --- narwhals/_dask/dataframe.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index e8dde409a3..0591a16a9b 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -266,7 +266,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "cross", "anti", "semi"], + how: Literal["left", "inner", "full", "cross", "anti", "semi"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -361,6 +361,18 @@ def join( extra.append(f"{right_key}_right") return self._from_native_frame(result_native.drop(columns=extra)) + if how == "full": + ## dask does not retain keys post-join + ## we must append the suffix to each key before-hand + if right_on is None: # pragma: no cover + msg = "`right_on` cannot be `None` in full-join" + raise TypeError(msg) + right_on_suffixed = [f"{k}{suffix}" for k in right_on] + right_on_mapper: dict[str, str] = dict(zip(right_on, right_on_suffixed)) + other._native_frame = other._native_frame.rename(columns=right_on_mapper) + right_on = right_on_suffixed # we now have the suffixed keys + how = "outer" # type: ignore[assignment] + return self._from_native_frame( self._native_frame.merge( other._native_frame, From 122fa6a4b8ce92974f86d2e1c97a2466141ccf4a Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 21:43:16 -0500 Subject: [PATCH 11/21] Fix pandas no collision suffix --- narwhals/_pandas_like/dataframe.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 15b283a137..5a5f7cfc8b 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -723,15 +723,23 @@ def join( return self._from_native_frame(result_native.drop(columns=extra)) if how == "full": - ## pandas does not retain keys post-join - ## we must append the suffix to each key before-hand - if right_on is None: # pragma: no cover - msg = "`right_on` cannot be `None` in full-join" - raise TypeError(msg) - right_on_suffixed = [f"{k}{suffix}" for k in right_on] - right_on_mapper: dict[str, str] = dict(zip(right_on, right_on_suffixed)) + ## Pandas coalesces keys in full joins unless there's no collision + + # help mypy + assert right_on is not None # noqa: S101 + assert left_on is not None # noqa: S101 + + right_keys_suffixed: list[str] = [] + for key in right_on: + if key in left_on: + suffixed = f"{key}{suffix}" + right_keys_suffixed.append(suffixed) + continue + right_keys_suffixed.append(key) + + right_on_mapper: dict[str, str] = dict(zip(right_on, right_keys_suffixed)) other._native_frame = other._native_frame.rename(columns=right_on_mapper) - right_on = right_on_suffixed # we now have the suffixed keys + right_on = right_keys_suffixed # we now have the suffixed keys how = "outer" # type: ignore[assignment] return self._from_native_frame( From f1cbe302aef83c58124205fc8b384458e0ea0903 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Tue, 4 Mar 2025 21:44:03 -0500 Subject: [PATCH 12/21] Refine tests to cover more cases --- tests/frame/join_test.py | 99 ++++++++++++++++++++++++++++++++++------ 1 file changed, 85 insertions(+), 14 deletions(-) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index b337ec4f67..f6859350b3 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -17,25 +17,96 @@ from tests.utils import assert_equal_data -def test_full_join(constructor: Constructor, request: pytest.FixtureRequest) -> None: - not_implemented = ("pandas", "dask", "duckdb", "modin", "sqlframe") +@pytest.mark.parametrize( + ("df1", "df2", "expected", "on", "left_on", "right_on"), + [ + ( + {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}, + { + "id": [2, 3, 4], + "department": ["HR", "Engineering", "Marketing"], + "salary": [50000, 60000, 70000], + }, + { + "id": [1, 2, 3, None], + "name": ["Alice", "Bob", "Charlie", None], + "age": [25, 30, 35, None], + "id_right": [None, 2, 3, 4], + "department": [None, "HR", "Engineering", "Marketing"], + "salary": [None, 50000, 60000, 70000], + }, + None, + ["id"], + ["id"], + ), + ( + {"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"], "age": [25, 30, 35]}, + { + "id": [2, 3, 4], + "department": ["HR", "Engineering", "Marketing"], + "salary": [50000, 60000, 70000], + }, + { + "id": [1, 2, 3, None], + "name": ["Alice", "Bob", "Charlie", None], + "age": [25, 30, 35, None], + "id_right": [None, 2, 3, 4], + "department": [None, "HR", "Engineering", "Marketing"], + "salary": [None, 50000, 60000, 70000], + }, + "id", + None, + None, + ), + ( + { + "id": [1, 2, 3, 4], + "year": [2020, 2021, 2022, 2023], + "value1": [100, 200, 300, 400], + }, + { + "id": [2, 3, 4, 5], + "year_foo": [2021, 2022, 2023, 2024], + "value2": [500, 600, 700, 800], + }, + { + "id": [1, 2, 3, 4, None], + "year": [2020, 2021, 2022, 2023, None], + "value1": [100, 200, 300, 400, None], + "id_right": [None, 2, 3, 4, 5], + # since year is different, don't apply suffix + "year_foo": [None, 2021, 2022, 2023, 2024], + "value2": [None, 500, 600, 700, 800], + }, + None, + ["id", "year"], + ["id", "year_foo"], + ), + ], +) +def test_full_join( + df1: dict[str, list[Any]], + df2: dict[str, list[Any]], + expected: dict[str, list[Any]], + on: None | str | list[str], + left_on: None | str | list[str], + right_on: None | str | list[str], + constructor: Constructor, + request: pytest.FixtureRequest, +) -> None: + not_implemented = ("modin[pyarrow]", "sqlframe") if any(lib for lib in not_implemented if lib in str(constructor)): request.applymarker(pytest.mark.xfail) - df1 = {"id": [1, 2, 3], "value1": ["A", "B", "C"]} - df2 = {"id": [2, 3, 4], "value2": ["X", "Y", "Z"]} - expected_no_coalesce = { - "id": [None, 1, 2, 3], - "value1": [None, "A", "B", "C"], - "id_right": [4, None, 2, 3], - "value2": ["Z", None, "X", "Y"], - } - df_left = nw_main.from_native(constructor(df1)) df_right = nw_main.from_native(constructor(df2)) - result = df_left.join(df_right, left_on="id", right_on="id", how="full") - result = result.sort("id") - assert_equal_data(result, expected_no_coalesce) + result = df_left.join( + df_right, on=on, left_on=left_on, right_on=right_on, how="full" + ).sort("id", nulls_last=True) + assert_equal_data(result, expected) + + +pytest.main([__file__]) def test_inner_join_two_keys(constructor: Constructor) -> None: From 5c4b97d27bff38fcda1259a7362e3096bd81f2b7 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 8 Mar 2025 10:50:04 -0500 Subject: [PATCH 13/21] Abstract pandas and dask full join logic --- narwhals/_dask/dataframe.py | 13 +++++++------ narwhals/_pandas_like/dataframe.py | 17 +++++------------ narwhals/utils.py | 27 +++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 0591a16a9b..a6c906a95c 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -16,7 +16,9 @@ from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation +from narwhals.utils import _remap_join_keys from narwhals.utils import check_column_exists +from narwhals.utils import check_column_names_are_unique from narwhals.utils import generate_temporary_column_name from narwhals.utils import parse_columns_to_drop from narwhals.utils import parse_version @@ -364,13 +366,12 @@ def join( if how == "full": ## dask does not retain keys post-join ## we must append the suffix to each key before-hand - if right_on is None: # pragma: no cover - msg = "`right_on` cannot be `None` in full-join" - raise TypeError(msg) - right_on_suffixed = [f"{k}{suffix}" for k in right_on] - right_on_mapper: dict[str, str] = dict(zip(right_on, right_on_suffixed)) + assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 + right_on_mapper = _remap_join_keys(left_on, right_on, suffix) other._native_frame = other._native_frame.rename(columns=right_on_mapper) - right_on = right_on_suffixed # we now have the suffixed keys + check_column_names_are_unique(other._native_frame.columns) + right_on = list(right_on_mapper.values()) # we now have the suffixed keys how = "outer" # type: ignore[assignment] return self._from_native_frame( diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 5a5f7cfc8b..cb2fca5883 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -27,6 +27,7 @@ from narwhals.dependencies import is_numpy_array_1d from narwhals.exceptions import InvalidOperationError from narwhals.utils import Implementation +from narwhals.utils import _remap_join_keys from narwhals.utils import check_column_exists from narwhals.utils import generate_temporary_column_name from narwhals.utils import import_dtypes_module @@ -724,22 +725,14 @@ def join( if how == "full": ## Pandas coalesces keys in full joins unless there's no collision - - # help mypy - assert right_on is not None # noqa: S101 assert left_on is not None # noqa: S101 + assert right_on is not None # noqa: S101 - right_keys_suffixed: list[str] = [] - for key in right_on: - if key in left_on: - suffixed = f"{key}{suffix}" - right_keys_suffixed.append(suffixed) - continue - right_keys_suffixed.append(key) + right_on_mapper = _remap_join_keys(left_on, right_on, suffix) - right_on_mapper: dict[str, str] = dict(zip(right_on, right_keys_suffixed)) other._native_frame = other._native_frame.rename(columns=right_on_mapper) - right_on = right_keys_suffixed # we now have the suffixed keys + check_column_names_are_unique(other._native_frame.columns) + right_on = list(right_on_mapper.values()) # we now have the suffixed keys how = "outer" # type: ignore[assignment] return self._from_native_frame( diff --git a/narwhals/utils.py b/narwhals/utils.py index f5959bb550..d27edb1801 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1392,3 +1392,30 @@ def _supports_dataframe_interchange(obj: Any) -> TypeIs[DataFrameLike]: def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: no cover # Return `True` for implementations that utilize `CompliantExpr._depth`. return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} + + +def _remap_join_keys( + left_on: list[str], right_on: list[str], suffix: str +) -> dict[str, str]: + """Remap join keys to avoid collisions. + + If left keys collide with the right keys, append the suffix. + If there's no collision, let the right keys be. + + Args: + left_on (list[str]): Left keys. + right_on (list[str]): Right keys. + suffix (str): Suffix to append to right keys. + + Returns: + dict[str, str]: A map of old to new right keys.. + """ + right_keys_suffixed: list[str] = [] + for key in right_on: + if key in left_on: + suffixed = f"{key}{suffix}" + right_keys_suffixed.append(suffixed) + continue + right_keys_suffixed.append(key) + + return dict(zip(right_on, right_keys_suffixed)) From 5561daa9e3c721452b098277558ce21e607ef465 Mon Sep 17 00:00:00 2001 From: Tyler Riccio Date: Sat, 8 Mar 2025 10:51:49 -0500 Subject: [PATCH 14/21] Ensure duckdb join collision consistency --- narwhals/_duckdb/dataframe.py | 7 +++++-- tests/frame/join_test.py | 16 +++++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index aa5393bd31..9a44dfc2e6 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -291,8 +291,11 @@ def join( if native_how in {"inner", "left", "cross", "outer"}: select = [f'lhs."{x}"' for x in self._native_frame.columns] for col in other._native_frame.columns: - if col in self._native_frame.columns and ( - right_on is None or col not in right_on or (native_how == "outer") + col_in_lhs: bool = col in self._native_frame.columns + if native_how == "outer" and not col_in_lhs: + select.append(f'rhs."{col}"') + elif (native_how == "outer") or ( + col_in_lhs and (right_on is None or col not in right_on) ): select.append(f'rhs."{col}" as "{col}{suffix}"') elif right_on is None or col not in right_on: diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index f6859350b3..a7228e1d4c 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -10,6 +10,8 @@ import narwhals as nw_main # use nw_main in some tests for coverage import narwhals.stable.v1 as nw +from narwhals import LazyFrame +from narwhals.exceptions import DuplicateError from narwhals.utils import Implementation from tests.utils import DUCKDB_VERSION from tests.utils import PANDAS_VERSION @@ -105,8 +107,20 @@ def test_full_join( ).sort("id", nulls_last=True) assert_equal_data(result, expected) + # test duplication + if isinstance(df_left, LazyFrame): + return # cannot materialize columns for a check + df1 = {"foo": [1, 2, 3], "val1": [1, 2, 3]} + df2 = {"foo": [1, 2, 3], "foo_right": [1, 2, 3]} + df_left = nw_main.from_native(constructor(df1)) + df_right = nw_main.from_native(constructor(df2)) + + # polars throws `NarwhalsError`, everything else should raise `DuplicateError` + with pytest.raises((DuplicateError, nw.exceptions.NarwhalsError)): + df_left.join(df_right, on="foo", how="full") + -pytest.main([__file__]) +pytest.main([__file__, "--all-cpu-constructors"]) def test_inner_join_two_keys(constructor: Constructor) -> None: From f749accc9322a80d9cfff84f1aa185b2ed689ab5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 8 Mar 2025 15:57:34 +0000 Subject: [PATCH 15/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- narwhals/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/utils.py b/narwhals/utils.py index 0d6b458780..0fdf0c727e 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1479,7 +1479,6 @@ def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} - def _remap_join_keys( left_on: list[str], right_on: list[str], suffix: str ) -> dict[str, str]: @@ -1506,6 +1505,7 @@ def _remap_join_keys( return dict(zip(right_on, right_keys_suffixed)) + # TODO @dangotbanned: Extend with runtime behavior for `v1.*` # See `narwhals.exceptions.NarwhalsUnstableWarning` def unstable(fn: _Fn, /) -> _Fn: From 9d0f578a4b3c6e1e5a781c19c69908aaefa63687 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 22:11:49 +0100 Subject: [PATCH 16/21] support spark like --- narwhals/_dask/dataframe.py | 4 ++-- narwhals/_pandas_like/dataframe.py | 4 ++-- narwhals/_spark_like/dataframe.py | 34 +++++++++++++++++++++++++----- narwhals/utils.py | 2 +- tests/frame/join_test.py | 2 +- 5 files changed, 35 insertions(+), 11 deletions(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 359d7cab59..a28cc58f46 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -16,7 +16,7 @@ from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation -from narwhals.utils import _remap_join_keys +from narwhals.utils import _remap_full_join_keys from narwhals.utils import check_column_exists from narwhals.utils import check_column_names_are_unique from narwhals.utils import generate_temporary_column_name @@ -358,7 +358,7 @@ def join( assert left_on is not None # noqa: S101 assert right_on is not None # noqa: S101 - right_on_mapper = _remap_join_keys(left_on, right_on, suffix) + right_on_mapper = _remap_full_join_keys(left_on, right_on, suffix) other_native = other._native_frame other_native = other_native.rename(columns=right_on_mapper) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 65adf55d7f..bdaa25b550 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -29,7 +29,7 @@ from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantLazyFrame from narwhals.utils import Implementation -from narwhals.utils import _remap_join_keys +from narwhals.utils import _remap_full_join_keys from narwhals.utils import check_column_exists from narwhals.utils import generate_temporary_column_name from narwhals.utils import import_dtypes_module @@ -723,7 +723,7 @@ def join( assert left_on is not None # noqa: S101 assert right_on is not None # noqa: S101 - right_on_mapper = _remap_join_keys(left_on, right_on, suffix) + right_on_mapper = _remap_full_join_keys(left_on, right_on, suffix) other_native = other._native_frame other_native = other_native.rename(columns=right_on_mapper) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index fa44d440c4..d07f28f10c 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -1,6 +1,8 @@ from __future__ import annotations import warnings +from functools import reduce +from operator import and_ from typing import TYPE_CHECKING from typing import Any from typing import Iterator @@ -326,7 +328,7 @@ def unique( def join( self: Self, other: Self, - how: Literal["inner", "left", "cross", "semi", "anti"], + how: Literal["inner", "left", "cross", "semi", "anti", "full"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -337,14 +339,23 @@ def join( left_columns = self.columns right_columns = other.columns + right_on_: list[str] = right_on or [] + left_on_: list[str] = left_on or [] + # create a mapping for columns on other # `right_on` columns will be renamed as `left_on` # the remaining columns will be either added the suffix or left unchanged. + right_cols_to_rename = ( + [c for c in right_columns if c not in right_on_] + if how != "full" + else right_columns + ) + rename_mapping = { - **dict(zip(right_on or [], left_on or [])), + **dict(zip(right_on_, left_on_)), **{ colname: f"{colname}{suffix}" if colname in left_columns else colname - for colname in list(set(right_columns).difference(set(right_on or []))) + for colname in right_cols_to_rename }, } other_native = other_native.select( @@ -361,11 +372,24 @@ def join( [ rename_mapping[colname] for colname in right_columns - if colname not in (right_on or []) + if colname not in right_on_ ] ) + elif how == "full": + col_order.extend(rename_mapping.values()) + + right_on_remapped = [rename_mapping.get(c, c) for c in right_on_] + on_ = reduce( + and_, + ( + getattr(self_native, left_key) == getattr(other_native, right_key) + for left_key, right_key in zip(left_on_, right_on_remapped) + ), + ) + how_native = "full outer" if how == "full" else how + return self._from_native_frame( - self_native.join(other_native, on=left_on, how=how).select(col_order) + self_native.join(other_native, on=on_, how=how_native).select(col_order) ) def explode(self: Self, columns: list[str]) -> Self: diff --git a/narwhals/utils.py b/narwhals/utils.py index a38e22418a..954cca3413 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1504,7 +1504,7 @@ def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} -def _remap_join_keys( +def _remap_full_join_keys( left_on: Sequence[str], right_on: Sequence[str], suffix: str ) -> dict[str, str]: """Remap join keys to avoid collisions. diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 9f445f8eda..57fd2663b7 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -96,7 +96,7 @@ def test_full_join( constructor: Constructor, request: pytest.FixtureRequest, ) -> None: - not_implemented = ("modin[pyarrow]", "sqlframe") + not_implemented = ("modin[pyarrow]",) if any(lib for lib in not_implemented if lib in str(constructor)): request.applymarker(pytest.mark.xfail) From 09c0ca711d720e925867f3d34e50f5ef5852d2be Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 22:30:14 +0100 Subject: [PATCH 17/21] old polars (?) --- narwhals/_polars/dataframe.py | 47 ++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 439528bfac..6f4751a08a 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -61,7 +61,6 @@ class PolarsDataFrame: item: Method[Any] iter_rows: Method[Iterator[tuple[Any, ...]] | Iterator[Mapping[str, Any]]] is_unique: Method[PolarsSeries] - join: Method[Self] join_asof: Method[Self] rename: Method[Self] row: Method[tuple[Any, ...]] @@ -420,6 +419,29 @@ def pivot( def to_polars(self: Self) -> pl.DataFrame: return self._native_frame + def join( + self: Self, + other: Self, + *, + how: Literal["left", "inner", "full", "cross", "anti", "semi"], + left_on: list[str] | None, + right_on: list[str] | None, + suffix: str, + ) -> Self: + how_native = ( + "outer" if (self._backend_version < (1, 0, 0) and how == "full") else how + ) + + return self._from_native_frame( + self._native_frame.join( + other=other._native_frame, + how=how_native, + left_on=left_on, + right_on=right_on, + suffix=suffix, + ) + ) + class PolarsLazyFrame: def __init__( @@ -599,3 +621,26 @@ def simple_select(self, *column_names: str) -> Self: def aggregate(self: Self, *exprs: Any) -> Self: return self.select(*exprs) # type: ignore[no-any-return] + + def join( + self: Self, + other: Self, + *, + how: Literal["left", "inner", "full", "cross", "anti", "semi"], + left_on: list[str] | None, + right_on: list[str] | None, + suffix: str, + ) -> Self: + how_native = ( + "outer" if (self._backend_version < (1, 0, 0) and how == "full") else how + ) + + return self._from_native_frame( + self._native_frame.join( + other=other._native_frame, + how=how_native, + left_on=left_on, + right_on=right_on, + suffix=suffix, + ) + ) From ab541aa791d809fd1e96173720dd4a479a07d742 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 22:30:38 +0100 Subject: [PATCH 18/21] split test --- tests/frame/join_test.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 57fd2663b7..b4c2327d30 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -10,7 +10,6 @@ import narwhals as nw_main # use nw_main in some tests for coverage import narwhals.stable.v1 as nw -from narwhals import LazyFrame from narwhals.exceptions import DuplicateError from narwhals.utils import Implementation from tests.utils import DUCKDB_VERSION @@ -94,12 +93,7 @@ def test_full_join( left_on: None | str | list[str], right_on: None | str | list[str], constructor: Constructor, - request: pytest.FixtureRequest, ) -> None: - not_implemented = ("modin[pyarrow]",) - if any(lib for lib in not_implemented if lib in str(constructor)): - request.applymarker(pytest.mark.xfail) - df_left = nw_main.from_native(constructor(df1)) df_right = nw_main.from_native(constructor(df2)) result = df_left.join( @@ -107,17 +101,16 @@ def test_full_join( ).sort("id", nulls_last=True) assert_equal_data(result, expected) - # test duplication - if isinstance(df_left, LazyFrame): - return # cannot materialize columns for a check + +def test_full_join_duplicate(constructor: Constructor) -> None: df1 = {"foo": [1, 2, 3], "val1": [1, 2, 3]} df2 = {"foo": [1, 2, 3], "foo_right": [1, 2, 3]} - df_left = nw_main.from_native(constructor(df1)) - df_right = nw_main.from_native(constructor(df2)) + df_left = nw_main.from_native(constructor(df1)).lazy() + df_right = nw_main.from_native(constructor(df2)).lazy() # polars throws `NarwhalsError`, everything else should raise `DuplicateError` with pytest.raises((DuplicateError, nw.exceptions.NarwhalsError)): - df_left.join(df_right, on="foo", how="full") + df_left.join(df_right, on="foo", how="full").collect() def test_inner_join_two_keys(constructor: Constructor) -> None: From 14a5f0cfb9ddd65c0354d687d001b52e15cbeaa0 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 22:40:20 +0100 Subject: [PATCH 19/21] align signature, fix spark like cross join on_ value --- narwhals/_arrow/dataframe.py | 2 +- narwhals/_compliant/dataframe.py | 2 +- narwhals/_dask/dataframe.py | 2 +- narwhals/_duckdb/dataframe.py | 2 +- narwhals/_pandas_like/dataframe.py | 2 +- narwhals/_polars/dataframe.py | 4 ++-- narwhals/_spark_like/dataframe.py | 18 +++++++++++------- 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 85abdac170..c5397aede7 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -404,7 +404,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: Sequence[str] | None, right_on: Sequence[str] | None, suffix: str, diff --git a/narwhals/_compliant/dataframe.py b/narwhals/_compliant/dataframe.py index b64c138834..423077ffd2 100644 --- a/narwhals/_compliant/dataframe.py +++ b/narwhals/_compliant/dataframe.py @@ -85,7 +85,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: Sequence[str] | None, right_on: Sequence[str] | None, suffix: str, diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index a28cc58f46..987b84786f 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -255,7 +255,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index ebb04fe632..8b9f3e2c8f 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -251,7 +251,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index bdaa25b550..66f393713a 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -593,7 +593,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: Sequence[str] | None, right_on: Sequence[str] | None, suffix: str, diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 6f4751a08a..6649eb30dc 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -423,7 +423,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -626,7 +626,7 @@ def join( self: Self, other: Self, *, - how: Literal["left", "inner", "full", "cross", "anti", "semi"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index d07f28f10c..7a5b573fdf 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -328,7 +328,7 @@ def unique( def join( self: Self, other: Self, - how: Literal["inner", "left", "cross", "semi", "anti", "full"], + how: Literal["inner", "left", "full", "cross", "semi", "anti"], left_on: list[str] | None, right_on: list[str] | None, suffix: str, @@ -379,12 +379,16 @@ def join( col_order.extend(rename_mapping.values()) right_on_remapped = [rename_mapping.get(c, c) for c in right_on_] - on_ = reduce( - and_, - ( - getattr(self_native, left_key) == getattr(other_native, right_key) - for left_key, right_key in zip(left_on_, right_on_remapped) - ), + on_ = ( + reduce( + and_, + ( + getattr(self_native, left_key) == getattr(other_native, right_key) + for left_key, right_key in zip(left_on_, right_on_remapped) + ), + ) + if left_on_ + else None ) how_native = "full outer" if how == "full" else how From f434b9369b78d465e92922b46936b16a8459f839 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 22:58:52 +0100 Subject: [PATCH 20/21] fix spark like --- narwhals/_spark_like/dataframe.py | 8 +++++--- tests/frame/join_test.py | 9 ++++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 7a5b573fdf..0dac9b03e3 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -378,7 +378,7 @@ def join( elif how == "full": col_order.extend(rename_mapping.values()) - right_on_remapped = [rename_mapping.get(c, c) for c in right_on_] + right_on_remapped = [rename_mapping[c] for c in right_on_] on_ = ( reduce( and_, @@ -387,10 +387,12 @@ def join( for left_key, right_key in zip(left_on_, right_on_remapped) ), ) - if left_on_ + if how == "full" else None + if how == "cross" + else left_on_ ) - how_native = "full outer" if how == "full" else how + how_native = "full_outer" if how == "full" else how return self._from_native_frame( self_native.join(other_native, on=on_, how=how_native).select(col_order) diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index b4c2327d30..70f288201f 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -109,7 +109,14 @@ def test_full_join_duplicate(constructor: Constructor) -> None: df_right = nw_main.from_native(constructor(df2)).lazy() # polars throws `NarwhalsError`, everything else should raise `DuplicateError` - with pytest.raises((DuplicateError, nw.exceptions.NarwhalsError)): + exceptions = [DuplicateError, nw.exceptions.NarwhalsError] + + if "pyspark" in str(constructor) and "sqlframe" not in str(constructor): + from pyspark.errors import AnalysisException + + exceptions.append(AnalysisException) + + with pytest.raises(tuple(exceptions)): df_left.join(df_right, on="foo", how="full").collect() From d03ac8eddad01ca7b6ea23f3d3f9afee8b8501b7 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 18 Mar 2025 23:13:48 +0100 Subject: [PATCH 21/21] typing --- narwhals/_dask/dataframe.py | 8 ++++---- narwhals/_duckdb/dataframe.py | 4 ++-- narwhals/_polars/dataframe.py | 12 ++++++------ narwhals/_spark_like/dataframe.py | 10 +++++----- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 987b84786f..3790248e9b 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -256,8 +256,8 @@ def join( other: Self, *, how: Literal["inner", "left", "full", "cross", "semi", "anti"], - left_on: list[str] | None, - right_on: list[str] | None, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, suffix: str, ) -> Self: if how == "cross": @@ -288,7 +288,7 @@ def join( other_native = ( select_columns_by_name( other._native_frame, - right_on, + right_on, # type: ignore[arg-type] self._backend_version, self._implementation, ) @@ -315,7 +315,7 @@ def join( other_native = ( select_columns_by_name( other._native_frame, - right_on, + right_on, # type: ignore[arg-type] self._backend_version, self._implementation, ) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 8b9f3e2c8f..51c6493177 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -252,8 +252,8 @@ def join( other: Self, *, how: Literal["inner", "left", "full", "cross", "semi", "anti"], - left_on: list[str] | None, - right_on: list[str] | None, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, suffix: str, ) -> Self: original_alias = self._native_frame.alias diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 6649eb30dc..8c68f9e292 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -424,8 +424,8 @@ def join( other: Self, *, how: Literal["inner", "left", "full", "cross", "semi", "anti"], - left_on: list[str] | None, - right_on: list[str] | None, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, suffix: str, ) -> Self: how_native = ( @@ -435,7 +435,7 @@ def join( return self._from_native_frame( self._native_frame.join( other=other._native_frame, - how=how_native, + how=how_native, # type: ignore[arg-type] left_on=left_on, right_on=right_on, suffix=suffix, @@ -627,8 +627,8 @@ def join( other: Self, *, how: Literal["inner", "left", "full", "cross", "semi", "anti"], - left_on: list[str] | None, - right_on: list[str] | None, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, suffix: str, ) -> Self: how_native = ( @@ -638,7 +638,7 @@ def join( return self._from_native_frame( self._native_frame.join( other=other._native_frame, - how=how_native, + how=how_native, # type: ignore[arg-type] left_on=left_on, right_on=right_on, suffix=suffix, diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 0dac9b03e3..3351f28995 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -329,8 +329,8 @@ def join( self: Self, other: Self, how: Literal["inner", "left", "full", "cross", "semi", "anti"], - left_on: list[str] | None, - right_on: list[str] | None, + left_on: Sequence[str] | None, + right_on: Sequence[str] | None, suffix: str, ) -> Self: self_native = self._native_frame @@ -339,8 +339,8 @@ def join( left_columns = self.columns right_columns = other.columns - right_on_: list[str] = right_on or [] - left_on_: list[str] = left_on or [] + right_on_: Sequence[str] = right_on or [] + left_on_: Sequence[str] = left_on or [] # create a mapping for columns on other # `right_on` columns will be renamed as `left_on` @@ -395,7 +395,7 @@ def join( how_native = "full_outer" if how == "full" else how return self._from_native_frame( - self_native.join(other_native, on=on_, how=how_native).select(col_order) + self_native.join(other_native, on=on_, how=how_native).select(col_order) # type: ignore[arg-type] ) def explode(self: Self, columns: list[str]) -> Self: