Skip to content
Merged
2 changes: 1 addition & 1 deletion doc/source/user_guide/merging.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1273,7 +1273,7 @@ columns:

.. ipython:: python

result = pd.merge(left, right, on='k', suffixes=['_l', '_r'])
result = pd.merge(left, right, on='k', suffixes=('_l', '_r'))

.. ipython:: python
:suppress:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ Backwards incompatible API changes
- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string.
- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError``
(previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`)
- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`)

``MultiIndex.get_indexer`` interprets `method` argument differently
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
19 changes: 12 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,10 +667,8 @@ def get_result(self):

join_index, left_indexer, right_indexer = self._get_join_info()

lsuf, rsuf = self.suffixes

llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, lsuf, self.right._info_axis, rsuf
self.left._info_axis, self.right._info_axis, self.suffixes
)

lindexers = {1: left_indexer} if left_indexer is not None else {}
Expand Down Expand Up @@ -1484,10 +1482,8 @@ def __init__(
def get_result(self):
join_index, left_indexer, right_indexer = self._get_join_info()

lsuf, rsuf = self.suffixes

llabels, rlabels = _items_overlap_with_suffix(
self.left._info_axis, lsuf, self.right._info_axis, rsuf
self.left._info_axis, self.right._info_axis, self.suffixes
)

if self.fill_method == "ffill":
Expand Down Expand Up @@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
)


def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix):
def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]):
"""
Suffixes type validation.

If two indices overlap, add suffixes to overlapping entries.

If corresponding suffix is empty, the entry is simply converted to string.

"""
if not isinstance(suffixes, tuple):
raise TypeError(
f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}"
)

to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right

lsuffix, rsuffix = suffixes

if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/reshape/merge/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_inner_join(self):
_check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner")

def test_handle_overlap(self):
joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"])
joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar"))

assert "key1.foo" in joined
assert "key1.bar" in joined
Expand All @@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self):
self.df2,
left_on="key2",
right_on="key1",
suffixes=[".foo", ".bar"],
suffixes=(".foo", ".bar"),
)
assert "key1.foo" in joined
assert "key2.bar" in joined
Expand Down
39 changes: 25 additions & 14 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]),
("a", "b", dict(suffixes=("_x", None)), ["a", "b"]),
("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]),
(0, 0, dict(suffixes=["_a", None]), ["0_a", 0]),
("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]),
(0, 0, dict(suffixes=("_a", None)), ["0_a", 0]),
("a", "a", dict(), ["a_x", "a_y"]),
(0, 0, dict(), ["0_x", "0_y"]),
],
Expand Down Expand Up @@ -2056,13 +2056,7 @@ def test_merge_duplicate_suffix(how, expected):

@pytest.mark.parametrize(
"col1, col2, suffixes",
[
("a", "a", [None, None]),
("a", "a", (None, None)),
("a", "a", ("", None)),
(0, 0, [None, None]),
(0, 0, (None, "")),
],
[("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
)
def test_merge_suffix_error(col1, col2, suffixes):
# issue: 24782
Expand All @@ -2075,18 +2069,35 @@ def test_merge_suffix_error(col1, col2, suffixes):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)])
def test_merge_suffix_none_error(col1, col2, suffixes):
# issue: 24782
@pytest.mark.parametrize(
"col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)],
)
def test_merge_suffix_type_error(col1, col2, suffixes):
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

# TODO: might reconsider current raise behaviour, see GH24782
msg = "iterable"
msg = (
f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}"
)
with pytest.raises(TypeError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize(
"col1, col2, suffixes, msg",
[
("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
],
)
def test_merge_suffix_length_error(col1, col2, suffixes, msg):
a = pd.DataFrame({col1: [1, 2, 3]})
b = pd.DataFrame({col2: [3, 4, 5]})

with pytest.raises(ValueError, match=msg):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("cat_dtype", ["one", "two"])
@pytest.mark.parametrize("reverse", [True, False])
def test_merge_equal_cat_dtypes(cat_dtype, reverse):
Expand Down