diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 099e5bc48353a..6823ef65c2e37 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -847,6 +847,7 @@ Reshaping - Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) - Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`) +- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 68d61da0cf7dd..ccaaa91a2d84a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -3062,13 +3062,16 @@ def renamer(x, suffix: str | None): if not llabels.is_unique: # Only warn when duplicates are caused because of suffixes, already duplicated # columns in origin should not warn - dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist() + dups.extend(llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()) if not rlabels.is_unique: dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist()) + # Suffix addition creates duplicate to pre-existing column name + dups.extend(llabels.intersection(right.difference(to_rename)).tolist()) + dups.extend(rlabels.intersection(left.difference(to_rename)).tolist()) if dups: raise MergeError( f"Passing 'suffixes' which cause duplicate columns {set(dups)} is " - f"not allowed.", + "not allowed.", ) return llabels, rlabels diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f0f67aebd85ec..f3418ad047afe 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -3060,3 +3060,12 @@ def test_merge_on_all_nan_column(): {"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]} ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("suffixes", [("_dup", ""), ("", "_dup")]) +def test_merge_for_suffix_collisions(suffixes): + # GH#61402 + df1 = DataFrame({"col1": [1], "col2": [2]}) + df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]}) + with pytest.raises(MergeError, match="duplicate columns"): + merge(df1, df2, on="col1", suffixes=suffixes)