diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 55bfb044fb31d..5397b1763242d 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -154,6 +154,8 @@ Other enhancements - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`) - :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, :class:`SettingWithCopyWarning`, and :class:`NumExprClobberingError` are now exposed in ``pandas.errors`` (:issue:`27656`) - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`) +- :func:`merge` and :meth:`DataFrame.merge` now allows passing ``None`` or ``(None, None)`` for ``suffixes`` argument, keeping column labels unchanged in the resulting :class:`DataFrame` potentially with duplicate column labels (:issue:`46885`) +- :func:`DataFrame.join` now allows passing empty string for ``lsuffix`` and ``rsuffix`` arguments, keeping column labels unchanged in the resulting :class:`DataFrame` potentially with duplicate column labels (:issue:`46885`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/_typing.py b/pandas/_typing.py index a85820a403fde..afa59e5485f62 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -111,7 +111,7 @@ IndexLabel = Union[Hashable, Sequence[Hashable]] Level = Union[Hashable, int] Shape = Tuple[int, ...] -Suffixes = Tuple[Optional[str], Optional[str]] +Suffixes = Optional[Tuple[Optional[str], Optional[str]]] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Frequency = Union[str, "DateOffset"] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4376c784bc847..0ada4db7ef5dc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -322,13 +322,13 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : list-like, default is ("_x", "_y") - A length-2 sequence where each element is optionally a string +suffixes : optional list-like, default is ("_x", "_y") + An optional length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in `left` and `right` respectively. Pass a value of `None` instead of a string to indicate that the column name from `left` or - `right` should be left as-is, with no suffix. At least one of the - values must not be None. + `right` should be left as-is, with no suffix. Pass `None` to keep + both columns labels as-is. copy : bool, default True If False, avoid copy if possible. indicator : bool or str, default False @@ -412,14 +412,17 @@ 4 bar 2 bar 6 5 baz 3 baz 7 -Merge DataFrames df1 and df2, but raise an exception if the DataFrames have -any overlapping columns. - ->>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) -Traceback (most recent call last): -... -ValueError: columns overlap but no suffix specified: - Index(['value'], dtype='object') +Merge DataFrames df1 and df2 with null as suffix will keep +the original columns names + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=None) + lkey value rkey value +0 foo 1 foo 5 +1 foo 1 foo 8 +2 foo 5 foo 5 +3 foo 5 foo 8 +4 bar 2 bar 6 +5 baz 3 baz 7 >>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) >>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4227d43c459d0..1b4da0a8774cc 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2359,7 +2359,7 @@ def _items_overlap_with_suffix( If corresponding suffix is empty, the entry is simply converted to string. """ - if not is_list_like(suffixes, allow_sets=False): + if not (is_list_like(suffixes, allow_sets=False) or suffixes is None): warnings.warn( f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " "unexpected results. Provide 'suffixes' as a tuple instead. In the " @@ -2372,10 +2372,7 @@ def _items_overlap_with_suffix( if len(to_rename) == 0: return left, right - lsuffix, rsuffix = suffixes - - if not lsuffix and not rsuffix: - raise ValueError(f"columns overlap but no suffix specified: {to_rename}") + lsuffix, rsuffix = suffixes if suffixes else (None, None) def renamer(x, suffix): """ @@ -2386,12 +2383,12 @@ def renamer(x, suffix): Parameters ---------- - x : original column name + x : original column label suffix : str or None Returns ------- - x : renamed column name + x : renamed column label """ if x in to_rename and suffix is not None: return f"{x}{suffix}" diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index 0935856fb223a..fb8da70bcd2d6 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -280,12 +280,6 @@ def test_join_index(float_frame): with pytest.raises(ValueError, match="join method"): f.join(f2, how="foo") - # corner case - overlapping columns - msg = "columns overlap but no suffix" - for how in ("outer", "left", "inner"): - with pytest.raises(ValueError, match=msg): - float_frame.join(float_frame, how=how) - def test_join_index_more(float_frame): af = float_frame.loc[:, ["A", "B"]] diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index ccdfc3cd23790..b2bce22931643 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2209,6 +2209,10 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]), ("a", "a", {}, ["a_x", "a_y"]), (0, 0, {}, ["0_x", "0_y"]), + (0, 0, {"suffixes": None}, [0, 0]), + (0, 0, {"suffixes": (None, None)}, [0, 0]), + ("a", "a", {"suffixes": None}, ["a", "a"]), + ("a", "a", {"suffixes": (None, None)}, ["a", "a"]), ], ) def test_merge_suffix(col1, col2, kwargs, expected_cols): @@ -2255,21 +2259,6 @@ def test_merge_duplicate_suffix(how, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "col1, col2, suffixes", - [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))], -) -def test_merge_suffix_error(col1, col2, suffixes): - # issue: 24782 - a = DataFrame({col1: [1, 2, 3]}) - b = DataFrame({col2: [3, 4, 5]}) - - # TODO: might reconsider current raise behaviour, see issue 24782 - msg = "columns overlap but no suffix specified" - with pytest.raises(ValueError, match=msg): - merge(a, b, left_index=True, right_index=True, suffixes=suffixes) - - @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) def test_merge_suffix_warns(suffixes): a = DataFrame({"a": [1, 2, 3]}) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 0dbe45eeb1e82..8b2a43e6512e1 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -636,12 +636,6 @@ def test_join_multi_levels_invalid(self, portfolio, household): ): household.join(portfolio, how="inner") - portfolio2 = portfolio.copy() - portfolio2.index.set_names(["household_id", "foo"]) - - with pytest.raises(ValueError, match="columns overlap but no suffix specified"): - portfolio2.join(portfolio, how="inner") - def test_join_multi_levels2(self): # some more advanced merges