pandas-dev · charlesdong1991 · Dec 3, 2018 · Jan 19, 2019 · Jul 30, 2019 · Nov 16, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -114,6 +114,7 @@ Other enhancements
 - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
 - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
 - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
+- Added ``suffixes`` argument to :meth:`pandas.concat` to distinguish overlapping column names after concatenation (:issue:`21791`)
 
 Build Changes
 ^^^^^^^^^^^^^

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -2,6 +2,10 @@
 concat routines
 """
 
+from collections import Counter
+from functools import partial
+from itertools import chain
+from typing import Optional
 import warnings
 
 import numpy as np
@@ -21,6 +25,7 @@
 )
 import pandas.core.indexes.base as ibase
 from pandas.core.internals import concatenate_block_managers
+from pandas.core.internals.managers import _transform_index
 
 # ---------------------------------------------------------------------
 # Concatenate DataFrame objects
@@ -37,6 +42,7 @@ def concat(
     names=None,
     verify_integrity: bool = False,
     sort=None,
+    suffixes: Optional[tuple] = None,
     copy: bool = True,
 ):
     """
@@ -94,6 +100,14 @@ def concat(
 
         .. versionadded:: 0.23.0
 
+    suffixes : tuple of str, default None
+        Suffix to apply to overlapping column names for each concatenated object
+        respectively. If the length of suffixes does not match with number of
+        concatenated objects, an error will raise. If None, the output will remain
+        as is with duplicated column names.
 suffixes : tuple of (str, str), default ('_x', '_y') 
     Suffix to apply to overlapping column names in the left and right 
     side, respectively. To raise an exception on overlapping columns use 
     (False, False). 
 suffixes : tuple of (str, str), default ('_x', '_y') 
     Suffix to apply to overlapping column names in the left and right 
     side, respectively. To raise an exception on overlapping columns use 
     (False, False). 
+
+        This has no effect if there is no overlapping column names or if axis=0.
+
     copy : bool, default True
         If False, do not copy data unnecessarily.
 
@@ -238,6 +252,16 @@ def concat(
     Traceback (most recent call last):
         ...
     ValueError: Indexes have overlapping values: ['a']
+
+    If objects have overlapping column names when passing in ``axis=1``,
+    specifying suffixes using tuple can add suffix to each object respecitvely.
+
+    >>> df7 = pd.DataFrame({"a": [1, 2]})
+    >>> df8 = pd.DataFrame({"a": [3, 4], "b": [4, 6]})
+    >>> pd.concat([df7, df8], axis=1, suffixes=("_x", "_y"))
+      a_x  a_y  b
+    0   1    3  4
+    1   2    4  6
     """
     op = _Concatenator(
         objs,
@@ -251,6 +275,7 @@ def concat(
         verify_integrity=verify_integrity,
         copy=copy,
         sort=sort,
+        suffixes=suffixes,
     )
 
     return op.get_result()
@@ -274,6 +299,7 @@ def __init__(
         verify_integrity: bool = False,
         copy: bool = True,
         sort=False,
+        suffixes=None,
     ):
         if isinstance(objs, (NDFrame, str)):
             raise TypeError(
@@ -418,6 +444,16 @@ def __init__(
         self.names = names or getattr(keys, "names", None)
         self.levels = levels
         self.sort = sort
+        self.suffixes = suffixes
+
+        if self.axis == 0 and not self._is_series:
+
+            # If objs is not composed of pure Series, and if BlockManager axis is 1,
+            # then will check the overlapping of columns, and directly rename them
+            # if overlapping is the case
+            self.objs = self._items_overlap_with_suffix(
+                self.objs, suffixes=self.suffixes
+            )
 
         self.ignore_index = ignore_index
         self.verify_integrity = verify_integrity
@@ -447,6 +483,10 @@ def get_result(self):
 
                 index, columns = self.new_axes
                 df = cons(data, index=index)
+
+                # before assigning columns to composed DataFrame, check if columns
+                # are overlapped
+                columns = self._items_overlap_with_suffix(columns, self.suffixes)
                 df.columns = columns
                 return df.__finalize__(self, method="concat")
 
@@ -585,6 +625,66 @@ def _maybe_check_integrity(self, concat_index: Index):
                     "{overlap!s}".format(overlap=overlap)
                 )
 
+    def _items_overlap_with_suffix(self, objs, suffixes):
+        """
+        Adding suffix for items if there is overlapping situation.
+
+        Be aware that `objs` can be either DataFrame-like or Index-like given
+        if `self._is_series` is True or False.
+        """
+        if self._is_series:
+
+            # when _is_series is True, objs are actually column Index
+            overlap_cols = list(objs)
+        else:
+            overlap_cols = chain.from_iterable([obj.columns for obj in objs])
+        to_rename = [col for col, cnt in Counter(overlap_cols).items() if cnt > 1]
+
+        if len(to_rename) == 0 or suffixes is None:
+            return objs
+
+        if not isinstance(suffixes, tuple):
+            raise ValueError(
+                f"Invalid type {type(suffixes)} is assigned to suffixes, only "
+                f"'tuple' is allowed."
+            )
+
+        if len(objs) != len(suffixes):
+            raise ValueError(
+                "Number of objects for concatenation is not"
+                "equal to number of suffixes"
+            )
+
+        def renamer(x, suffix):
+            """
+            Rename the indices.
+
+            If there is overlap, and suffix is not None, add
+            suffix, otherwise, leave it as-is.
+
+            Parameters
+            ----------
+            x : original column name
+            suffix : str or None
+
+            Returns
+            -------
+            x : renamed column name
+            """
+            if x in to_rename and suffix is not None:
+                return f"{x}{suffix}"
+            return x
+
+        if self._is_series:
+            new_cols = [renamer(obj, suffix) for obj, suffix in zip(objs, suffixes)]
+            return new_cols
+
+        for obj, suffix in zip(objs, suffixes):
+            col_renamer = partial(renamer, suffix=suffix)
+            obj.columns = _transform_index(obj.columns, col_renamer)
+
+        return objs
+
 
 def _concat_indexes(indexes) -> Index:
     return indexes[0].append(indexes[1:])

diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -2776,3 +2776,137 @@ def test_concat_datetimeindex_freq():
     expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
     expected.index.freq = None
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("suffixes", ["_a", ("_x"), ["a", "b"]])
+def test_concat_suffixes_type(suffixes):
+    # GH 21791, like pd.merge, here suffixes type should be tuple
+    objs = [pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3]})]
+    with pytest.raises(ValueError, match="only 'tuple' is allowed"):
+        pd.concat(objs, axis=1, suffixes=suffixes)
+
+
+@pytest.mark.parametrize(
+    "objs, suffixes",
+    [
+        (
+            [
+                pd.Series([1, 2], name="a"),
+                pd.Series([2, 3], name="a"),
+                pd.Series([2, 3]),
+            ],
+            ("_x", "_y"),
+        ),
+        (
+            [
+                pd.DataFrame({"a": [1, 2]}),
+                pd.DataFrame({"a": [2, 3]}, pd.Series([1, 2])),
+            ],
+            ("_x", "_y", "_z", "_k"),
+        ),
+        (
+            [pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3]})],
+            ("_x", "_y", "_z"),
+        ),
+    ],
+)
+def test_concat_suffixes_length_unmatch_error(objs, suffixes):
+    # GH 21791, add test to see if warning is raise when columns overlap but length of
+    # suffixes does not match the length of objs
+    with pytest.raises(ValueError, match="Number of objects for concatenation is not"):
+        pd.concat(objs, axis=1, suffixes=suffixes)
+
+
+@pytest.mark.parametrize(
+    "objs, suffixes, expected",
+    [
+        (
+            [pd.Series([1, 2], name="a"), pd.Series([2, 3], name="a")],
+            ("_x", "_y"),
+            pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3]}),
+        ),
+        (
+            [
+                pd.Series([1, 2]),
+                pd.Series([2, 3], name="b"),
+                pd.Series([3, 4], name="b"),
+            ],
+            ("_x", "_y", "_z"),
+            pd.DataFrame({0: [1, 2], "b_y": [2, 3], "b_z": [3, 4]}),
+        ),
+        (
+            [
+                pd.Series([1, 2], name="a"),
+                pd.Series([2, 3], name="b"),
+                pd.Series([3, 4], name="b"),
+                pd.Series([3, 5], name="a"),
+            ],
+            ("_x", "_y", "_z", "_k"),
+            pd.DataFrame({"a_x": [1, 2], "b_y": [2, 3], "b_z": [3, 4], "a_k": [3, 5]}),
+        ),
+    ],
+)
+def test_concat_suffixes_series(objs, suffixes, expected):
+    # GH 21791, test if suffixes is assigned correctly when objs are all Series
+    output = pd.concat(objs, axis=1, suffixes=suffixes)
+    tm.assert_frame_equal(output, expected)
+
+
+@pytest.mark.parametrize(
+    "objs, suffixes, expected",
+    [
+        (
+            [pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3], "b": [3, 4]})],
+            ("_x", "_y"),
+            pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [3, 4]}),
+        ),
+        (
+            [
+                pd.DataFrame({"a": [1, 2], "b": [2, 3]}),
+                pd.DataFrame({"a": [2, 3]}),
+                pd.DataFrame({"a": [3, 4], "b": [4, 5], "c": [5, 6]}),
+            ],
+            ("_x", "_y", "_z"),
+            pd.DataFrame(
+                {
+                    "a_x": [1, 2],
+                    "b_x": [2, 3],
+                    "a_y": [2, 3],
+                    "a_z": [3, 4],
+                    "b_z": [4, 5],
+                    "c": [5, 6],
+                }
+            ),
+        ),
+    ],
+)
+def test_concat_suffixes_dataframes(objs, suffixes, expected):
+    # GH 21791, test if suffixes is assigned correctly when objs are all DataFrames
+    output = pd.concat(objs, axis=1, suffixes=suffixes)
+    tm.assert_frame_equal(output, expected)
+
+
+@pytest.mark.parametrize(
+    "objs, suffixes, expected",
+    [
+        (
+            [pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3], "b": [2, 5]})],
+            ("_x", "_y"),
+            pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [2, 5]}),
+        ),
+        (
+            [
+                pd.Series([1, 2], name="a"),
+                pd.DataFrame({"a": [2, 3], "b": [2, 5]}),
+                pd.Series([3, 4], name="b"),
+            ],
+            ("_x", "_y", "_z"),
+            pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b_y": [2, 5], "b_z": [3, 4]}),
+        ),
+    ],
+)
+def test_concat_suffixes_mixed_series_dataframe(objs, suffixes, expected):
+    # GH 21791, test if suffixes is assigned correctly when objs are mixed Series and
+    # DataFrames
+    output = pd.concat(objs, axis=1, suffixes=suffixes)
+    tm.assert_frame_equal(output, expected)