Skip to content

ENH: Add suffixes argument for pd.concat #29669

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ Other enhancements
- Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
- Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
- :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
- Added ``suffixes`` argument to :meth:`pandas.concat` to distinguish overlapping column names after concatenation (:issue:`21791`)

Build Changes
^^^^^^^^^^^^^
Expand Down
100 changes: 100 additions & 0 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
concat routines
"""

from collections import Counter
from functools import partial
from itertools import chain
from typing import Optional
import warnings

import numpy as np
Expand All @@ -21,6 +25,7 @@
)
import pandas.core.indexes.base as ibase
from pandas.core.internals import concatenate_block_managers
from pandas.core.internals.managers import _transform_index

# ---------------------------------------------------------------------
# Concatenate DataFrame objects
Expand All @@ -37,6 +42,7 @@ def concat(
names=None,
verify_integrity: bool = False,
sort=None,
suffixes: Optional[tuple] = None,
copy: bool = True,
):
"""
Expand Down Expand Up @@ -94,6 +100,14 @@ def concat(

.. versionadded:: 0.23.0

suffixes : tuple of str, default None
Suffix to apply to overlapping column names for each concatenated object
respectively. If the length of suffixes does not match with number of
concatenated objects, an error will raise. If None, the output will remain
as is with duplicated column names.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to replicate the behavior of merge in regards to suffixes=(False, False)?

pandas/pandas/core/frame.py

Lines 203 to 206 in e246c3b

suffixes : tuple of (str, str), default ('_x', '_y')
Suffix to apply to overlapping column names in the left and right
side, respectively. To raise an exception on overlapping columns use
(False, False).


This has no effect if there is no overlapping column names or if axis=0.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add .. versionadded:: 1.0.0?

copy : bool, default True
If False, do not copy data unnecessarily.

Expand Down Expand Up @@ -238,6 +252,16 @@ def concat(
Traceback (most recent call last):
...
ValueError: Indexes have overlapping values: ['a']

If objects have overlapping column names when passing in ``axis=1``,
specifying suffixes using tuple can add suffix to each object respecitvely.

>>> df7 = pd.DataFrame({"a": [1, 2]})
>>> df8 = pd.DataFrame({"a": [3, 4], "b": [4, 6]})
>>> pd.concat([df7, df8], axis=1, suffixes=("_x", "_y"))
a_x a_y b
0 1 3 4
1 2 4 6
"""
op = _Concatenator(
objs,
Expand All @@ -251,6 +275,7 @@ def concat(
verify_integrity=verify_integrity,
copy=copy,
sort=sort,
suffixes=suffixes,
)

return op.get_result()
Expand All @@ -274,6 +299,7 @@ def __init__(
verify_integrity: bool = False,
copy: bool = True,
sort=False,
suffixes=None,
):
if isinstance(objs, (NDFrame, str)):
raise TypeError(
Expand Down Expand Up @@ -418,6 +444,16 @@ def __init__(
self.names = names or getattr(keys, "names", None)
self.levels = levels
self.sort = sort
self.suffixes = suffixes

if self.axis == 0 and not self._is_series:

# If objs is not composed of pure Series, and if BlockManager axis is 1,
# then will check the overlapping of columns, and directly rename them
# if overlapping is the case
self.objs = self._items_overlap_with_suffix(
self.objs, suffixes=self.suffixes
)

self.ignore_index = ignore_index
self.verify_integrity = verify_integrity
Expand Down Expand Up @@ -447,6 +483,10 @@ def get_result(self):

index, columns = self.new_axes
df = cons(data, index=index)

# before assigning columns to composed DataFrame, check if columns
# are overlapped
columns = self._items_overlap_with_suffix(columns, self.suffixes)
df.columns = columns
return df.__finalize__(self, method="concat")

Expand Down Expand Up @@ -585,6 +625,66 @@ def _maybe_check_integrity(self, concat_index: Index):
"{overlap!s}".format(overlap=overlap)
)

def _items_overlap_with_suffix(self, objs, suffixes):
"""
Adding suffix for items if there is overlapping situation.

Be aware that `objs` can be either DataFrame-like or Index-like given
if `self._is_series` is True or False.
"""
if self._is_series:

# when _is_series is True, objs are actually column Index
overlap_cols = list(objs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you actually need to convert to a list here? I think leaving it as an Index should still work the same way?

else:
overlap_cols = chain.from_iterable([obj.columns for obj in objs])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need the list comprehension and could just do a generator expression instead.

to_rename = [col for col, cnt in Counter(overlap_cols).items() if cnt > 1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make this a set comprehension since this is only used for x in to_rename lookups.


if len(to_rename) == 0 or suffixes is None:
Copy link
Member

@jschendel jschendel Nov 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a little more pythonic to check the boolness of to_rename: len(to_rename) == 0 --> not to_rename

return objs

if not isinstance(suffixes, tuple):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this isinstance check maybe be moved to the start of the function? Might be clearer.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i thought so as well, and then decided to leave it here because of two reasons:

  1. this will only be used if there is overlapping in column names.
  2. the default is None, so default is used, then directly return original objs without post-processing and checking below.
    Does it make any sense? I am very open to suggestions!

raise ValueError(
f"Invalid type {type(suffixes)} is assigned to suffixes, only "
f"'tuple' is allowed."
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: I don't think you need the second line to be and f-string

)

if len(objs) != len(suffixes):
raise ValueError(
"Number of objects for concatenation is not"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs a space at the end: is not "

"equal to number of suffixes"
)

def renamer(x, suffix):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like there's basically an identical definition of this function in core/reshape/merge.py, so would be nice to be able to reuse this. A little tricky in that this is a nested function, so maybe can do as a follow-up once things are set in stone.

"""
Rename the indices.

If there is overlap, and suffix is not None, add
suffix, otherwise, leave it as-is.

Parameters
----------
x : original column name
suffix : str or None

Returns
-------
x : renamed column name
"""
if x in to_rename and suffix is not None:
return f"{x}{suffix}"
return x

if self._is_series:
new_cols = [renamer(obj, suffix) for obj, suffix in zip(objs, suffixes)]
return new_cols

for obj, suffix in zip(objs, suffixes):
col_renamer = partial(renamer, suffix=suffix)
obj.columns = _transform_index(obj.columns, col_renamer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this line is causing the original dataframes to be modified as well:

In [1]: import pandas as pd; pd.__version__
Out[1]: '0.26.0.dev0+947.gc8570707c'

In [2]: df1 = pd.DataFrame({'A': list('ab'), 'B': [0, 1]})

In [3]: df2 = pd.DataFrame({'A':list('ac'), 'C': [100, 200]})

In [4]: df3 = pd.concat([df1, df2], axis=1, suffixes=('_x', '_y'))

In [5]: df1.columns
Out[5]: Index(['A_x', 'B'], dtype='object')

In [6]: df1
Out[6]: 
  A_x  B
0   a  0
1   b  1

In [7]: df2.columns
Out[7]: Index(['A_y', 'C'], dtype='object')

In [8]: df2
Out[8]: 
  A_y    C
0   a  100
1   c  200

Root cause could be occurring elsewhere though, but it's an issue nonetheless. Would be nice to add a test for the above.


return objs


def _concat_indexes(indexes) -> Index:
return indexes[0].append(indexes[1:])
Expand Down
134 changes: 134 additions & 0 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2776,3 +2776,137 @@ def test_concat_datetimeindex_freq():
expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50]))
expected.index.freq = None
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("suffixes", ["_a", ("_x"), ["a", "b"]])
def test_concat_suffixes_type(suffixes):
# GH 21791, like pd.merge, here suffixes type should be tuple
objs = [pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3]})]
with pytest.raises(ValueError, match="only 'tuple' is allowed"):
pd.concat(objs, axis=1, suffixes=suffixes)


@pytest.mark.parametrize(
"objs, suffixes",
[
(
[
pd.Series([1, 2], name="a"),
pd.Series([2, 3], name="a"),
pd.Series([2, 3]),
],
("_x", "_y"),
),
(
[
pd.DataFrame({"a": [1, 2]}),
pd.DataFrame({"a": [2, 3]}, pd.Series([1, 2])),
],
("_x", "_y", "_z", "_k"),
),
(
[pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3]})],
("_x", "_y", "_z"),
),
],
)
def test_concat_suffixes_length_unmatch_error(objs, suffixes):
# GH 21791, add test to see if warning is raise when columns overlap but length of
# suffixes does not match the length of objs
with pytest.raises(ValueError, match="Number of objects for concatenation is not"):
pd.concat(objs, axis=1, suffixes=suffixes)


@pytest.mark.parametrize(
"objs, suffixes, expected",
[
(
[pd.Series([1, 2], name="a"), pd.Series([2, 3], name="a")],
("_x", "_y"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add tests for if dupe suffixes are specified, e.g. ("_x", "_x")?

pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3]}),
),
(
[
pd.Series([1, 2]),
pd.Series([2, 3], name="b"),
pd.Series([3, 4], name="b"),
],
("_x", "_y", "_z"),
pd.DataFrame({0: [1, 2], "b_y": [2, 3], "b_z": [3, 4]}),
),
(
[
pd.Series([1, 2], name="a"),
pd.Series([2, 3], name="b"),
pd.Series([3, 4], name="b"),
pd.Series([3, 5], name="a"),
],
("_x", "_y", "_z", "_k"),
pd.DataFrame({"a_x": [1, 2], "b_y": [2, 3], "b_z": [3, 4], "a_k": [3, 5]}),
),
],
)
def test_concat_suffixes_series(objs, suffixes, expected):
# GH 21791, test if suffixes is assigned correctly when objs are all Series
output = pd.concat(objs, axis=1, suffixes=suffixes)
tm.assert_frame_equal(output, expected)


@pytest.mark.parametrize(
"objs, suffixes, expected",
[
(
[pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3], "b": [3, 4]})],
("_x", "_y"),
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [3, 4]}),
),
(
[
pd.DataFrame({"a": [1, 2], "b": [2, 3]}),
pd.DataFrame({"a": [2, 3]}),
pd.DataFrame({"a": [3, 4], "b": [4, 5], "c": [5, 6]}),
],
("_x", "_y", "_z"),
pd.DataFrame(
{
"a_x": [1, 2],
"b_x": [2, 3],
"a_y": [2, 3],
"a_z": [3, 4],
"b_z": [4, 5],
"c": [5, 6],
}
),
),
],
)
def test_concat_suffixes_dataframes(objs, suffixes, expected):
# GH 21791, test if suffixes is assigned correctly when objs are all DataFrames
output = pd.concat(objs, axis=1, suffixes=suffixes)
tm.assert_frame_equal(output, expected)


@pytest.mark.parametrize(
"objs, suffixes, expected",
[
(
[pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3], "b": [2, 5]})],
("_x", "_y"),
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [2, 5]}),
),
(
[
pd.Series([1, 2], name="a"),
pd.DataFrame({"a": [2, 3], "b": [2, 5]}),
pd.Series([3, 4], name="b"),
],
("_x", "_y", "_z"),
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b_y": [2, 5], "b_z": [3, 4]}),
),
],
)
def test_concat_suffixes_mixed_series_dataframe(objs, suffixes, expected):
# GH 21791, test if suffixes is assigned correctly when objs are mixed Series and
# DataFrames
output = pd.concat(objs, axis=1, suffixes=suffixes)
tm.assert_frame_equal(output, expected)