-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Add suffixes argument for pd.concat #29669
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e461a1
1314059
8bcb313
a4f9d14
13a2930
901a21b
fd64695
b7e97d6
22e3250
fd53b09
1252f7b
233adee
f7d3d59
c857070
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,10 @@ | |
concat routines | ||
""" | ||
|
||
from collections import Counter | ||
from functools import partial | ||
from itertools import chain | ||
from typing import Optional | ||
import warnings | ||
|
||
import numpy as np | ||
|
@@ -21,6 +25,7 @@ | |
) | ||
import pandas.core.indexes.base as ibase | ||
from pandas.core.internals import concatenate_block_managers | ||
from pandas.core.internals.managers import _transform_index | ||
|
||
# --------------------------------------------------------------------- | ||
# Concatenate DataFrame objects | ||
|
@@ -37,6 +42,7 @@ def concat( | |
names=None, | ||
verify_integrity: bool = False, | ||
sort=None, | ||
suffixes: Optional[tuple] = None, | ||
copy: bool = True, | ||
): | ||
""" | ||
|
@@ -94,6 +100,14 @@ def concat( | |
|
||
.. versionadded:: 0.23.0 | ||
|
||
suffixes : tuple of str, default None | ||
Suffix to apply to overlapping column names for each concatenated object | ||
respectively. If the length of suffixes does not match with number of | ||
concatenated objects, an error will raise. If None, the output will remain | ||
as is with duplicated column names. | ||
|
||
This has no effect if there is no overlapping column names or if axis=0. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add |
||
copy : bool, default True | ||
If False, do not copy data unnecessarily. | ||
|
||
|
@@ -238,6 +252,16 @@ def concat( | |
Traceback (most recent call last): | ||
... | ||
ValueError: Indexes have overlapping values: ['a'] | ||
|
||
If objects have overlapping column names when passing in ``axis=1``, | ||
specifying suffixes using tuple can add suffix to each object respecitvely. | ||
|
||
>>> df7 = pd.DataFrame({"a": [1, 2]}) | ||
>>> df8 = pd.DataFrame({"a": [3, 4], "b": [4, 6]}) | ||
>>> pd.concat([df7, df8], axis=1, suffixes=("_x", "_y")) | ||
a_x a_y b | ||
0 1 3 4 | ||
1 2 4 6 | ||
""" | ||
op = _Concatenator( | ||
objs, | ||
|
@@ -251,6 +275,7 @@ def concat( | |
verify_integrity=verify_integrity, | ||
copy=copy, | ||
sort=sort, | ||
suffixes=suffixes, | ||
) | ||
|
||
return op.get_result() | ||
|
@@ -274,6 +299,7 @@ def __init__( | |
verify_integrity: bool = False, | ||
copy: bool = True, | ||
sort=False, | ||
suffixes=None, | ||
): | ||
if isinstance(objs, (NDFrame, str)): | ||
raise TypeError( | ||
|
@@ -418,6 +444,16 @@ def __init__( | |
self.names = names or getattr(keys, "names", None) | ||
self.levels = levels | ||
self.sort = sort | ||
self.suffixes = suffixes | ||
|
||
if self.axis == 0 and not self._is_series: | ||
|
||
# If objs is not composed of pure Series, and if BlockManager axis is 1, | ||
# then will check the overlapping of columns, and directly rename them | ||
# if overlapping is the case | ||
self.objs = self._items_overlap_with_suffix( | ||
self.objs, suffixes=self.suffixes | ||
) | ||
|
||
self.ignore_index = ignore_index | ||
self.verify_integrity = verify_integrity | ||
|
@@ -447,6 +483,10 @@ def get_result(self): | |
|
||
index, columns = self.new_axes | ||
df = cons(data, index=index) | ||
|
||
# before assigning columns to composed DataFrame, check if columns | ||
# are overlapped | ||
columns = self._items_overlap_with_suffix(columns, self.suffixes) | ||
df.columns = columns | ||
return df.__finalize__(self, method="concat") | ||
|
||
|
@@ -585,6 +625,66 @@ def _maybe_check_integrity(self, concat_index: Index): | |
"{overlap!s}".format(overlap=overlap) | ||
) | ||
|
||
def _items_overlap_with_suffix(self, objs, suffixes): | ||
""" | ||
Adding suffix for items if there is overlapping situation. | ||
|
||
Be aware that `objs` can be either DataFrame-like or Index-like given | ||
if `self._is_series` is True or False. | ||
""" | ||
if self._is_series: | ||
|
||
# when _is_series is True, objs are actually column Index | ||
overlap_cols = list(objs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you actually need to convert to a |
||
else: | ||
overlap_cols = chain.from_iterable([obj.columns for obj in objs]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think you need the list comprehension and could just do a generator expression instead. |
||
to_rename = [col for col, cnt in Counter(overlap_cols).items() if cnt > 1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe make this a |
||
|
||
if len(to_rename) == 0 or suffixes is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe a little more pythonic to check the boolness of |
||
return objs | ||
|
||
if not isinstance(suffixes, tuple): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i thought so as well, and then decided to leave it here because of two reasons:
|
||
raise ValueError( | ||
f"Invalid type {type(suffixes)} is assigned to suffixes, only " | ||
f"'tuple' is allowed." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick: I don't think you need the second line to be and f-string |
||
) | ||
|
||
if len(objs) != len(suffixes): | ||
raise ValueError( | ||
"Number of objects for concatenation is not" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. needs a space at the end: |
||
"equal to number of suffixes" | ||
) | ||
|
||
def renamer(x, suffix): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like there's basically an identical definition of this function in |
||
""" | ||
Rename the indices. | ||
|
||
If there is overlap, and suffix is not None, add | ||
suffix, otherwise, leave it as-is. | ||
|
||
Parameters | ||
---------- | ||
x : original column name | ||
suffix : str or None | ||
|
||
Returns | ||
------- | ||
x : renamed column name | ||
""" | ||
if x in to_rename and suffix is not None: | ||
return f"{x}{suffix}" | ||
return x | ||
|
||
if self._is_series: | ||
new_cols = [renamer(obj, suffix) for obj, suffix in zip(objs, suffixes)] | ||
return new_cols | ||
|
||
for obj, suffix in zip(objs, suffixes): | ||
col_renamer = partial(renamer, suffix=suffix) | ||
obj.columns = _transform_index(obj.columns, col_renamer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this line is causing the original dataframes to be modified as well: In [1]: import pandas as pd; pd.__version__
Out[1]: '0.26.0.dev0+947.gc8570707c'
In [2]: df1 = pd.DataFrame({'A': list('ab'), 'B': [0, 1]})
In [3]: df2 = pd.DataFrame({'A':list('ac'), 'C': [100, 200]})
In [4]: df3 = pd.concat([df1, df2], axis=1, suffixes=('_x', '_y'))
In [5]: df1.columns
Out[5]: Index(['A_x', 'B'], dtype='object')
In [6]: df1
Out[6]:
A_x B
0 a 0
1 b 1
In [7]: df2.columns
Out[7]: Index(['A_y', 'C'], dtype='object')
In [8]: df2
Out[8]:
A_y C
0 a 100
1 c 200 Root cause could be occurring elsewhere though, but it's an issue nonetheless. Would be nice to add a test for the above. |
||
|
||
return objs | ||
|
||
|
||
def _concat_indexes(indexes) -> Index: | ||
return indexes[0].append(indexes[1:]) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2776,3 +2776,137 @@ def test_concat_datetimeindex_freq(): | |
expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) | ||
expected.index.freq = None | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize("suffixes", ["_a", ("_x"), ["a", "b"]]) | ||
def test_concat_suffixes_type(suffixes): | ||
# GH 21791, like pd.merge, here suffixes type should be tuple | ||
objs = [pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3]})] | ||
with pytest.raises(ValueError, match="only 'tuple' is allowed"): | ||
pd.concat(objs, axis=1, suffixes=suffixes) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"objs, suffixes", | ||
[ | ||
( | ||
[ | ||
pd.Series([1, 2], name="a"), | ||
pd.Series([2, 3], name="a"), | ||
pd.Series([2, 3]), | ||
], | ||
("_x", "_y"), | ||
), | ||
( | ||
[ | ||
pd.DataFrame({"a": [1, 2]}), | ||
pd.DataFrame({"a": [2, 3]}, pd.Series([1, 2])), | ||
], | ||
("_x", "_y", "_z", "_k"), | ||
), | ||
( | ||
[pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3]})], | ||
("_x", "_y", "_z"), | ||
), | ||
], | ||
) | ||
def test_concat_suffixes_length_unmatch_error(objs, suffixes): | ||
# GH 21791, add test to see if warning is raise when columns overlap but length of | ||
# suffixes does not match the length of objs | ||
with pytest.raises(ValueError, match="Number of objects for concatenation is not"): | ||
pd.concat(objs, axis=1, suffixes=suffixes) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"objs, suffixes, expected", | ||
[ | ||
( | ||
[pd.Series([1, 2], name="a"), pd.Series([2, 3], name="a")], | ||
("_x", "_y"), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you also add tests for if dupe |
||
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3]}), | ||
), | ||
( | ||
[ | ||
pd.Series([1, 2]), | ||
pd.Series([2, 3], name="b"), | ||
pd.Series([3, 4], name="b"), | ||
], | ||
("_x", "_y", "_z"), | ||
pd.DataFrame({0: [1, 2], "b_y": [2, 3], "b_z": [3, 4]}), | ||
), | ||
( | ||
[ | ||
pd.Series([1, 2], name="a"), | ||
pd.Series([2, 3], name="b"), | ||
pd.Series([3, 4], name="b"), | ||
pd.Series([3, 5], name="a"), | ||
], | ||
("_x", "_y", "_z", "_k"), | ||
pd.DataFrame({"a_x": [1, 2], "b_y": [2, 3], "b_z": [3, 4], "a_k": [3, 5]}), | ||
), | ||
], | ||
) | ||
def test_concat_suffixes_series(objs, suffixes, expected): | ||
# GH 21791, test if suffixes is assigned correctly when objs are all Series | ||
output = pd.concat(objs, axis=1, suffixes=suffixes) | ||
tm.assert_frame_equal(output, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"objs, suffixes, expected", | ||
[ | ||
( | ||
[pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [2, 3], "b": [3, 4]})], | ||
("_x", "_y"), | ||
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [3, 4]}), | ||
), | ||
( | ||
[ | ||
pd.DataFrame({"a": [1, 2], "b": [2, 3]}), | ||
pd.DataFrame({"a": [2, 3]}), | ||
pd.DataFrame({"a": [3, 4], "b": [4, 5], "c": [5, 6]}), | ||
], | ||
("_x", "_y", "_z"), | ||
pd.DataFrame( | ||
{ | ||
"a_x": [1, 2], | ||
"b_x": [2, 3], | ||
"a_y": [2, 3], | ||
"a_z": [3, 4], | ||
"b_z": [4, 5], | ||
"c": [5, 6], | ||
} | ||
), | ||
), | ||
], | ||
) | ||
def test_concat_suffixes_dataframes(objs, suffixes, expected): | ||
# GH 21791, test if suffixes is assigned correctly when objs are all DataFrames | ||
output = pd.concat(objs, axis=1, suffixes=suffixes) | ||
tm.assert_frame_equal(output, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"objs, suffixes, expected", | ||
[ | ||
( | ||
[pd.Series([1, 2], name="a"), pd.DataFrame({"a": [2, 3], "b": [2, 5]})], | ||
("_x", "_y"), | ||
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b": [2, 5]}), | ||
), | ||
( | ||
[ | ||
pd.Series([1, 2], name="a"), | ||
pd.DataFrame({"a": [2, 3], "b": [2, 5]}), | ||
pd.Series([3, 4], name="b"), | ||
], | ||
("_x", "_y", "_z"), | ||
pd.DataFrame({"a_x": [1, 2], "a_y": [2, 3], "b_y": [2, 5], "b_z": [3, 4]}), | ||
), | ||
], | ||
) | ||
def test_concat_suffixes_mixed_series_dataframe(objs, suffixes, expected): | ||
# GH 21791, test if suffixes is assigned correctly when objs are mixed Series and | ||
# DataFrames | ||
output = pd.concat(objs, axis=1, suffixes=suffixes) | ||
tm.assert_frame_equal(output, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to replicate the behavior of merge in regards to
suffixes=(False, False)
?pandas/pandas/core/frame.py
Lines 203 to 206 in e246c3b