From a748ccd24071e268998d5f26348b54485e3fdd7d Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Fri, 7 Jun 2019 16:39:53 -0400 Subject: [PATCH 01/36] BUG: Check for duplicate names in columns and index when calling crosstab --- pandas/core/reshape/pivot.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index d653dd87308cf..47cacedb57e06 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -543,6 +543,17 @@ def crosstab( common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + if len(set(rownames)) != len(rownames): + raise ValueError("Duplicated index/row names not allowed") + if len(set(colnames)) != len(colnames): + raise ValueError("Duplicated column names not allowed") + + repeated_names = set(rownames).intersection(set(colnames)) + if repeated_names: + raise ValueError("Column and rows cannot share the same names. " + "Repeated names: {repeated_names}" + .format(repeated_names=", ".join(repeated_names))) + data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) From c5430abe1d7cb6eb93aca57b866dabfa4302c0b1 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sun, 9 Jun 2019 12:41:47 -0400 Subject: [PATCH 02/36] Updated test for duplicated names in crosstab --- pandas/tests/reshape/test_pivot.py | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 582084e3bfb5a..ee7a5dcd52f73 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2440,16 +2440,29 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) + def test_crosstab_dup_index_names(self): - # GH 13279 - s = pd.Series(range(3), name="foo") + s1 = pd.Series(range(3), name='foo') + s2 = s1 + 1 + + msg = "Column and rows cannot share the same names. Repeated names: foo" + with pytest.raises(ValueError, match=msg): + pd.crosstab(s1, s2) + with pytest.raises(ValueError, match=msg): + pd.crosstab(s1, [s2, s2.rename("bar")]) + + msg = "Duplicated column names not allowed" + with pytest.raises(ValueError, match=msg): + pd.crosstab(s1.rename("bar"), [s2, s2]) + with pytest.raises(ValueError, match=msg): + pd.crosstab(s1.rename("bar"), [s2.rename("one"), s2, s2]) + + msg = "Duplicated index/row names not allowed" + with pytest.raises(ValueError, match=msg): + pd.crosstab([s1, s1], s2.rename("bar")) + with pytest.raises(ValueError, match=msg): + pd.crosstab([s1, s1, s1.rename("one")], s2.rename("bar")) - result = pd.crosstab(s, s) - expected_index = pd.Index(range(3), name="foo") - expected = pd.DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index - ) - tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): From 5762bb68bffcdfdd1aabee9d309a52b2e1a05992 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sun, 9 Jun 2019 12:46:22 -0400 Subject: [PATCH 03/36] Flake8 compliance --- pandas/tests/reshape/test_pivot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index ee7a5dcd52f73..aada1cf0d4062 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2445,7 +2445,8 @@ def test_crosstab_dup_index_names(self): s1 = pd.Series(range(3), name='foo') s2 = s1 + 1 - msg = "Column and rows cannot share the same names. Repeated names: foo" + msg = "Column and rows cannot share the same names. " \ + "Repeated names: foo" with pytest.raises(ValueError, match=msg): pd.crosstab(s1, s2) with pytest.raises(ValueError, match=msg): From 989bd5aba7f9bae541694a215cc6d2f032940647 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Mon, 16 Sep 2019 21:19:03 -0400 Subject: [PATCH 04/36] Black formatting --- pandas/core/reshape/pivot.py | 9 ++++++--- pandas/tests/reshape/test_pivot.py | 9 ++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 47cacedb57e06..38aa78dcd11c2 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -550,9 +550,12 @@ def crosstab( repeated_names = set(rownames).intersection(set(colnames)) if repeated_names: - raise ValueError("Column and rows cannot share the same names. " - "Repeated names: {repeated_names}" - .format(repeated_names=", ".join(repeated_names))) + raise ValueError( + "Column and rows cannot share the same names. " + "Repeated names: {repeated_names}".format( + repeated_names=", ".join(repeated_names) + ) + ) data = {} data.update(zip(rownames, index)) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index aada1cf0d4062..c859d57fd89a8 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2440,13 +2440,13 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_dup_index_names(self): - s1 = pd.Series(range(3), name='foo') + # Duplicate names/index/rows in crosstab are not supported + + s1 = pd.Series(range(3), name="foo") s2 = s1 + 1 - msg = "Column and rows cannot share the same names. " \ - "Repeated names: foo" + msg = "Column and rows cannot share the same names. Repeated names: foo" with pytest.raises(ValueError, match=msg): pd.crosstab(s1, s2) with pytest.raises(ValueError, match=msg): @@ -2464,7 +2464,6 @@ def test_crosstab_dup_index_names(self): with pytest.raises(ValueError, match=msg): pd.crosstab([s1, s1, s1.rename("one")], s2.rename("bar")) - @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): s1 = pd.Series(range(3), name=names[0]) From 672bd4b5d1a6bdd0780e9438728c1c72faea462f Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 19 Sep 2019 23:28:37 -0400 Subject: [PATCH 05/36] Creates name mapping to overcome duplicated columns issues in pd.crosstab() --- pandas/core/reshape/pivot.py | 96 +++++++++++++++++++++--------- pandas/tests/reshape/test_pivot.py | 53 ++++++++++++----- 2 files changed, 106 insertions(+), 43 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 38aa78dcd11c2..bfa46fb24d265 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -534,42 +534,38 @@ def crosstab( b 0 1 0 c 0 0 0 """ + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) + common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - common_idx = _get_objs_combined_axis(index + columns, intersect=True, sort=False) + # We create our own mapping of row and columns names + # to prevent issues with duplicate columns. GH Issue: #22529 + shared_col_row_names = set(rownames).intersection(set(colnames)) + row_names_mapper, unique_row_names = _build_names_mapper( + rownames, shared_col_row_names, "row" + ) + col_names_mapper, unique_col_names = _build_names_mapper( + colnames, shared_col_row_names, "col" + ) - if len(set(rownames)) != len(rownames): - raise ValueError("Duplicated index/row names not allowed") - if len(set(colnames)) != len(colnames): - raise ValueError("Duplicated column names not allowed") - - repeated_names = set(rownames).intersection(set(colnames)) - if repeated_names: - raise ValueError( - "Column and rows cannot share the same names. " - "Repeated names: {repeated_names}".format( - repeated_names=", ".join(repeated_names) - ) - ) + from pandas import DataFrame data = {} - data.update(zip(rownames, index)) - data.update(zip(colnames, columns)) - - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") - - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") - - from pandas import DataFrame + data.update(zip(unique_row_names, index)) + data.update(zip(unique_col_names, columns)) df = DataFrame(data, index=common_idx) + if values is None: df["__dummy__"] = 0 kwargs = {"aggfunc": len, "fill_value": 0} @@ -579,12 +575,12 @@ def crosstab( table = df.pivot_table( "__dummy__", - index=rownames, - columns=colnames, + index=unique_row_names, + columns=unique_col_names, margins=margins, margins_name=margins_name, dropna=dropna, - **kwargs + **kwargs, ) # Post-process @@ -593,6 +589,11 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) + print(row_names_mapper) + print(col_names_mapper) + table = table.rename_axis(index=row_names_mapper, axis=0) + table = table.rename_axis(columns=col_names_mapper, axis=1) + return table @@ -692,3 +693,44 @@ def _get_names(arrs, names, prefix="row"): names = list(names) return names + + +def _get_duplicate_count(names_list): + seen_names = set() + duplicate_names = {} + for name in names_list: + if name not in seen_names: + duplicate_names[name] = 0 + seen_names.add(name) + else: + duplicate_names[name] += 1 + + return duplicate_names + + +def _build_names_mapper(names, shared_col_row_names, suffix): + + dup_names_count = _get_duplicate_count(names) + names_mapper = {} + unique_names = [] + + # We reserve the names so the number are in order of appearance + for name in reversed(names): + mapped_name = name + num_duplicates = dup_names_count[name] + + # Add a number if it is duplicated + if num_duplicates > 0: + mapped_name = f"{mapped_name}_{num_duplicates}" + dup_names_count[name] -= 1 + + # Add suffix if it is shared between column and rows + if name in shared_col_row_names: + mapped_name = f"{mapped_name}_{suffix}" + + names_mapper[mapped_name] = name + + # Creates a list of names in the original order + unique_names.insert(0, mapped_name) + + return names_mapper, unique_names diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c859d57fd89a8..042f58aef05e1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2441,28 +2441,49 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # Duplicate names/index/rows in crosstab are not supported + # We test that duplicated column names do not produce issues + # GH Issue: #22529 + # Duplicate name shared between rows and columns s1 = pd.Series(range(3), name="foo") s2 = s1 + 1 + expected = pd.crosstab(s1, s2.rename("bar")) + result = pd.crosstab(s1, s2) + tm.assert_frame_equal( + result, expected.rename_axis(columns={"bar": "foo"}, axis=1) + ) + assert result.index.names == ["foo"] + assert result.columns.names == ["foo"] - msg = "Column and rows cannot share the same names. Repeated names: foo" - with pytest.raises(ValueError, match=msg): - pd.crosstab(s1, s2) - with pytest.raises(ValueError, match=msg): - pd.crosstab(s1, [s2, s2.rename("bar")]) + # Row names duplicated + s1 = pd.Series(range(3), name="foo") + s2 = s1 + 1 + s3 = pd.Series(range(3), name="bar_col") - msg = "Duplicated column names not allowed" - with pytest.raises(ValueError, match=msg): - pd.crosstab(s1.rename("bar"), [s2, s2]) - with pytest.raises(ValueError, match=msg): - pd.crosstab(s1.rename("bar"), [s2.rename("one"), s2, s2]) + expected = pd.crosstab([s1, s2.rename("bar")], s3) + result = pd.crosstab([s1, s2], s3) - msg = "Duplicated index/row names not allowed" - with pytest.raises(ValueError, match=msg): - pd.crosstab([s1, s1], s2.rename("bar")) - with pytest.raises(ValueError, match=msg): - pd.crosstab([s1, s1, s1.rename("one")], s2.rename("bar")) + tm.assert_frame_equal( + result, expected.rename_axis(index={"bar": "foo"}, axis=0) + ) + + assert result.index.names == ["foo", "foo"] + assert result.columns.names == ["bar_col"] + + # Column names duplicated + s1 = pd.Series(range(3), name="foo") + s2 = s1 + 1 + s3 = pd.Series(range(3), name="bar_row") + + expected = pd.crosstab(s3, [s1, s2.rename("bar")]) + result = pd.crosstab(s3, [s1, s2]) + + tm.assert_frame_equal( + result, expected.rename_axis(columns={"bar": "foo"}, axis=1) + ) + + assert result.index.names == ["bar_row"] + assert result.columns.names == ["foo", "foo"] @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): From 1b67514ed3f502ea874dca4e5bab45765b4fa3da Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 19 Sep 2019 23:29:49 -0400 Subject: [PATCH 06/36] Removed debug prints --- pandas/core/reshape/pivot.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index bfa46fb24d265..5e25343b65419 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -589,8 +589,6 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) - print(row_names_mapper) - print(col_names_mapper) table = table.rename_axis(index=row_names_mapper, axis=0) table = table.rename_axis(columns=col_names_mapper, axis=1) From 4f2fa86100252e70f0e81f0fafe3c192e0af5f99 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 19 Sep 2019 23:30:14 -0400 Subject: [PATCH 07/36] Removed empty line --- pandas/core/reshape/pivot.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 5e25343b65419..0728bc76ec5b4 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -707,7 +707,6 @@ def _get_duplicate_count(names_list): def _build_names_mapper(names, shared_col_row_names, suffix): - dup_names_count = _get_duplicate_count(names) names_mapper = {} unique_names = [] From fbff182b5dbc1edbfd40fba63595f70026f658cf Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 19 Sep 2019 23:31:20 -0400 Subject: [PATCH 08/36] Improved comments --- pandas/core/reshape/pivot.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0728bc76ec5b4..bcf2be9c9b390 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -711,17 +711,17 @@ def _build_names_mapper(names, shared_col_row_names, suffix): names_mapper = {} unique_names = [] - # We reserve the names so the number are in order of appearance + # We reverse the names so the number are in order of appearance for name in reversed(names): mapped_name = name num_duplicates = dup_names_count[name] - # Add a number if it is duplicated + # Adds a number if name is duplicated within columns/rows if num_duplicates > 0: mapped_name = f"{mapped_name}_{num_duplicates}" dup_names_count[name] -= 1 - # Add suffix if it is shared between column and rows + # Add suffix name is shared between column and rows if name in shared_col_row_names: mapped_name = f"{mapped_name}_{suffix}" From 6071db52a3432e183b97bc18878066b9ec0775ba Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Fri, 20 Sep 2019 00:06:05 -0400 Subject: [PATCH 09/36] String manipulation compatible with <3.6 --- pandas/core/reshape/pivot.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index bcf2be9c9b390..32c5c427eb334 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -718,12 +718,16 @@ def _build_names_mapper(names, shared_col_row_names, suffix): # Adds a number if name is duplicated within columns/rows if num_duplicates > 0: - mapped_name = f"{mapped_name}_{num_duplicates}" + mapped_name = "{mapped_name}_{num_duplicates}".format( + mapped_name=mapped_name, num_duplicates=num_duplicates + ) dup_names_count[name] -= 1 # Add suffix name is shared between column and rows if name in shared_col_row_names: - mapped_name = f"{mapped_name}_{suffix}" + mapped_name = "{mapped_name}_{suffix}".format( + mapped_name=mapped_name, suffix=suffix + ) names_mapper[mapped_name] = name From 3d9c632238ef3a3467553e7be830069094628939 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sun, 22 Sep 2019 13:19:38 -0400 Subject: [PATCH 10/36] Replaced custom method with _value_counts_arraylike --- pandas/core/reshape/pivot.py | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 32c5c427eb334..2646d76aa322d 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -13,6 +13,7 @@ from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series +from pandas.core.algorithms import _value_counts_arraylike # Note: We need to make sure `frame` is imported before `pivot`, otherwise @@ -693,35 +694,23 @@ def _get_names(arrs, names, prefix="row"): return names -def _get_duplicate_count(names_list): - seen_names = set() - duplicate_names = {} - for name in names_list: - if name not in seen_names: - duplicate_names[name] = 0 - seen_names.add(name) - else: - duplicate_names[name] += 1 - - return duplicate_names - - def _build_names_mapper(names, shared_col_row_names, suffix): - dup_names_count = _get_duplicate_count(names) + keys, counts = _value_counts_arraylike(names, dropna=False) + names_count = dict(zip(keys, counts)) + names_mapper = {} unique_names = [] - - # We reverse the names so the number are in order of appearance + # We reverse the names so the numbers are in the order given by the user for name in reversed(names): mapped_name = name - num_duplicates = dup_names_count[name] + name_count = names_count[name] # Adds a number if name is duplicated within columns/rows - if num_duplicates > 0: - mapped_name = "{mapped_name}_{num_duplicates}".format( - mapped_name=mapped_name, num_duplicates=num_duplicates + if name_count > 1: + mapped_name = "{mapped_name}_{name_count}".format( + mapped_name=mapped_name, name_count=name_count ) - dup_names_count[name] -= 1 + names_count[name] -= 1 # Add suffix name is shared between column and rows if name in shared_col_row_names: @@ -731,7 +720,7 @@ def _build_names_mapper(names, shared_col_row_names, suffix): names_mapper[mapped_name] = name - # Creates a list of names in the original order + # Creates a list of the new names in the original order unique_names.insert(0, mapped_name) return names_mapper, unique_names From d27233f5e8ea46b05e7a0373b5f982cba64cbc8e Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Tue, 24 Sep 2019 17:30:31 -0400 Subject: [PATCH 11/36] Resort imports --- pandas/core/reshape/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 2646d76aa322d..dfc1ab771c06a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -6,6 +6,7 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.algorithms import _value_counts_arraylike import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper @@ -13,7 +14,6 @@ from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series -from pandas.core.algorithms import _value_counts_arraylike # Note: We need to make sure `frame` is imported before `pivot`, otherwise From 3e9333deb94a3ab1a94c868808497c4d24372f05 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Mon, 18 Nov 2019 23:57:14 -0500 Subject: [PATCH 12/36] Added docstring and type hints to _build_names_mapper --- pandas/core/reshape/pivot.py | 26 +++++++++++++++++++++++++- pandas/tests/reshape/test_pivot.py | 2 +- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 2399fec928be0..6a8e641d08373 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,3 +1,5 @@ +from typing import List, Set, Dict, Tuple + import numpy as np from pandas.util._decorators import Appender, Substitution @@ -699,7 +701,29 @@ def _get_names(arrs, names, prefix="row"): return names -def _build_names_mapper(names, shared_col_row_names, suffix): +def _build_names_mapper( + names: List[str], shared_col_row_names: Set[str], suffix: str +) -> Tuple[Dict[str, str], List[str]]: + """ + Given a list of row or column names, creates a mapper of unique names to + column/row names. + + Parameters + ---------- + names : list + Names to be deduplicated. + shared_col_row_names : set or list + Values used both in rows and columns, so need additional deduplication. + suffix : str + Suffix to deduplicate values in shared_col_row_names + + Returns + ------- + names_mapper: dict + The keys are the unique names and the values are the original names. + unique_names: list + Unique names in the same order that names came in + """ keys, counts = _value_counts_arraylike(names, dropna=False) names_count = dict(zip(keys, counts)) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index ef86d398750ae..4c4f763ba79d7 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2487,7 +2487,7 @@ def test_crosstab_with_numpy_size(self): def test_crosstab_dup_index_names(self): # We test that duplicated column names do not produce issues - # GH Issue: #22529 + # GH Issue: #22529, GH#13279 # Duplicate name shared between rows and columns s1 = pd.Series(range(3), name="foo") From fe6f7fd131673ec598a8d319a32eaebf9cb824dc Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Tue, 19 Nov 2019 09:07:37 -0500 Subject: [PATCH 13/36] Sorted imports --- pandas/core/reshape/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 6a8e641d08373..a969360223bfc 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,4 +1,4 @@ -from typing import List, Set, Dict, Tuple +from typing import Dict, List, Set, Tuple import numpy as np From 9f4a00ac4a5f530d432eae98c6b6eee3957d4bf6 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Tue, 19 Nov 2019 09:38:22 -0500 Subject: [PATCH 14/36] Added types to unique_names --- pandas/core/reshape/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index a969360223bfc..12fef6c5d3d07 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -728,7 +728,7 @@ def _build_names_mapper( names_count = dict(zip(keys, counts)) names_mapper = {} - unique_names = [] + unique_names: List[str] = [] # We reverse the names so the numbers are in the order given by the user for name in reversed(names): mapped_name = name From 788ac2b7c33479d421f4448da7aaae9559e21782 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Wed, 20 Nov 2019 21:39:47 -0500 Subject: [PATCH 15/36] Updated tests --- pandas/core/reshape/pivot.py | 10 +++++----- pandas/tests/reshape/test_pivot.py | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 12fef6c5d3d07..af17afd8ed9c4 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -555,7 +555,7 @@ def crosstab( colnames = _get_names(columns, colnames, prefix="col") # We create our own mapping of row and columns names - # to prevent issues with duplicate columns. GH Issue: #22529 + # to prevent issues with duplicate columns/row names. GH Issue: #22529 shared_col_row_names = set(rownames).intersection(set(colnames)) row_names_mapper, unique_row_names = _build_names_mapper( rownames, shared_col_row_names, "row" @@ -566,10 +566,10 @@ def crosstab( from pandas import DataFrame - data = {} - data.update(zip(unique_row_names, index)) - data.update(zip(unique_col_names, columns)) - + data = { + **dict(zip(unique_row_names, index)), + **dict(zip(unique_col_names, columns)), + } df = DataFrame(data, index=common_idx) if values is None: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 4c4f763ba79d7..dea52ec7a79bf 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2492,11 +2492,11 @@ def test_crosstab_dup_index_names(self): # Duplicate name shared between rows and columns s1 = pd.Series(range(3), name="foo") s2 = s1 + 1 - expected = pd.crosstab(s1, s2.rename("bar")) - result = pd.crosstab(s1, s2) - tm.assert_frame_equal( - result, expected.rename_axis(columns={"bar": "foo"}, axis=1) + expected = pd.crosstab(s1, s2.rename("bar")).rename_axis( + columns={"bar": "foo"}, axis=1 ) + result = pd.crosstab(s1, s2) + tm.assert_frame_equal(result, expected) assert result.index.names == ["foo"] assert result.columns.names == ["foo"] @@ -2505,12 +2505,12 @@ def test_crosstab_dup_index_names(self): s2 = s1 + 1 s3 = pd.Series(range(3), name="bar_col") - expected = pd.crosstab([s1, s2.rename("bar")], s3) + expected = pd.crosstab([s1, s2.rename("bar")], s3).rename_axis( + index={"bar": "foo"}, axis=0 + ) result = pd.crosstab([s1, s2], s3) - tm.assert_frame_equal( - result, expected.rename_axis(index={"bar": "foo"}, axis=0) - ) + tm.assert_frame_equal(result, expected) assert result.index.names == ["foo", "foo"] assert result.columns.names == ["bar_col"] @@ -2520,12 +2520,12 @@ def test_crosstab_dup_index_names(self): s2 = s1 + 1 s3 = pd.Series(range(3), name="bar_row") - expected = pd.crosstab(s3, [s1, s2.rename("bar")]) + expected = pd.crosstab(s3, [s1, s2.rename("bar")]).rename_axis( + columns={"bar": "foo"}, axis=1 + ) result = pd.crosstab(s3, [s1, s2]) - tm.assert_frame_equal( - result, expected.rename_axis(columns={"bar": "foo"}, axis=1) - ) + tm.assert_frame_equal(result, expected) assert result.index.names == ["bar_row"] assert result.columns.names == ["foo", "foo"] From d938a96e264efaeb24fe45dcd0ce59e7f7c935a2 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 21 Nov 2019 09:14:29 -0500 Subject: [PATCH 16/36] Sorted imports --- pandas/core/reshape/pivot.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index d74878721cf3e..1b4f4629d02bd 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,21 +1,18 @@ -from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Set, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Set, Tuple, Union import numpy as np - -from pandas.util._decorators import Appender, Substitution - +import pandas.core.common as com +from pandas.core.algorithms import _value_counts_arraylike from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries - -from pandas.core.algorithms import _value_counts_arraylike -import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.index import Index, MultiIndex, get_objs_combined_axis from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series +from pandas.util._decorators import Appender, Substitution if TYPE_CHECKING: from pandas import DataFrame From d60308c12ac7cef4aa789aa5b98bd0b8a9f6121f Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Thu, 21 Nov 2019 17:10:32 -0500 Subject: [PATCH 17/36] Sorted imports --- pandas/core/reshape/pivot.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 1b4f4629d02bd..3239ff3794c5e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,18 +1,21 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Set, Tuple, Union import numpy as np -import pandas.core.common as com -from pandas.core.algorithms import _value_counts_arraylike + +from pandas.util._decorators import Appender, Substitution + from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +from pandas.core.algorithms import _value_counts_arraylike +import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper from pandas.core.index import Index, MultiIndex, get_objs_combined_axis from pandas.core.reshape.concat import concat from pandas.core.reshape.util import cartesian_product from pandas.core.series import Series -from pandas.util._decorators import Appender, Substitution if TYPE_CHECKING: from pandas import DataFrame From cad2dea26166bc76da089e32221a9f276a2c02e5 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sat, 11 Apr 2020 11:33:12 -0400 Subject: [PATCH 18/36] Improved comment --- pandas/core/reshape/pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 997c140243f2d..f162bade41eca 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -777,7 +777,7 @@ def _build_names_mapper( ) names_count[name] -= 1 - # Add suffix name is shared between column and rows + # Add suffix name if column is shared between column and rows if name in shared_col_row_names: mapped_name = "{mapped_name}_{suffix}".format( mapped_name=mapped_name, suffix=suffix From f0261e97d01b5a3ed44616bf251f23bd18abfff7 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sat, 11 Apr 2020 13:45:00 -0400 Subject: [PATCH 19/36] Updated tests --- pandas/tests/reshape/test_crosstab.py | 46 +++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 8795af2e11122..6244051ad5f9e 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -534,15 +534,49 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279 - s = Series(range(3), name="foo") + # We test that duplicated column names do not produce issues + # GH Issue: #22529, GH#13279 - result = crosstab(s, s) - expected_index = Index(range(3), name="foo") - expected = DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index + # Duplicate name shared between rows and columns + s1 = pd.Series(range(3), name="foo") + s2 = s1 + 1 + expected = pd.crosstab(s1, s2.rename("bar")).rename_axis( + columns={"bar": "foo"}, axis=1 ) + result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + assert result.index.names == ["foo"] + assert result.columns.names == ["foo"] + + # Row names duplicated + s1 = pd.Series(range(3), name="foo") + s2 = s1 + 1 + s3 = pd.Series(range(3), name="bar_col") + + expected = pd.crosstab([s1, s2.rename("bar")], s3).rename_axis( + index={"bar": "foo"}, axis=0 + ) + result = pd.crosstab([s1, s2], s3) + + tm.assert_frame_equal(result, expected) + + assert result.index.names == ["foo", "foo"] + assert result.columns.names == ["bar_col"] + + # Column names duplicated + s1 = pd.Series(range(3), name="foo") + s2 = s1 + 1 + s3 = pd.Series(range(3), name="bar_row") + + expected = pd.crosstab(s3, [s1, s2.rename("bar")]).rename_axis( + columns={"bar": "foo"}, axis=1 + ) + result = pd.crosstab(s3, [s1, s2]) + + tm.assert_frame_equal(result, expected) + + assert result.index.names == ["bar_row"] + assert result.columns.names == ["foo", "foo"] @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): From 2733c17d29960be7a0e00886adf650bfd3a8694b Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sat, 11 Apr 2020 13:46:37 -0400 Subject: [PATCH 20/36] Removed pd. reference in tests --- pandas/tests/reshape/test_crosstab.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 6244051ad5f9e..129dc1f8e09ff 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -538,9 +538,9 @@ def test_crosstab_dup_index_names(self): # GH Issue: #22529, GH#13279 # Duplicate name shared between rows and columns - s1 = pd.Series(range(3), name="foo") + s1 = Series(range(3), name="foo") s2 = s1 + 1 - expected = pd.crosstab(s1, s2.rename("bar")).rename_axis( + expected = crosstab(s1, s2.rename("bar")).rename_axis( columns={"bar": "foo"}, axis=1 ) result = pd.crosstab(s1, s2) @@ -549,14 +549,14 @@ def test_crosstab_dup_index_names(self): assert result.columns.names == ["foo"] # Row names duplicated - s1 = pd.Series(range(3), name="foo") + s1 = Series(range(3), name="foo") s2 = s1 + 1 - s3 = pd.Series(range(3), name="bar_col") + s3 = Series(range(3), name="bar_col") - expected = pd.crosstab([s1, s2.rename("bar")], s3).rename_axis( + expected = crosstab([s1, s2.rename("bar")], s3).rename_axis( index={"bar": "foo"}, axis=0 ) - result = pd.crosstab([s1, s2], s3) + result = crosstab([s1, s2], s3) tm.assert_frame_equal(result, expected) @@ -564,14 +564,14 @@ def test_crosstab_dup_index_names(self): assert result.columns.names == ["bar_col"] # Column names duplicated - s1 = pd.Series(range(3), name="foo") + s1 = Series(range(3), name="foo") s2 = s1 + 1 - s3 = pd.Series(range(3), name="bar_row") + s3 = Series(range(3), name="bar_row") - expected = pd.crosstab(s3, [s1, s2.rename("bar")]).rename_axis( + expected = crosstab(s3, [s1, s2.rename("bar")]).rename_axis( columns={"bar": "foo"}, axis=1 ) - result = pd.crosstab(s3, [s1, s2]) + result = crosstab(s3, [s1, s2]) tm.assert_frame_equal(result, expected) From f1b19a643cfebcd7751d046bbfa6ca14dde31b7d Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sat, 11 Apr 2020 13:46:56 -0400 Subject: [PATCH 21/36] Removed pd. reference in tests --- pandas/tests/reshape/test_crosstab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 129dc1f8e09ff..b0298b910d206 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -543,7 +543,7 @@ def test_crosstab_dup_index_names(self): expected = crosstab(s1, s2.rename("bar")).rename_axis( columns={"bar": "foo"}, axis=1 ) - result = pd.crosstab(s1, s2) + result = crosstab(s1, s2) tm.assert_frame_equal(result, expected) assert result.index.names == ["foo"] assert result.columns.names == ["foo"] From 85e010bffd2eefe2625e9f1c64091ea94ba54d85 Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Fri, 29 May 2020 19:27:05 -0400 Subject: [PATCH 22/36] Added Set import from typing --- pandas/core/reshape/pivot.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 5e1141e06ae2d..718db61db3698 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,6 +5,7 @@ List, Optional, Sequence, + Set, Tuple, Union, cast, From 3ed74d994991b965a61a2ea5d9983941a2ae44be Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sun, 12 Jul 2020 23:23:57 -0400 Subject: [PATCH 23/36] Renamed duplicated row/col names test --- pandas/tests/reshape/test_crosstab.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index b0298b910d206..2e21a55d97493 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -533,11 +533,11 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_dup_index_names(self): - # We test that duplicated column names do not produce issues + def test_crosstab_duplicated_row_and_col_names(self): + # We test that duplicated row or column names do not produce issues # GH Issue: #22529, GH#13279 - # Duplicate name shared between rows and columns + # Same name in both rows and columns s1 = Series(range(3), name="foo") s2 = s1 + 1 expected = crosstab(s1, s2.rename("bar")).rename_axis( From e5c6cd953223548e8d7b537ac1e963d578f978de Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Sun, 12 Jul 2020 23:31:15 -0400 Subject: [PATCH 24/36] Added what's new entry --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..b92ae2161263d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1124,6 +1124,7 @@ Reshaping - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) - Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) +- Bug in :meth:`DataFrame.crosstab` when duplicated row or column names were used (:issue:`22529`) Sparse ^^^^^^ From dc1d5c5ddb0df20c375dc005d314b87bbc0e0abd Mon Sep 17 00:00:00 2001 From: Fernando Irarrazaval Date: Fri, 11 Sep 2020 13:10:42 -0400 Subject: [PATCH 25/36] Updated reference to value_counts_arraylike --- pandas/core/reshape/pivot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 8beb5e9503b51..fa34258f89439 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -20,7 +20,7 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.algorithms import _value_counts_arraylike +import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper @@ -773,7 +773,7 @@ def _build_names_mapper( unique_names: list Unique names in the same order that names came in """ - keys, counts = _value_counts_arraylike(names, dropna=False) + keys, counts = algos.value_counts_arraylike(names, dropna=False) names_count = dict(zip(keys, counts)) names_mapper = {} From 86c6ff1005ccd4fa7d017a5b37607fbe9f69f55b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 06:45:26 +0000 Subject: [PATCH 26/36] consolidate test --- pandas/tests/reshape/test_crosstab.py | 44 +++++++-------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 1896d392b31a1..50b30cf8a73ac 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -536,50 +536,28 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_duplicated_row_and_col_names(self): - # We test that duplicated row or column names do not produce issues - # GH Issue: #22529, GH#13279 + # GH22529 - # Same name in both rows and columns s1 = Series(range(3), name="foo") - s2 = s1 + 1 - expected = crosstab(s1, s2.rename("bar")).rename_axis( - columns={"bar": "foo"}, axis=1 - ) - result = crosstab(s1, s2) - tm.assert_frame_equal(result, expected) - assert result.index.names == ["foo"] - assert result.columns.names == ["foo"] - - # Row names duplicated - s1 = Series(range(3), name="foo") - s2 = s1 + 1 - s3 = Series(range(3), name="bar_col") - - expected = crosstab([s1, s2.rename("bar")], s3).rename_axis( - index={"bar": "foo"}, axis=0 - ) - result = crosstab([s1, s2], s3) + s2_foo = Series(range(1, 4), name="foo") + s2_bar = Series(range(1, 4), name="bar") + s3 = Series(range(3), name="waldo") + result = crosstab(s1, s2_foo) + expected = crosstab(s1, s2_bar).rename_axis(columns={"bar": "foo"}, axis=1) tm.assert_frame_equal(result, expected) - assert result.index.names == ["foo", "foo"] - assert result.columns.names == ["bar_col"] - - # Column names duplicated - s1 = Series(range(3), name="foo") - s2 = s1 + 1 - s3 = Series(range(3), name="bar_row") + result = crosstab([s1, s2_foo], s3) + expected = crosstab([s1, s2_bar], s3).rename_axis(index={"bar": "foo"}, axis=0) + tm.assert_frame_equal(result, expected) - expected = crosstab(s3, [s1, s2.rename("bar")]).rename_axis( + result = crosstab(s3, [s1, s2_foo]) + expected = crosstab(s3, [s1, s2_bar]).rename_axis( columns={"bar": "foo"}, axis=1 ) - result = crosstab(s3, [s1, s2]) tm.assert_frame_equal(result, expected) - assert result.index.names == ["bar_row"] - assert result.columns.names == ["foo", "foo"] - @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]]) def test_crosstab_tuple_name(self, names): s1 = Series(range(3), name=names[0]) From bbf77575f7232d10b5ff9f9ef549e1faf728fb26 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 06:51:57 +0000 Subject: [PATCH 27/36] comments + cleanup --- pandas/tests/reshape/test_crosstab.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 50b30cf8a73ac..6faf64789c687 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -535,26 +535,31 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_duplicated_row_and_col_names(self): - # GH22529 + def test_crosstab_duplicate_names(self): + # GH 13279 / 22529 s1 = Series(range(3), name="foo") s2_foo = Series(range(1, 4), name="foo") s2_bar = Series(range(1, 4), name="bar") s3 = Series(range(3), name="waldo") + # check result computed with duplicate labels against + # result computed with unique labels, then relabelled + mapper = {"bar": "foo"} + + # duplicate row, column labels result = crosstab(s1, s2_foo) - expected = crosstab(s1, s2_bar).rename_axis(columns={"bar": "foo"}, axis=1) + expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1) tm.assert_frame_equal(result, expected) + # duplicate row, unique column labels result = crosstab([s1, s2_foo], s3) - expected = crosstab([s1, s2_bar], s3).rename_axis(index={"bar": "foo"}, axis=0) + expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0) tm.assert_frame_equal(result, expected) + # unique row, duplicate column labels result = crosstab(s3, [s1, s2_foo]) - expected = crosstab(s3, [s1, s2_bar]).rename_axis( - columns={"bar": "foo"}, axis=1 - ) + expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1) tm.assert_frame_equal(result, expected) From 00f6ca30fcf8ee913dd8c97d7d56a2b0347c0d36 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 17:00:07 +0000 Subject: [PATCH 28/36] rewrite _build_names_mapper for rownames, colnames args --- pandas/core/reshape/pivot.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 4b77d675e3f3a..19e05d2f3e42b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,7 +5,6 @@ List, Optional, Sequence, - Set, Tuple, Union, cast, @@ -597,15 +596,9 @@ def crosstab( rownames = _get_names(index, rownames, prefix="row") colnames = _get_names(columns, colnames, prefix="col") - # We create our own mapping of row and columns names - # to prevent issues with duplicate columns/row names. GH Issue: #22529 - shared_col_row_names = set(rownames).intersection(set(colnames)) - row_names_mapper, unique_row_names = _build_names_mapper( - rownames, shared_col_row_names, "row" - ) - col_names_mapper, unique_col_names = _build_names_mapper( - colnames, shared_col_row_names, "col" - ) + # duplicate names mapped to unique names for pivot op + row_names_mapper, unique_row_names = _build_names_mapper(rownames, colnames, "row") + col_names_mapper, unique_col_names = _build_names_mapper(rownames, colnames, "col") from pandas import DataFrame @@ -749,7 +742,7 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( - names: List[str], shared_col_row_names: Set[str], suffix: str + rownames: List[str], colnames: List[str], suffix: str ) -> Tuple[Dict[str, str], List[str]]: """ Given a list of row or column names, creates a mapper of unique names to @@ -771,6 +764,8 @@ def _build_names_mapper( unique_names: list Unique names in the same order that names came in """ + shared_col_row_names = set(rownames).intersection(set(colnames)) + names = rownames if suffix == "row" else colnames keys, counts = algos.value_counts_arraylike(names, dropna=False) names_count = dict(zip(keys, counts)) From 2a414916ed5566c3279ec0790e935022bf0cec31 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 17:52:45 +0000 Subject: [PATCH 29/36] simplify _build_names_mapper --- pandas/core/reshape/pivot.py | 84 ++++++++++++++---------------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 19e05d2f3e42b..ce8ba5bbed1b2 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like, is_scalar from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.frame import _shared_docs from pandas.core.groupby import Grouper @@ -597,8 +596,12 @@ def crosstab( colnames = _get_names(columns, colnames, prefix="col") # duplicate names mapped to unique names for pivot op - row_names_mapper, unique_row_names = _build_names_mapper(rownames, colnames, "row") - col_names_mapper, unique_col_names = _build_names_mapper(rownames, colnames, "col") + ( + row_names_mapper, + unique_row_names, + col_names_mapper, + unique_col_names, + ) = _build_names_mapper(rownames, colnames) from pandas import DataFrame @@ -742,56 +745,35 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( - rownames: List[str], colnames: List[str], suffix: str + rownames: List[str], colnames: List[str] ) -> Tuple[Dict[str, str], List[str]]: - """ - Given a list of row or column names, creates a mapper of unique names to - column/row names. - - Parameters - ---------- - names : list - Names to be deduplicated. - shared_col_row_names : set or list - Values used both in rows and columns, so need additional deduplication. - suffix : str - Suffix to deduplicate values in shared_col_row_names + def get_duplicates(names): + seen = set() + return {name for name in names if name not in seen and not seen.add(name)} - Returns - ------- - names_mapper: dict - The keys are the unique names and the values are the original names. - unique_names: list - Unique names in the same order that names came in - """ shared_col_row_names = set(rownames).intersection(set(colnames)) - names = rownames if suffix == "row" else colnames - keys, counts = algos.value_counts_arraylike(names, dropna=False) - names_count = dict(zip(keys, counts)) - - names_mapper = {} - unique_names: List[str] = [] - # We reverse the names so the numbers are in the order given by the user - for name in reversed(names): - mapped_name = name - name_count = names_count[name] - - # Adds a number if name is duplicated within columns/rows - if name_count > 1: - mapped_name = "{mapped_name}_{name_count}".format( - mapped_name=mapped_name, name_count=name_count - ) - names_count[name] -= 1 - - # Add suffix name if column is shared between column and rows - if name in shared_col_row_names: - mapped_name = "{mapped_name}_{suffix}".format( - mapped_name=mapped_name, suffix=suffix - ) - - names_mapper[mapped_name] = name + duplicate_names = ( + shared_col_row_names | get_duplicates(rownames) | get_duplicates(colnames) + ) - # Creates a list of the new names in the original order - unique_names.insert(0, mapped_name) + rownames_mapper = { + f"row_{i}": rowname + for i, rowname in enumerate(rownames) + if rowname in duplicate_names + } + unique_rownames = [ + f"row_{i}" if rowname in duplicate_names else rowname + for i, rowname in enumerate(rownames) + ] + + colnames_mapper = { + f"col_{i}": colname + for i, colname in enumerate(colnames) + if colname in duplicate_names + } + unique_colnames = [ + f"col_{i}" if colname in duplicate_names else colname + for i, colname in enumerate(colnames) + ] - return names_mapper, unique_names + return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames From 975af3e6e24266f7ba19416727531f82fe9f7d0b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 17:59:54 +0000 Subject: [PATCH 30/36] rename row_names->rownames, col_names->colnames for consistency --- pandas/core/reshape/pivot.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ce8ba5bbed1b2..97ace103fd97e 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -597,17 +597,17 @@ def crosstab( # duplicate names mapped to unique names for pivot op ( - row_names_mapper, - unique_row_names, - col_names_mapper, - unique_col_names, + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, ) = _build_names_mapper(rownames, colnames) from pandas import DataFrame data = { - **dict(zip(unique_row_names, index)), - **dict(zip(unique_col_names, columns)), + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), } df = DataFrame(data, index=common_idx) original_df_cols = df.columns @@ -621,8 +621,8 @@ def crosstab( table = df.pivot_table( ["__dummy__"], - index=unique_row_names, - columns=unique_col_names, + index=unique_rownames, + columns=unique_colnames, margins=margins, margins_name=margins_name, dropna=dropna, @@ -641,8 +641,8 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) - table = table.rename_axis(index=row_names_mapper, axis=0) - table = table.rename_axis(columns=col_names_mapper, axis=1) + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) return table From 33f76c8e1e365b3d52a4b6ac4a2ba786b66aa1b6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 18:04:43 +0000 Subject: [PATCH 31/36] rewrite whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 54c9964b54918..a7854b66475f5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -712,7 +712,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- Bug in :meth:`DataFrame.crosstab` when duplicated row or column names were used (:issue:`22529`) +- Bug in :meth:`DataFrame.crosstab` was returning incorrect results on inputs with duplicate row names, duplicate column names or duplicate names between row and column labels (:issue:`22529`) - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) From c9a11dd9de08dee4ff6b2befd477c52a857c179a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 18:12:07 +0000 Subject: [PATCH 32/36] rename vars in _build_names_mapper --- pandas/core/reshape/pivot.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 97ace103fd97e..c884877305f0f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -747,33 +747,25 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( rownames: List[str], colnames: List[str] ) -> Tuple[Dict[str, str], List[str]]: - def get_duplicates(names): + def duplicates(names): seen = set() return {name for name in names if name not in seen and not seen.add(name)} - shared_col_row_names = set(rownames).intersection(set(colnames)) - duplicate_names = ( - shared_col_row_names | get_duplicates(rownames) | get_duplicates(colnames) - ) + shared_names = set(rownames).intersection(set(colnames)) + duplicates = duplicates(rownames) | duplicates(colnames) | shared_names rownames_mapper = { - f"row_{i}": rowname - for i, rowname in enumerate(rownames) - if rowname in duplicate_names + f"row_{i}": name for i, name in enumerate(rownames) if name in duplicates } unique_rownames = [ - f"row_{i}" if rowname in duplicate_names else rowname - for i, rowname in enumerate(rownames) + f"row_{i}" if name in duplicates else name for i, name in enumerate(rownames) ] colnames_mapper = { - f"col_{i}": colname - for i, colname in enumerate(colnames) - if colname in duplicate_names + f"col_{i}": name for i, name in enumerate(colnames) if name in duplicates } unique_colnames = [ - f"col_{i}" if colname in duplicate_names else colname - for i, colname in enumerate(colnames) + f"col_{i}" if name in duplicates else name for i, name in enumerate(colnames) ] return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames From a8498844cbeec6602a368b99fa2c401e5a2758a9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 19:27:52 +0000 Subject: [PATCH 33/36] typing --- pandas/core/reshape/pivot.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c884877305f0f..1eaf48ebb1de6 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,6 +5,7 @@ List, Optional, Sequence, + Set, Tuple, Union, cast, @@ -746,10 +747,10 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( rownames: List[str], colnames: List[str] -) -> Tuple[Dict[str, str], List[str]]: +) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: def duplicates(names): - seen = set() - return {name for name in names if name not in seen and not seen.add(name)} + seen: Set = set() + return {name for name in names if name not in seen} shared_names = set(rownames).intersection(set(colnames)) duplicates = duplicates(rownames) | duplicates(colnames) | shared_names From 0c44679b8720b11e409c454375a83601c6cc07cf Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 22 Nov 2020 19:55:58 +0000 Subject: [PATCH 34/36] more typing --- pandas/core/reshape/pivot.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 1eaf48ebb1de6..860eccd859590 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -748,25 +748,25 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( rownames: List[str], colnames: List[str] ) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: - def duplicates(names): + def get_duplicates(names): seen: Set = set() return {name for name in names if name not in seen} shared_names = set(rownames).intersection(set(colnames)) - duplicates = duplicates(rownames) | duplicates(colnames) | shared_names + dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names rownames_mapper = { - f"row_{i}": name for i, name in enumerate(rownames) if name in duplicates + f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names } unique_rownames = [ - f"row_{i}" if name in duplicates else name for i, name in enumerate(rownames) + f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) ] colnames_mapper = { - f"col_{i}": name for i, name in enumerate(colnames) if name in duplicates + f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names } unique_colnames = [ - f"col_{i}" if name in duplicates else name for i, name in enumerate(colnames) + f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) ] return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames From e42011a3041cb94b59dbd2733ef87d1aca74af29 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 26 Nov 2020 22:52:53 -0500 Subject: [PATCH 35/36] review comment: add docstring --- pandas/core/reshape/pivot.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 860eccd859590..f84b0ef1c4932 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -748,6 +748,33 @@ def _get_names(arrs, names, prefix: str = "row"): def _build_names_mapper( rownames: List[str], colnames: List[str] ) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: + """ + Given the names of a DataFrame's rows and columns, returns a set of unique row + and column names and mappers that convert to original names. + + A row or column name is replaced if it is duplicate among the rows of the inputs, + among the columns of the inputs or between the rows and the columns. + + Paramters + --------- + rownames: list[str] + colnames: list[str] + + Returns + ------- + Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) + + rownames_mapper: + a dictionary with new row names as keys and original rownames as values + unique_rownames: + a list of rownames with duplicate names replaced by dummy names + colnames_mapper: + a dictionary with new column names as keys and original column names as values + unique_colnames: + a list of column names with duplicate names replaced by dummy names + + """ + def get_duplicates(names): seen: Set = set() return {name for name in names if name not in seen} From 5ca044771d0a718a61e5a06309cdd3a3cc3ebe5d Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Thu, 26 Nov 2020 22:53:58 -0500 Subject: [PATCH 36/36] more on docstring --- pandas/core/reshape/pivot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index f84b0ef1c4932..22887cede51ed 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -764,13 +764,13 @@ def _build_names_mapper( ------- Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) - rownames_mapper: + rownames_mapper: dict[str, str] a dictionary with new row names as keys and original rownames as values - unique_rownames: + unique_rownames: list[str] a list of rownames with duplicate names replaced by dummy names - colnames_mapper: + colnames_mapper: dict[str, str] a dictionary with new column names as keys and original column names as values - unique_colnames: + unique_colnames: list[str] a list of column names with duplicate names replaced by dummy names """