diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 049ccc0e6c4df..b16888338cda5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -729,6 +729,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :meth:`DataFrame.crosstab` was returning incorrect results on inputs with duplicate row names, duplicate column names or duplicate names between row and column labels (:issue:`22529`) - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`concat` and :class:`DataFrame` constructor where input index names are not preserved in some cases (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index c1198cdfcda81..22887cede51ed 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -5,6 +5,7 @@ List, Optional, Sequence, + Set, Tuple, Union, cast, @@ -578,29 +579,37 @@ def crosstab( b 0 1 0 c 0 0 0 """ + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) - rownames = _get_names(index, rownames, prefix="row") - colnames = _get_names(columns, colnames, prefix="col") - common_idx = None pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - data: Dict = {} - data.update(zip(rownames, index)) - data.update(zip(colnames, columns)) - - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") + # duplicate names mapped to unique names for pivot op + ( + rownames_mapper, + unique_rownames, + colnames_mapper, + unique_colnames, + ) = _build_names_mapper(rownames, colnames) from pandas import DataFrame + data = { + **dict(zip(unique_rownames, index)), + **dict(zip(unique_colnames, columns)), + } df = DataFrame(data, index=common_idx) original_df_cols = df.columns @@ -613,8 +622,8 @@ def crosstab( table = df.pivot_table( ["__dummy__"], - index=rownames, - columns=colnames, + index=unique_rownames, + columns=unique_colnames, margins=margins, margins_name=margins_name, dropna=dropna, @@ -633,6 +642,9 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) + table = table.rename_axis(index=rownames_mapper, axis=0) + table = table.rename_axis(columns=colnames_mapper, axis=1) + return table @@ -731,3 +743,57 @@ def _get_names(arrs, names, prefix: str = "row"): names = list(names) return names + + +def _build_names_mapper( + rownames: List[str], colnames: List[str] +) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: + """ + Given the names of a DataFrame's rows and columns, returns a set of unique row + and column names and mappers that convert to original names. + + A row or column name is replaced if it is duplicate among the rows of the inputs, + among the columns of the inputs or between the rows and the columns. + + Paramters + --------- + rownames: list[str] + colnames: list[str] + + Returns + ------- + Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) + + rownames_mapper: dict[str, str] + a dictionary with new row names as keys and original rownames as values + unique_rownames: list[str] + a list of rownames with duplicate names replaced by dummy names + colnames_mapper: dict[str, str] + a dictionary with new column names as keys and original column names as values + unique_colnames: list[str] + a list of column names with duplicate names replaced by dummy names + + """ + + def get_duplicates(names): + seen: Set = set() + return {name for name in names if name not in seen} + + shared_names = set(rownames).intersection(set(colnames)) + dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names + + rownames_mapper = { + f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names + } + unique_rownames = [ + f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) + ] + + colnames_mapper = { + f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names + } + unique_colnames = [ + f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) + ] + + return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 5f6037276b31c..6faf64789c687 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -535,15 +535,32 @@ def test_crosstab_with_numpy_size(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_dup_index_names(self): - # GH 13279 - s = Series(range(3), name="foo") + def test_crosstab_duplicate_names(self): + # GH 13279 / 22529 + + s1 = Series(range(3), name="foo") + s2_foo = Series(range(1, 4), name="foo") + s2_bar = Series(range(1, 4), name="bar") + s3 = Series(range(3), name="waldo") + + # check result computed with duplicate labels against + # result computed with unique labels, then relabelled + mapper = {"bar": "foo"} + + # duplicate row, column labels + result = crosstab(s1, s2_foo) + expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1) + tm.assert_frame_equal(result, expected) + + # duplicate row, unique column labels + result = crosstab([s1, s2_foo], s3) + expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0) + tm.assert_frame_equal(result, expected) + + # unique row, duplicate column labels + result = crosstab(s3, [s1, s2_foo]) + expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1) - result = crosstab(s, s) - expected_index = Index(range(3), name="foo") - expected = DataFrame( - np.eye(3, dtype=np.int64), index=expected_index, columns=expected_index - ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])