diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 86fc47dee09fc..6ffafe2fae9c7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -509,6 +509,7 @@ Reshaping - Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`) - Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`) - Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`) +- Bug in :func:`crosstab` when performing crosstab operation on two series with tupple name, the resulting data frame has incorrectly named output columns (:issue:`18321`) - Numeric diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0e92fc4edce85..4a2f39fcab4a3 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -455,6 +455,9 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, from pandas import DataFrame df = DataFrame(data, index=common_idx) + common_cols_idx = df.columns + + # adding dummy column for calculation of pivot table if values is None: df['__dummy__'] = 0 kwargs = {'aggfunc': len, 'fill_value': 0} @@ -462,10 +465,15 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, df['__dummy__'] = values kwargs = {'aggfunc': aggfunc} - table = df.pivot_table('__dummy__', index=rownames, columns=colnames, + table = df.pivot_table(['__dummy__'], index=rownames, columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, **kwargs) + # since column dummy is before computing pivot table, it has to be removed + if not table.empty: + added_cols_idx = list(df.columns.difference(common_cols_idx))[0] + table = table[added_cols_idx] + # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 786c57a4a82df..5a6e7cd4d9bb3 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1628,7 +1628,8 @@ def test_crosstab_dup_index_names(self): pytest.raises(ValueError, pd.crosstab, s, s) @pytest.mark.parametrize("names", [['a', ('b', 'c')], - [('a', 'b'), 'c']]) + [('a', 'b'), 'c'], + [('a', 'b'), ('c', 'd')]]) def test_crosstab_tuple_name(self, names): s1 = pd.Series(range(3), name=names[0]) s2 = pd.Series(range(1, 4), name=names[1]) @@ -1638,3 +1639,21 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("names, input_data, expected_data_out", [ + (['a', 'b'], [[1, 2, 3], [1, 1, 1]], [1, 1, 1]), + ([('a', 'b'), 'c'], [[1, 2, 2], [1, 1, 1]], [1, 2]), + ([('a', 'b'), ('c', 'd')], [[1, 2, 3], [1, 2, 3]], + np.eye(3, dtype=int))]) + def test_crosstab_cols_output(self, names, input_data, expected_data_out): + row_series = pd.Series(input_data[0], name=names[0]) + col_series = pd.Series(input_data[1], name=names[1]) + expected_crosstab = pd.DataFrame( + expected_data_out, + index=pd.Index(set(input_data[0]), name=names[0]), + columns=pd.Index(set(input_data[1]), name=names[1]) + ) + tm.assert_frame_equal( + pd.crosstab(row_series, col_series), expected_crosstab, + check_exact=True + )