Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@ Reshaping
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
- Bug in :func:`crosstab` when performing crosstab operation on two series with tupple name, the resulting data frame has incorrectly named output columns (:issue:`18321`)
-

Numeric
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,17 +455,25 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,

from pandas import DataFrame
df = DataFrame(data, index=common_idx)
common_cols_idx = df.columns

# adding dummy column for calculation of pivot table
if values is None:
df['__dummy__'] = 0
kwargs = {'aggfunc': len, 'fill_value': 0}
else:
df['__dummy__'] = values
kwargs = {'aggfunc': aggfunc}

table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
table = df.pivot_table(['__dummy__'], index=rownames, columns=colnames,
margins=margins, margins_name=margins_name,
dropna=dropna, **kwargs)

# since column dummy is before computing pivot table, it has to be removed
if not table.empty:
added_cols_idx = list(df.columns.difference(common_cols_idx))[0]
table = table[added_cols_idx]

# Post-process
if normalize is not False:
table = _normalize(table, normalize=normalize, margins=margins,
Expand Down
21 changes: 20 additions & 1 deletion pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1628,7 +1628,8 @@ def test_crosstab_dup_index_names(self):
pytest.raises(ValueError, pd.crosstab, s, s)

@pytest.mark.parametrize("names", [['a', ('b', 'c')],
[('a', 'b'), 'c']])
[('a', 'b'), 'c'],
[('a', 'b'), ('c', 'd')]])
def test_crosstab_tuple_name(self, names):
s1 = pd.Series(range(3), name=names[0])
s2 = pd.Series(range(1, 4), name=names[1])
Expand All @@ -1638,3 +1639,21 @@ def test_crosstab_tuple_name(self, names):

result = pd.crosstab(s1, s2)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("names, input_data, expected_data_out", [
(['a', 'b'], [[1, 2, 3], [1, 1, 1]], [1, 1, 1]),
([('a', 'b'), 'c'], [[1, 2, 2], [1, 1, 1]], [1, 2]),
([('a', 'b'), ('c', 'd')], [[1, 2, 3], [1, 2, 3]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can u make input_data into 2, call them row_data and col_data)

Copy link
Author

@ghost ghost Jan 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback I was hoping for this to be consistent with "names", which is consistent with the test above this test. Is there a reason to split these? In general, as long as readability isn't improved, I think consistency is important.

np.eye(3, dtype=int))])
def test_crosstab_cols_output(self, names, input_data, expected_data_out):
row_series = pd.Series(input_data[0], name=names[0])
col_series = pd.Series(input_data[1], name=names[1])
expected_crosstab = pd.DataFrame(
expected_data_out,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

call this expected

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a naming convention to be comply to in pandas, apart from pep8? I don't see the benefit of changing from expected_crosstab to expected. What's the reason behind this required change?

index=pd.Index(set(input_data[0]), name=names[0]),
columns=pd.Index(set(input_data[1]), name=names[1])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason u r using set here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using set because input_data contains duplicating entries. pd.Index([1,1,1,1]) != pd.Index([1])

)
tm.assert_frame_equal(
pd.crosstab(row_series, col_series), expected_crosstab,
check_exact=True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have another line

result = pd.crosstab....

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it won't improve readability that much. Is there a reason you request explicit variable here?

)