Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ Groupby/Resample/Rolling
- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`)
- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
- Ensured that ordering of outputs in ``groupby`` aggregation functions is consistent across all versions of Python (:issue:`25692`)

- Ensured that result group order is correct when grouping on an ordered Categorical and specifying ``observed=True`` (:issue:`25871`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use double back-ticks around Categorical

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


Reshaping
^^^^^^^^^
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
if observed:
codes = algorithms.unique1d(self.grouper.codes)
codes = codes[codes != -1]
if sort or self.grouper.ordered:
codes = np.sort(codes)
else:
codes = np.arange(len(categories))

Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,32 @@ def test_dataframe_categorical_with_nan(observed):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("ordered", [True, False])
@pytest.mark.parametrize("observed", [True, False])
@pytest.mark.parametrize("sort", [True, False])
def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
# GH 25871: Fix groupby sorting on ordered Categoricals
# Build a dataframe with cat having one unobserved category ('AWOL'),
# and a Series with identical values
cat = pd.Categorical(['d', 'a', 'b', 'a', 'd', 'b'],
categories=['a', 'b', 'AWOL', 'd'],
ordered=ordered)
val = pd.Series(['d', 'a', 'b', 'a', 'd', 'b'])
df = pd.DataFrame({'cat': cat, 'val': val})

# aggregate on the Categorical
result = (df.groupby('cat', observed=observed, sort=sort)['val']
.aggregate('first'))

# If ordering works, we expect index labels equal to aggregation results,
# except for 'observed=False': index contains 'AWOL' and aggregation None
label = pd.Series(result.index.array, dtype='object')
aggr = pd.Series(result.array)
if not observed:
aggr[aggr.isna()] = 'AWOL'
tm.assert_equal(label, aggr)


def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range('2014-01-01', periods=4)
Expand Down
29 changes: 14 additions & 15 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,28 +253,27 @@ def test_groupby_levels_and_columns(self):
tm.assert_frame_equal(by_levels, by_columns)

def test_groupby_categorical_index_and_columns(self, observed):
# GH18432
# GH18432, adapted for GH25871
columns = ['A', 'B', 'A', 'B']
categories = ['B', 'A']
data = np.ones((5, 4), int)
data = np.array([[1, 2, 1, 2],
[1, 2, 1, 2],
[1, 2, 1, 2],
[1, 2, 1, 2],
[1, 2, 1, 2]], int)
cat_columns = CategoricalIndex(columns,
categories=categories,
ordered=True)
df = DataFrame(data=data, columns=cat_columns)
result = df.groupby(axis=1, level=0, observed=observed).sum()
expected_data = 2 * np.ones((5, 2), int)

if observed:
# if we are not-observed we undergo a reindex
# so need to adjust the output as our expected sets us up
# to be non-observed
expected_columns = CategoricalIndex(['A', 'B'],
categories=categories,
ordered=True)
else:
expected_columns = CategoricalIndex(categories,
categories=categories,
ordered=True)
expected_data = np.array([[4, 2],
[4, 2],
[4, 2],
[4, 2],
[4, 2]], int)
expected_columns = CategoricalIndex(categories,
categories=categories,
ordered=True)
expected = DataFrame(data=expected_data, columns=expected_columns)
assert_frame_equal(result, expected)

Expand Down