Skip to content

Commit 2408c05

Browse files
committed
on attempting to do an ordering operation on an unordered Categorical,
show an OrderingWarning but let the operation succeed
1 parent 738f3b7 commit 2408c05

File tree

6 files changed

+70
-63
lines changed

6 files changed

+70
-63
lines changed

doc/source/categorical.rst

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -280,16 +280,12 @@ Sorting and Order
280280
The default for construction has change in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True``
281281

282282
If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a
283-
meaning and certain operations are possible. If the categorical is unordered, a `TypeError` is
284-
raised.
283+
meaning and certain operations are possible. If the categorical is unordered, an ``OrderingWarning`` is shown.
285284

286285
.. ipython:: python
287286
288287
s = Series(Categorical(["a","b","c","a"], ordered=False))
289-
try:
290-
s.sort()
291-
except TypeError as e:
292-
print("TypeError: " + str(e))
288+
s.sort()
293289
s = Series(["a","b","c","a"]).astype('category',ordered=True)
294290
s.sort()
295291
s
@@ -343,8 +339,8 @@ necessarily make the sort order the same as the categories order.
343339

344340
.. note::
345341

346-
If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will raise
347-
`TypeError`. Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them
342+
If the `Categorical` is not ordered, ``Series.min()`` and ``Series.max()`` will show an ``OrderingWarning``
343+
Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them
348344
(e.g.``Series.median()``, which would need to compute the mean between two values if the length
349345
of an array is even) do not work and raise a `TypeError`.
350346

doc/source/whatsnew/v0.16.0.txt

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -368,26 +368,7 @@ For ease of creation of series of categorical data, we have added the ability to
368368
s = Series(["a","b","c","a"]).astype('category',categories=list('abcdef'),ordered=False)
369369
s
370370

371-
.. warning::
372-
373-
This simple API change may have suprising effects if a user is relying on the previous defaulted behavior implicity. In particular,
374-
sorting operations with a ``Categorical`` will now raise an error:
375-
376-
.. code-block:: python
377-
378-
In [1]: df = DataFrame({ 'A' : Series(list('aabc')).astype('category'), 'B' : np.arange(4) })
379-
380-
In [2]: df['A'].order()
381-
TypeError: Categorical not ordered
382-
you can use .as_ordered() to change the Categorical to an ordered one
383-
384-
In [3]: df.groupby('A').sum()
385-
ValueError: cannot sort by an unordered Categorical in the grouper
386-
you can set sort=False in the groupby expression or
387-
make the categorical ordered by using .as_ordered()
388-
389-
The solution is to make 'A' orderable, e.g. ``df['A'] = df['A'].cat.as_ordered()``
390-
371+
- In prior versions, trying to ``.order()/.argsort()/.searchsorted()`` on an unordered ``Categorical`` would raise a ``TypeError``. This has been relaxed in that the operation will now succeed but show an ``OrderingWarning``. This will perform the ordering in the order of the categories, then in order of appearance for the values within that category. This operation will NOT modify the existing object. (:issue:`9148`)
391372

392373
Indexing Changes
393374
~~~~~~~~~~~~~~~~

pandas/core/categorical.py

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from pandas.core.config import get_option
2525
from pandas.core import format as fmt
2626

27+
class OrderingWarning(Warning): pass
28+
2729
def _cat_compare_op(op):
2830
def f(self, other):
2931
# On python2, you can usually compare any type to any type, and Categoricals can be
@@ -828,6 +830,18 @@ def T(self):
828830
def nbytes(self):
829831
return self._codes.nbytes + self._categories.values.nbytes
830832

833+
def maybe_coerce_as_ordered(self):
834+
"""
835+
if we are not ordered, but try an ordering operation, let it succeed with a warning
836+
This may return a new copy of the object
837+
"""
838+
if not self.ordered:
839+
warn("Categorical is not ordered\n"
840+
"sort will be in the order of the categories\n"
841+
"you can use .as_ordered() to change the Categorical to an ordered one\n",
842+
OrderingWarning)
843+
return self
844+
831845
def searchsorted(self, v, side='left', sorter=None):
832846
"""Find indices where elements should be inserted to maintain order.
833847
@@ -847,6 +861,11 @@ def searchsorted(self, v, side='left', sorter=None):
847861
Optional array of integer indices that sort `self` into ascending
848862
order. They are typically the result of ``np.argsort``.
849863
864+
Warns
865+
-----
866+
OrderingWarning
867+
If the `Categorical` is not `ordered`.
868+
850869
Returns
851870
-------
852871
indices : array of ints
@@ -878,9 +897,7 @@ def searchsorted(self, v, side='left', sorter=None):
878897
>>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4])
879898
array([3, 5]) # eggs after donuts, after switching milk and donuts
880899
"""
881-
if not self.ordered:
882-
raise ValueError("Categorical not ordered\n"
883-
"you can use .as_ordered() to change the Categorical to an ordered one\n")
900+
self = self.maybe_coerce_as_ordered()
884901

885902
from pandas.core.series import Series
886903
values_as_codes = self.categories.values.searchsorted(Series(v).values, side)
@@ -1003,13 +1020,17 @@ def argsort(self, ascending=True, **kwargs):
10031020
10041021
Only ordered Categoricals can be argsorted!
10051022
1023+
Warns
1024+
-----
1025+
OrderingWarning
1026+
If the `Categorical` is not `ordered`.
1027+
10061028
Returns
10071029
-------
10081030
argsorted : numpy array
10091031
"""
1010-
if not self.ordered:
1011-
raise TypeError("Categorical not ordered\n"
1012-
"you can use .as_ordered() to change the Categorical to an ordered one\n")
1032+
1033+
self = self.maybe_coerce_as_ordered()
10131034
result = np.argsort(self._codes.copy(), **kwargs)
10141035
if not ascending:
10151036
result = result[::-1]
@@ -1032,6 +1053,11 @@ def order(self, inplace=False, ascending=True, na_position='last'):
10321053
'first' puts NaNs at the beginning
10331054
'last' puts NaNs at the end
10341055
1056+
Warns
1057+
-----
1058+
OrderingWarning
1059+
If the `Categorical` is not `ordered`.
1060+
10351061
Returns
10361062
-------
10371063
y : Category or None
@@ -1040,9 +1066,8 @@ def order(self, inplace=False, ascending=True, na_position='last'):
10401066
--------
10411067
Category.sort
10421068
"""
1043-
if not self.ordered:
1044-
raise TypeError("Categorical not ordered\n"
1045-
"you can use .as_ordered() to change the Categorical to an ordered one\n")
1069+
1070+
self = self.maybe_coerce_as_ordered()
10461071
if na_position not in ['last','first']:
10471072
raise ValueError('invalid na_position: {!r}'.format(na_position))
10481073

@@ -1092,6 +1117,11 @@ def sort(self, inplace=True, ascending=True, na_position='last'):
10921117
'first' puts NaNs at the beginning
10931118
'last' puts NaNs at the end
10941119
1120+
Warns
1121+
-----
1122+
OrderingWarning
1123+
If the `Categorical` is not `ordered`.
1124+
10951125
Returns
10961126
-------
10971127
y : Category or None
@@ -1413,18 +1443,16 @@ def min(self, numeric_only=None, **kwargs):
14131443
14141444
Only ordered `Categoricals` have a minimum!
14151445
1416-
Raises
1417-
------
1418-
TypeError
1446+
Warns
1447+
-----
1448+
OrderingWarning
14191449
If the `Categorical` is not `ordered`.
14201450
14211451
Returns
14221452
-------
14231453
min : the minimum of this `Categorical`
14241454
"""
1425-
if not self.ordered:
1426-
raise TypeError("Categorical not ordered\n"
1427-
"you can use .as_ordered() to change the Categorical to an ordered one\n")
1455+
self = self.maybe_coerce_as_ordered()
14281456
if numeric_only:
14291457
good = self._codes != -1
14301458
pointer = self._codes[good].min(**kwargs)
@@ -1441,18 +1469,16 @@ def max(self, numeric_only=None, **kwargs):
14411469
14421470
Only ordered `Categoricals` have a maximum!
14431471
1444-
Raises
1445-
------
1446-
TypeError
1472+
Warns
1473+
-----
1474+
OrderingWarning
14471475
If the `Categorical` is not `ordered`.
14481476
14491477
Returns
14501478
-------
14511479
max : the maximum of this `Categorical`
14521480
"""
1453-
if not self.ordered:
1454-
raise TypeError("Categorical not ordered\n"
1455-
"you can use .as_ordered() to change the Categorical to an ordered one\n")
1481+
self = self.maybe_coerce_as_ordered()
14561482
if numeric_only:
14571483
good = self._codes != -1
14581484
pointer = self._codes[good].max(**kwargs)

pandas/core/groupby.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1926,10 +1926,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19261926

19271927
# must have an ordered categorical
19281928
if self.sort:
1929-
if not self.grouper.ordered:
1930-
raise ValueError("cannot sort by an unordered Categorical in the grouper\n"
1931-
"you can set sort=False in the groupby expression or\n"
1932-
"make the categorical ordered by using .set_ordered(True)\n")
1929+
self.grouper = self.grouper.maybe_coerce_as_ordered()
19331930

19341931
# fix bug #GH8868 sort=False being ignored in categorical groupby
19351932
else:

pandas/tests/test_categorical.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import pandas as pd
1313

1414
from pandas import Categorical, Index, Series, DataFrame, PeriodIndex, Timestamp
15-
15+
from pandas.core.categorical import OrderingWarning
1616
import pandas.core.common as com
1717
import pandas.compat as compat
1818
import pandas.util.testing as tm
@@ -838,8 +838,10 @@ def test_min_max(self):
838838

839839
# unordered cats have no min/max
840840
cat = Categorical(["a","b","c","d"], ordered=False)
841-
self.assertRaises(TypeError, lambda : cat.min())
842-
self.assertRaises(TypeError, lambda : cat.max())
841+
with tm.assert_produces_warning(OrderingWarning):
842+
cat.min()
843+
with tm.assert_produces_warning(OrderingWarning):
844+
cat.max()
843845
cat = Categorical(["a","b","c","d"], ordered=True)
844846
_min = cat.min()
845847
_max = cat.max()
@@ -920,7 +922,8 @@ def test_sort(self):
920922

921923
# unordered cats are not sortable
922924
cat = Categorical(["a","b","b","a"], ordered=False)
923-
self.assertRaises(TypeError, lambda : cat.sort())
925+
with tm.assert_produces_warning(OrderingWarning):
926+
cat.sort()
924927
cat = Categorical(["a","c","b","d"], ordered=True)
925928

926929
# order
@@ -1594,8 +1597,10 @@ def test_groupby_sort(self):
15941597
def test_min_max(self):
15951598
# unordered cats have no min/max
15961599
cat = Series(Categorical(["a","b","c","d"], ordered=False))
1597-
self.assertRaises(TypeError, lambda : cat.min())
1598-
self.assertRaises(TypeError, lambda : cat.max())
1600+
with tm.assert_produces_warning(OrderingWarning):
1601+
cat.min()
1602+
with tm.assert_produces_warning(OrderingWarning):
1603+
cat.max()
15991604

16001605
cat = Series(Categorical(["a","b","c","d"], ordered=True))
16011606
_min = cat.min()
@@ -1769,7 +1774,8 @@ def test_sort(self):
17691774

17701775
# unordered cats are not sortable
17711776
cat = Series(Categorical(["a","b","b","a"], ordered=False))
1772-
self.assertRaises(TypeError, lambda : cat.sort())
1777+
with tm.assert_produces_warning(OrderingWarning):
1778+
cat.sort()
17731779

17741780
cat = Series(Categorical(["a","c","b","d"], ordered=True))
17751781

@@ -1803,9 +1809,8 @@ def test_sort(self):
18031809
self.assertEqual(res["sort"].dtype, "category")
18041810
self.assertEqual(res["unsort"].dtype, "category")
18051811

1806-
def f():
1812+
with tm.assert_produces_warning(OrderingWarning):
18071813
df.sort(columns=["unsort"], ascending=False)
1808-
self.assertRaises(TypeError, f)
18091814

18101815
# multi-columns sort
18111816
# GH 7848

pandas/tests/test_groupby.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas import date_range,bdate_range, Timestamp
1111
from pandas.core.index import Index, MultiIndex, Int64Index
1212
from pandas.core.api import Categorical, DataFrame
13+
from pandas.core.categorical import OrderingWarning
1314
from pandas.core.groupby import (SpecificationError, DataError,
1415
_nargsort, _lexsort_indexer)
1516
from pandas.core.series import Series
@@ -3299,7 +3300,8 @@ def test_groupby_sort_categorical(self):
32993300
result_nosort.index = index
33003301

33013302
col = 'range'
3302-
self.assertRaises(ValueError, lambda : df.groupby(col, sort=True).first())
3303+
with tm.assert_produces_warning(OrderingWarning):
3304+
df.groupby(col, sort=True).first()
33033305
assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
33043306

33053307

0 commit comments

Comments
 (0)