Skip to content
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ Bug Fixes

**Categorical**

- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`)
-

**Timezones**
Expand Down
7 changes: 2 additions & 5 deletions pandas/_libs/hashing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t

from util cimport _checknull
from cpython cimport (PyString_Check,
PyBytes_Check,
from cpython cimport (PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free

Expand Down Expand Up @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
cdef list datas = []
for i in range(n):
val = arr[i]
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
if PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/series/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pandas import (Index, Series, DataFrame, date_range, option_context,
Categorical, period_range, timedelta_range)
from pandas.core.index import MultiIndex
from pandas.core.base import StringMixin

from pandas.compat import lrange, range, u
from pandas import compat
Expand Down Expand Up @@ -202,6 +203,35 @@ def test_latex_repr(self):

class TestCategoricalRepr(object):

def test_categorical_repr_unicode(self):
# GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii',
# and we are working in PY2, then rendering a Categorical could raise
# UnicodeDecodeError by trying to decode when it shouldn't

class County(StringMixin):
name = u'San Sebastián'
state = u'PR'

def __unicode__(self):
return self.name + u', ' + self.state

cat = pd.Categorical([County() for n in range(61)])
idx = pd.Index(cat)
ser = idx.to_series()

if compat.PY3:
# no reloading of sys, just check that the default (utf8) works
# as expected
repr(ser)
str(ser)

else:
# set sys.defaultencoding to ascii, then change it back after
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make this into a context manager in pandas.util.testing

# the test
with tm.set_defaultencoding('ascii'):
repr(ser)
str(ser)

def test_categorical_repr(self):
a = Series(Categorical([1, 2, 3, 4]))
exp = u("0 1\n1 2\n2 3\n3 4\n" +
Expand Down
22 changes: 22 additions & 0 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,28 @@ def _valid_locales(locales, normalize):
# Stdout / stderr decorators


@contextmanager
def set_defaultencoding(encoding):
"""
Set default encoding (as given by sys.getdefaultencoding()) to the given
encoding; restore on exit.

Parameters
----------
encoding : str
"""
if not PY2:
raise ValueError("set_defaultencoding context is only available "
"in Python 2.")
orig = sys.getdefaultencoding()
reload(sys) # noqa:F821
sys.setdefaultencoding(encoding)
try:
yield
finally:
sys.setdefaultencoding(orig)


def capture_stdout(f):
"""
Decorator to capture stdout in a buffer so that it can be checked
Expand Down