Skip to content

pd.lib.infer_dtype infers bytes in Python3 #10032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 8, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ Enhancements

- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here <ref-subclassing-pandas>`

- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate :issue:`10032`.

.. _whatsnew_0161.enhancements.categoricalindex:

CategoricalIndex
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

PY3 = (sys.version_info[0] >= 3)
PY3_2 = sys.version_info[:2] == (3, 2)
PY2 = sys.version_info[0] == 2


try:
import __builtin__ as builtins
Expand Down
66 changes: 42 additions & 24 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import sys
cimport util
from tslib import NaT
from datetime import datetime, timedelta
iNaT = util.get_nat()

cdef bint PY2 = sys.version_info[0] == 2

# core.common import for fast inference checks
def is_float(object obj):
return util.is_float_object(obj)
Expand Down Expand Up @@ -38,10 +41,10 @@ _TYPE_MAP = {
'f' : 'floating',
'complex128': 'complex',
'c' : 'complex',
'string': 'string',
'S' : 'string',
'unicode': 'unicode',
'U' : 'unicode',
'string': 'string' if PY2 else 'bytes',
'S' : 'string' if PY2 else 'bytes',
'unicode': 'unicode' if PY2 else 'string',
'U' : 'unicode' if PY2 else 'string',
'bool': 'boolean',
'b' : 'boolean',
'datetime64[ns]' : 'datetime64',
Expand Down Expand Up @@ -181,6 +184,10 @@ def infer_dtype(object _values):
if is_unicode_array(values):
return 'unicode'

elif PyBytes_Check(val):
if is_bytes_array(values):
return 'bytes'

elif is_timedelta(val):
if is_timedelta_or_timedelta64_array(values):
return 'timedelta'
Expand All @@ -196,11 +203,6 @@ def infer_dtype(object _values):

return 'mixed'

def infer_dtype_list(list values):
cdef:
Py_ssize_t i, n = len(values)
pass


def is_possible_datetimelike_array(object arr):
# determine if we have a possible datetimelike (or null-like) array
Expand Down Expand Up @@ -253,7 +255,6 @@ def is_bool_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, np.bool_):
return True
Expand All @@ -277,7 +278,6 @@ def is_integer_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, np.integer):
return True
Expand All @@ -298,7 +298,6 @@ def is_integer_float_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, np.integer):
return True
Expand All @@ -321,7 +320,6 @@ def is_float_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, np.floating):
return True
Expand All @@ -342,9 +340,9 @@ def is_string_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, (np.string_, np.unicode_)):
if ((PY2 and issubclass(values.dtype.type, np.string_)) or
not PY2 and issubclass(values.dtype.type, np.unicode_)):
return True
elif values.dtype == np.object_:
objbuf = values
Expand All @@ -363,7 +361,6 @@ def is_unicode_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf
object obj

if issubclass(values.dtype.type, np.unicode_):
return True
Expand All @@ -381,8 +378,29 @@ def is_unicode_array(ndarray values):
return False


def is_bytes_array(ndarray values):
cdef:
Py_ssize_t i, n = len(values)
ndarray[object] objbuf

if issubclass(values.dtype.type, np.bytes_):
return True
elif values.dtype == np.object_:
objbuf = values

if n == 0:
return False

for i in range(n):
if not PyBytes_Check(objbuf[i]):
return False
return True
else:
return False


def is_datetime_array(ndarray[object] values):
cdef int i, null_count = 0, n = len(values)
cdef Py_ssize_t i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
Expand All @@ -399,7 +417,7 @@ def is_datetime_array(ndarray[object] values):
return null_count != n

def is_datetime64_array(ndarray values):
cdef int i, null_count = 0, n = len(values)
cdef Py_ssize_t i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
Expand All @@ -416,7 +434,7 @@ def is_datetime64_array(ndarray values):
return null_count != n

def is_timedelta_array(ndarray values):
cdef int i, null_count = 0, n = len(values)
cdef Py_ssize_t i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
Expand All @@ -431,7 +449,7 @@ def is_timedelta_array(ndarray values):
return null_count != n

def is_timedelta64_array(ndarray values):
cdef int i, null_count = 0, n = len(values)
cdef Py_ssize_t i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
Expand All @@ -447,7 +465,7 @@ def is_timedelta64_array(ndarray values):

def is_timedelta_or_timedelta64_array(ndarray values):
""" infer with timedeltas and/or nat/none """
cdef int i, null_count = 0, n = len(values)
cdef Py_ssize_t i, null_count = 0, n = len(values)
cdef object v
if n == 0:
return False
Expand All @@ -462,7 +480,7 @@ def is_timedelta_or_timedelta64_array(ndarray values):
return null_count != n

def is_date_array(ndarray[object] values):
cdef int i, n = len(values)
cdef Py_ssize_t i, n = len(values)
if n == 0:
return False
for i in range(n):
Expand All @@ -471,7 +489,7 @@ def is_date_array(ndarray[object] values):
return True

def is_time_array(ndarray[object] values):
cdef int i, n = len(values)
cdef Py_ssize_t i, n = len(values)
if n == 0:
return False
for i in range(n):
Expand All @@ -484,7 +502,7 @@ def is_period(object o):
return isinstance(o,Period)

def is_period_array(ndarray[object] values):
cdef int i, n = len(values)
cdef Py_ssize_t i, n = len(values)
from pandas.tseries.period import Period

if n == 0:
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd
from pandas.lib import isscalar, item_from_zerodim, max_len_string_array
import pandas.util.testing as tm
from pandas.compat import u
from pandas.compat import u, PY2


class TestMisc(tm.TestCase):
Expand All @@ -28,6 +28,17 @@ def test_max_len_string_array(self):
tm.assertRaises(TypeError,
lambda: max_len_string_array(arr.astype('U')))

def test_infer_dtype_bytes(self):
compare = 'string' if PY2 else 'bytes'

# string array of bytes
arr = np.array(list('abc'), dtype='S1')
self.assertEqual(pd.lib.infer_dtype(arr), compare)

# object array of bytes
arr = arr.astype(object)
self.assertEqual(pd.lib.infer_dtype(arr), compare)


class TestIsscalar(tm.TestCase):

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1802,6 +1802,17 @@ def test_index_str_accessor_visibility(self):
with self.assertRaisesRegexp(AttributeError, message):
idx.str

def test_method_on_bytes(self):
lhs = Series(np.array(list('abc'), 'S1').astype(object))
rhs = Series(np.array(list('def'), 'S1').astype(object))
if compat.PY3:
self.assertRaises(TypeError, lhs.str.cat, rhs)
else:
result = lhs.str.cat(rhs)
expected = Series(np.array(['ad', 'be', 'cf'],
'S2').astype(object))
tm.assert_series_equal(result, expected)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down