diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 811b32676fc6a..8d593b3b44002 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -67,6 +67,8 @@ Enhancements - ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` +- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate :issue:`10032`. + .. _whatsnew_0161.enhancements.categoricalindex: CategoricalIndex diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index bff6eb1f95abc..6be0facf2bffc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -37,6 +37,8 @@ PY3 = (sys.version_info[0] >= 3) PY3_2 = sys.version_info[:2] == (3, 2) +PY2 = sys.version_info[0] == 2 + try: import __builtin__ as builtins diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index dbe6f2f1f8351..55d5e37fc19ee 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -1,8 +1,11 @@ +import sys cimport util from tslib import NaT from datetime import datetime, timedelta iNaT = util.get_nat() +cdef bint PY2 = sys.version_info[0] == 2 + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -38,10 +41,10 @@ _TYPE_MAP = { 'f' : 'floating', 'complex128': 'complex', 'c' : 'complex', - 'string': 'string', - 'S' : 'string', - 'unicode': 'unicode', - 'U' : 'unicode', + 'string': 'string' if PY2 else 'bytes', + 'S' : 'string' if PY2 else 'bytes', + 'unicode': 'unicode' if PY2 else 'string', + 'U' : 'unicode' if PY2 else 'string', 'bool': 'boolean', 'b' : 'boolean', 'datetime64[ns]' : 'datetime64', @@ -181,6 +184,10 @@ def infer_dtype(object _values): if is_unicode_array(values): return 'unicode' + elif PyBytes_Check(val): + if is_bytes_array(values): + return 'bytes' + elif is_timedelta(val): if is_timedelta_or_timedelta64_array(values): return 'timedelta' @@ -196,11 +203,6 @@ def infer_dtype(object _values): return 'mixed' -def infer_dtype_list(list values): - cdef: - Py_ssize_t i, n = len(values) - pass - def is_possible_datetimelike_array(object arr): # determine if we have a possible datetimelike (or null-like) array @@ -253,7 +255,6 @@ def is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.bool_): return True @@ -277,7 +278,6 @@ def is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.integer): return True @@ -298,7 +298,6 @@ def is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.integer): return True @@ -321,7 +320,6 @@ def is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.floating): return True @@ -342,9 +340,9 @@ def is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj - if issubclass(values.dtype.type, (np.string_, np.unicode_)): + if ((PY2 and issubclass(values.dtype.type, np.string_)) or + not PY2 and issubclass(values.dtype.type, np.unicode_)): return True elif values.dtype == np.object_: objbuf = values @@ -363,7 +361,6 @@ def is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf - object obj if issubclass(values.dtype.type, np.unicode_): return True @@ -381,8 +378,29 @@ def is_unicode_array(ndarray values): return False +def is_bytes_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + + if issubclass(values.dtype.type, np.bytes_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyBytes_Check(objbuf[i]): + return False + return True + else: + return False + + def is_datetime_array(ndarray[object] values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -399,7 +417,7 @@ def is_datetime_array(ndarray[object] values): return null_count != n def is_datetime64_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -416,7 +434,7 @@ def is_datetime64_array(ndarray values): return null_count != n def is_timedelta_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -431,7 +449,7 @@ def is_timedelta_array(ndarray values): return null_count != n def is_timedelta64_array(ndarray values): - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -447,7 +465,7 @@ def is_timedelta64_array(ndarray values): def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ - cdef int i, null_count = 0, n = len(values) + cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: return False @@ -462,7 +480,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return null_count != n def is_date_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) if n == 0: return False for i in range(n): @@ -471,7 +489,7 @@ def is_date_array(ndarray[object] values): return True def is_time_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) if n == 0: return False for i in range(n): @@ -484,7 +502,7 @@ def is_period(object o): return isinstance(o,Period) def is_period_array(ndarray[object] values): - cdef int i, n = len(values) + cdef Py_ssize_t i, n = len(values) from pandas.tseries.period import Period if n == 0: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 4c134b25636a7..6d9bea29cf44d 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -6,7 +6,7 @@ import pandas as pd from pandas.lib import isscalar, item_from_zerodim, max_len_string_array import pandas.util.testing as tm -from pandas.compat import u +from pandas.compat import u, PY2 class TestMisc(tm.TestCase): @@ -28,6 +28,17 @@ def test_max_len_string_array(self): tm.assertRaises(TypeError, lambda: max_len_string_array(arr.astype('U'))) + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(pd.lib.infer_dtype(arr), compare) + class TestIsscalar(tm.TestCase): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index c9b11810d83fe..b77b52c0e17a1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -1802,6 +1802,17 @@ def test_index_str_accessor_visibility(self): with self.assertRaisesRegexp(AttributeError, message): idx.str + def test_method_on_bytes(self): + lhs = Series(np.array(list('abc'), 'S1').astype(object)) + rhs = Series(np.array(list('def'), 'S1').astype(object)) + if compat.PY3: + self.assertRaises(TypeError, lhs.str.cat, rhs) + else: + result = lhs.str.cat(rhs) + expected = Series(np.array(['ad', 'be', 'cf'], + 'S2').astype(object)) + tm.assert_series_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],