Skip to content

Commit 91a6958

Browse files
committed
API: Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index with all null elements (GH7431)
1 parent 586e317 commit 91a6958

File tree

10 files changed

+219
-68
lines changed

10 files changed

+219
-68
lines changed

doc/source/v0.14.1.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,8 @@ API changes
4646
day = offsets.Day(normalize=True)
4747
day.apply(Timestamp('2014-01-01 09:00'))
4848

49-
50-
51-
52-
49+
- Improved inference of datetime/timedelta with mixed null objects. Regression from 0.13.1 in interpretation of an object Index
50+
with all null elements (:issue:`7431`)
5351

5452
- Openpyxl now raises a ValueError on construction of the openpyxl writer
5553
instead of warning on pandas import (:issue:`7284`).

pandas/core/common.py

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1782,24 +1782,79 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False):
17821782
value.dtype == np.object_)):
17831783
pass
17841784

1785+
# try to infer if we have a datetimelike here
1786+
# otherwise pass thru
17851787
else:
1786-
# we might have a array (or single object) that is datetime like,
1787-
# and no dtype is passed don't change the value unless we find a
1788-
# datetime set
1789-
v = value
1790-
if not is_list_like(v):
1791-
v = [v]
1792-
if len(v):
1793-
inferred_type = lib.infer_dtype(v)
1794-
if inferred_type in ['datetime', 'datetime64']:
1795-
try:
1796-
value = tslib.array_to_datetime(np.array(v))
1797-
except:
1798-
pass
1799-
elif inferred_type in ['timedelta', 'timedelta64']:
1800-
from pandas.tseries.timedeltas import \
1801-
_possibly_cast_to_timedelta
1802-
value = _possibly_cast_to_timedelta(value, coerce='compat')
1788+
value = _possibly_infer_to_datetimelike(value)
1789+
1790+
return value
1791+
1792+
def _possibly_infer_to_datetimelike(value):
1793+
# we might have a array (or single object) that is datetime like,
1794+
# and no dtype is passed don't change the value unless we find a
1795+
# datetime/timedelta set
1796+
1797+
# this is pretty strict in that a datetime/timedelta is REQUIRED
1798+
# in addition to possible nulls/string likes
1799+
1800+
# ONLY strings are NOT datetimelike
1801+
1802+
v = value
1803+
if not is_list_like(v):
1804+
v = [v]
1805+
v = np.array(v)
1806+
shape = v.shape
1807+
v = v.ravel()
1808+
1809+
if len(v):
1810+
1811+
def _try_datetime(v):
1812+
# safe coerce to datetime64
1813+
try:
1814+
return tslib.array_to_datetime(v, raise_=True).reshape(shape)
1815+
except:
1816+
return v
1817+
1818+
def _try_timedelta(v):
1819+
# safe coerce to timedelta64
1820+
1821+
# will try first with a string & object conversion
1822+
from pandas.tseries.timedeltas import to_timedelta
1823+
try:
1824+
return to_timedelta(v).values.reshape(shape)
1825+
except:
1826+
1827+
# this is for compat with numpy < 1.7
1828+
# but string-likes will fail here
1829+
1830+
from pandas.tseries.timedeltas import \
1831+
_possibly_cast_to_timedelta
1832+
try:
1833+
return _possibly_cast_to_timedelta(v, coerce='compat').reshape(shape)
1834+
except:
1835+
return v
1836+
1837+
# do a quick inference for perf
1838+
sample = v[:min(3,len(v))]
1839+
inferred_type = lib.infer_dtype(sample)
1840+
1841+
if inferred_type in ['datetime', 'datetime64']:
1842+
value = _try_datetime(v)
1843+
elif inferred_type in ['timedelta', 'timedelta64']:
1844+
value = _try_timedelta(v)
1845+
1846+
# its possible to have nulls intermixed within the datetime or timedelta
1847+
# these will in general have an inferred_type of 'mixed', so have to try
1848+
# both datetime and timedelta
1849+
1850+
# try timedelta first to avoid spurious datetime conversions
1851+
# e.g. '00:00:01' is a timedelta but technically is also a datetime
1852+
elif inferred_type in ['mixed']:
1853+
1854+
if lib.is_possible_datetimelike_array(_ensure_object(v)):
1855+
value = _try_timedelta(v)
1856+
if lib.infer_dtype(value) in ['mixed']:
1857+
value = _try_datetime(v)
18031858

18041859
return value
18051860

pandas/core/internals.py

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
_NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like,
1313
ABCSparseSeries, _infer_dtype_from_scalar,
1414
_is_null_datelike_scalar,
15-
is_timedelta64_dtype, is_datetime64_dtype,)
15+
is_timedelta64_dtype, is_datetime64_dtype,
16+
_possibly_infer_to_datetimelike)
1617
from pandas.core.index import Index, MultiIndex, _ensure_index
1718
from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
1819
import pandas.core.common as com
@@ -1807,26 +1808,21 @@ def make_block(values, placement, klass=None, ndim=None,
18071808
elif issubclass(vtype, np.complexfloating):
18081809
klass = ComplexBlock
18091810

1810-
# try to infer a DatetimeBlock, or set to an ObjectBlock
18111811
else:
18121812

1813+
# we want to infer here if its a datetimelike if its object type
1814+
# this is pretty strict in that it requires a datetime/timedelta
1815+
# value IN addition to possible nulls/strings
1816+
# an array of ONLY strings will not be inferred
18131817
if np.prod(values.shape):
1814-
flat = values.ravel()
1815-
1816-
# try with just the first element; we just need to see if
1817-
# this is a datetime or not
1818-
inferred_type = lib.infer_dtype(flat[0:1])
1819-
if inferred_type in ['datetime', 'datetime64']:
1820-
1821-
# we have an object array that has been inferred as
1822-
# datetime, so convert it
1823-
try:
1824-
values = tslib.array_to_datetime(
1825-
flat).reshape(values.shape)
1826-
if issubclass(values.dtype.type, np.datetime64):
1827-
klass = DatetimeBlock
1828-
except: # it already object, so leave it
1829-
pass
1818+
result = _possibly_infer_to_datetimelike(values)
1819+
vtype = result.dtype.type
1820+
if issubclass(vtype, np.datetime64):
1821+
klass = DatetimeBlock
1822+
values = result
1823+
elif (issubclass(vtype, np.timedelta64)):
1824+
klass = TimeDeltaBlock
1825+
values = result
18301826

18311827
if klass is None:
18321828
klass = ObjectBlock

pandas/io/tests/test_json/test_pandas.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
import os
55

66
import numpy as np
7-
8-
from pandas import Series, DataFrame, DatetimeIndex, Timestamp
7+
import nose
8+
from pandas import Series, DataFrame, DatetimeIndex, Timestamp, _np_version_under1p7
99
import pandas as pd
1010
read_json = pd.read_json
1111

@@ -600,11 +600,29 @@ def test_url(self):
600600
for c in ['created_at', 'closed_at', 'updated_at']:
601601
self.assertEqual(result[c].dtype, 'datetime64[ns]')
602602

603-
def test_default_handler(self):
603+
def test_timedelta(self):
604+
if _np_version_under1p7:
605+
raise nose.SkipTest("numpy < 1.7")
606+
604607
from datetime import timedelta
608+
converter = lambda x: pd.to_timedelta(x,unit='ms')
609+
610+
s = Series([timedelta(23), timedelta(seconds=5)])
611+
self.assertEqual(s.dtype,'timedelta64[ns]')
612+
assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter))
613+
605614
frame = DataFrame([timedelta(23), timedelta(seconds=5)])
615+
self.assertEqual(frame[0].dtype,'timedelta64[ns]')
616+
assert_frame_equal(
617+
frame, pd.read_json(frame.to_json()).apply(converter))
618+
619+
def test_default_handler(self):
620+
from datetime import timedelta
621+
622+
frame = DataFrame([timedelta(23), timedelta(seconds=5), 42])
606623
self.assertRaises(OverflowError, frame.to_json)
607-
expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))])
624+
625+
expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5)), 42])
608626
assert_frame_equal(
609627
expected, pd.read_json(frame.to_json(default_handler=str)))
610628

pandas/src/inference.pyx

Lines changed: 60 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,27 @@ def infer_dtype_list(list values):
172172
pass
173173

174174

175+
def is_possible_datetimelike_array(object arr):
176+
# determine if we have a possible datetimelike (or null-like) array
177+
cdef:
178+
Py_ssize_t i, n = len(arr)
179+
bint seen_timedelta = 0, seen_datetime = 0
180+
object v
181+
182+
for i in range(n):
183+
v = arr[i]
184+
if util.is_string_object(v):
185+
continue
186+
elif util._checknull(v):
187+
continue
188+
elif is_datetime(v):
189+
seen_datetime=1
190+
elif is_timedelta(v):
191+
seen_timedelta=1
192+
else:
193+
return False
194+
return seen_datetime or seen_timedelta
195+
175196
cdef inline bint is_null_datetimelike(v):
176197
# determine if we have a null for a timedelta/datetime (or integer versions)x
177198
if util._checknull(v):
@@ -331,61 +352,84 @@ def is_unicode_array(ndarray values):
331352

332353

333354
def is_datetime_array(ndarray[object] values):
334-
cdef int i, n = len(values)
355+
cdef int i, null_count = 0, n = len(values)
335356
cdef object v
336357
if n == 0:
337358
return False
359+
360+
# return False for all nulls
338361
for i in range(n):
339362
v = values[i]
340-
if not (is_datetime(v) or is_null_datetimelike(v)):
363+
if is_null_datetimelike(v):
364+
# we are a regular null
365+
if util._checknull(v):
366+
null_count += 1
367+
elif not is_datetime(v):
341368
return False
342-
return True
343-
369+
return null_count != n
344370

345371
def is_datetime64_array(ndarray values):
346-
cdef int i, n = len(values)
372+
cdef int i, null_count = 0, n = len(values)
347373
cdef object v
348374
if n == 0:
349375
return False
376+
377+
# return False for all nulls
350378
for i in range(n):
351379
v = values[i]
352-
if not (util.is_datetime64_object(v) or is_null_datetimelike(v)):
380+
if is_null_datetimelike(v):
381+
# we are a regular null
382+
if util._checknull(v):
383+
null_count += 1
384+
elif not util.is_datetime64_object(v):
353385
return False
354-
return True
386+
return null_count != n
355387

356388
def is_timedelta_array(ndarray values):
357-
cdef int i, n = len(values)
389+
cdef int i, null_count = 0, n = len(values)
358390
cdef object v
359391
if n == 0:
360392
return False
361393
for i in range(n):
362394
v = values[i]
363-
if not (PyDelta_Check(v) or is_null_datetimelike(v)):
395+
if is_null_datetimelike(v):
396+
# we are a regular null
397+
if util._checknull(v):
398+
null_count += 1
399+
elif not PyDelta_Check(v):
364400
return False
365-
return True
401+
return null_count != n
366402

367403
def is_timedelta64_array(ndarray values):
368-
cdef int i, n = len(values)
404+
cdef int i, null_count = 0, n = len(values)
369405
cdef object v
370406
if n == 0:
371407
return False
372408
for i in range(n):
373409
v = values[i]
374-
if not (util.is_timedelta64_object(v) or is_null_datetimelike(v)):
410+
if is_null_datetimelike(v):
411+
# we are a regular null
412+
if util._checknull(v):
413+
null_count += 1
414+
elif not util.is_timedelta64_object(v):
375415
return False
376-
return True
416+
return null_count != n
377417

378418
def is_timedelta_or_timedelta64_array(ndarray values):
379419
""" infer with timedeltas and/or nat/none """
380-
cdef int i, n = len(values)
420+
cdef int i, null_count = 0, n = len(values)
381421
cdef object v
382422
if n == 0:
383423
return False
384424
for i in range(n):
385425
v = values[i]
386-
if not (is_timedelta(v) or is_null_datetimelike(v)):
426+
if is_null_datetimelike(v):
427+
# we are a regular null
428+
if util._checknull(v):
429+
null_count += 1
430+
elif not is_timedelta(v):
387431
return False
388-
return True
432+
return null_count != n
389433

390434
def is_date_array(ndarray[object] values):
391435
cdef int i, n = len(values)

pandas/tests/test_series.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,12 @@ def test_constructor_pass_none(self):
581581
s = Series(None, index=lrange(5), dtype=object)
582582
self.assertEqual(s.dtype, np.object_)
583583

584+
# GH 7431
585+
# inference on the index
586+
s = Series(index=np.array([None]))
587+
expected = Series(index=Index([None]))
588+
assert_series_equal(s,expected)
589+
584590
def test_constructor_cast(self):
585591
self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float)
586592

@@ -669,6 +675,16 @@ def test_constructor_dtype_datetime64(self):
669675
self.assert_numpy_array_equal(series1.values,dates2)
670676
self.assertEqual(series1.dtype,object)
671677

678+
# these will correctly infer a datetime
679+
s = Series([None, pd.NaT, '2013-08-05 15:30:00.000001'])
680+
self.assertEqual(s.dtype,'datetime64[ns]')
681+
s = Series([np.nan, pd.NaT, '2013-08-05 15:30:00.000001'])
682+
self.assertEqual(s.dtype,'datetime64[ns]')
683+
s = Series([pd.NaT, None, '2013-08-05 15:30:00.000001'])
684+
self.assertEqual(s.dtype,'datetime64[ns]')
685+
s = Series([pd.NaT, np.nan, '2013-08-05 15:30:00.000001'])
686+
self.assertEqual(s.dtype,'datetime64[ns]')
687+
672688
def test_constructor_dict(self):
673689
d = {'a': 0., 'b': 1., 'c': 2.}
674690
result = Series(d, index=['b', 'c', 'd', 'a'])
@@ -2462,6 +2478,18 @@ def f():
24622478
td = Series([timedelta(days=i) for i in range(3)] + ['foo'])
24632479
self.assertEqual(td.dtype, 'object')
24642480

2481+
# these will correctly infer a timedelta
2482+
# but only on numpy > 1.7 as the cython path will only be used
2483+
if not _np_version_under1p7:
2484+
s = Series([None, pd.NaT, '1 Day'])
2485+
self.assertEqual(s.dtype,'timedelta64[ns]')
2486+
s = Series([np.nan, pd.NaT, '1 Day'])
2487+
self.assertEqual(s.dtype,'timedelta64[ns]')
2488+
s = Series([pd.NaT, None, '1 Day'])
2489+
self.assertEqual(s.dtype,'timedelta64[ns]')
2490+
s = Series([pd.NaT, np.nan, '1 Day'])
2491+
self.assertEqual(s.dtype,'timedelta64[ns]')
2492+
24652493
def test_operators_timedelta64(self):
24662494

24672495
# invalid ops
@@ -2939,12 +2967,12 @@ def test_datetime64_fillna(self):
29392967

29402968
# GH 6587
29412969
# make sure that we are treating as integer when filling
2970+
# this also tests inference of a datetime-like with NaT's
29422971
s = Series([pd.NaT, pd.NaT, '2013-08-05 15:30:00.000001'])
29432972
expected = Series(['2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001', '2013-08-05 15:30:00.000001'], dtype='M8[ns]')
29442973
result = s.fillna(method='backfill')
29452974
assert_series_equal(result, expected)
29462975

2947-
29482976
def test_fillna_int(self):
29492977
s = Series(np.random.randint(-100, 100, 50))
29502978
s.fillna(method='ffill', inplace=True)

0 commit comments

Comments
 (0)