diff --git a/doc/source/release.rst b/doc/source/release.rst index 5376e0396799e..101ec290a58cf 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,7 +105,7 @@ Improvements to existing features test to vbench (:issue:`4705` and :issue:`4722`) - Add ``axis`` and ``level`` keywords to ``where``, so that the ``other`` argument can now be an alignable pandas object. - - ``to_datetime`` with a format of 'YYYYMMDD' now parses much faster + - ``to_datetime`` with a format of '%Y%m%d' now parses much faster API Changes ~~~~~~~~~~~ diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c9e643e25b761..d7ca9d9b371d4 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -845,9 +845,19 @@ def test_to_datetime_format_YYYYMMDD(self): assert_series_equal(result, expected) # with NaT + expected = Series([Timestamp("19801222"),Timestamp("19801222")] + [Timestamp("19810105")]*5) + expected[2] = np.nan s[2] = np.nan - self.assertRaises(ValueError, to_datetime, s,format='%Y%m%d') - self.assertRaises(ValueError, to_datetime, s.apply(str),format='%Y%m%d') + + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + + # string with NaT + s = s.apply(str) + s[2] = 'nat' + result = to_datetime(s,format='%Y%m%d') + assert_series_equal(result, expected) + def test_to_datetime_format_microsecond(self): val = '01-Apr-2011 00:00:01.978' diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index cca4850c2c1bf..dd78bea385c61 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -106,8 +106,7 @@ def _convert_listlike(arg, box): # shortcut formatting here if format == '%Y%m%d': try: - carg = arg.astype(np.int64).astype(object) - result = lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100) + result = _attempt_YYYYMMDD(arg) except: raise ValueError("cannot convert the input to '%Y%m%d' date format") @@ -144,6 +143,43 @@ def _convert_listlike(arg, box): class DateParseError(ValueError): pass +def _attempt_YYYYMMDD(arg): + """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, + arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """ + + def calc(carg): + # calculate the actual result + carg = carg.astype(object) + return lib.try_parse_year_month_day(carg/10000,carg/100 % 100, carg % 100) + + def calc_with_mask(carg,mask): + result = np.empty(carg.shape, dtype='M8[ns]') + iresult = result.view('i8') + iresult[-mask] = tslib.iNaT + result[mask] = calc(carg[mask].astype(np.float64).astype(np.int64)).astype('M8[ns]') + return result + + # try intlike / strings that are ints + try: + return calc(arg.astype(np.int64)) + except: + pass + + # a float with actual np.nan + try: + carg = arg.astype(np.float64) + return calc_with_mask(carg,com.notnull(carg)) + except: + pass + + # string with NaN-like + try: + mask = ~lib.ismember(arg, tslib._nat_strings) + return calc_with_mask(arg,mask) + except: + pass + + return None # patterns for quarters like '4Q2005', '05Q1' qpat1full = re.compile(r'(\d)Q(\d\d\d\d)') diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index 999c3869daf62..353d7afc63cb3 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -154,16 +154,7 @@ def date_range(start=None, end=None, periods=None, freq=None): timeseries_to_datetime_YYYYMMDD = \ Benchmark('to_datetime(strings,format="%Y%m%d")', setup, - start_date=datetime(2013, 9, 1)) - -setup = common_setup + """ -rng = date_range('1/1/2000', periods=10000, freq='D') -strings = Series(rng.year*10000+rng.month*100+rng.day,dtype=np.int64).apply(str) -""" - -timeseries_to_datetime_YYYYMMDD_old = \ - Benchmark('pandas.tslib.array_strptime(strings.values,"%Y%m%d")', setup, - start_date=datetime(2013, 9, 1)) + start_date=datetime(2012, 7, 1)) # ---- infer_freq # infer_freq