From 2d41f2f4cab66801f39f8d74c299b78f6c99e631 Mon Sep 17 00:00:00 2001 From: Nathalie Rud Date: Tue, 28 May 2019 17:08:24 +0100 Subject: [PATCH 1/5] BUG: ignore errors for invalid dates in to_datetime with coerce (#25512) parsing.try_parse_year_month_day() in _attempt_YYYYMMDD() throws not only ValueError but also OverFlowError for incorrect dates. So handling of this error was added. --- pandas/core/tools/datetimes.py | 6 +++--- pandas/tests/indexes/datetimes/test_tools.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 817d539d4ad6f..a39b65ab72aa8 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -775,21 +775,21 @@ def calc_with_mask(carg, mask): # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) - except ValueError: + except (ValueError, OverflowError): pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) - except ValueError: + except (ValueError, OverflowError): pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) - except ValueError: + except (ValueError, OverflowError): pass return None diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index d62d8d1276fec..e0d85c2685a43 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -83,6 +83,21 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) + # GH 25512 + # strings with invalid date values, errors=coerce + s = Series(['19801222', '20010012', '10019999', np.nan]) + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) + expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + # integers with invalid date values, errors=coerce + s = Series([20010012, 20190813, 20019999, np.nan]) + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) + expected = Series([np.nan, Timestamp('20190813'), np.nan, np.nan]) + tm.assert_series_equal(result, expected) + # coercion # GH 7930 s = Series([20121231, 20141231, 99991231]) From 832b2ee7bb32c7298f7c4c857ba613e4bb2639e5 Mon Sep 17 00:00:00 2001 From: Nathalie Rud Date: Wed, 29 May 2019 13:12:18 +0100 Subject: [PATCH 2/5] TST: add 2 tests for to_datetime() for Series with invalid values and NaNs (#25512) --- pandas/tests/indexes/datetimes/test_tools.py | 22 ++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index e0d85c2685a43..76e53a72e9420 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -84,18 +84,32 @@ def test_to_datetime_format_YYYYMMDD(self, cache): assert_series_equal(result, expected) # GH 25512 - # strings with invalid date values, errors=coerce + # NaN before strings with invalid date values, errors=coerce + s = Series(['19801222', np.nan, '20010012', '10019999']) + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) + expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + # NaN after strings with invalid date values, errors=coerce s = Series(['19801222', '20010012', '10019999', np.nan]) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) - # integers with invalid date values, errors=coerce - s = Series([20010012, 20190813, 20019999, np.nan]) + # NaN before integers with invalid date values, errors=coerce + s = Series([20190813, np.nan, 20010012, 20019999]) + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) + expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + # NaN after integers with invalid date values, errors=coerce + s = Series([20190813, 20010012, np.nan, 20019999]) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) - expected = Series([np.nan, Timestamp('20190813'), np.nan, np.nan]) + expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) tm.assert_series_equal(result, expected) # coercion From e78b8a4959fda5a0ee884da29e1323558add8eba Mon Sep 17 00:00:00 2001 From: Nathalie Rud Date: Wed, 29 May 2019 16:06:57 +0100 Subject: [PATCH 3/5] TST: create separate test test_to_datetime_format_YYYYMMDD_overflow() for #25512 --- pandas/tests/indexes/datetimes/test_tools.py | 38 ++++++++++---------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 76e53a72e9420..6521591460680 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -83,48 +83,50 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = to_datetime(s, format='%Y%m%d', cache=cache) assert_series_equal(result, expected) + # coercion + # GH 7930 + s = Series([20121231, 20141231, 99991231]) + result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', + cache=cache) + expected = Series([datetime(2012, 12, 31), + datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object) + tm.assert_series_equal(result, expected) + + result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', + cache=cache) + expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('cache', [True, False]) + def test_to_datetime_format_YYYYMMDD_overflow(self, cache): # GH 25512 # NaN before strings with invalid date values, errors=coerce s = Series(['19801222', np.nan, '20010012', '10019999']) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) - tm.assert_series_equal(result, expected) + assert_series_equal(result, expected) # NaN after strings with invalid date values, errors=coerce s = Series(['19801222', '20010012', '10019999', np.nan]) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) - tm.assert_series_equal(result, expected) + assert_series_equal(result, expected) # NaN before integers with invalid date values, errors=coerce s = Series([20190813, np.nan, 20010012, 20019999]) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) - tm.assert_series_equal(result, expected) + assert_series_equal(result, expected) # NaN after integers with invalid date values, errors=coerce s = Series([20190813, 20010012, np.nan, 20019999]) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) - tm.assert_series_equal(result, expected) - - # coercion - # GH 7930 - s = Series([20121231, 20141231, 99991231]) - result = pd.to_datetime(s, format='%Y%m%d', errors='ignore', - cache=cache) - expected = Series([datetime(2012, 12, 31), - datetime(2014, 12, 31), datetime(9999, 12, 31)], - dtype=object) - tm.assert_series_equal(result, expected) - - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') assert_series_equal(result, expected) @pytest.mark.parametrize('cache', [True, False]) From 897dd8bb42a7a16855e7918d71e3230e240f1cd1 Mon Sep 17 00:00:00 2001 From: Nathalie Rud Date: Wed, 29 May 2019 16:29:01 +0100 Subject: [PATCH 4/5] DOC: add whatsnew for #25512 --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 32faf7115f0fd..e81bc57317c89 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -359,6 +359,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) +- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) Timedelta ^^^^^^^^^ From e32641e471d7ef64c7fa239314ded0337dc5e196 Mon Sep 17 00:00:00 2001 From: Nathalie Rud Date: Thu, 30 May 2019 13:48:22 +0100 Subject: [PATCH 5/5] TST: parametrize test for GH #25512 --- pandas/tests/indexes/datetimes/test_tools.py | 44 +++++++------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 6521591460680..c507c31ee54dd 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -98,35 +98,23 @@ def test_to_datetime_format_YYYYMMDD(self, cache): expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('cache', [True, False]) - def test_to_datetime_format_YYYYMMDD_overflow(self, cache): + @pytest.mark.parametrize("input_s, expected", [ + # NaN before strings with invalid date values + [Series(['19801222', np.nan, '20010012', '10019999']), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN after strings with invalid date values + [Series(['19801222', '20010012', '10019999', np.nan]), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN before integers with invalid date values + [Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], + # NaN after integers with invalid date values + [Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): # GH 25512 - # NaN before strings with invalid date values, errors=coerce - s = Series(['19801222', np.nan, '20010012', '10019999']) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) - assert_series_equal(result, expected) - - # NaN after strings with invalid date values, errors=coerce - s = Series(['19801222', '20010012', '10019999', np.nan]) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series([Timestamp('19801222'), np.nan, np.nan, np.nan]) - assert_series_equal(result, expected) - - # NaN before integers with invalid date values, errors=coerce - s = Series([20190813, np.nan, 20010012, 20019999]) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) - assert_series_equal(result, expected) - - # NaN after integers with invalid date values, errors=coerce - s = Series([20190813, 20010012, np.nan, 20019999]) - result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', - cache=cache) - expected = Series([Timestamp('20190813'), np.nan, np.nan, np.nan]) + # format='%Y%m%d', errors='coerce' + result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') assert_series_equal(result, expected) @pytest.mark.parametrize('cache', [True, False])