diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 32faf7115f0fd..e81bc57317c89 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -359,6 +359,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` repr where ``np.datetime64('NaT')`` and ``np.timedelta64('NaT')`` with ``dtype=object`` would be represented as ``NaN`` (:issue:`25445`) - Bug in :func:`to_datetime` which does not replace the invalid argument with ``NaT`` when error is set to coerce (:issue:`26122`) - Bug in adding :class:`DateOffset` with nonzero month to :class:`DatetimeIndex` would raise ``ValueError`` (:issue:`26258`) +- Bug in :func:`to_datetime` which raises unhandled ``OverflowError`` when called with mix of invalid dates and ``NaN`` values with ``format='%Y%m%d'`` and ``error='coerce'`` (:issue:`25512`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 817d539d4ad6f..a39b65ab72aa8 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -775,21 +775,21 @@ def calc_with_mask(carg, mask): # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) - except ValueError: + except (ValueError, OverflowError): pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, notna(carg)) - except ValueError: + except (ValueError, OverflowError): pass # string with NaN-like try: mask = ~algorithms.isin(arg, list(tslib.nat_strings)) return calc_with_mask(arg, mask) - except ValueError: + except (ValueError, OverflowError): pass return None diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index d62d8d1276fec..c507c31ee54dd 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -96,6 +96,25 @@ def test_to_datetime_format_YYYYMMDD(self, cache): result = pd.to_datetime(s, format='%Y%m%d', errors='coerce', cache=cache) expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("input_s, expected", [ + # NaN before strings with invalid date values + [Series(['19801222', np.nan, '20010012', '10019999']), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN after strings with invalid date values + [Series(['19801222', '20010012', '10019999', np.nan]), + Series([Timestamp('19801222'), np.nan, np.nan, np.nan])], + # NaN before integers with invalid date values + [Series([20190813, np.nan, 20010012, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])], + # NaN after integers with invalid date values + [Series([20190813, 20010012, np.nan, 20019999]), + Series([Timestamp('20190813'), np.nan, np.nan, np.nan])]]) + def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected): + # GH 25512 + # format='%Y%m%d', errors='coerce' + result = pd.to_datetime(input_s, format='%Y%m%d', errors='coerce') assert_series_equal(result, expected) @pytest.mark.parametrize('cache', [True, False])