From 3490468e43dd98d8cb6a957d0561a61307644c15 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Sun, 18 Dec 2022 19:12:12 +0000 Subject: [PATCH 1/8] share paths and fix bugs --- doc/source/whatsnew/v2.0.0.rst | 8 ++ pandas/_libs/tslib.pyi | 3 - pandas/_libs/tslib.pyx | 64 +--------- pandas/_libs/tslibs/parsing.pyx | 2 +- pandas/_libs/tslibs/strptime.pxd | 4 + pandas/_libs/tslibs/strptime.pyx | 87 ++++++++++++-- pandas/core/arrays/datetimes.py | 7 -- pandas/core/tools/datetimes.py | 65 +--------- pandas/tests/tools/test_to_datetime.py | 159 ++++++++++++++++++++++--- pandas/tests/tslibs/test_parsing.py | 18 ++- 10 files changed, 248 insertions(+), 169 deletions(-) create mode 100644 pandas/_libs/tslibs/strptime.pxd diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e93dcebf20e3e..5a43b74a1830c 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -779,6 +779,7 @@ Performance improvements - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) - Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`) - Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`) +- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: @@ -808,6 +809,13 @@ Datetimelike - Bug in :meth:`Timestamp.round` when the ``freq`` argument has zero-duration (e.g. "0ns") returning incorrect results instead of raising (:issue:`49737`) - Bug in :func:`to_datetime` was not raising ``ValueError`` when invalid format was passed and ``errors`` was ``'ignore'`` or ``'coerce'`` (:issue:`50266`) - Bug in :class:`DateOffset` was throwing ``TypeError`` when constructing with milliseconds and another super-daily argument (:issue:`49897`) +- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`) +- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`) +- Bug in :func:`to_datetime` was not returning input when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`) +- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` to timezone-aware when parsing with timezone-aware strings, ISO8601 format, and ``utc=False`` (:issue:`50254`) +- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`) +- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`) +- Bug in :func:`to_datetime` was failing to parse date strings ``'today'`` and ``'now'`` if ``format`` was not ISO8601 (:issue:`50359`) - Timedelta diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi index ab94c4d59c5fc..9819b5173db56 100644 --- a/pandas/_libs/tslib.pyi +++ b/pandas/_libs/tslib.pyi @@ -23,9 +23,6 @@ def array_to_datetime( dayfirst: bool = ..., yearfirst: bool = ..., utc: bool = ..., - require_iso8601: bool = ..., - format: str | None = ..., - exact: bool = ..., ) -> tuple[np.ndarray, tzinfo | None]: ... # returned ndarray may be object dtype or datetime64[ns] diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 976a53e9117de..caccc873fcf80 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -38,6 +38,7 @@ from pandas._libs.tslibs.np_datetime cimport ( pydatetime_to_dt64, string_to_dts, ) +from pandas._libs.tslibs.strptime cimport parse_today_now from pandas._libs.util cimport ( is_datetime64_object, is_float_object, @@ -401,9 +402,6 @@ cpdef array_to_datetime( bint dayfirst=False, bint yearfirst=False, bint utc=False, - bint require_iso8601=False, - format: str | None=None, - bint exact=True, ): """ Converts a 1D array of date-like values to a numpy array of either: @@ -430,8 +428,6 @@ cpdef array_to_datetime( yearfirst parsing behavior when encountering datetime strings utc : bool, default False indicator whether the dates should be UTC - require_iso8601 : bool, default False - indicator whether the datetime string should be iso8601 Returns ------- @@ -502,16 +498,6 @@ cpdef array_to_datetime( iresult[i] = get_datetime64_nanos(val, NPY_FR_ns) elif is_integer_object(val) or is_float_object(val): - if require_iso8601: - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError( - f"time data \"{val}\" at position {i} doesn't " - f"match format \"{format}\"" - ) - return values, tz_out # these must be ns unit by-definition seen_integer = True @@ -542,25 +528,13 @@ cpdef array_to_datetime( string_to_dts_failed = string_to_dts( val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, format, exact + &out_tzoffset, False, None, False ) if string_to_dts_failed: # An error at this point is a _parsing_ error # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i], utc): + if parse_today_now(val, &iresult[i], utc): continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError( - f"time data \"{val}\" at position {i} doesn't " - f"match format \"{format}\"" - ) - return values, tz_out try: py_dt = parse_datetime_string(val, @@ -623,18 +597,6 @@ cpdef array_to_datetime( if is_coerce: iresult[i] = NPY_NAT continue - elif require_iso8601 and isinstance(val, str): - # GH#19382 for just-barely-OutOfBounds falling back to - # dateutil parser will return incorrect result because - # it will ignore nanoseconds - if is_raise: - - # Still raise OutOfBoundsDatetime, - # as error message is informative. - raise - - assert is_ignore - return values, tz_out raise except OutOfBoundsDatetime: @@ -793,26 +755,6 @@ cdef _array_to_datetime_object( return oresult, None -cdef bint _parse_today_now(str val, int64_t* iresult, bint utc): - # We delay this check for as long as possible - # because it catches relatively rare cases - - # Multiply by 1000 to convert to nanos, since these methods naturally have - # microsecond resolution - if val == "now": - if utc: - iresult[0] = Timestamp.utcnow().value * 1000 - else: - # GH#18705 make sure to_datetime("now") matches Timestamp("now") - # Note using Timestamp.now() is faster than Timestamp("now") - iresult[0] = Timestamp.now().value * 1000 - return True - elif val == "today": - iresult[0] = Timestamp.today().value * 1000 - return True - return False - - def array_to_datetime_with_tz(ndarray values, tzinfo tz): """ Vectorized analogue to pd.Timestamp(value, tz=tz) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 26317da62c8d9..9d152265a1a4b 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -825,7 +825,7 @@ def format_is_iso(f: str) -> bint: but must be consistent. Leading 0s in dates and times are optional. """ iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format - excluded_formats = ["%Y%m%d", "%Y%m", "%Y"] + excluded_formats = ["%Y%m"] for date_sep in [" ", "/", "\\", "-", ".", ""]: for time_sep in [" ", "T"]: diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd new file mode 100644 index 0000000000000..175195d4362e4 --- /dev/null +++ b/pandas/_libs/tslibs/strptime.pxd @@ -0,0 +1,4 @@ +from numpy cimport int64_t + + +cdef bint parse_today_now(str val, int64_t* iresult, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index c1bc5fd0910f8..045b07e53f070 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, NPY_FR_ns, check_dts_bounds, npy_datetimestruct, npy_datetimestruct_to_datetime, pydate_to_dt64, pydatetime_to_dt64, + string_to_dts, ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timestamps cimport _Timestamp @@ -48,9 +50,28 @@ from pandas._libs.util cimport ( is_float_object, is_integer_object, ) +from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() +cdef bint parse_today_now(str val, int64_t* iresult, bint utc): + # We delay this check for as long as possible + # because it catches relatively rare cases + + # Multiply by 1000 to convert to nanos, since these methods naturally have + # microsecond resolution + if val == "now": + if utc: + iresult[0] = Timestamp.utcnow().value * 1000 + else: + # GH#18705 make sure to_datetime("now") matches Timestamp("now") + # Note using Timestamp.now() is faster than Timestamp("now") + iresult[0] = Timestamp.now().value * 1000 + return True + elif val == "today": + iresult[0] = Timestamp.today().value * 1000 + return True + return False cdef dict _parse_code_table = {"y": 0, "Y": 1, @@ -94,6 +115,7 @@ def array_strptime( exact : matches must be exact if True, search if False errors : string specifying error handling, {'raise', 'ignore', 'coerce'} """ + from pandas._libs.tslibs.parsing import format_is_iso cdef: Py_ssize_t i, n = len(values) @@ -111,6 +133,9 @@ def array_strptime( bint found_naive = False bint found_tz = False tzinfo tz_out = None + bint iso_format = fmt is not None and format_is_iso(fmt) + NPY_DATETIMEUNIT out_bestunit + int out_local = 0, out_tzoffset = 0 assert is_raise or is_ignore or is_coerce @@ -232,17 +257,57 @@ def array_strptime( else: val = str(val) - # exact matching - if exact: - found = format_regex.match(val) - if not found: - raise ValueError(f"time data \"{val}\" at position {i} doesn't " - f"match format \"{fmt}\"") - if len(val) != found.end(): - raise ValueError( - f"unconverted data remains at position {i}: " - f'"{val[found.end():]}"' - ) + if iso_format: + string_to_dts_failed = string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, fmt, exact + ) + if not string_to_dts_failed: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + tz = timezone(timedelta(minutes=out_tzoffset)) + result_timezone[i] = tz + out_local = 0 + out_tzoffset = 0 + iresult[i] = value + try: + check_dts_bounds(&dts) + except ValueError: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise + continue + + if parse_today_now(val, &iresult[i], utc): + continue + + # Some ISO formats can't be parsed by string_to_dts + # For example, 6-digit YYYYMD. So, if there's an error, + # try the string-matching code below. + + # exact matching + if exact: + found = format_regex.match(val) + if not found: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError(f"time data \"{val}\" at position {i} doesn't " + f"match format \"{fmt}\"") + if len(val) != found.end(): + if is_coerce: + iresult[i] = NPY_NAT + continue + raise ValueError( + f"unconverted data remains at position {i}: " + f'"{val[found.end():]}"' + ) # search else: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 0162f54bf5225..608b38765621b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2118,10 +2118,7 @@ def objects_to_datetime64ns( yearfirst, utc: bool = False, errors: DateTimeErrorChoices = "raise", - require_iso8601: bool = False, allow_object: bool = False, - format: str | None = None, - exact: bool = True, ): """ Convert data to array of timestamps. @@ -2134,7 +2131,6 @@ def objects_to_datetime64ns( utc : bool, default False Whether to convert/localize timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} - require_iso8601 : bool, default False allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. @@ -2165,9 +2161,6 @@ def objects_to_datetime64ns( utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - require_iso8601=require_iso8601, - format=format, - exact=exact, ) result = result.reshape(data.shape, order=order) except OverflowError as err: diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index a97a866a8406e..49661bd86b7cc 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -36,7 +36,6 @@ from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.parsing import ( DateParseError, - format_is_iso, guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime @@ -419,7 +418,6 @@ def _convert_listlike_datetimes( # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation - orig_arg = arg try: arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) except TypeError: @@ -432,24 +430,12 @@ def _convert_listlike_datetimes( raise arg = ensure_object(arg) - require_iso8601 = False if format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) - # There is a special fast-path for iso8601 formatted datetime strings - require_iso8601 = format is not None and format_is_iso(format) - - if format is not None and not require_iso8601: - return _to_datetime_with_format( - arg, - orig_arg, - name, - utc, - format, - exact, - errors, - ) + if format is not None: + return _array_strptime_with_fallback(arg, name, utc, format, exact, errors) result, tz_parsed = objects_to_datetime64ns( arg, @@ -457,10 +443,7 @@ def _convert_listlike_datetimes( yearfirst=yearfirst, utc=utc, errors=errors, - require_iso8601=require_iso8601, allow_object=True, - format=format, - exact=exact, ) if tz_parsed is not None: @@ -490,40 +473,6 @@ def _array_strptime_with_fallback( return _box_as_indexlike(result, utc=utc, name=name) -def _to_datetime_with_format( - arg, - orig_arg, - name, - utc: bool, - fmt: str, - exact: bool, - errors: str, -) -> Index: - """ - Try parsing with the given format. - """ - result = None - - # shortcut formatting here - if fmt == "%Y%m%d": - # pass orig_arg as float-dtype may have been converted to - # datetime64[ns] - orig_arg = ensure_object(orig_arg) - try: - # may return None without raising - result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: - raise ValueError( - "cannot convert the input to '%Y%m%d' date format" - ) from err - if result is not None: - return _box_as_indexlike(result, utc=utc, name=name) - - # fallback - res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors) - return res - - def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: """ to_datetime specalized to the case where a 'unit' is passed. @@ -978,7 +927,7 @@ def to_datetime( in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') - datetime.datetime(1300, 1, 1, 0, 0) + '13000101' >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT @@ -1011,14 +960,12 @@ def to_datetime( Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00], dtype='object') - - A mix of timezone-aware and timezone-naive inputs is converted to - a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware - are constant: + - A mix of timezone-aware and timezone-naive inputs is also converted to + a simple :class:`Index` containing :class:`datetime.datetime` objects: >>> from datetime import datetime >>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)]) - DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'], - dtype='datetime64[ns, UTC-01:00]', freq=None) + Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object') | diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 927388408cf27..4ff309c223fb2 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -132,8 +132,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): # string with NaT ser2 = ser.apply(str) ser2[2] = "nat" - result = to_datetime(ser2, format="%Y%m%d", cache=cache) - tm.assert_series_equal(result, expected) + with pytest.raises(ValueError, match="unconverted data remains: .0"): + # https://github.com/pandas-dev/pandas/issues/50051 + to_datetime(ser2, format="%Y%m%d", cache=cache) def test_to_datetime_format_YYYYMM_with_nat(self, cache): # https://github.com/pandas-dev/pandas/issues/50237 @@ -148,15 +149,26 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): def test_to_datetime_format_YYYYMMDD_ignore(self, cache): # coercion - # GH 7930 + # GH 7930, GH 14487 ser = Series([20121231, 20141231, 99991231]) result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) expected = Series( - [datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], + [20121231, 20141231, 99991231], dtype=object, ) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_ignore_with_outofbounds(self, cache): + # https://github.com/pandas-dev/pandas/issues/26493 + result = to_datetime( + ["15010101", "20150101", np.nan], + format="%Y%m%d", + errors="ignore", + cache=cache, + ) + expected = Index(["15010101", "20150101", np.nan]) + tm.assert_index_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 @@ -550,6 +562,26 @@ def test_to_datetime_mixed_date_and_string(self, format): ), id="all tz-aware, mixed offsets, with utc", ), + pytest.param( + False, + ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], + Index( + [ + Timestamp("2000-01-01 01:00:00"), + Timestamp("2000-01-01 02:00:00+0000", tz="UTC"), + ], + ), + id="tz-aware string, naive pydatetime, without utc", + ), + pytest.param( + True, + ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], + DatetimeIndex( + ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], + dtype="datetime64[ns, UTC]", + ), + id="tz-aware string, naive pydatetime, with utc", + ), ], ) @pytest.mark.parametrize( @@ -560,6 +592,7 @@ def test_to_datetime_mixed_datetime_and_string_with_format( self, fmt, utc, args, expected, constructor ): # https://github.com/pandas-dev/pandas/issues/49298 + # https://github.com/pandas-dev/pandas/issues/50254 # note: ISO8601 formats go down a fastpath, so we need to check both # a ISO8601 format and a non-ISO8601 one ts1 = constructor(args[0]) @@ -567,6 +600,62 @@ def test_to_datetime_mixed_datetime_and_string_with_format( result = to_datetime([ts1, ts2], format=fmt, utc=utc) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "fmt, utc, expected", + [ + pytest.param( + "%Y-%m-%d %H:%M:%S%z", + True, + DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], + dtype="datetime64[ns, UTC]", + ), + id="ISO8601, UTC", + ), + pytest.param( + "%Y-%m-%d %H:%M:%S%z", + False, + Index( + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-01-02 02:00:00+0200", tz="UTC+02:00"), + NaT, + ] + ), + id="ISO8601, non-UTC", + ), + pytest.param( + "%Y-%d-%m %H:%M:%S%z", + True, + DatetimeIndex( + ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], + dtype="datetime64[ns, UTC]", + ), + id="non-ISO8601, UTC", + ), + pytest.param( + "%Y-%d-%m %H:%M:%S%z", + False, + Index( + [ + Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"), + Timestamp("2000-02-01 02:00:00+0200", tz="UTC+02:00"), + NaT, + ] + ), + id="non-ISO8601, non-UTC", + ), + ], + ) + def test_to_datetime_mixed_offsets_with_none(self, fmt, utc, expected): + # https://github.com/pandas-dev/pandas/issues/50071 + result = to_datetime( + ["2000-01-01 09:00:00+01:00", "2000-01-02 02:00:00+02:00", None], + format=fmt, + utc=utc, + ) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( "fmt", ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"], @@ -836,6 +925,29 @@ def test_to_datetime_today(self, tz): def test_to_datetime_today_now_unicode_bytes(self, arg): to_datetime([arg]) + @pytest.mark.parametrize( + "format, expected_ds", + [ + ("%Y-%m-%d %H:%M:%S%z", "2020-01-03"), + ("%Y-%d-%m %H:%M:%S%z", "2020-03-01"), + (None, "2020-01-03"), + ], + ) + @pytest.mark.parametrize( + "string, attribute", + [ + ("now", "utcnow"), + ("today", "today"), + ], + ) + def test_to_datetime_now_with_format(self, format, expected_ds, string, attribute): + # https://github.com/pandas-dev/pandas/issues/50359 + result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) + expected = DatetimeIndex( + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + ) + assert (expected - result).max().total_seconds() < 1e-2 + @pytest.mark.parametrize( "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] ) @@ -1953,10 +2065,7 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): # `format` is longer than the string, so this fails regardless of `exact` with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" at position 0 doesn't match format " - rf"\"{format}\"" - ), + match=rf"time data '{input}' does not match format '{format}'", ): to_datetime(input, format=format, exact=exact) @@ -1975,10 +2084,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" at position 0 doesn't match format " - rf"\"{format}\"" - ), + match="unconverted data remains: |does not match format", ): to_datetime(input, format=format) @@ -2014,10 +2120,7 @@ def test_to_datetime_iso8601_separator(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" at position 0 doesn\'t match format " - rf"\"{format}\"" - ), + match=rf"time data \'{input}\' does not match format '{format}'", ): to_datetime(input, format=format) @@ -2043,6 +2146,28 @@ def test_to_datetime_iso8601_valid(self, input, format): result = to_datetime(input, format=format) assert result == expected + @pytest.mark.parametrize( + "input, format", + [ + ("2020-1", "%Y-%m"), + ("2020-1-1", "%Y-%m-%d"), + ("2020-1-1 0", "%Y-%m-%d %H"), + ("2020-1-1T0", "%Y-%m-%dT%H"), + ("2020-1-1 0:0", "%Y-%m-%d %H:%M"), + ("2020-1-1T0:0", "%Y-%m-%dT%H:%M"), + ("2020-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"), + ("2020-1-1T0:0:0", "%Y-%m-%dT%H:%M:%S"), + ("2020-1-1T0:0:0.000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-1-1T0:0:0.000000", "%Y-%m-%dT%H:%M:%S.%f"), + ("2020-1-1T0:0:0.000000000", "%Y-%m-%dT%H:%M:%S.%f"), + ], + ) + def test_to_datetime_iso8601_non_padded(self, input, format): + # https://github.com/pandas-dev/pandas/issues/21422 + expected = Timestamp(2020, 1, 1) + result = to_datetime(input, format=format) + assert result == expected + @pytest.mark.parametrize( "input, format", [ @@ -2552,7 +2677,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = f'time data "{arg}" at position 0 doesn\'t match format "%Y-%m-%d"' + msg = "day is out of range for month|unconverted data remains" with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 7675305e27d22..d4f731206e885 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -154,27 +154,25 @@ def test_parsers_month_freq(date_str, expected): ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"), ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"), ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"), - # The +9 format for offsets is supported by dateutil, - # but don't round-trip, see https://github.com/pandas-dev/pandas/issues/48921 - ("2011-12-30T00:00:00+9", None), - ("2011-12-30T00:00:00+09", None), + ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"), + ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+090", None), ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:000", None), - ("2011-12-30T00:00:00+9:0", None), + ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"), ("2011-12-30T00:00:00+09:", None), ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"), ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"), - ("2011-12-30T00:00:00.000000+9", None), - ("2011-12-30T00:00:00.000000+09", None), + ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"), + ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+090", None), ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:000", None), - ("2011-12-30T00:00:00.000000+9:0", None), + ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"), ("2011-12-30T00:00:00.000000+09:", None), ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"), ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %H:%M:%S %p"), @@ -305,9 +303,9 @@ def test_parse_time_string_check_instance_type_raise_exception(): ("%Y-%m-%dT%H:%M:%S.%f", True), ("%Y-%m-%dT%H:%M:%S.%f%z", True), ("%Y-%m-%dT%H:%M:%S.%f%Z", False), - ("%Y%m%d", False), + ("%Y%m%d", True), ("%Y%m", False), - ("%Y", False), + ("%Y", True), ("%Y-%m-%d", True), ("%Y-%m", True), ], From 794f9e495bc3c7892079beb24d8943c0897b86ba Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 20 Dec 2022 15:49:30 +0000 Subject: [PATCH 2/8] move format_is_iso to strptime --- pandas/_libs/tslibs/parsing.pyi | 1 - pandas/_libs/tslibs/parsing.pyx | 20 -------------------- pandas/_libs/tslibs/strptime.pxd | 1 + pandas/_libs/tslibs/strptime.pyx | 22 ++++++++++++++++++++-- pandas/tests/tslibs/test_parsing.py | 7 +++++-- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index c7244447edaf7..2e666249a76fc 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -40,7 +40,6 @@ def try_parse_datetime_components( minutes: npt.NDArray[np.object_], # object[:] seconds: npt.NDArray[np.object_], # object[:] ) -> npt.NDArray[np.object_]: ... -def format_is_iso(f: str) -> bool: ... def guess_datetime_format( dt_str, dayfirst: bool | None = ..., diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 9d152265a1a4b..aa95febfc9721 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -818,26 +818,6 @@ class _timelex: _DATEUTIL_LEXER_SPLIT = _timelex.split -def format_is_iso(f: str) -> bint: - """ - Does format match the iso8601 set that can be handled by the C parser? - Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different - but must be consistent. Leading 0s in dates and times are optional. - """ - iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format - excluded_formats = ["%Y%m"] - - for date_sep in [" ", "/", "\\", "-", ".", ""]: - for time_sep in [" ", "T"]: - for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: - if (iso_template(date_sep=date_sep, - time_sep=time_sep, - micro_or_tz=micro_or_tz, - ).startswith(f) and f not in excluded_formats): - return True - return False - - def guess_datetime_format(dt_str: str, bint dayfirst=False) -> str | None: """ Guess the datetime format of a given datetime string. diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index 175195d4362e4..9eacc1ffb2fc0 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -1,4 +1,5 @@ from numpy cimport int64_t +cpdef bint format_is_iso(str f) cdef bint parse_today_now(str val, int64_t* iresult, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 045b07e53f070..0f15aa91b9bd0 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -54,6 +54,26 @@ from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() +cpdef bint format_is_iso(f: str): + """ + Does format match the iso8601 set that can be handled by the C parser? + Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different + but must be consistent. Leading 0s in dates and times are optional. + """ + iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format + excluded_formats = ["%Y%m"] + + for date_sep in [" ", "/", "\\", "-", ".", ""]: + for time_sep in [" ", "T"]: + for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: + if (iso_template(date_sep=date_sep, + time_sep=time_sep, + micro_or_tz=micro_or_tz, + ).startswith(f) and f not in excluded_formats): + return True + return False + + cdef bint parse_today_now(str val, int64_t* iresult, bint utc): # We delay this check for as long as possible # because it catches relatively rare cases @@ -115,8 +135,6 @@ def array_strptime( exact : matches must be exact if True, search if False errors : string specifying error handling, {'raise', 'ignore', 'coerce'} """ - from pandas._libs.tslibs.parsing import format_is_iso - cdef: Py_ssize_t i, n = len(values) npy_datetimestruct dts diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index d4f731206e885..6e64040d38830 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas._libs.tslibs import parsing +from pandas._libs.tslibs import ( + parsing, + strptime, +) from pandas._libs.tslibs.parsing import parse_time_string import pandas.util._test_decorators as td @@ -312,7 +315,7 @@ def test_parse_time_string_check_instance_type_raise_exception(): ) def test_is_iso_format(fmt, expected): # see gh-41047 - result = parsing.format_is_iso(fmt) + result = strptime.format_is_iso(fmt) assert result == expected From 030bcb2f4ab2a5c9ea5f2e2da9bfd729dd12742f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 20 Dec 2022 15:50:06 +0000 Subject: [PATCH 3/8] loosen bound in test --- pandas/tests/tools/test_to_datetime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4ff309c223fb2..d5e91958c8ced 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -946,7 +946,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut expected = DatetimeIndex( [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" ) - assert (expected - result).max().total_seconds() < 1e-2 + assert (expected - result).max().total_seconds() < 1 @pytest.mark.parametrize( "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")] From 38871c7d3e3390170a28efe4623ee7d3ae28b648 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 20 Dec 2022 16:19:34 +0000 Subject: [PATCH 4/8] keep format_is_iso as cdef, make def wrapper for test --- pandas/_libs/tslibs/strptime.pxd | 1 - pandas/_libs/tslibs/strptime.pyx | 7 ++++++- pandas/tests/tslibs/test_parsing.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pxd b/pandas/_libs/tslibs/strptime.pxd index 9eacc1ffb2fc0..175195d4362e4 100644 --- a/pandas/_libs/tslibs/strptime.pxd +++ b/pandas/_libs/tslibs/strptime.pxd @@ -1,5 +1,4 @@ from numpy cimport int64_t -cpdef bint format_is_iso(str f) cdef bint parse_today_now(str val, int64_t* iresult, bint utc) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 0f15aa91b9bd0..a8dff891c8815 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -54,7 +54,7 @@ from pandas._libs.tslibs.timestamps import Timestamp cnp.import_array() -cpdef bint format_is_iso(f: str): +cdef bint format_is_iso(f: str): """ Does format match the iso8601 set that can be handled by the C parser? Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different @@ -74,6 +74,11 @@ cpdef bint format_is_iso(f: str): return False +def _test_format_is_iso(f: str) -> bool: + """Only used in testing.""" + return format_is_iso(f) + + cdef bint parse_today_now(str val, int64_t* iresult, bint utc): # We delay this check for as long as possible # because it catches relatively rare cases diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 6e64040d38830..558c802fd70f6 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -315,7 +315,7 @@ def test_parse_time_string_check_instance_type_raise_exception(): ) def test_is_iso_format(fmt, expected): # see gh-41047 - result = strptime.format_is_iso(fmt) + result = strptime._test_format_is_iso(fmt) assert result == expected From 762dea8a9631d5e903575d4f96e9e73e159c3497 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 20 Dec 2022 16:22:14 +0000 Subject: [PATCH 5/8] use fantastic f-strings --- pandas/_libs/tslibs/strptime.pyx | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index a8dff891c8815..fd8c953dcbb36 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -60,16 +60,13 @@ cdef bint format_is_iso(f: str): Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different but must be consistent. Leading 0s in dates and times are optional. """ - iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format excluded_formats = ["%Y%m"] for date_sep in [" ", "/", "\\", "-", ".", ""]: for time_sep in [" ", "T"]: for micro_or_tz in ["", "%z", ".%f", ".%f%z"]: - if (iso_template(date_sep=date_sep, - time_sep=time_sep, - micro_or_tz=micro_or_tz, - ).startswith(f) and f not in excluded_formats): + iso_fmt = f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" + if iso_fmt.startswith(f) and f not in excluded_formats: return True return False From 3da6cebf2d7e45cb5becf311bb31bb79c7fa09c1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 20 Dec 2022 20:14:42 +0000 Subject: [PATCH 6/8] fixup --- pandas/tests/tools/test_to_datetime.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index d5e91958c8ced..5fb6bb82b87b9 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2065,7 +2065,10 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): # `format` is longer than the string, so this fails regardless of `exact` with pytest.raises( ValueError, - match=rf"time data '{input}' does not match format '{format}'", + match=( + rf"time data '{input}' does not match format " + rf"\'{format}\' \((match|search)\) at position 0" + ), ): to_datetime(input, format=format, exact=exact) @@ -2084,7 +2087,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format): # `format` is shorter than the date string, so only fails with `exact=True` with pytest.raises( ValueError, - match="unconverted data remains: |does not match format", + match="(unconverted data remains: |does not match format).* at position 0", ): to_datetime(input, format=format) @@ -2120,7 +2123,10 @@ def test_to_datetime_iso8601_separator(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 with pytest.raises( ValueError, - match=rf"time data \'{input}\' does not match format '{format}'", + match=( + rf"time data '{input}' does not match format " + rf"'{format}' \(match\) at position 0" + ), ): to_datetime(input, format=format) @@ -2677,7 +2683,7 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = "day is out of range for month|unconverted data remains" + msg = "(day is out of range for month|unconverted data remains).* at position 0" with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) From 392d23906bf8430ae495f64a5d55971cdd80925b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 28 Dec 2022 11:37:06 +0000 Subject: [PATCH 7/8] fixup tests --- pandas/_libs/tslibs/strptime.pyx | 83 +++++++++++--------------- pandas/tests/tools/test_to_datetime.py | 30 ++++++---- 2 files changed, 56 insertions(+), 57 deletions(-) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fd8c953dcbb36..eb59dbc24528b 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -137,6 +137,7 @@ def array_strptime( exact : matches must be exact if True, search if False errors : string specifying error handling, {'raise', 'ignore', 'coerce'} """ + cdef: Py_ssize_t i, n = len(values) npy_datetimestruct dts @@ -277,57 +278,45 @@ def array_strptime( else: val = str(val) - if iso_format: - string_to_dts_failed = string_to_dts( - val, &dts, &out_bestunit, &out_local, - &out_tzoffset, False, fmt, exact - ) - if not string_to_dts_failed: - # No error reported by string_to_dts, pick back up - # where we left off - value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) - if out_local == 1: - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - tz = timezone(timedelta(minutes=out_tzoffset)) - result_timezone[i] = tz - out_local = 0 - out_tzoffset = 0 - iresult[i] = value - try: + if iso_format: + string_to_dts_failed = string_to_dts( + val, &dts, &out_bestunit, &out_local, + &out_tzoffset, False, fmt, exact + ) + if not string_to_dts_failed: + # No error reported by string_to_dts, pick back up + # where we left off + value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts) + if out_local == 1: + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + tz = timezone(timedelta(minutes=out_tzoffset)) + result_timezone[i] = tz + out_local = 0 + out_tzoffset = 0 + iresult[i] = value check_dts_bounds(&dts) - except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - continue + continue - if parse_today_now(val, &iresult[i], utc): - continue + if parse_today_now(val, &iresult[i], utc): + continue - # Some ISO formats can't be parsed by string_to_dts - # For example, 6-digit YYYYMD. So, if there's an error, - # try the string-matching code below. + # Some ISO formats can't be parsed by string_to_dts + # For example, 6-digit YYYYMD. So, if there's an error, + # try the string-matching code below. - # exact matching - if exact: - found = format_regex.match(val) - if not found: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError(f"time data \"{val}\" at position {i} doesn't " - f"match format \"{fmt}\"") - if len(val) != found.end(): - if is_coerce: - iresult[i] = NPY_NAT - continue - raise ValueError( - f"unconverted data remains at position {i}: " - f'"{val[found.end():]}"' - ) + # exact matching + if exact: + found = format_regex.match(val) + if not found: + raise ValueError(f"time data \"{val}\" at position {i} doesn't " + f"match format \"{fmt}\"") + if len(val) != found.end(): + raise ValueError( + f"unconverted data remains at position {i}: " + f'"{val[found.end():]}"' + ) # search else: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 5fb6bb82b87b9..ca76c10cf552d 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -132,7 +132,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): # string with NaT ser2 = ser.apply(str) ser2[2] = "nat" - with pytest.raises(ValueError, match="unconverted data remains: .0"): + with pytest.raises( + ValueError, match='unconverted data remains at position 0: ".0' + ): # https://github.com/pandas-dev/pandas/issues/50051 to_datetime(ser2, format="%Y%m%d", cache=cache) @@ -2066,8 +2068,8 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): with pytest.raises( ValueError, match=( - rf"time data '{input}' does not match format " - rf"\'{format}\' \((match|search)\) at position 0" + rf"time data \"{input}\" at position 0 doesn't match format " + rf"\"{format}\"" ), ): to_datetime(input, format=format, exact=exact) @@ -2085,10 +2087,13 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): def test_to_datetime_iso8601_exact_fails(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 # `format` is shorter than the date string, so only fails with `exact=True` - with pytest.raises( - ValueError, - match="(unconverted data remains: |does not match format).* at position 0", - ): + msg = "|".join( + [ + '^unconverted data remains at position 0: ".*"$', + '^time data "0" at position 0 doesn\'t match format ".*"$', + ] + ) + with pytest.raises(ValueError, match=msg): to_datetime(input, format=format) @pytest.mark.parametrize( @@ -2124,8 +2129,8 @@ def test_to_datetime_iso8601_separator(self, input, format): with pytest.raises( ValueError, match=( - rf"time data '{input}' does not match format " - rf"'{format}' \(match\) at position 0" + rf"time data \"{input}\" at position 0 doesn\'t match format " + rf"\"{format}\"" ), ): to_datetime(input, format=format) @@ -2683,7 +2688,12 @@ def test_day_not_in_month_raise(self, cache): @pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"]) def test_day_not_in_month_raise_value(self, cache, arg): - msg = "(day is out of range for month|unconverted data remains).* at position 0" + msg = "|".join( + [ + "^day is out of range for month$", + '^unconverted data remains at position 0: "2"$', + ] + ) with pytest.raises(ValueError, match=msg): to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache) From bb3cd4dd9ece92c94ffc4842db46396d1d0eda2b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 28 Dec 2022 16:22:13 +0000 Subject: [PATCH 8/8] fixup post-merge --- pandas/tests/tools/test_to_datetime.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 445f4986299f0..a0667899840f1 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -132,7 +132,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser2 = ser.apply(str) ser2[2] = "nat" with pytest.raises( - ValueError, match='unconverted data remains at position 0: ".0' + ValueError, match='unconverted data remains: ".0", at position 0' ): # https://github.com/pandas-dev/pandas/issues/50051 to_datetime(ser2, format="%Y%m%d", cache=cache) @@ -2118,12 +2118,15 @@ def test_to_datetime_iso8601_fails(self, input, format, exact): def test_to_datetime_iso8601_exact_fails(self, input, format): # https://github.com/pandas-dev/pandas/issues/12649 # `format` is shorter than the date string, so only fails with `exact=True` + msg = "|".join( + [ + '^unconverted data remains: ".*", at position 0$', + 'time data ".*" doesn\'t match format ".*", at position 0', + ] + ) with pytest.raises( ValueError, - match=( - rf"time data \"{input}\" doesn't match format " - rf"\"{format}\", at position 0" - ), + match=(msg), ): to_datetime(input, format=format) @@ -2723,8 +2726,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-29", "%Y-%m-%d", - '^time data "2015-02-29" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + "^day is out of range for month, at position 0$", ), ( "2015-29-02", @@ -2734,8 +2736,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-02-32", "%Y-%m-%d", - '^time data "2015-02-32" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + '^unconverted data remains: "2", at position 0$', ), ( "2015-32-02", @@ -2746,8 +2747,7 @@ def test_day_not_in_month_raise(self, cache): ( "2015-04-31", "%Y-%m-%d", - '^time data "2015-04-31" doesn\'t match format "%Y-%m-%d", ' - "at position 0$", + "^day is out of range for month, at position 0$", ), ( "2015-31-04",