diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 6f005f912fe37..a112c632ceb25 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -204,6 +204,7 @@ If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python + :okwarning: pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) @@ -211,9 +212,10 @@ you can pass the ``dayfirst`` flag: .. warning:: - You see in the above example that ``dayfirst`` isn't strict, so if a date + You see in the above example that ``dayfirst`` isn't strict. If a date can't be parsed with the day being first it will be parsed as if - ``dayfirst`` were False. + ``dayfirst`` were False, and in the case of parsing delimited date strings + (e.g. ``31-12-2012``) then a warning will also be raised. If you pass a single string to ``to_datetime``, it returns a single ``Timestamp``. ``Timestamp`` can also accept string input, but it doesn't accept string parsing diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 205a49e7786a7..fc488504f1fdf 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -103,10 +103,20 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix1: +.. _whatsnew_140.notable_bug_fixes.inconsistent_date_string_parsing: -notable_bug_fix1 -^^^^^^^^^^^^^^^^ +Inconsistent date string parsing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``dayfirst`` option of :func:`to_datetime` isn't strict, and this can lead to surprising behaviour: + +.. ipython:: python + :okwarning: + + pd.to_datetime(["31-12-2021"], dayfirst=False) + +Now, a warning will be raised if a date string cannot be parsed accordance to the given ``dayfirst`` value when +the value is a delimited date string (e.g. ``31-12-2012``). .. _whatsnew_140.notable_bug_fixes.notable_bug_fix2: @@ -253,6 +263,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :class:`DataFrame` constructor unnecessarily copying non-datetimelike 2D object arrays (:issue:`39272`) +- :func:`to_datetime` would silently swap ``MM/DD/YYYY`` and ``DD/MM/YYYY`` formats if the given ``dayfirst`` option could not be respected - now, a warning is raised in the case of delimited date strings (e.g. ``31-12-2012``) (:issue:`12585`) - Timedelta diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 212e40b30848a..cfa16df367bce 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -3,6 +3,7 @@ Parsing functions for datetime and datetime-like strings. """ import re import time +import warnings from libc.string cimport strchr @@ -81,6 +82,11 @@ class DateParseError(ValueError): _DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) +PARSING_WARNING_MSG = ( + "Parsing '{date_string}' in {format} format. Provide format " + "or specify infer_datetime_format=True for consistent parsing." +) + cdef: set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'} @@ -168,10 +174,28 @@ cdef inline object _parse_delimited_date(str date_string, bint dayfirst): # date_string can't be converted to date, above format return None, None + swapped_day_and_month = False if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \ and (month <= MAX_MONTH or day <= MAX_MONTH): if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap: day, month = month, day + swapped_day_and_month = True + if dayfirst and not swapped_day_and_month: + warnings.warn( + PARSING_WARNING_MSG.format( + date_string=date_string, + format='MM/DD/YYYY' + ), + stacklevel=4, + ) + elif not dayfirst and swapped_day_and_month: + warnings.warn( + PARSING_WARNING_MSG.format( + date_string=date_string, + format='DD/MM/YYYY' + ), + stacklevel=4, + ) if PY_VERSION_HEX >= 0x03060100: # In Python <= 3.6.0 there is no range checking for invalid dates # in C api, thus we call faster C version for 3.6.1 or newer diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8eac5f76fd455..3005dd958ab49 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -701,8 +701,14 @@ def to_datetime( Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug, based on dateutil behavior). + + .. warning:: + + dayfirst=True is not strict, but will prefer to parse + with day first. If a delimited date string cannot be parsed in + accordance with the given `dayfirst` option, e.g. + ``to_datetime(['31-12-2021'])``, then a warning will be shown. + yearfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. @@ -711,8 +717,11 @@ def to_datetime( - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). - Warning: yearfirst=True is not strict, but will prefer to parse - with year first (this is a known bug, based on dateutil behavior). + .. warning:: + + yearfirst=True is not strict, but will prefer to parse + with year first. + utc : bool, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index c7b5efa5bf0c9..41f0b661611a6 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -8,6 +8,7 @@ datetime, ) from io import StringIO +import warnings from dateutil.parser import parse as du_parse from hypothesis import ( @@ -39,6 +40,7 @@ from pandas.core.indexes.datetimes import date_range import pandas.io.date_converters as conv +from pandas.io.parsers import read_csv # constant _DEFAULT_DATETIME = datetime(1, 1, 1) @@ -1556,16 +1558,16 @@ def test_invalid_parse_delimited_date(all_parsers, date_string): "date_string,dayfirst,expected", [ # %d/%m/%Y; month > 12 thus replacement - ("13/02/2019", False, datetime(2019, 2, 13)), ("13/02/2019", True, datetime(2019, 2, 13)), # %m/%d/%Y; day > 12 thus there will be no replacement ("02/13/2019", False, datetime(2019, 2, 13)), - ("02/13/2019", True, datetime(2019, 2, 13)), # %d/%m/%Y; dayfirst==True thus replacement ("04/02/2019", True, datetime(2019, 2, 4)), ], ) -def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected): +def test_parse_delimited_date_swap_no_warning( + all_parsers, date_string, dayfirst, expected +): parser = all_parsers expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") result = parser.read_csv( @@ -1574,6 +1576,30 @@ def test_parse_delimited_date_swap(all_parsers, date_string, dayfirst, expected) tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "date_string,dayfirst,expected", + [ + # %d/%m/%Y; month > 12 thus replacement + ("13/02/2019", False, datetime(2019, 2, 13)), + # %m/%d/%Y; day > 12 thus there will be no replacement + ("02/13/2019", True, datetime(2019, 2, 13)), + ], +) +def test_parse_delimited_date_swap_with_warning( + all_parsers, date_string, dayfirst, expected +): + parser = all_parsers + expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + warning_msg = ( + "Provide format or specify infer_datetime_format=True for consistent parsing" + ) + with tm.assert_produces_warning(UserWarning, match=warning_msg): + result = parser.read_csv( + StringIO(date_string), header=None, dayfirst=dayfirst, parse_dates=[0] + ) + tm.assert_frame_equal(result, expected) + + def _helper_hypothesis_delimited_date(call, date_string, **kwargs): msg, result = None, None try: @@ -1602,9 +1628,11 @@ def test_hypothesis_delimited_date(date_format, dayfirst, delimiter, test_dateti except_in_dateutil, except_out_dateutil = None, None date_string = test_datetime.strftime(date_format.replace(" ", delimiter)) - except_out_dateutil, result = _helper_hypothesis_delimited_date( - parse_datetime_string, date_string, dayfirst=dayfirst - ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + except_out_dateutil, result = _helper_hypothesis_delimited_date( + parse_datetime_string, date_string, dayfirst=dayfirst + ) except_in_dateutil, expected = _helper_hypothesis_delimited_date( du_parse, date_string, @@ -1674,3 +1702,95 @@ def test_date_parser_usecols_thousands(all_parsers): ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) tm.assert_frame_equal(result, expected) + + +def test_dayfirst_warnings(): + # GH 12585 + warning_msg_day_first = ( + "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) + warning_msg_month_first = ( + "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) + + # CASE 1: valid input + input = "date\n31/12/2014\n10/03/2011" + expected_consistent = DatetimeIndex( + ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ) + expected_inconsistent = DatetimeIndex( + ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None, name="date" + ) + + # A. dayfirst arg correct, no warning + res1 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index + tm.assert_index_equal(expected_consistent, res1) + + # B. dayfirst arg incorrect, warning + incorrect output + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res2 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected_inconsistent, res2) + + # C. dayfirst default arg, same as B + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res3 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected_inconsistent, res3) + + # D. infer_datetime_format=True overrides dayfirst default + # no warning + correct result + res4 = read_csv( + StringIO(input), + parse_dates=["date"], + infer_datetime_format=True, + index_col="date", + ).index + tm.assert_index_equal(expected_consistent, res4) + + # CASE 2: invalid input + # cannot consistently process with single format + # warnings *always* raised + + # first in DD/MM/YYYY, second in MM/DD/YYYY + input = "date\n31/12/2014\n03/30/2011" + expected = DatetimeIndex( + ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None, name="date" + ) + + # A. use dayfirst=True + with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): + res5 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=True, index_col="date" + ).index + tm.assert_index_equal(expected, res5) + + # B. use dayfirst=False + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res6 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected, res6) + + # C. use dayfirst default arg, same as B + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res7 = read_csv( + StringIO(input), parse_dates=["date"], dayfirst=False, index_col="date" + ).index + tm.assert_index_equal(expected, res7) + + # D. use infer_datetime_format=True + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res8 = read_csv( + StringIO(input), + parse_dates=["date"], + infer_datetime_format=True, + index_col="date", + ).index + tm.assert_index_equal(expected, res8) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3cc81ef851306..7e6c2a452f1a0 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -572,7 +572,7 @@ def test_to_timestamp_tz_arg(self, tzstr): with tm.assert_produces_warning(FutureWarning): p = Period("1/1/2005", freq="A").to_timestamp(freq="A", tz=tzstr) - exp = Timestamp("31/12/2005", tz="UTC").tz_convert(tzstr) + exp = Timestamp(day=31, month=12, year=2005, tz="UTC").tz_convert(tzstr) exp_zone = pytz.timezone(tzstr).normalize(p) assert p == exp diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 7351f50aea8c1..469a5caf7d694 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1839,6 +1839,75 @@ def test_dayfirst(self, cache): tm.assert_index_equal(expected, idx5) tm.assert_index_equal(expected, idx6) + def test_dayfirst_warnings(self): + # GH 12585 + warning_msg_day_first = ( + "Parsing '31/12/2014' in DD/MM/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) + warning_msg_month_first = ( + "Parsing '03/30/2011' in MM/DD/YYYY format. Provide " + "format or specify infer_datetime_format=True for consistent parsing." + ) + + # CASE 1: valid input + arr = ["31/12/2014", "10/03/2011"] + expected_consistent = DatetimeIndex( + ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ) + expected_inconsistent = DatetimeIndex( + ["2014-12-31", "2011-10-03"], dtype="datetime64[ns]", freq=None + ) + + # A. dayfirst arg correct, no warning + res1 = to_datetime(arr, dayfirst=True) + tm.assert_index_equal(expected_consistent, res1) + + # B. dayfirst arg incorrect, warning + incorrect output + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res2 = to_datetime(arr, dayfirst=False) + tm.assert_index_equal(expected_inconsistent, res2) + + # C. dayfirst default arg, same as B + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res3 = to_datetime(arr, dayfirst=False) + tm.assert_index_equal(expected_inconsistent, res3) + + # D. infer_datetime_format=True overrides dayfirst default + # no warning + correct result + res4 = to_datetime(arr, infer_datetime_format=True) + tm.assert_index_equal(expected_consistent, res4) + + # CASE 2: invalid input + # cannot consistently process with single format + # warnings *always* raised + + arr = ["31/12/2014", "03/30/2011"] + # first in DD/MM/YYYY, second in MM/DD/YYYY + expected = DatetimeIndex( + ["2014-12-31", "2011-03-30"], dtype="datetime64[ns]", freq=None + ) + + # A. use dayfirst=True + with tm.assert_produces_warning(UserWarning, match=warning_msg_month_first): + res5 = to_datetime(arr, dayfirst=True) + tm.assert_index_equal(expected, res5) + + # B. use dayfirst=False + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res6 = to_datetime(arr, dayfirst=False) + tm.assert_index_equal(expected, res6) + + # C. use dayfirst default arg, same as B + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res7 = to_datetime(arr, dayfirst=False) + tm.assert_index_equal(expected, res7) + + # D. use infer_datetime_format=True + with tm.assert_produces_warning(UserWarning, match=warning_msg_day_first): + res8 = to_datetime(arr, infer_datetime_format=True) + tm.assert_index_equal(expected, res8) + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) def test_to_datetime_dta_tz(self, klass): # GH#27733