From 22127240efcaf0dcd5040450a6ffe60e23fea70e Mon Sep 17 00:00:00 2001 From: Vasilij N Litvinov Date: Tue, 5 Feb 2019 20:38:42 +0300 Subject: [PATCH 01/14] Implement no-raising version of parse_iso_8601_datetime --- pandas/_libs/tslibs/np_datetime.pxd | 3 + pandas/_libs/tslibs/np_datetime.pyx | 60 +++++++++++++- .../tslibs/src/datetime/np_datetime_strings.c | 78 +++++++++++++------ .../tslibs/src/datetime/np_datetime_strings.h | 8 +- 4 files changed, 123 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 803c8cb18e3d5..fe89a4e2818e0 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -74,3 +74,6 @@ cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil cdef int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1 +# see np_datetime.pyx for reasons of second _noexc version +cdef int _string_to_dts_noexc(object val, pandas_datetimestruct* dts, + int* out_local, int* out_tzoffset) except? -2 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 016206b0b69f0..4df2014abf4df 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -34,6 +34,9 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) + int parse_iso_8601_datetime_noexc(char *str, int len, + pandas_datetimestruct *out, + int *out_local, int *out_tzoffset) # ---------------------------------------------------------------------- @@ -175,6 +178,57 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, Py_ssize_t length const char* buf - buf = get_c_string_buf_and_size(val, &length) - return parse_iso_8601_datetime(buf, length, - dts, out_local, out_tzoffset) + if isinstance(val, unicode): + val = PyUnicode_AsASCIIString(val) + + tmp = val + result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset) + + if result == -1: + raise ValueError('Unable to parse %s' % str(val)) + return result + + +cdef inline int _cstring_to_dts(char *val, int length, + npy_datetimestruct* dts, + int* out_local, int* out_tzoffset) except? -1: + # Note: without this "extra layer" between _string_to_dts + # and parse_iso_8601_datetime, calling _string_to_dts raises + # `SystemError: returned a result with an error set` + # in Python3 + cdef: + int result + + result = parse_iso_8601_datetime(val, length, + dts, out_local, out_tzoffset) + return result + +# Slightly faster version that doesn't raise a ValueError +# if a date cannot be parsed, but it does raise ValueError if +# "val" supplied is not a valid ASCII string. +# Caller must check that return value == 0 to determine if parsing succeeded. +# NOTE: to stop exception propagation when date parsing failed +# this function is marked to cause an exception on a return value +# that can never happen in a real life. +cdef inline int _string_to_dts_noexc(object val, pandas_datetimestruct* dts, + int* out_local, int* out_tzoffset) except -2: + cdef: + int result + char *tmp + + if PyUnicode_Check(val): + val = PyUnicode_AsASCIIString(val) + + tmp = val + result = _cstring_to_dts_noexc(tmp, len(val), dts, out_local, out_tzoffset) + return result + +cdef inline int _cstring_to_dts_noexc(char *val, int length, + pandas_datetimestruct* dts, + int* out_local, int* out_tzoffset): + cdef: + int result + + result = parse_iso_8601_datetime_noexc(val, length, + dts, out_local, out_tzoffset) + return result diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index abeeaba1d1198..c8571bba1bdb4 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,9 +66,27 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ -int parse_iso_8601_datetime(const char *str, int len, +static int __parse_iso_8601_datetime(char *str, int len, int want_exc, + pandas_datetimestruct *out, + int *out_local, int *out_tzoffset); + +int parse_iso_8601_datetime(char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) { + return __parse_iso_8601_datetime(str, len, 1, out, out_local, out_tzoffset); +} + +// slightly faster version of parse_iso_8601_datetime which +// doesn't set Python exceptions but still returns -1 on error +int parse_iso_8601_datetime_noexc(char *str, int len, + pandas_datetimestruct *out, + int *out_local, int *out_tzoffset) { + return __parse_iso_8601_datetime(str, len, 0, out, out_local, out_tzoffset); +} + +static int __parse_iso_8601_datetime(char *str, int len, int want_exc, + pandas_datetimestruct *out, + int *out_local, int *out_tzoffset) { int year_leap = 0; int i, numdigits; const char *substr; @@ -173,8 +191,10 @@ int parse_iso_8601_datetime(const char *str, int len, goto parse_error; } if (out->month < 1 || out->month > 12) { - PyErr_Format(PyExc_ValueError, - "Month out of range in datetime string \"%s\"", str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); + } goto error; } @@ -217,8 +237,10 @@ int parse_iso_8601_datetime(const char *str, int len, } if (out->day < 1 || out->day > days_per_month_table[year_leap][out->month - 1]) { - PyErr_Format(PyExc_ValueError, - "Day out of range in datetime string \"%s\"", str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + } goto error; } @@ -251,8 +273,10 @@ int parse_iso_8601_datetime(const char *str, int len, ++substr; --sublen; if (out->hour >= 24) { - PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Hours out of range in datetime string \"%s\"", str); + } goto error; } } @@ -291,8 +315,10 @@ int parse_iso_8601_datetime(const char *str, int len, ++substr; --sublen; if (out->min >= 60) { - PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Minutes out of range in datetime string \"%s\"", str); + } goto error; } } else if (!has_hms_sep) { @@ -328,8 +354,10 @@ int parse_iso_8601_datetime(const char *str, int len, ++substr; --sublen; if (out->sec >= 60) { - PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Seconds out of range in datetime string \"%s\"", str); + } goto error; } } else if (!has_hms_sep) { @@ -438,10 +466,12 @@ int parse_iso_8601_datetime(const char *str, int len, substr += 2; sublen -= 2; if (offset_hour >= 24) { - PyErr_Format(PyExc_ValueError, - "Timezone hours offset out of range " - "in datetime string \"%s\"", - str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone hours offset out of range " + "in datetime string \"%s\"", + str); + } goto error; } } else if (sublen >= 1 && isdigit(substr[0])) { @@ -466,10 +496,12 @@ int parse_iso_8601_datetime(const char *str, int len, substr += 2; sublen -= 2; if (offset_minute >= 60) { - PyErr_Format(PyExc_ValueError, - "Timezone minutes offset out of range " - "in datetime string \"%s\"", - str); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Timezone minutes offset out of range " + "in datetime string \"%s\"", + str); + } goto error; } } else if (sublen >= 1 && isdigit(substr[0])) { @@ -507,9 +539,11 @@ int parse_iso_8601_datetime(const char *str, int len, return 0; parse_error: - PyErr_Format(PyExc_ValueError, - "Error parsing datetime string \"%s\" at position %d", str, - (int)(substr - str)); + if (want_exc) { + PyErr_Format(PyExc_ValueError, + "Error parsing datetime string \"%s\" at position %d", str, + (int)(substr - str)); + } return -1; error: diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 86ebe890810d6..29404e0a3a117 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -58,6 +58,13 @@ parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset); +// slightly faster version of parse_iso_8601_datetime which +// doesn't set Python exceptions but still returns -1 on error +int +parse_iso_8601_datetime_noexc(char *str, int len, + pandas_datetimestruct *out, + int *out_local, + int *out_tzoffset); /* * Provides a string length to use for converting datetime @@ -79,5 +86,4 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base); int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, NPY_DATETIMEUNIT base); - #endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_ From 6ca0156399b7f06321f51d964c58db19c0f96fb8 Mon Sep 17 00:00:00 2001 From: Vasilij N Litvinov Date: Tue, 5 Feb 2019 20:39:28 +0300 Subject: [PATCH 02/14] Use no-raising version of parse_iso_8601_datetime in array_to_datetime --- pandas/_libs/tslib.pyx | 100 ++++++++++-------- pandas/_libs/tslibs/np_datetime.pxd | 4 +- pandas/_libs/tslibs/np_datetime.pyx | 66 +++--------- .../tslibs/src/datetime/np_datetime_strings.c | 14 +-- .../tslibs/src/datetime/np_datetime_strings.h | 8 +- 5 files changed, 84 insertions(+), 108 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 50e3fb1c38cc7..c7a4e19adb7af 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -19,7 +19,7 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp -from pandas._libs.tslibs.np_datetime cimport ( +from pandas._libs.tslibs.np_datetime cimport (_string_to_dts_noexc, check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime @@ -511,6 +511,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', int out_local=0, out_tzoffset=0 float offset_seconds, tz_offset set out_tzoffset_vals = set() + bint string_to_dts_failed # specify error conditions assert is_raise or is_ignore or is_coerce @@ -579,58 +580,65 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', continue try: - _string_to_dts(val, &dts, &out_local, &out_tzoffset) - except ValueError: - # A ValueError at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i]): - continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT + string_to_dts_failed = _string_to_dts_noexc( + val, &dts, &out_local, + &out_tzoffset + ) != 0 + if string_to_dts_failed: + # An error at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): continue - elif is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) - return values, tz_out - - try: - py_dt = parse_datetime_string(val, - dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise TypeError("invalid string coercion to " - "datetime") - - # If the dateutil parser returned tzinfo, capture it - # to check if all arguments have the same tzinfo - tz = py_dt.utcoffset() - if tz is not None: - seen_datetime_offset = 1 - # dateutil timezone objects cannot be hashed, so - # store the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - - _ts = convert_datetime_to_tsobject(py_dt, None) - iresult[i] = _ts.value + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values, tz_out + + try: + py_dt = parse_datetime_string( + val, + dayfirst=dayfirst, + yearfirst=yearfirst + ) + except Exception: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise TypeError("invalid string coercion to " + "datetime") + + # If the dateutil parser returned tzinfo, + # capture it to check if all arguments + # have the same tzinfo + tz = py_dt.utcoffset() + if tz is not None: + seen_datetime_offset = 1 + # dateutil timezone objects cannot be hashed, + # so store the UTC offsets in seconds instead + out_tzoffset_vals.add(tz.total_seconds()) + else: + # Add a marker for naive string, + # to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + + _ts = convert_datetime_to_tsobject(py_dt, None) + iresult[i] = _ts.value except: # TODO: What exception are we concerned with here? if is_coerce: iresult[i] = NPY_NAT continue raise - else: - # No error raised by string_to_dts, pick back up + if not string_to_dts_failed: + # No error reported by string_to_dts, pick back up # where we left off value = dtstruct_to_dt64(&dts) if out_local == 1: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index fe89a4e2818e0..1ea06c3bf299b 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -75,5 +75,5 @@ cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil cdef int _string_to_dts(object val, npy_datetimestruct* dts, int* out_local, int* out_tzoffset) except? -1 # see np_datetime.pyx for reasons of second _noexc version -cdef int _string_to_dts_noexc(object val, pandas_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -2 +cdef int _string_to_dts_noexc(object val, npy_datetimestruct* dts, + int* out_local, int* out_tzoffset) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 4df2014abf4df..e34d0ea1f65ed 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,4 @@ -from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE +from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE, PyErr_Clear from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT, @@ -34,8 +34,8 @@ cdef extern from "src/datetime/np_datetime_strings.h": int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) - int parse_iso_8601_datetime_noexc(char *str, int len, - pandas_datetimestruct *out, + int parse_iso_8601_datetime_noexc(const char *str, int len, + npy_datetimestruct *out, int *out_local, int *out_tzoffset) @@ -178,57 +178,25 @@ cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, Py_ssize_t length const char* buf - if isinstance(val, unicode): - val = PyUnicode_AsASCIIString(val) + buf = get_c_string_buf_and_size(val, &length) + return parse_iso_8601_datetime(buf, length, + dts, out_local, out_tzoffset) - tmp = val - result = _cstring_to_dts(tmp, len(val), dts, out_local, out_tzoffset) - - if result == -1: - raise ValueError('Unable to parse %s' % str(val)) - return result - - -cdef inline int _cstring_to_dts(char *val, int length, - npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -1: - # Note: without this "extra layer" between _string_to_dts - # and parse_iso_8601_datetime, calling _string_to_dts raises - # `SystemError: returned a result with an error set` - # in Python3 - cdef: - int result - - result = parse_iso_8601_datetime(val, length, - dts, out_local, out_tzoffset) - return result # Slightly faster version that doesn't raise a ValueError -# if a date cannot be parsed, but it does raise ValueError if -# "val" supplied is not a valid ASCII string. +# if a date cannot be parsed, it reports various errors via return result. # Caller must check that return value == 0 to determine if parsing succeeded. -# NOTE: to stop exception propagation when date parsing failed -# this function is marked to cause an exception on a return value -# that can never happen in a real life. -cdef inline int _string_to_dts_noexc(object val, pandas_datetimestruct* dts, - int* out_local, int* out_tzoffset) except -2: +cdef inline int _string_to_dts_noexc(object val, npy_datetimestruct* dts, + int* out_local, int* out_tzoffset): cdef: - int result - char *tmp - - if PyUnicode_Check(val): - val = PyUnicode_AsASCIIString(val) - - tmp = val - result = _cstring_to_dts_noexc(tmp, len(val), dts, out_local, out_tzoffset) - return result + Py_ssize_t length + const char* buf -cdef inline int _cstring_to_dts_noexc(char *val, int length, - pandas_datetimestruct* dts, - int* out_local, int* out_tzoffset): - cdef: - int result + buf = get_c_string_buf_and_size(val, &length) + if buf == NULL: + PyErr_Clear() + return -1 - result = parse_iso_8601_datetime_noexc(val, length, - dts, out_local, out_tzoffset) + result = parse_iso_8601_datetime_noexc(buf, length, + dts, out_local, out_tzoffset); return result diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index c8571bba1bdb4..6a1539ed8b039 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,11 +66,11 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ -static int __parse_iso_8601_datetime(char *str, int len, int want_exc, - pandas_datetimestruct *out, +static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, int *out_local, int *out_tzoffset); -int parse_iso_8601_datetime(char *str, int len, +int parse_iso_8601_datetime(const char *str, int len, npy_datetimestruct *out, int *out_local, int *out_tzoffset) { return __parse_iso_8601_datetime(str, len, 1, out, out_local, out_tzoffset); @@ -78,14 +78,14 @@ int parse_iso_8601_datetime(char *str, int len, // slightly faster version of parse_iso_8601_datetime which // doesn't set Python exceptions but still returns -1 on error -int parse_iso_8601_datetime_noexc(char *str, int len, - pandas_datetimestruct *out, +int parse_iso_8601_datetime_noexc(const char *str, int len, + npy_datetimestruct *out, int *out_local, int *out_tzoffset) { return __parse_iso_8601_datetime(str, len, 0, out, out_local, out_tzoffset); } -static int __parse_iso_8601_datetime(char *str, int len, int want_exc, - pandas_datetimestruct *out, +static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, + npy_datetimestruct *out, int *out_local, int *out_tzoffset) { int year_leap = 0; int i, numdigits; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 29404e0a3a117..4de5d0ff9833c 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -61,10 +61,10 @@ parse_iso_8601_datetime(const char *str, int len, // slightly faster version of parse_iso_8601_datetime which // doesn't set Python exceptions but still returns -1 on error int -parse_iso_8601_datetime_noexc(char *str, int len, - pandas_datetimestruct *out, - int *out_local, - int *out_tzoffset); +parse_iso_8601_datetime_noexc(const char *str, int len, + npy_datetimestruct *out, + int *out_local, + int *out_tzoffset); /* * Provides a string length to use for converting datetime From e60a9330d16abb854137d853c32a636b44eaa057 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 26 Apr 2019 23:29:17 +0300 Subject: [PATCH 03/14] fixed linter errors --- pandas/_libs/tslib.pyx | 7 ++++--- pandas/_libs/tslibs/src/datetime/np_datetime_strings.c | 9 ++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c7a4e19adb7af..82be8db8cdcfb 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -19,9 +19,10 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp -from pandas._libs.tslibs.np_datetime cimport (_string_to_dts_noexc, - check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct, - dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value) +from pandas._libs.tslibs.np_datetime cimport ( + _string_to_dts_noexc, check_dts_bounds, npy_datetimestruct, _string_to_dts, + dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, + get_datetime64_value) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 6a1539ed8b039..1aa84b63c6bad 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -275,7 +275,8 @@ static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out->hour >= 24) { if (want_exc) { PyErr_Format(PyExc_ValueError, - "Hours out of range in datetime string \"%s\"", str); + "Hours out of range in datetime string \"%s\"", + str); } goto error; } @@ -317,7 +318,8 @@ static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out->min >= 60) { if (want_exc) { PyErr_Format(PyExc_ValueError, - "Minutes out of range in datetime string \"%s\"", str); + "Minutes out of range in datetime string \"%s\"", + str); } goto error; } @@ -356,7 +358,8 @@ static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, if (out->sec >= 60) { if (want_exc) { PyErr_Format(PyExc_ValueError, - "Seconds out of range in datetime string \"%s\"", str); + "Seconds out of range in datetime string \"%s\"", + str); } goto error; } From a22b52a6fb5be7b090d32007125d01ff9dbf1a48 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 27 Apr 2019 07:15:45 +0300 Subject: [PATCH 04/14] removed comparison to 0 --- pandas/_libs/tslib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 82be8db8cdcfb..64f4f48a54811 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -584,7 +584,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', string_to_dts_failed = _string_to_dts_noexc( val, &dts, &out_local, &out_tzoffset - ) != 0 + ) if string_to_dts_failed: # An error at this point is a _parsing_ error # specifically _not_ OutOfBoundsDatetime From 9d3b739839d78de297d6b21d33a712dc9cdf8bb8 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sat, 27 Apr 2019 22:51:04 +0300 Subject: [PATCH 05/14] moved '_string_to_dts_noexc' from 'try' block --- pandas/_libs/tslib.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 64f4f48a54811..2b1303284922a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -580,11 +580,11 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', iresult[i] = NPY_NAT continue + string_to_dts_failed = _string_to_dts_noexc( + val, &dts, &out_local, + &out_tzoffset + ) try: - string_to_dts_failed = _string_to_dts_noexc( - val, &dts, &out_local, - &out_tzoffset - ) if string_to_dts_failed: # An error at this point is a _parsing_ error # specifically _not_ OutOfBoundsDatetime From 10636caba2b6c66ed25e677b72acc9744d251445 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Sun, 28 Apr 2019 21:13:22 +0300 Subject: [PATCH 06/14] removed string-to-dts-noexc and parse_iso_8601_datetime_noexc --- pandas/_libs/tslib.pyx | 7 +- pandas/_libs/tslibs/conversion.pyx | 82 +++++++++++-------- pandas/_libs/tslibs/np_datetime.pxd | 5 +- pandas/_libs/tslibs/np_datetime.pyx | 30 +------ .../tslibs/src/datetime/np_datetime_strings.c | 20 +---- .../tslibs/src/datetime/np_datetime_strings.h | 9 +- pandas/tests/tslibs/test_parse_iso8601.py | 9 +- 7 files changed, 59 insertions(+), 103 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 2b1303284922a..e716dd7de2832 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -20,7 +20,7 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.np_datetime cimport ( - _string_to_dts_noexc, check_dts_bounds, npy_datetimestruct, _string_to_dts, + check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime @@ -205,7 +205,8 @@ def _test_parse_iso8601(object ts): elif ts == 'today': return Timestamp.now().normalize() - _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) + if _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) == -1: + raise ValueError obj.value = dtstruct_to_dt64(&obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -580,7 +581,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', iresult[i] = NPY_NAT continue - string_to_dts_failed = _string_to_dts_noexc( + string_to_dts_failed = _string_to_dts( val, &dts, &out_local, &out_tzoffset ) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 17237a557443b..65782fed54504 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -443,47 +443,57 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ts = datetime.now(tz) # equiv: datetime.today().replace(tzinfo=tz) else: - try: - _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) - obj.value = dtstruct_to_dt64(&obj.dts) - check_dts_bounds(&obj.dts) - if out_local == 1: - obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) - if tz is None: - check_dts_bounds(&obj.dts) - check_overflows(obj) - return obj - else: - # Keep the converter same as PyDateTime's - obj = convert_to_tsobject(obj.value, obj.tzinfo, - None, 0, 0) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, - obj.dts.hour, obj.dts.min, obj.dts.sec, - obj.dts.us, obj.tzinfo) - obj = convert_datetime_to_tsobject( - dt, tz, nanos=obj.dts.ps // 1000) - return obj - - else: - ts = obj.value - if tz is not None: - # shift for localize_tso - ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, - ambiguous='raise')[0] - - except OutOfBoundsDatetime: - # GH#19382 for just-barely-OutOfBounds falling back to dateutil - # parser will return incorrect result because it will ignore - # nanoseconds - raise - - except ValueError: + string_to_dts_failed = _string_to_dts( + ts, &obj.dts, &out_local, + &out_tzoffset + ) + if string_to_dts_failed: try: ts = parse_datetime_string(ts, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: raise ValueError("could not convert string to Timestamp") + else: + try: + obj.value = dtstruct_to_dt64(&obj.dts) + check_dts_bounds(&obj.dts) + if out_local == 1: + obj.tzinfo = pytz.FixedOffset(out_tzoffset) + obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) + if tz is None: + check_dts_bounds(&obj.dts) + check_overflows(obj) + return obj + else: + # Keep the converter same as PyDateTime's + obj = convert_to_tsobject(obj.value, obj.tzinfo, + None, 0, 0) + dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) + obj = convert_datetime_to_tsobject( + dt, tz, nanos=obj.dts.ps // 1000) + return obj + + else: + ts = obj.value + if tz is not None: + # shift for localize_tso + ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, + ambiguous='raise')[0] + + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to dateutil + # parser will return incorrect result because it will ignore + # nanoseconds + raise + + except ValueError: + try: + ts = parse_datetime_string(ts, dayfirst=dayfirst, + yearfirst=yearfirst) + except Exception: + raise ValueError("could not convert string to Timestamp") return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 1ea06c3bf299b..772a86e531ded 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -73,7 +73,4 @@ cdef npy_timedelta get_timedelta64_value(object obj) nogil cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil cdef int _string_to_dts(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -1 -# see np_datetime.pyx for reasons of second _noexc version -cdef int _string_to_dts_noexc(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) + int* out_local, int* out_tzoffset) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index e34d0ea1f65ed..5ff1820f060f7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,4 @@ -from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE, PyErr_Clear +from cpython cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE from cpython.datetime cimport (datetime, date, PyDateTime_IMPORT, @@ -31,12 +31,9 @@ cdef extern from "src/datetime/np_datetime.h": npy_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS cdef extern from "src/datetime/np_datetime_strings.h": - int parse_iso_8601_datetime(const char *str, int len, + int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, int *out_local, int *out_tzoffset) - int parse_iso_8601_datetime_noexc(const char *str, int len, - npy_datetimestruct *out, - int *out_local, int *out_tzoffset) # ---------------------------------------------------------------------- @@ -173,30 +170,11 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) except? -1: + int* out_local, int* out_tzoffset): cdef: Py_ssize_t length const char* buf buf = get_c_string_buf_and_size(val, &length) - return parse_iso_8601_datetime(buf, length, + return parse_iso_8601_datetime(buf, length, 0, dts, out_local, out_tzoffset) - - -# Slightly faster version that doesn't raise a ValueError -# if a date cannot be parsed, it reports various errors via return result. -# Caller must check that return value == 0 to determine if parsing succeeded. -cdef inline int _string_to_dts_noexc(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset): - cdef: - Py_ssize_t length - const char* buf - - buf = get_c_string_buf_and_size(val, &length) - if buf == NULL: - PyErr_Clear() - return -1 - - result = parse_iso_8601_datetime_noexc(buf, length, - dts, out_local, out_tzoffset); - return result diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c index 1aa84b63c6bad..54ed6ecff21e2 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.c @@ -66,25 +66,7 @@ This file implements string parsing and creation for NumPy datetime. * * Returns 0 on success, -1 on failure. */ -static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, - npy_datetimestruct *out, - int *out_local, int *out_tzoffset); - -int parse_iso_8601_datetime(const char *str, int len, - npy_datetimestruct *out, - int *out_local, int *out_tzoffset) { - return __parse_iso_8601_datetime(str, len, 1, out, out_local, out_tzoffset); -} - -// slightly faster version of parse_iso_8601_datetime which -// doesn't set Python exceptions but still returns -1 on error -int parse_iso_8601_datetime_noexc(const char *str, int len, - npy_datetimestruct *out, - int *out_local, int *out_tzoffset) { - return __parse_iso_8601_datetime(str, len, 0, out, out_local, out_tzoffset); -} - -static int __parse_iso_8601_datetime(const char *str, int len, int want_exc, +int parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, int *out_local, int *out_tzoffset) { int year_leap = 0; diff --git a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h index 4de5d0ff9833c..880c34ea77638 100644 --- a/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h +++ b/pandas/_libs/tslibs/src/datetime/np_datetime_strings.h @@ -54,17 +54,10 @@ This file implements string parsing and creation for NumPy datetime. * Returns 0 on success, -1 on failure. */ int -parse_iso_8601_datetime(const char *str, int len, +parse_iso_8601_datetime(const char *str, int len, int want_exc, npy_datetimestruct *out, int *out_local, int *out_tzoffset); -// slightly faster version of parse_iso_8601_datetime which -// doesn't set Python exceptions but still returns -1 on error -int -parse_iso_8601_datetime_noexc(const char *str, int len, - npy_datetimestruct *out, - int *out_local, - int *out_tzoffset); /* * Provides a string length to use for converting datetime diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index 8c995f243a993..0b61a0cb5826b 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -46,18 +46,13 @@ def test_parsers_iso8601(date_str, exp): "20010101 12345Z", ]) def test_parsers_iso8601_invalid(date_str): - msg = "Error parsing datetime string \"{s}\"".format(s=date_str) - - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError): tslib._test_parse_iso8601(date_str) def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = ("Timezone hours offset out of range " - "in datetime string \"{s}\"".format(s=date_str)) - - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError): tslib._test_parse_iso8601(date_str) From 91dada83391495840fd8964cf9128b4428de11a5 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 29 Apr 2019 09:48:49 +0300 Subject: [PATCH 07/14] added 'want_exc' param to '_string_to_dts' --- pandas/_libs/tslib.pyx | 104 ++++++++++------------ pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/np_datetime.pxd | 3 +- pandas/_libs/tslibs/np_datetime.pyx | 5 +- pandas/tests/tslibs/test_parse_iso8601.py | 9 +- 5 files changed, 61 insertions(+), 62 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index e716dd7de2832..f53f63c9e600c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -205,8 +205,7 @@ def _test_parse_iso8601(object ts): elif ts == 'today': return Timestamp.now().normalize() - if _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) == -1: - raise ValueError + _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset, True) obj.value = dtstruct_to_dt64(&obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: @@ -583,62 +582,55 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', string_to_dts_failed = _string_to_dts( val, &dts, &out_local, - &out_tzoffset + &out_tzoffset, False ) - try: - if string_to_dts_failed: - # An error at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i]): - continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) - return values, tz_out - - try: - py_dt = parse_datetime_string( - val, - dayfirst=dayfirst, - yearfirst=yearfirst - ) - except Exception: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise TypeError("invalid string coercion to " - "datetime") - - # If the dateutil parser returned tzinfo, - # capture it to check if all arguments - # have the same tzinfo - tz = py_dt.utcoffset() - if tz is not None: - seen_datetime_offset = 1 - # dateutil timezone objects cannot be hashed, - # so store the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, - # to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - - _ts = convert_datetime_to_tsobject(py_dt, None) - iresult[i] = _ts.value - except: - # TODO: What exception are we concerned with here? - if is_coerce: - iresult[i] = NPY_NAT + if string_to_dts_failed: + # An error at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): continue - raise + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values, tz_out + + try: + py_dt = parse_datetime_string( + val, + dayfirst=dayfirst, + yearfirst=yearfirst + ) + except Exception: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise TypeError("invalid string coercion to " + "datetime") + + # If the dateutil parser returned tzinfo, + # capture it to check if all arguments + # have the same tzinfo + tz = py_dt.utcoffset() + if tz is not None: + seen_datetime_offset = 1 + # dateutil timezone objects cannot be hashed, + # so store the UTC offsets in seconds instead + out_tzoffset_vals.add(tz.total_seconds()) + else: + # Add a marker for naive string, + # to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + + _ts = convert_datetime_to_tsobject(py_dt, None) + iresult[i] = _ts.value if not string_to_dts_failed: # No error reported by string_to_dts, pick back up # where we left off diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 65782fed54504..f3f5ee714684c 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -445,7 +445,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, else: string_to_dts_failed = _string_to_dts( ts, &obj.dts, &out_local, - &out_tzoffset + &out_tzoffset, False ) if string_to_dts_failed: try: diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 772a86e531ded..020bcdf0a7b15 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -73,4 +73,5 @@ cdef npy_timedelta get_timedelta64_value(object obj) nogil cdef NPY_DATETIMEUNIT get_datetime64_unit(object obj) nogil cdef int _string_to_dts(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset) + int* out_local, int* out_tzoffset, + bint want_exc) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 5ff1820f060f7..7d362708015ce 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -170,11 +170,12 @@ cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): cdef inline int _string_to_dts(object val, npy_datetimestruct* dts, - int* out_local, int* out_tzoffset): + int* out_local, int* out_tzoffset, + bint want_exc) except? -1: cdef: Py_ssize_t length const char* buf buf = get_c_string_buf_and_size(val, &length) - return parse_iso_8601_datetime(buf, length, 0, + return parse_iso_8601_datetime(buf, length, want_exc, dts, out_local, out_tzoffset) diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index 0b61a0cb5826b..8c995f243a993 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -46,13 +46,18 @@ def test_parsers_iso8601(date_str, exp): "20010101 12345Z", ]) def test_parsers_iso8601_invalid(date_str): - with pytest.raises(ValueError): + msg = "Error parsing datetime string \"{s}\"".format(s=date_str) + + with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - with pytest.raises(ValueError): + msg = ("Timezone hours offset out of range " + "in datetime string \"{s}\"".format(s=date_str)) + + with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) From 389253af8ac654393a200e69799500c5fa1aae5c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 29 Apr 2019 11:43:42 +0300 Subject: [PATCH 08/14] fixed flake8 errors --- pandas/_libs/tslib.pyx | 21 +++++++++------------ pandas/_libs/tslibs/conversion.pyx | 6 +++--- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f53f63c9e600c..aa403b227c74b 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -20,9 +20,8 @@ from pandas._libs.util cimport ( from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, _string_to_dts, - dt64_to_dtstruct, dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, - get_datetime64_value) + check_dts_bounds, npy_datetimestruct, _string_to_dts, dt64_to_dtstruct, + dtstruct_to_dt64, pydatetime_to_dt64, pydate_to_dt64, get_datetime64_value) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string @@ -597,8 +596,8 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', continue elif is_raise: raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) + "match format specified" + .format(val=val)) return values, tz_out try: @@ -614,18 +613,16 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', raise TypeError("invalid string coercion to " "datetime") - # If the dateutil parser returned tzinfo, - # capture it to check if all arguments - # have the same tzinfo + # If the dateutil parser returned tzinfo, capture it + # to check if all arguments have the same tzinfo tz = py_dt.utcoffset() if tz is not None: seen_datetime_offset = 1 - # dateutil timezone objects cannot be hashed, - # so store the UTC offsets in seconds instead + # dateutil timezone objects cannot be hashed, so + # store the UTC offsets in seconds instead out_tzoffset_vals.add(tz.total_seconds()) else: - # Add a marker for naive string, - # to track if we are + # Add a marker for naive string, to track if we are # parsing mixed naive and aware strings out_tzoffset_vals.add('naive') diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f3f5ee714684c..fb81d5b9b8eb0 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -467,10 +467,10 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, else: # Keep the converter same as PyDateTime's obj = convert_to_tsobject(obj.value, obj.tzinfo, - None, 0, 0) + None, 0, 0) dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, - obj.dts.hour, obj.dts.min, obj.dts.sec, - obj.dts.us, obj.tzinfo) + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) obj = convert_datetime_to_tsobject( dt, tz, nanos=obj.dts.ps // 1000) return obj From 1233eb4195dd5aa1f9245791c2047c91b5934d60 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2019 13:39:15 +0300 Subject: [PATCH 09/14] removed extra 'check_dts_bounds' call and 'try/except Exception:' block --- pandas/_libs/tslibs/conversion.pyx | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index fb81d5b9b8eb0..cc7fc16613751 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -423,6 +423,7 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, _TSObject obj int out_local = 0, out_tzoffset = 0 datetime dt + bint do_parse_datetime_string = False if tz is not None: tz = maybe_get_tz(tz) @@ -447,21 +448,14 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ts, &obj.dts, &out_local, &out_tzoffset, False ) - if string_to_dts_failed: - try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - raise ValueError("could not convert string to Timestamp") - else: - try: + try: + if not string_to_dts_failed: obj.value = dtstruct_to_dt64(&obj.dts) check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) if tz is None: - check_dts_bounds(&obj.dts) check_overflows(obj) return obj else: @@ -482,13 +476,17 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise')[0] - except OutOfBoundsDatetime: - # GH#19382 for just-barely-OutOfBounds falling back to dateutil - # parser will return incorrect result because it will ignore - # nanoseconds - raise + except OutOfBoundsDatetime: + # GH#19382 for just-barely-OutOfBounds falling back to dateutil + # parser will return incorrect result because it will ignore + # nanoseconds + raise + + except ValueError: + do_parse_datetime_string = True - except ValueError: + finally: + if string_to_dts_failed or do_parse_datetime_string: try: ts = parse_datetime_string(ts, dayfirst=dayfirst, yearfirst=yearfirst) From 09115c6538ecf2d50db8701708a3d53a00f78439 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2019 13:59:28 +0300 Subject: [PATCH 10/14] changed formatting for 'parse_datetime_string' call in tslib.pyx --- pandas/_libs/tslib.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index aa403b227c74b..68ad38dc37165 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -601,11 +601,9 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', return values, tz_out try: - py_dt = parse_datetime_string( - val, - dayfirst=dayfirst, - yearfirst=yearfirst - ) + py_dt = parse_datetime_string(val, + dayfirst=dayfirst, + yearfirst=yearfirst) except Exception: if is_coerce: iresult[i] = NPY_NAT From 43cc10b8c7fdbca1257eef7ed38c1a6e9e3f1f94 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2019 18:13:39 +0300 Subject: [PATCH 11/14] added new helper function - 'setup_tsobject_tz_using_offset' --- pandas/_libs/tslibs/conversion.pyx | 52 ++++++++++++++++-------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index cc7fc16613751..8069ba6e3c93b 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -392,6 +392,25 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, return obj +cdef _TSObject setup_tsobject_tz_using_offset(_TSObject obj, + object tz, int tzoffset): + obj.tzinfo = pytz.FixedOffset(tzoffset) + obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) + if tz is None: + check_overflows(obj) + return obj + else: + # Keep the converter same as PyDateTime's + obj = convert_to_tsobject(obj.value, obj.tzinfo, + None, 0, 0) + dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) + obj = convert_datetime_to_tsobject( + dt, tz, nanos=obj.dts.ps // 1000) + return obj + + cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, bint dayfirst=False, bint yearfirst=False): @@ -450,25 +469,11 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, ) try: if not string_to_dts_failed: - obj.value = dtstruct_to_dt64(&obj.dts) check_dts_bounds(&obj.dts) + obj.value = dtstruct_to_dt64(&obj.dts) if out_local == 1: - obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) - if tz is None: - check_overflows(obj) - return obj - else: - # Keep the converter same as PyDateTime's - obj = convert_to_tsobject(obj.value, obj.tzinfo, - None, 0, 0) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, - obj.dts.hour, obj.dts.min, obj.dts.sec, - obj.dts.us, obj.tzinfo) - obj = convert_datetime_to_tsobject( - dt, tz, nanos=obj.dts.ps // 1000) - return obj - + return setup_tsobject_tz_using_offset(obj, tz, + out_tzoffset) else: ts = obj.value if tz is not None: @@ -485,13 +490,12 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, except ValueError: do_parse_datetime_string = True - finally: - if string_to_dts_failed or do_parse_datetime_string: - try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - raise ValueError("could not convert string to Timestamp") + if string_to_dts_failed or do_parse_datetime_string: + try: + ts = parse_datetime_string(ts, dayfirst=dayfirst, + yearfirst=yearfirst) + except Exception: + raise ValueError("could not convert string to Timestamp") return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) From ce242f27ad9ddb9eccc53b8c84e49227f6a68aba Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2019 21:12:32 +0300 Subject: [PATCH 12/14] added doc-string --- pandas/_libs/tslibs/conversion.pyx | 65 +++++++++++++++++++----------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8069ba6e3c93b..a19695dfd6bec 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -392,23 +392,42 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, return obj -cdef _TSObject setup_tsobject_tz_using_offset(_TSObject obj, - object tz, int tzoffset): - obj.tzinfo = pytz.FixedOffset(tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) +cdef _TSObject create_tsobject_tz_using_offset(int64_t value, + object tz, int tzoffset): + """ + Create tsobject from numpy datetime64 using initial timezone offset + + Parameters + ---------- + value: int64_t + numpy dt64 + tz : tzinfo or None + timezone for the timezone-aware output. + tzoffset: int + + Returns + obj : _TSObject + ------- + + """ + cdef: + _TSObject obj + datetime dt + + tzinfo = pytz.FixedOffset(tzoffset) + value = tz_convert_single(value, tzinfo, UTC) + obj = convert_to_tsobject(value, tzinfo, None, 0, 0) if tz is None: check_overflows(obj) return obj - else: - # Keep the converter same as PyDateTime's - obj = convert_to_tsobject(obj.value, obj.tzinfo, - None, 0, 0) - dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, - obj.dts.hour, obj.dts.min, obj.dts.sec, - obj.dts.us, obj.tzinfo) - obj = convert_datetime_to_tsobject( - dt, tz, nanos=obj.dts.ps // 1000) - return obj + + # Keep the converter same as PyDateTime's + dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, + obj.dts.hour, obj.dts.min, obj.dts.sec, + obj.dts.us, obj.tzinfo) + obj = convert_datetime_to_tsobject( + dt, tz, nanos=obj.dts.ps // 1000) + return obj cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, @@ -439,16 +458,14 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, obj : _TSObject """ cdef: - _TSObject obj + npy_datetimestruct dts + int64_t value # numpy dt64 int out_local = 0, out_tzoffset = 0 - datetime dt bint do_parse_datetime_string = False if tz is not None: tz = maybe_get_tz(tz) - obj = _TSObject() - assert isinstance(ts, str) if len(ts) == 0 or ts in nat_strings: @@ -464,18 +481,18 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, # equiv: datetime.today().replace(tzinfo=tz) else: string_to_dts_failed = _string_to_dts( - ts, &obj.dts, &out_local, + ts, &dts, &out_local, &out_tzoffset, False ) try: if not string_to_dts_failed: - check_dts_bounds(&obj.dts) - obj.value = dtstruct_to_dt64(&obj.dts) + check_dts_bounds(&dts) + value = dtstruct_to_dt64(&dts) if out_local == 1: - return setup_tsobject_tz_using_offset(obj, tz, - out_tzoffset) + return create_tsobject_tz_using_offset(value, tz, + out_tzoffset) else: - ts = obj.value + ts = value if tz is not None: # shift for localize_tso ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, From 543257ae0bac8f1ed9749c4809fb360fad8318a0 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 2 May 2019 23:18:17 +0300 Subject: [PATCH 13/14] fixed doc-string; changed parameters order in 'create_tsobject_tz_using_offset' func --- pandas/_libs/tslibs/conversion.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a19695dfd6bec..718fcf94d1285 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -393,9 +393,10 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, cdef _TSObject create_tsobject_tz_using_offset(int64_t value, - object tz, int tzoffset): + int tzoffset, object tz=None): """ - Create tsobject from numpy datetime64 using initial timezone offset + Convert a numpy datetime64 `value`, along with initial timezone offset + `tzoffset` to a _TSObject (with timezone object `tz` - optional). Parameters ---------- @@ -406,9 +407,8 @@ cdef _TSObject create_tsobject_tz_using_offset(int64_t value, tzoffset: int Returns - obj : _TSObject ------- - + obj : _TSObject """ cdef: _TSObject obj @@ -489,8 +489,8 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, check_dts_bounds(&dts) value = dtstruct_to_dt64(&dts) if out_local == 1: - return create_tsobject_tz_using_offset(value, tz, - out_tzoffset) + return create_tsobject_tz_using_offset(value, + out_tzoffset, tz) else: ts = value if tz is not None: From cffa8a6185335cdee3c9448178a4eaaa497ada82 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 6 May 2019 09:52:33 +0300 Subject: [PATCH 14/14] fixed parameters order in 'create_tsobject_tz_using_offset' doc-string --- pandas/_libs/tslibs/conversion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 718fcf94d1285..bee3e28874a05 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -402,9 +402,9 @@ cdef _TSObject create_tsobject_tz_using_offset(int64_t value, ---------- value: int64_t numpy dt64 + tzoffset: int tz : tzinfo or None timezone for the timezone-aware output. - tzoffset: int Returns -------