Skip to content

Commit 3283b81

Browse files
author
MarcoGorelli
committed
share paths and fix bugs
1 parent d800024 commit 3283b81

File tree

8 files changed

+101
-99
lines changed

8 files changed

+101
-99
lines changed

doc/source/whatsnew/v2.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,8 @@ Datetimelike
790790
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing :class:`Timestamp`, ``datetime.datetime``, ``datetime.date``, or ``np.datetime64`` objects when non-ISO8601 ``format`` was passed (:issue:`49298`, :issue:`50036`)
791791
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`)
792792
- Bug in :class:`Timestamp` was showing ``UserWarning``, which was not actionable by users, when parsing non-ISO8601 delimited date strings (:issue:`50232`)
793+
- Bug in :class:`Timestamp` was showing ``UserWarning`` which was not actionable by users (:issue:`50232`)
794+
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`)
793795
-
794796

795797
Timedelta

pandas/_libs/tslib.pyx

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -410,8 +410,6 @@ cpdef array_to_datetime(
410410
bint yearfirst=False,
411411
bint utc=False,
412412
bint require_iso8601=False,
413-
format: str | None=None,
414-
bint exact=True,
415413
):
416414
"""
417415
Converts a 1D array of date-like values to a numpy array of either:
@@ -517,7 +515,7 @@ cpdef array_to_datetime(
517515
elif is_raise:
518516
raise ValueError(
519517
f"time data \"{val}\" at position {i} doesn't "
520-
f"match format \"{format}\""
518+
f"match ISO8601 standard"
521519
)
522520
return values, tz_out
523521
# these must be ns unit by-definition
@@ -550,7 +548,7 @@ cpdef array_to_datetime(
550548

551549
string_to_dts_failed = string_to_dts(
552550
val, &dts, &out_bestunit, &out_local,
553-
&out_tzoffset, False, format, exact
551+
&out_tzoffset, False, None, False
554552
)
555553
if string_to_dts_failed:
556554
# An error at this point is a _parsing_ error

pandas/_libs/tslibs/parsing.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ def format_is_iso(f: str) -> bint:
846846
but must be consistent. Leading 0s in dates and times are optional.
847847
"""
848848
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
849-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
849+
excluded_formats = ["%Y%m", "%Y"]
850850

851851
for date_sep in [" ", "/", "\\", "-", ".", ""]:
852852
for time_sep in [" ", "T"]:

pandas/_libs/tslibs/strptime.pyx

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.timestamps cimport _Timestamp
4547
from pandas._libs.util cimport (
@@ -93,6 +95,7 @@ def array_strptime(
9395
exact : matches must be exact if True, search if False
9496
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
9597
"""
98+
from pandas._libs.tslibs.parsing import format_is_iso
9699

97100
cdef:
98101
Py_ssize_t i, n = len(values)
@@ -110,6 +113,9 @@ def array_strptime(
110113
bint found_naive = False
111114
bint found_tz = False
112115
tzinfo tz_out = None
116+
bint iso_format = fmt is not None and format_is_iso(fmt)
117+
NPY_DATETIMEUNIT out_bestunit
118+
int out_local = 0, out_tzoffset = 0
113119

114120
assert is_raise or is_ignore or is_coerce
115121

@@ -198,6 +204,39 @@ def array_strptime(
198204
else:
199205
val = str(val)
200206

207+
if iso_format:
208+
string_to_dts_failed = string_to_dts(
209+
val, &dts, &out_bestunit, &out_local,
210+
&out_tzoffset, False, fmt, exact
211+
)
212+
if not string_to_dts_failed:
213+
# No error reported by string_to_dts, pick back up
214+
# where we left off
215+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
216+
if out_local == 1:
217+
# Store the out_tzoffset in seconds
218+
# since we store the total_seconds of
219+
# dateutil.tz.tzoffset objects
220+
# out_tzoffset_vals.add(out_tzoffset * 60.)
221+
tz = timezone(timedelta(minutes=out_tzoffset))
222+
result_timezone[i] = tz
223+
# value = tz_localize_to_utc_single(value, tz)
224+
out_local = 0
225+
out_tzoffset = 0
226+
iresult[i] = value
227+
try:
228+
check_dts_bounds(&dts)
229+
except ValueError:
230+
if is_coerce:
231+
iresult[i] = NPY_NAT
232+
continue
233+
raise
234+
continue
235+
236+
# Some ISO formats can't be parsed by string_to_dts
237+
# For example, 6-digit YYYYMD. So, if there's an error,
238+
# try the string-matching code below.
239+
201240
# exact matching
202241
if exact:
203242
found = format_regex.match(val)

pandas/core/arrays/datetimes.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2120,8 +2120,6 @@ def objects_to_datetime64ns(
21202120
errors: DateTimeErrorChoices = "raise",
21212121
require_iso8601: bool = False,
21222122
allow_object: bool = False,
2123-
format: str | None = None,
2124-
exact: bool = True,
21252123
):
21262124
"""
21272125
Convert data to array of timestamps.
@@ -2166,8 +2164,6 @@ def objects_to_datetime64ns(
21662164
dayfirst=dayfirst,
21672165
yearfirst=yearfirst,
21682166
require_iso8601=require_iso8601,
2169-
format=format,
2170-
exact=exact,
21712167
)
21722168
result = result.reshape(data.shape, order=order)
21732169
except OverflowError as err:

pandas/core/tools/datetimes.py

Lines changed: 6 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from pandas._libs.tslibs.conversion import precision_from_unit
3737
from pandas._libs.tslibs.parsing import (
3838
DateParseError,
39-
format_is_iso,
4039
guess_datetime_format,
4140
)
4241
from pandas._libs.tslibs.strptime import array_strptime
@@ -419,7 +418,6 @@ def _convert_listlike_datetimes(
419418

420419
# warn if passing timedelta64, raise for PeriodDtype
421420
# NB: this must come after unit transformation
422-
orig_arg = arg
423421
try:
424422
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
425423
except TypeError:
@@ -432,35 +430,20 @@ def _convert_listlike_datetimes(
432430
raise
433431

434432
arg = ensure_object(arg)
435-
require_iso8601 = False
436433

437434
if format is None:
438435
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
439436

440-
# There is a special fast-path for iso8601 formatted datetime strings
441-
require_iso8601 = format is not None and format_is_iso(format)
442-
443-
if format is not None and not require_iso8601:
444-
return _to_datetime_with_format(
445-
arg,
446-
orig_arg,
447-
name,
448-
utc,
449-
format,
450-
exact,
451-
errors,
452-
)
437+
if format is not None:
438+
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
453439

454440
result, tz_parsed = objects_to_datetime64ns(
455441
arg,
456442
dayfirst=dayfirst,
457443
yearfirst=yearfirst,
458444
utc=utc,
459445
errors=errors,
460-
require_iso8601=require_iso8601,
461446
allow_object=True,
462-
format=format,
463-
exact=exact,
464447
)
465448

466449
if tz_parsed is not None:
@@ -512,40 +495,6 @@ def _array_strptime_with_fallback(
512495
return _box_as_indexlike(result, utc=utc, name=name)
513496

514497

515-
def _to_datetime_with_format(
516-
arg,
517-
orig_arg,
518-
name,
519-
utc: bool,
520-
fmt: str,
521-
exact: bool,
522-
errors: str,
523-
) -> Index:
524-
"""
525-
Try parsing with the given format.
526-
"""
527-
result = None
528-
529-
# shortcut formatting here
530-
if fmt == "%Y%m%d":
531-
# pass orig_arg as float-dtype may have been converted to
532-
# datetime64[ns]
533-
orig_arg = ensure_object(orig_arg)
534-
try:
535-
# may return None without raising
536-
result = _attempt_YYYYMMDD(orig_arg, errors=errors)
537-
except (ValueError, TypeError, OutOfBoundsDatetime) as err:
538-
raise ValueError(
539-
"cannot convert the input to '%Y%m%d' date format"
540-
) from err
541-
if result is not None:
542-
return _box_as_indexlike(result, utc=utc, name=name)
543-
544-
# fallback
545-
res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors)
546-
return res
547-
548-
549498
def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
550499
"""
551500
to_datetime specalized to the case where a 'unit' is passed.
@@ -1000,7 +949,7 @@ def to_datetime(
1000949
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
1001950
1002951
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
1003-
datetime.datetime(1300, 1, 1, 0, 0)
952+
'13000101'
1004953
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
1005954
NaT
1006955
@@ -1033,14 +982,12 @@ def to_datetime(
1033982
Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
1034983
dtype='object')
1035984
1036-
- A mix of timezone-aware and timezone-naive inputs is converted to
1037-
a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware
1038-
are constant:
985+
- A mix of timezone-aware and timezone-naive inputs is also converted to
986+
a simple :class:`Index` containing :class:`datetime.datetime` objects:
1039987
1040988
>>> from datetime import datetime
1041989
>>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
1042-
DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'],
1043-
dtype='datetime64[ns, UTC-01:00]', freq=None)
990+
Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
1044991
1045992
|
1046993

pandas/tests/tools/test_to_datetime.py

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,8 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
132132
# string with NaT
133133
ser2 = ser.apply(str)
134134
ser2[2] = "nat"
135-
result = to_datetime(ser2, format="%Y%m%d", cache=cache)
136-
tm.assert_series_equal(result, expected)
135+
with pytest.raises(ValueError, match="unconverted data remains: .0"):
136+
to_datetime(ser2, format="%Y%m%d", cache=cache)
137137

138138
def test_to_datetime_format_YYYYMM_with_nat(self, cache):
139139
# https://github.com/pandas-dev/pandas/issues/50237
@@ -146,13 +146,25 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache):
146146
result = to_datetime(ser, format="%Y%m", cache=cache)
147147
tm.assert_series_equal(result, expected)
148148

149+
def test_to_datetime_format_YYYYMM_with_nat_2(self, cache):
150+
# is this different from above? can remove?
151+
# https://github.com/pandas-dev/pandas/issues/50237
152+
ser = Series([198012, 198012] + [198101] * 5)
153+
expected = Series(
154+
[Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5
155+
)
156+
expected[2] = np.nan
157+
ser[2] = np.nan
158+
result = to_datetime(ser, format="%Y%m", cache=cache)
159+
tm.assert_series_equal(result, expected)
160+
149161
def test_to_datetime_format_YYYYMMDD_ignore(self, cache):
150162
# coercion
151163
# GH 7930
152164
ser = Series([20121231, 20141231, 99991231])
153165
result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache)
154166
expected = Series(
155-
[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)],
167+
[20121231, 20141231, 99991231],
156168
dtype=object,
157169
)
158170
tm.assert_series_equal(result, expected)
@@ -1737,8 +1749,8 @@ def test_dataframe_coerce(self, cache):
17371749
df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})
17381750

17391751
msg = (
1740-
"cannot assemble the datetimes: time data .+ does not "
1741-
r"match format '%Y%m%d' \(match\)"
1752+
r"cannot assemble the datetimes: time data '.*' does not match "
1753+
r"format '%Y%m%d' \(match\)"
17421754
)
17431755
with pytest.raises(ValueError, match=msg):
17441756
to_datetime(df2, cache=cache)
@@ -1873,10 +1885,7 @@ def test_to_datetime_iso8601_fails(self, input, format, exact):
18731885
# `format` is longer than the string, so this fails regardless of `exact`
18741886
with pytest.raises(
18751887
ValueError,
1876-
match=(
1877-
rf"time data \"{input}\" at position 0 doesn't match format "
1878-
rf"\"{format}\""
1879-
),
1888+
match=(rf"time data '{input}' does not match format '{format}'"),
18801889
):
18811890
to_datetime(input, format=format, exact=exact)
18821891

@@ -1895,10 +1904,7 @@ def test_to_datetime_iso8601_exact_fails(self, input, format):
18951904
# `format` is shorter than the date string, so only fails with `exact=True`
18961905
with pytest.raises(
18971906
ValueError,
1898-
match=(
1899-
rf"time data \"{input}\" at position 0 doesn't match format "
1900-
rf"\"{format}\""
1901-
),
1907+
match=("unconverted data remains: |does not match format"),
19021908
):
19031909
to_datetime(input, format=format)
19041910

@@ -1934,10 +1940,7 @@ def test_to_datetime_iso8601_separator(self, input, format):
19341940
# https://github.com/pandas-dev/pandas/issues/12649
19351941
with pytest.raises(
19361942
ValueError,
1937-
match=(
1938-
rf"time data \"{input}\" at position 0 doesn\'t match format "
1939-
rf"\"{format}\""
1940-
),
1943+
match=(rf"time data \'{input}\' does not match format '{format}'"),
19411944
):
19421945
to_datetime(input, format=format)
19431946

@@ -1963,6 +1966,28 @@ def test_to_datetime_iso8601_valid(self, input, format):
19631966
result = to_datetime(input, format=format)
19641967
assert result == expected
19651968

1969+
@pytest.mark.parametrize(
1970+
"input, format",
1971+
[
1972+
("2020-1", "%Y-%m"),
1973+
("2020-1-1", "%Y-%m-%d"),
1974+
("2020-1-1 0", "%Y-%m-%d %H"),
1975+
("2020-1-1T0", "%Y-%m-%dT%H"),
1976+
("2020-1-1 0:0", "%Y-%m-%d %H:%M"),
1977+
("2020-1-1T0:0", "%Y-%m-%dT%H:%M"),
1978+
("2020-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"),
1979+
("2020-1-1T0:0:0", "%Y-%m-%dT%H:%M:%S"),
1980+
("2020-1-1T0:0:0.000", "%Y-%m-%dT%H:%M:%S.%f"),
1981+
("2020-1-1T0:0:0.000000", "%Y-%m-%dT%H:%M:%S.%f"),
1982+
("2020-1-1T0:0:0.000000000", "%Y-%m-%dT%H:%M:%S.%f"),
1983+
],
1984+
)
1985+
def test_to_datetime_iso8601_non_padded(self, input, format):
1986+
# https://github.com/pandas-dev/pandas/issues/21422
1987+
expected = Timestamp(2020, 1, 1)
1988+
result = to_datetime(input, format=format)
1989+
assert result == expected
1990+
19661991
@pytest.mark.parametrize(
19671992
"input, format",
19681993
[
@@ -2465,7 +2490,7 @@ def test_day_not_in_month_raise(self, cache):
24652490

24662491
@pytest.mark.parametrize("arg", ["2015-02-29", "2015-02-32", "2015-04-31"])
24672492
def test_day_not_in_month_raise_value(self, cache, arg):
2468-
msg = f'time data "{arg}" at position 0 doesn\'t match format "%Y-%m-%d"'
2493+
msg = "day is out of range for month|unconverted data remains"
24692494
with pytest.raises(ValueError, match=msg):
24702495
to_datetime(arg, errors="raise", format="%Y-%m-%d", cache=cache)
24712496

@@ -2859,10 +2884,7 @@ def test_incorrect_value_exception(self):
28592884
)
28602885
def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
28612886
# see gh-23830
2862-
msg = (
2863-
"Out of bounds nanosecond timestamp: 2417-10-27 00:00:00 "
2864-
"present at position 0"
2865-
)
2887+
msg = "Out of bounds nanosecond timestamp: 2417-10-27 00:00:00"
28662888
with pytest.raises(OutOfBoundsDatetime, match=msg):
28672889
with tm.assert_produces_warning(warning, match="Could not infer format"):
28682890
to_datetime("2417-10-27 00:00:00", format=format)

0 commit comments

Comments
 (0)