-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG: to_datetime with decimal number doesn't fail for %Y%m%d #50054
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
2004fb7
65149de
83a4ec6
754fdfe
69a487a
a004790
ad8a56a
22ca184
6e8280b
55dc073
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,11 +27,6 @@ def try_parse_dates( | |
dayfirst: bool = ..., | ||
default: datetime | None = ..., | ||
) -> npt.NDArray[np.object_]: ... | ||
def try_parse_year_month_day( | ||
years: npt.NDArray[np.object_], # object[:] | ||
months: npt.NDArray[np.object_], # object[:] | ||
days: npt.NDArray[np.object_], # object[:] | ||
) -> npt.NDArray[np.object_]: ... | ||
Comment on lines
-30
to
-34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removing this function altogether, special-casing The non-ISO8601 path can handle this format just fine, let's use that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this impact perf? im assuming this was introduced as a fastpath |
||
def try_parse_datetime_components( | ||
years: npt.NDArray[np.object_], # object[:] | ||
months: npt.NDArray[np.object_], # object[:] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,8 +24,6 @@ | |
Timedelta, | ||
Timestamp, | ||
iNaT, | ||
nat_strings, | ||
parsing, | ||
timezones as libtimezones, | ||
) | ||
from pandas._libs.tslibs.parsing import ( | ||
|
@@ -38,7 +36,6 @@ | |
AnyArrayLike, | ||
ArrayLike, | ||
DateTimeErrorChoices, | ||
npt, | ||
) | ||
|
||
from pandas.core.dtypes.common import ( | ||
|
@@ -57,13 +54,11 @@ | |
ABCDataFrame, | ||
ABCSeries, | ||
) | ||
from pandas.core.dtypes.missing import notna | ||
|
||
from pandas.arrays import ( | ||
DatetimeArray, | ||
IntegerArray, | ||
) | ||
from pandas.core import algorithms | ||
from pandas.core.algorithms import unique | ||
from pandas.core.arrays.base import ExtensionArray | ||
from pandas.core.arrays.datetimes import ( | ||
|
@@ -407,7 +402,6 @@ def _convert_listlike_datetimes( | |
|
||
# warn if passing timedelta64, raise for PeriodDtype | ||
# NB: this must come after unit transformation | ||
orig_arg = arg | ||
try: | ||
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz)) | ||
except TypeError: | ||
|
@@ -435,8 +429,8 @@ def _convert_listlike_datetimes( | |
require_iso8601 = not infer_datetime_format | ||
|
||
if format is not None and not require_iso8601: | ||
res = _to_datetime_with_format( | ||
arg, orig_arg, name, utc, format, exact, errors, infer_datetime_format | ||
res = _array_strptime_with_fallback( | ||
arg, name, utc, format, exact, errors, infer_datetime_format | ||
) | ||
if res is not None: | ||
return res | ||
|
@@ -510,43 +504,6 @@ def _array_strptime_with_fallback( | |
return _box_as_indexlike(result, utc=utc, name=name) | ||
|
||
|
||
def _to_datetime_with_format( | ||
arg, | ||
orig_arg, | ||
name, | ||
utc: bool, | ||
fmt: str, | ||
exact: bool, | ||
errors: str, | ||
infer_datetime_format: bool, | ||
) -> Index | None: | ||
""" | ||
Try parsing with the given format, returning None on failure. | ||
""" | ||
result = None | ||
|
||
# shortcut formatting here | ||
if fmt == "%Y%m%d": | ||
# pass orig_arg as float-dtype may have been converted to | ||
# datetime64[ns] | ||
orig_arg = ensure_object(orig_arg) | ||
try: | ||
# may return None without raising | ||
result = _attempt_YYYYMMDD(orig_arg, errors=errors) | ||
except (ValueError, TypeError, OutOfBoundsDatetime) as err: | ||
raise ValueError( | ||
"cannot convert the input to '%Y%m%d' date format" | ||
) from err | ||
if result is not None: | ||
return _box_as_indexlike(result, utc=utc, name=name) | ||
|
||
# fallback | ||
res = _array_strptime_with_fallback( | ||
arg, name, utc, fmt, exact, errors, infer_datetime_format | ||
) | ||
return res | ||
|
||
|
||
def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: | ||
""" | ||
to_datetime specalized to the case where a 'unit' is passed. | ||
|
@@ -976,7 +933,7 @@ def to_datetime( | |
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. | ||
|
||
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') | ||
datetime.datetime(1300, 1, 1, 0, 0) | ||
'13000101' | ||
Comment on lines
932
to
+933
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this one also looks like it was wrong to begin with - we don't return
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. are there many other cases where we get pydatetime objects instead of Timestamps? If no, then i think it makes sense to tighten and say "you always get Timestamps or ignore/raise". If yes, then i think returning the pydatetime objects is better here than the string (and probably also for '1300' with '%Y') There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've opened #50072, let's move this discussion there |
||
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') | ||
NaT | ||
|
||
|
@@ -1244,60 +1201,6 @@ def coerce(values): | |
return values | ||
|
||
|
||
def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think there's an issue about this fastpath not actually being fast. if this is removed, the issue is probably closeable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, not quite I've left a comment there, but removing it wouldn't fix the issue because 6-digit %Y%m%d dates wouldn't get parsed |
||
""" | ||
try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, | ||
arg is a passed in as an object dtype, but could really be ints/strings | ||
with nan-like/or floats (e.g. with nan) | ||
|
||
Parameters | ||
---------- | ||
arg : np.ndarray[object] | ||
errors : {'raise','ignore','coerce'} | ||
""" | ||
|
||
def calc(carg): | ||
# calculate the actual result | ||
carg = carg.astype(object, copy=False) | ||
parsed = parsing.try_parse_year_month_day( | ||
carg / 10000, carg / 100 % 100, carg % 100 | ||
) | ||
return tslib.array_to_datetime(parsed, errors=errors)[0] | ||
|
||
def calc_with_mask(carg, mask): | ||
result = np.empty(carg.shape, dtype="M8[ns]") | ||
iresult = result.view("i8") | ||
iresult[~mask] = iNaT | ||
|
||
masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) | ||
result[mask] = masked_result.astype("M8[ns]") | ||
return result | ||
|
||
# try intlike / strings that are ints | ||
try: | ||
return calc(arg.astype(np.int64)) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
# a float with actual np.nan | ||
try: | ||
carg = arg.astype(np.float64) | ||
return calc_with_mask(carg, notna(carg)) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
# string with NaN-like | ||
try: | ||
# error: Argument 2 to "isin" has incompatible type "List[Any]"; expected | ||
# "Union[Union[ExtensionArray, ndarray], Index, Series]" | ||
mask = ~algorithms.isin(arg, list(nat_strings)) # type: ignore[arg-type] | ||
return calc_with_mask(arg, mask) | ||
except (ValueError, OverflowError, TypeError): | ||
pass | ||
|
||
return None | ||
|
||
|
||
__all__ = [ | ||
"DateParseError", | ||
"should_cache", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -125,24 +125,21 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): | |
expected[2] = np.nan | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't need to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks - I've marked as draft as there might be a better solution out there which doesn't degrade performance, just working on it |
||
ser[2] = np.nan | ||
|
||
result = to_datetime(ser, format="%Y%m%d", cache=cache) | ||
tm.assert_series_equal(result, expected) | ||
with pytest.raises(ValueError, match=None): | ||
to_datetime(ser, format="%Y%m%d", cache=cache) | ||
Comment on lines
-128
to
+130
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should've always raised - |
||
|
||
# string with NaT | ||
ser2 = ser.apply(str) | ||
ser2[2] = "nat" | ||
result = to_datetime(ser2, format="%Y%m%d", cache=cache) | ||
tm.assert_series_equal(result, expected) | ||
with pytest.raises(ValueError, match=None): | ||
to_datetime(ser2, format="%Y%m%d", cache=cache) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be worth adding a test where the ser3 = ser.astype("Int64")
ser3[2] = pd.NA
to_datetime(ser3, format="%Y%m%d") |
||
|
||
def test_to_datetime_format_YYYYMMDD_ignore(self, cache): | ||
# coercion | ||
# GH 7930 | ||
ser = Series([20121231, 20141231, 99991231]) | ||
expected = Series([20121231, 20141231, 99991231], dtype=object) | ||
result = to_datetime(ser, format="%Y%m%d", errors="ignore", cache=cache) | ||
expected = Series( | ||
[datetime(2012, 12, 31), datetime(2014, 12, 31), datetime(9999, 12, 31)], | ||
dtype=object, | ||
) | ||
tm.assert_series_equal(result, expected) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test looks like it wasn't right to begin with. If the input is invalid, errors='ignore' should return the input that's what we do for literally any other format, e.g.
|
||
|
||
def test_to_datetime_format_YYYYMMDD_coercion(self, cache): | ||
|
@@ -249,7 +246,7 @@ def test_to_datetime_format_integer(self, cache): | |
# valid date, length == 8 | ||
[20121030, datetime(2012, 10, 30)], | ||
# short valid date, length == 6 | ||
[199934, datetime(1999, 3, 4)], | ||
[199934, 199934], | ||
MarcoGorelli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# long integer date partially parsed to datetime(2012,1,1), length > 8 | ||
[2012010101, 2012010101], | ||
# invalid date partially parsed to datetime(2012,9,9), length == 8 | ||
|
@@ -1727,8 +1724,8 @@ def test_dataframe_coerce(self, cache): | |
df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]}) | ||
|
||
msg = ( | ||
"cannot assemble the datetimes: time data .+ does not " | ||
r"match format '%Y%m%d' \(match\)" | ||
r"cannot assemble the datetimes: time data .+ doesn't " | ||
r'match format "%Y%m%d"' | ||
) | ||
with pytest.raises(ValueError, match=msg): | ||
to_datetime(df2, cache=cache) | ||
|
@@ -1804,7 +1801,10 @@ def test_dataframe_mixed(self, cache): | |
def test_dataframe_float(self, cache): | ||
# float | ||
df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]}) | ||
msg = "cannot assemble the datetimes: unconverted data remains: 1" | ||
msg = ( | ||
r'cannot assemble the datetimes: time data "20000151" at ' | ||
r'position 0 doesn\'t match format "%Y%m%d"' | ||
) | ||
with pytest.raises(ValueError, match=msg): | ||
to_datetime(df, cache=cache) | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.