From fa5d2e363554658cc5812644d249f458819c7328 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 30 Sep 2024 09:43:29 +0200 Subject: [PATCH 1/6] implement default precision timestamp using predefined options with default "ns" resolution --- xarray/coding/cftime_offsets.py | 18 +---- xarray/coding/times.py | 111 +++++++++++++++++++--------- xarray/core/options.py | 9 +++ xarray/core/pdcompat.py | 22 +++--- xarray/core/variable.py | 50 +++++++------ xarray/tests/test_cftime_offsets.py | 2 +- xarray/tests/test_concat.py | 2 +- xarray/tests/test_conventions.py | 4 +- xarray/tests/test_dataarray.py | 4 +- xarray/tests/test_dataset.py | 10 +-- xarray/tests/test_groupby.py | 4 +- xarray/tests/test_interp.py | 2 +- xarray/tests/test_plot.py | 2 +- xarray/tests/test_variable.py | 34 ++++----- 14 files changed, 161 insertions(+), 113 deletions(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index c503e8ebcd3..0bf02345404 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -64,7 +64,7 @@ from xarray.core.pdcompat import ( NoDefault, count_not_none, - nanosecond_precision_timestamp, + default_precision_timestamp, no_default, ) from xarray.core.utils import emit_user_level_warning @@ -83,21 +83,13 @@ T_FreqStr = TypeVar("T_FreqStr", str, None) -def _nanosecond_precision_timestamp(*args, **kwargs): - # As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to - # infer the appropriate datetime precision. Until xarray supports - # non-nanosecond precision times, we will use this constructor wrapper to - # explicitly create nanosecond-precision Timestamp objects. - return pd.Timestamp(*args, **kwargs).as_unit("ns") - - def get_date_type(calendar, use_cftime=True): """Return the cftime date type for a given calendar name.""" if cftime is None: raise ImportError("cftime is required for dates with non-standard calendars") else: if _is_standard_calendar(calendar) and not use_cftime: - return _nanosecond_precision_timestamp + return default_precision_timestamp calendars = { "noleap": cftime.DatetimeNoLeap, @@ -1475,10 +1467,8 @@ def date_range_like(source, calendar, use_cftime=None): if is_np_datetime_like(source.dtype): # We want to use datetime fields (datetime64 object don't have them) source_calendar = "standard" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - source_start = nanosecond_precision_timestamp(source_start) - source_end = nanosecond_precision_timestamp(source_end) + source_start = default_precision_timestamp(source_start) + source_end = default_precision_timestamp(source_end) else: if isinstance(source, CFTimeIndex): source_calendar = source.calendar diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 9306bde47a3..43446a2a7d8 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,8 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import nanosecond_precision_timestamp +from xarray.core.options import _get_datetime_resolution +from xarray.core.pdcompat import default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -193,9 +194,7 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime time_units, _ref_date = _unpack_netcdf_time_units(units) - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(_ref_date) + ref_date = default_precision_timestamp(_ref_date) # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). if ref_date.tz is not None: @@ -266,20 +265,54 @@ def _decode_datetime_with_pandas( time_units, ref_date_str = _unpack_netcdf_time_units(units) time_units = _netcdf_to_numpy_timeunit(time_units) try: - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(ref_date_str) + # relaxed to non-nanosecond resolution + ref_date = pd.Timestamp(ref_date_str) + # strip tz information + if ref_date.tz is not None: + ref_date = ref_date.tz_convert(None) + # get default unit and delta + default_unit = _get_datetime_resolution() + default_delta = np.timedelta64(1, default_unit).astype("timedelta64[ns]") + # get ref_date and time delta + ref_date_delta = np.timedelta64(1, ref_date.unit).astype("timedelta64[ns]") + time_delta = np.timedelta64(1, time_units).astype("timedelta64[ns]") + # choose the highest resolution + new_time_units = { + ref_date_delta: ref_date.unit, + time_delta: time_units, + default_delta: default_unit, + }[min(default_delta, ref_date_delta, time_delta)] + # transform to the highest needed resolution + # this will raise accordingly + ref_date = ref_date.as_unit(new_time_units) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err + dunit = ref_date.unit + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: # avoid size 0 datetimes GH1329 - pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date - pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date + fnd_min, fnd_max = flat_num_dates.min(), flat_num_dates.max() + min_delta = fnd_min * np.timedelta64(1, time_units) + max_delta = fnd_max * np.timedelta64(1, time_units) + if not np.isnan(min_delta): + # todo: add meaningful error messages + # this will raise on overflow + (ref_date + min_delta).as_unit(dunit) + # this will raise on dtype oveflow + if not np.int64(min_delta) == fnd_min: + raise OutOfBoundsTimedelta + if not np.isnan(max_delta): + # todo: add meaningful error message + # this will raise on overflow + (ref_date + max_delta).as_unit(dunit) + # this will raise on dtype oveflow + if not np.int64(max_delta) == fnd_max: + raise OutOfBoundsTimedelta # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype @@ -292,20 +325,25 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # Cast input ordinals to integers of nanoseconds because pd.to_timedelta - # works much faster when dealing with integers (GH 1399). - # properly handle NaN/NaT to prevent casting NaN to int + # keep NaT/nan mask nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units] - flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min - flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64) - # Use pd.to_timedelta to safely cast integer values to timedeltas, - # and add those to a Timestamp to safely produce a DatetimeIndex. This - # ensures that we do not encounter integer overflow at any point in the - # process without raising OutOfBoundsDatetime. - return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_units] + ns_dunit = _NS_PER_TIME_DELTA[dunit] + if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_dunit): + flat_num_dates *= np.int64(ns_time_unit / ns_dunit) + time_units = dunit + + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) + flat_num_dates_int[nan] = np.iinfo(np.int64).min + flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + + # cast to timedelta64[time_units] and add to ref_date + return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_units}]") def decode_cf_datetime( @@ -370,7 +408,7 @@ def to_timedelta_unboxed(value, **kwargs): def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() - assert result.dtype == "datetime64[ns]" + assert result.dtype == f"datetime64[{_get_datetime_resolution()}]" return result @@ -390,7 +428,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta: def _unit_timedelta_numpy(units: str) -> np.timedelta64: numpy_units = _netcdf_to_numpy_timeunit(units) - return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns") + default_unit = _get_datetime_resolution() + return np.timedelta64( + int(_NS_PER_TIME_DELTA[numpy_units] / _NS_PER_TIME_DELTA[default_unit]), + default_unit, + ) def _infer_time_units_from_diff(unique_timedeltas) -> str: @@ -411,7 +453,10 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str: def _time_units_to_timedelta64(units: str) -> np.timedelta64: - return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]") + default_unit = _get_datetime_resolution() + return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype( + f"timedelta64[{default_unit}]" + ) def infer_calendar_name(dates) -> CFCalendar: @@ -440,13 +485,11 @@ def infer_datetime_units(dates) -> str: unique time deltas in `dates`) """ dates = ravel(np.asarray(dates)) - if np.asarray(dates).dtype == "datetime64[ns]": + if np.issubdtype(np.asarray(dates).dtype, "datetime64"): dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] reference_date = dates[0] if len(dates) > 0 else "1970-01-01" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - reference_date = nanosecond_precision_timestamp(reference_date) + reference_date = default_precision_timestamp(reference_date) else: reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) @@ -479,9 +522,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: If raise_on_invalid is True (default), invalid dates trigger a ValueError. Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) - # TODO: the strict enforcement of nanosecond precision datetime values can - # be relaxed when addressing GitHub issue #7493. - new = np.empty(times.shape, dtype="M8[ns]") + new = np.empty(times.shape, dtype=f"M8[{_get_datetime_resolution()}]") dt: pd.Timestamp | Literal["NaT"] for i, t in np.ndenumerate(times): try: @@ -489,7 +530,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: # NumPy casts it safely it np.datetime64[ns] for dates outside # 1678 to 2262 (this is not currently the case for # datetime.datetime). - dt = nanosecond_precision_timestamp( + dt = default_precision_timestamp( t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond ) except ValueError as e: @@ -546,10 +587,8 @@ def convert_time_or_go_back(date, date_type): This is meant to convert end-of-month dates into a new calendar. """ - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. if date_type == pd.Timestamp: - date_type = nanosecond_precision_timestamp + date_type = default_precision_timestamp try: return date_type( date.year, @@ -757,7 +796,7 @@ def _eagerly_encode_cf_datetime( if not _is_standard_calendar(calendar) or dates.dtype.kind == "O": # parse with cftime instead raise OutOfBoundsDatetime - assert dates.dtype == "datetime64[ns]" + assert np.issubdtype(dates.dtype, "datetime64") time_units, ref_date = _unpack_time_units_and_ref_date(units) time_delta = _time_units_to_timedelta64(time_units) diff --git a/xarray/core/options.py b/xarray/core/options.py index 2d69e4b6584..f1eaf24b05d 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -86,10 +86,12 @@ class T_Options(TypedDict): "use_flox": True, "use_numbagg": True, "use_opt_einsum": True, + "time_resolution": "ns", } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) _DISPLAY_OPTIONS = frozenset(["text", "html"]) +_TIME_RESOLUTION_OPTIONS = frozenset(["s", "ms", "us", "ns"]) def _positive_integer(value: Any) -> bool: @@ -117,6 +119,7 @@ def _positive_integer(value: Any) -> bool: "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), "warn_for_unclosed_files": lambda value: isinstance(value, bool), + "time_resolution": _TIME_RESOLUTION_OPTIONS.__contains__, } @@ -158,6 +161,10 @@ def _get_keep_attrs(default: bool) -> bool: return _get_boolean_with_default("keep_attrs", default) +def _get_datetime_resolution() -> str: + return OPTIONS["time_resolution"] + + class set_options: """ Set options for xarray in a controlled context. @@ -258,6 +265,8 @@ class set_options: warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging. + time_resolution : {"s", "ms", "us", "ns"}, default: "ns" + Time resolution used for CF encoding/decoding. Examples -------- diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ae4febd6beb..a61d73ddc41 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -39,7 +39,8 @@ from typing import Literal import pandas as pd -from packaging.version import Version + +from xarray.core.options import _get_datetime_resolution def count_not_none(*args) -> int: @@ -73,13 +74,16 @@ def __repr__(self) -> str: NoDefault = Literal[_NoDefault.no_default] # For typing following pandas -def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: - """Return a nanosecond-precision Timestamp object. +def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: + """Return a Timestamp object with the default precision. - Note this function should no longer be needed after addressing GitHub issue - #7493. + Xarray default is "ns". This can be overridden by setting + set_options(time_resolution="us") or any other resolution + of {"s", "ms", "us", "ns"}. """ - if Version(pd.__version__) >= Version("2.0.0"): - return pd.Timestamp(*args, **kwargs).as_unit("ns") - else: - return pd.Timestamp(*args, **kwargs) + dt = pd.Timestamp(*args, **kwargs) + units = ["s", "ms", "us", "ns"] + default = _get_datetime_resolution() + if units.index(default) > units.index(dt.unit): + dt = dt.as_unit(default) + return dt diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d8cf0fe7550..7e9d8f66eb2 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -28,7 +28,7 @@ VectorizedIndexer, as_indexable, ) -from xarray.core.options import OPTIONS, _get_keep_attrs +from xarray.core.options import OPTIONS, _get_datetime_resolution, _get_keep_attrs from xarray.core.utils import ( OrderedSet, _default, @@ -71,13 +71,11 @@ from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint -NON_NANOSECOND_WARNING = ( - "Converting non-nanosecond precision {case} values to nanosecond precision. " - "This behavior can eventually be relaxed in xarray, as it is an artifact from " - "pandas which is now beginning to support non-nanosecond precision values. " - "This warning is caused by passing non-nanosecond np.datetime64 or " +NON_DEFAULTPRECISION_WARNING = ( + "Converting non-default precision {case} values to default precision. " + "This warning is caused by passing non-default np.datetime64 or " "np.timedelta64 values to the DataArray or Variable constructor; it can be " - "silenced by converting the values to nanosecond precision ahead of time." + "silenced by converting the values to default precision {res!r} ahead of time." ) @@ -198,34 +196,42 @@ def _maybe_wrap_data(data): return data -def _as_nanosecond_precision(data): +def _as_default_precision(data): + default_unit = _get_datetime_resolution() dtype = data.dtype - non_ns_datetime64 = ( + non_default_datetime64 = ( dtype.kind == "M" and isinstance(dtype, np.dtype) - and dtype != np.dtype("datetime64[ns]") + and dtype != np.dtype(f"datetime64[{default_unit}]") ) - non_ns_datetime_tz_dtype = ( - isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != "ns" + non_default_datetime_tz_dtype = ( + isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != default_unit ) - if non_ns_datetime64 or non_ns_datetime_tz_dtype: - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="datetime")) + if non_default_datetime64 or non_default_datetime_tz_dtype: + utils.emit_user_level_warning( + NON_DEFAULTPRECISION_WARNING.format( + case="datetime", res=f"'{default_unit}'" + ) + ) if isinstance(dtype, pd.DatetimeTZDtype): - nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) + default_precision_dtype = pd.DatetimeTZDtype(default_unit, dtype.tz) else: - nanosecond_precision_dtype = "datetime64[ns]" - return duck_array_ops.astype(data, nanosecond_precision_dtype) + default_precision_dtype = f"datetime64[{default_unit}]" + return duck_array_ops.astype(data, default_precision_dtype) elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) + utils.emit_user_level_warning( + NON_DEFAULTPRECISION_WARNING.format(case="timedelta", res="'ns'") + ) return duck_array_ops.astype(data, "timedelta64[ns]") else: return data def _possibly_convert_objects(values): + # todo: check wording wrt default precision vs non-nanosecond precision """Convert arrays of datetime.datetime and datetime.timedelta objects into datetime64 and timedelta64, according to the pandas convention. For the time - being, convert any non-nanosecond precision DatetimeIndex or TimedeltaIndex + being, convert any Converting non-default precision DatetimeIndex or TimedeltaIndex objects to nanosecond precision. While pandas is relaxing this in version 2.0.0, in xarray we will need to make sure we are ready to handle non-nanosecond precision datetimes or timedeltas in our code before allowing @@ -236,7 +242,7 @@ def _possibly_convert_objects(values): """ as_series = pd.Series(values.ravel(), copy=False) if as_series.dtype.kind in "mM": - as_series = _as_nanosecond_precision(as_series) + as_series = _as_default_precision(as_series) result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default @@ -255,9 +261,9 @@ def _possibly_convert_datetime_or_timedelta_index(data): before allowing such values to pass through unchanged.""" if isinstance(data, PandasIndexingAdapter): if isinstance(data.array, pd.DatetimeIndex | pd.TimedeltaIndex): - data = PandasIndexingAdapter(_as_nanosecond_precision(data.array)) + data = PandasIndexingAdapter(_as_default_precision(data.array)) elif isinstance(data, pd.DatetimeIndex | pd.TimedeltaIndex): - data = _as_nanosecond_precision(data) + data = _as_default_precision(data) return data diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 11e56e2adad..c9cf7270a9c 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1496,7 +1496,7 @@ def test_date_range_like_same_calendar(): assert src is out -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 226f376b581..e47c389c015 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -319,7 +319,7 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e6c69fc1ee1..b5fbf8556d8 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -213,7 +213,7 @@ def test_deterministic_coords_encoding(self) -> None: vars, attrs = conventions.encode_dataset_coordinates(ds) assert attrs["coordinates"] == "bar baz" - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -231,7 +231,7 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 78db39c194e..50704a7570a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3657,7 +3657,7 @@ def test_to_and_from_dict( actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) @@ -3666,7 +3666,7 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = DataArray.from_dict(da.to_dict()) assert_identical(da, roundtripped) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_nan_nat(self) -> None: y = np.random.randn(10, 3) y[2] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c6c32f85d10..f09d57915aa 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -122,7 +122,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: bool_var_to_append = np.array([False, True], dtype=bool) with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Converting non-nanosecond") + warnings.filterwarnings("ignore", "Converting non-default") ds = xr.Dataset( data_vars={ "da": xr.DataArray( @@ -499,7 +499,7 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -6070,7 +6070,7 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() @@ -7164,7 +7164,7 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.RandomState(42) @@ -7359,7 +7359,7 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index dc869cc3a34..646e97c5998 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -607,7 +607,7 @@ def test_groupby_repr_datetime(obj) -> None: assert actual == expected -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") @pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning") def test_groupby_drops_nans() -> None: @@ -2124,7 +2124,7 @@ def test_upsample_interpolate(self) -> None: assert_allclose(expected, actual, rtol=1e-16) @requires_scipy - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_upsample_interpolate_bug_2197(self) -> None: dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index d02e12dd695..1b7bb9c9181 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -641,7 +641,7 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 2605e387360..2516d9ec547 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2963,7 +2963,7 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 1d430b6b27e..5748d54a347 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -200,7 +200,7 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) @@ -212,7 +212,7 @@ def test_index_0d_datetime(self): x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_timedelta64(self): td = timedelta(hours=1) @@ -253,7 +253,7 @@ def test_0d_object_array_with_list(self): assert_array_equal(x[0].data, listarray.squeeze()) assert_array_equal(x.squeeze().data, listarray.squeeze()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_and_concat_datetime(self): # regression test for #125 date_range = pd.date_range("2011-09-01", periods=10) @@ -274,7 +274,7 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_conversion(self): times = pd.date_range("2000-01-01", periods=3) for values, preserve_source in [ @@ -290,7 +290,7 @@ def test_datetime64_conversion(self): same_source = source_ndarray(v.values) is source_ndarray(values) assert preserve_source == same_source - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_conversion(self): times = pd.timedelta_range(start=0, periods=3) for values, preserve_source in [ @@ -311,14 +311,14 @@ def test_object_conversion(self): actual = self.cls("x", data) assert actual.dtype == data.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_valid_range(self): data = np.datetime64("1250-01-01", "us") pderror = pd.errors.OutOfBoundsDatetime with pytest.raises(pderror, match=r"Out of bounds nanosecond"): self.cls(["t"], [data]) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_valid_range(self): data = np.timedelta64("200000", "D") pderror = pd.errors.OutOfBoundsTimedelta @@ -1076,7 +1076,7 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_conversion_scalar(self): expected = np.datetime64("2000-01-01", "ns") for values in [ @@ -1089,7 +1089,7 @@ def test_datetime64_conversion_scalar(self): assert v.values == expected assert v.values.dtype == np.dtype("datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_conversion_scalar(self): expected = np.timedelta64(24 * 60 * 60 * 10**9, "ns") for values in [ @@ -1116,7 +1116,7 @@ def test_0d_datetime(self): assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "ns") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_0d_timedelta(self): for td in [pd.to_timedelta("1s"), np.timedelta64(1, "s")]: v = Variable([], td) @@ -1561,7 +1561,7 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_transpose_0d(self): for value in [ 3.5, @@ -2623,7 +2623,7 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert actual.dtype == expected.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(self): expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) @@ -2965,7 +2965,7 @@ def test_from_pint_wrapping_dask(self, Var): def test_datetime_conversion_warning(values, warns) -> None: dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): + with pytest.warns(UserWarning, match="non-default precision datetime"): var = Variable(dims, values) else: with warnings.catch_warnings(): @@ -3011,7 +3011,7 @@ def test_datetime_conversion_warning(values, warns) -> None: def test_pandas_two_only_datetime_conversion_warnings( data: pd.DatetimeIndex | pd.Series, dtype: str | pd.DatetimeTZDtype ) -> None: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): + with pytest.warns(UserWarning, match="non-default precision datetime"): var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] if var.dtype.kind == "M": @@ -3040,7 +3040,7 @@ def test_pandas_two_only_datetime_conversion_warnings( def test_timedelta_conversion_warning(values, warns) -> None: dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): + with pytest.warns(UserWarning, match="non-default precision timedelta"): var = Variable(dims, values) else: with warnings.catch_warnings(): @@ -3054,7 +3054,7 @@ def test_pandas_two_only_timedelta_conversion_warning() -> None: # Note this test relies on a pandas feature that is only present in pandas # 2.0.0 and above, and so for now cannot be parametrized. data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]") - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): + with pytest.warns(UserWarning, match="non-default precision timedelta"): var = Variable(["time"], data) assert var.dtype == np.dtype("timedelta64[ns]") @@ -3070,6 +3070,6 @@ def test_pandas_two_only_timedelta_conversion_warning() -> None: ) def test_pandas_indexing_adapter_non_nanosecond_conversion(index, dtype) -> None: data = PandasIndexingAdapter(index.astype(f"{dtype}[s]")) - with pytest.warns(UserWarning, match="non-nanosecond precision"): + with pytest.warns(UserWarning, match="non-default precision"): var = Variable(["time"], data) assert var.dtype == np.dtype(f"{dtype}[ns]") From b0a325d3328ac97f769f11e8fd87cd51827d9572 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 4 Oct 2024 14:41:56 +0200 Subject: [PATCH 2/6] match test warning --- xarray/tests/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f09d57915aa..cdab4120f85 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3549,7 +3549,7 @@ def test_expand_dims_create_index_from_iterable(self): def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 - with pytest.warns(UserWarning, match="non-nanosecond precision"): + with pytest.warns(UserWarning, match="non-default precision"): ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) assert ds.time.dtype == np.dtype("datetime64[ns]") From f1940937fafd5b252ae94d66d598047f2ca1e07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 4 Oct 2024 15:34:31 +0200 Subject: [PATCH 3/6] fix test issues, work around pandas _as_unit/as_unit issue --- xarray/coding/times.py | 33 +++++++++++++++++++++++++++------ xarray/core/pdcompat.py | 4 +++- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 43446a2a7d8..82e13e7907d 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -274,23 +274,36 @@ def _decode_datetime_with_pandas( default_unit = _get_datetime_resolution() default_delta = np.timedelta64(1, default_unit).astype("timedelta64[ns]") # get ref_date and time delta - ref_date_delta = np.timedelta64(1, ref_date.unit).astype("timedelta64[ns]") + ref_date_unit = ( + ref_date.unit + if hasattr(ref_date, "unit") + else np.datetime_data(ref_date.asm8)[0] + ) + ref_date_delta = np.timedelta64(1, ref_date_unit).astype("timedelta64[ns]") time_delta = np.timedelta64(1, time_units).astype("timedelta64[ns]") # choose the highest resolution new_time_units = { - ref_date_delta: ref_date.unit, + ref_date_delta: ref_date_unit, time_delta: time_units, default_delta: default_unit, }[min(default_delta, ref_date_delta, time_delta)] # transform to the highest needed resolution # this will raise accordingly - ref_date = ref_date.as_unit(new_time_units) + ref_date = ( + ref_date.as_unit(new_time_units) + if hasattr(ref_date, "as_unit") + else ref_date._as_unit(new_time_units) + ) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err - dunit = ref_date.unit + dunit = ( + ref_date.unit + if hasattr(ref_date, "unit") + else np.datetime_data(ref_date.asm8)[0] + ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) @@ -302,14 +315,22 @@ def _decode_datetime_with_pandas( if not np.isnan(min_delta): # todo: add meaningful error messages # this will raise on overflow - (ref_date + min_delta).as_unit(dunit) + ( + (ref_date + min_delta).as_unit(dunit) + if hasattr(ref_date, "unit") + else (ref_date + min_delta)._as_unit(dunit) + ) # this will raise on dtype oveflow if not np.int64(min_delta) == fnd_min: raise OutOfBoundsTimedelta if not np.isnan(max_delta): # todo: add meaningful error message # this will raise on overflow - (ref_date + max_delta).as_unit(dunit) + ( + (ref_date + max_delta).as_unit(dunit) + if hasattr(ref_date, "unit") + else (ref_date + max_delta)._as_unit(dunit) + ) # this will raise on dtype oveflow if not np.int64(max_delta) == fnd_max: raise OutOfBoundsTimedelta diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index a61d73ddc41..4503d851719 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -38,6 +38,7 @@ from enum import Enum from typing import Literal +import numpy as np import pandas as pd from xarray.core.options import _get_datetime_resolution @@ -84,6 +85,7 @@ def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: dt = pd.Timestamp(*args, **kwargs) units = ["s", "ms", "us", "ns"] default = _get_datetime_resolution() - if units.index(default) > units.index(dt.unit): + unit = dt.unit if hasattr(dt, "unit") else np.datetime_data(dt.asm8)[0] + if units.index(default) > units.index(unit): dt = dt.as_unit(default) return dt From 11ed1953194b34ae9c742fef712ee12ae62d1f7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 4 Oct 2024 15:49:03 +0200 Subject: [PATCH 4/6] fix mypy in options --- xarray/core/options.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/options.py b/xarray/core/options.py index f1eaf24b05d..9a4a564b759 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -32,6 +32,7 @@ "use_numbagg", "use_opt_einsum", "use_flox", + "time_resolution", ] class T_Options(TypedDict): @@ -59,6 +60,7 @@ class T_Options(TypedDict): use_flox: bool use_numbagg: bool use_opt_einsum: bool + time_resolution: Literal["s", "ms", "us", "ns"] OPTIONS: T_Options = { From 6f4d1949f1bd50fcfe96ec81835e42dc690a1280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 5 Oct 2024 15:31:02 +0200 Subject: [PATCH 5/6] fix typing --- xarray/core/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/options.py b/xarray/core/options.py index 9a4a564b759..dd6a1620061 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -163,7 +163,7 @@ def _get_keep_attrs(default: bool) -> bool: return _get_boolean_with_default("keep_attrs", default) -def _get_datetime_resolution() -> str: +def _get_datetime_resolution() -> Literal["s", "ms", "us", "ns"]: return OPTIONS["time_resolution"] From cecd5613fb533b3f1f0ca4ba4eff057fae9ccd0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 7 Oct 2024 16:05:12 +0200 Subject: [PATCH 6/6] refactor out _check_date_for_units_since_refdate --- xarray/coding/times.py | 50 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 82e13e7907d..b71c49e8b9e 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -244,6 +244,25 @@ def _decode_datetime_with_cftime( return np.array([], dtype=object) +def _check_date_for_units_since_refdate( + date, unit: str, ref_date: pd.Timestamp +) -> None: + delta = date * np.timedelta64(1, unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if date.dtype.kind == "iu" and not np.int64(delta) == date: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow + ref_date_unit = np.datetime_data(ref_date.asm8)[0] + ( + (ref_date + delta).as_unit(ref_date_unit) + if hasattr(ref_date, "as_unit") + else (ref_date + delta)._as_unit(ref_date_unit) + ) + + def _decode_datetime_with_pandas( flat_num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: @@ -309,31 +328,12 @@ def _decode_datetime_with_pandas( warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: # avoid size 0 datetimes GH1329 - fnd_min, fnd_max = flat_num_dates.min(), flat_num_dates.max() - min_delta = fnd_min * np.timedelta64(1, time_units) - max_delta = fnd_max * np.timedelta64(1, time_units) - if not np.isnan(min_delta): - # todo: add meaningful error messages - # this will raise on overflow - ( - (ref_date + min_delta).as_unit(dunit) - if hasattr(ref_date, "unit") - else (ref_date + min_delta)._as_unit(dunit) - ) - # this will raise on dtype oveflow - if not np.int64(min_delta) == fnd_min: - raise OutOfBoundsTimedelta - if not np.isnan(max_delta): - # todo: add meaningful error message - # this will raise on overflow - ( - (ref_date + max_delta).as_unit(dunit) - if hasattr(ref_date, "unit") - else (ref_date + max_delta)._as_unit(dunit) - ) - # this will raise on dtype oveflow - if not np.int64(max_delta) == fnd_max: - raise OutOfBoundsTimedelta + _check_date_for_units_since_refdate( + flat_num_dates.min(), time_units, ref_date + ) + _check_date_for_units_since_refdate( + flat_num_dates.max(), time_units, ref_date + ) # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype