diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 151af2de66c..63785d72179 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,9 +22,15 @@ v0.16.3 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- As a result of :pull:`4684` the default units encoding for + datetime-like values (``np.datetime64[ns]`` or ``cftime.datetime``) will now + always be set such that ``int64`` values can be used. In the past, no units + finer than "seconds" were chosen, which would sometimes mean that ``float64`` + values were required, which would lead to inaccurate I/O round-trips. - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull: `4725`). By `Aureliana Barghini `_ + New Features ~~~~~~~~~~~~ @@ -34,6 +40,12 @@ Bug fixes - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke `_. - :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`). +- By default, when possible, xarray will now always use values of type ``int64`` when encoding + and decoding ``numpy.datetime64[ns]`` datetimes. This ensures that maximum + precision and accuracy are maintained in the round-tripping process + (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard calendar + dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark + `_ and `Mark Harfouche `_. - :py:meth:`DataArray.astype`, :py:meth:`Dataset.astype` and :py:meth:`Variable.astype` support the ``order`` and ``subok`` parameters again. This fixes a regression introduced in version 0.16.1 (:issue:`4644`, :pull:`4683`). diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 59f8b89743a..3d877a169f5 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -26,6 +26,7 @@ _STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} _NS_PER_TIME_DELTA = { + "ns": 1, "us": int(1e3), "ms": int(1e6), "s": int(1e9), @@ -35,7 +36,15 @@ } TIME_UNITS = frozenset( - ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"] + [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ] ) @@ -44,6 +53,7 @@ def _netcdf_to_numpy_timeunit(units): if not units.endswith("s"): units = "%ss" % units return { + "nanoseconds": "ns", "microseconds": "us", "milliseconds": "ms", "seconds": "s", @@ -151,21 +161,22 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime - # fixes: https://github.com/pydata/pandas/issues/14068 - # these lines check if the the lowest or the highest value in dates - # cause an OutOfBoundsDatetime (Overflow) error - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) - pd.to_timedelta(flat_num_dates.min(), delta) + ref_date - pd.to_timedelta(flat_num_dates.max(), delta) + ref_date - - # Cast input dates to integers of nanoseconds because `pd.to_datetime` - # works much faster when dealing with integers - # make _NS_PER_TIME_DELTA an array to ensure type upcasting - flat_num_dates_ns_int = ( - flat_num_dates.astype(np.float64) * _NS_PER_TIME_DELTA[delta] - ).astype(np.int64) + # To avoid integer overflow when converting to nanosecond units for integer + # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 + # (GH 2002). + if flat_num_dates.dtype.kind == "i": + flat_num_dates = flat_num_dates.astype(np.int64) + # Cast input ordinals to integers of nanoseconds because pd.to_timedelta + # works much faster when dealing with integers (GH 1399). + flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype( + np.int64 + ) + + # Use pd.to_timedelta to safely cast integer values to timedeltas, + # and add those to a Timestamp to safely produce a DatetimeIndex. This + # ensures that we do not encounter integer overflow at any point in the + # process without raising OutOfBoundsDatetime. return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values @@ -252,11 +263,24 @@ def decode_cf_timedelta(num_timedeltas, units): def _infer_time_units_from_diff(unique_timedeltas): - for time_unit in ["days", "hours", "minutes", "seconds"]: + # Note that the modulus operator was only implemented for np.timedelta64 + # arrays as of NumPy version 1.16.0. Once our minimum version of NumPy + # supported is greater than or equal to this we will no longer need to cast + # unique_timedeltas to a TimedeltaIndex. In the meantime, however, the + # modulus operator works for TimedeltaIndex objects. + unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas) + for time_unit in [ + "days", + "hours", + "minutes", + "seconds", + "milliseconds", + "microseconds", + "nanoseconds", + ]: delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)] unit_delta = np.timedelta64(delta_ns, "ns") - diffs = unique_timedeltas / unit_delta - if np.all(diffs == diffs.astype(int)): + if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")): return time_unit return "seconds" @@ -416,7 +440,15 @@ def encode_cf_datetime(dates, units=None, calendar=None): # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). - num = (pd.DatetimeIndex(dates.ravel()) - ref_date) / time_delta + dates_as_index = pd.DatetimeIndex(dates.ravel()) + time_deltas = dates_as_index - ref_date + + # Use floor division if time_delta evenly divides all differences + # to preserve integer dtype if possible (GH 4045). + if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")): + num = time_deltas // time_delta + else: + num = time_deltas / time_delta num = num.values.reshape(dates.shape) except (OutOfBoundsDatetime, OverflowError): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index d35cad019b7..dfd558f737e 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -6,7 +6,7 @@ import pytest from pandas.errors import OutOfBoundsDatetime -from xarray import DataArray, Dataset, Variable, coding, decode_cf +from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf from xarray.coding.times import ( cftime_to_nptime, decode_cf_datetime, @@ -479,27 +479,36 @@ def test_decoded_cf_datetime_array_2d(): assert_array_equal(np.asarray(result), expected) +FREQUENCIES_TO_ENCODING_UNITS = { + "N": "nanoseconds", + "U": "microseconds", + "L": "milliseconds", + "S": "seconds", + "T": "minutes", + "H": "hours", + "D": "days", +} + + +@pytest.mark.parametrize(("freq", "units"), FREQUENCIES_TO_ENCODING_UNITS.items()) +def test_infer_datetime_units(freq, units): + dates = pd.date_range("2000", periods=2, freq=freq) + expected = f"{units} since 2000-01-01 00:00:00" + assert expected == coding.times.infer_datetime_units(dates) + + @pytest.mark.parametrize( ["dates", "expected"], [ - (pd.date_range("1900-01-01", periods=5), "days since 1900-01-01 00:00:00"), - ( - pd.date_range("1900-01-01 12:00:00", freq="H", periods=2), - "hours since 1900-01-01 12:00:00", - ), ( pd.to_datetime(["1900-01-01", "1900-01-02", "NaT"]), "days since 1900-01-01 00:00:00", ), - ( - pd.to_datetime(["1900-01-01", "1900-01-02T00:00:00.005"]), - "seconds since 1900-01-01 00:00:00", - ), (pd.to_datetime(["NaT", "1900-01-01"]), "days since 1900-01-01 00:00:00"), (pd.to_datetime(["NaT"]), "days since 1970-01-01 00:00:00"), ], ) -def test_infer_datetime_units(dates, expected): +def test_infer_datetime_units_with_NaT(dates, expected): assert expected == coding.times.infer_datetime_units(dates) @@ -535,6 +544,7 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected): ("1h", "hours", np.int64(1)), ("1ms", "milliseconds", np.int64(1)), ("1us", "microseconds", np.int64(1)), + ("1ns", "nanoseconds", np.int64(1)), (["NaT", "0s", "1s"], None, [np.nan, 0, 1]), (["30m", "60m"], "hours", [0.5, 1.0]), ("NaT", "days", np.nan), @@ -958,3 +968,30 @@ def test_decode_ambiguous_time_warns(calendar): assert not record np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values()) +@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) +def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq): + times = pd.date_range("2000", periods=3, freq=freq) + units = f"{encoding_units} since 2000-01-01" + encoded, _, _ = coding.times.encode_cf_datetime(times, units) + + numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units) + encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit) + if pd.to_timedelta(1, freq) >= encoding_units_as_timedelta: + assert encoded.dtype == np.int64 + else: + assert encoded.dtype == np.float64 + + +@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) +def test_encode_decode_roundtrip(freq): + # See GH 4045. Prior to GH 4684 this test would fail for frequencies of + # "S", "L", "U", and "N". + initial_time = pd.date_range("1678-01-01", periods=1) + times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) + variable = Variable(["time"], times) + encoded = conventions.encode_cf_variable(variable) + decoded = conventions.decode_cf_variable("time", encoded) + assert_equal(variable, decoded)