pydata · dcherian · Jan 3, 2021 · Dec 12, 2020 · Dec 12, 2020 · Dec 12, 2020
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -22,9 +22,15 @@ v0.16.3 (unreleased)
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
+- As a result of :pull:`4684` the default units encoding for
+  datetime-like values (``np.datetime64[ns]`` or ``cftime.datetime``) will now
+  always be set such that ``int64`` values can be used.  In the past, no units
+  finer than "seconds" were chosen, which would sometimes mean that ``float64``
+  values were required, which would lead to inaccurate I/O round-trips.
 - remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull: `4725`).
   By `Aureliana Barghini <https://github.com/aurghs>`_
 
+
 New Features
 ~~~~~~~~~~~~
 
@@ -34,6 +40,12 @@ Bug fixes
 
 - :py:meth:`DataArray.resample` and :py:meth:`Dataset.resample` do not trigger computations anymore if :py:meth:`Dataset.weighted` or :py:meth:`DataArray.weighted` are applied (:issue:`4625`, :pull:`4668`). By `Julius Busecke <https://github.com/jbusecke>`_.
 - :py:func:`merge` with ``combine_attrs='override'`` makes a copy of the attrs (:issue:`4627`).
+- By default, when possible, xarray will now always use values of type ``int64`` when encoding
+  and decoding ``numpy.datetime64[ns]`` datetimes.  This ensures that maximum
+  precision and accuracy are maintained in the round-tripping process
+  (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard calendar
+  dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark
+  <https://github.com/spencerkclark>`_ and `Mark Harfouche <http://github.com/hmaarrfk>`_.
 - :py:meth:`DataArray.astype`, :py:meth:`Dataset.astype` and :py:meth:`Variable.astype` support
   the ``order`` and ``subok`` parameters again. This fixes a regression introduced in version 0.16.1
   (:issue:`4644`, :pull:`4683`).

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -26,6 +26,7 @@
 _STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"}
 
 _NS_PER_TIME_DELTA = {
+    "ns": 1,
     "us": int(1e3),
     "ms": int(1e6),
     "s": int(1e9),
@@ -35,7 +36,15 @@
 }
 
 TIME_UNITS = frozenset(
-    ["days", "hours", "minutes", "seconds", "milliseconds", "microseconds"]
+    [
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "milliseconds",
+        "microseconds",
+        "nanoseconds",
+    ]
 )
 
 
@@ -44,6 +53,7 @@ def _netcdf_to_numpy_timeunit(units):
     if not units.endswith("s"):
         units = "%ss" % units
     return {
+        "nanoseconds": "ns",
         "microseconds": "us",
         "milliseconds": "ms",
         "seconds": "s",
@@ -151,21 +161,22 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar):
         # strings, in which case we fall back to using cftime
         raise OutOfBoundsDatetime
 
-    # fixes: https://github.com/pydata/pandas/issues/14068
-    # these lines check if the the lowest or the highest value in dates
-    # cause an OutOfBoundsDatetime (Overflow) error
-    with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
-        pd.to_timedelta(flat_num_dates.min(), delta) + ref_date
-        pd.to_timedelta(flat_num_dates.max(), delta) + ref_date
-
-    # Cast input dates to integers of nanoseconds because `pd.to_datetime`
-    # works much faster when dealing with integers
-    # make _NS_PER_TIME_DELTA an array to ensure type upcasting
-    flat_num_dates_ns_int = (
-        flat_num_dates.astype(np.float64) * _NS_PER_TIME_DELTA[delta]
-    ).astype(np.int64)
+    # To avoid integer overflow when converting to nanosecond units for integer
+    # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64
+    # (GH 2002).
+    if flat_num_dates.dtype.kind == "i":
+        flat_num_dates = flat_num_dates.astype(np.int64)
 
+    # Cast input ordinals to integers of nanoseconds because pd.to_timedelta
+    # works much faster when dealing with integers (GH 1399).
+    flat_num_dates_ns_int = (flat_num_dates * _NS_PER_TIME_DELTA[delta]).astype(
+        np.int64
+    )
+
+    # Use pd.to_timedelta to safely cast integer values to timedeltas,
+    # and add those to a Timestamp to safely produce a DatetimeIndex.  This
+    # ensures that we do not encounter integer overflow at any point in the
+    # process without raising OutOfBoundsDatetime.
     return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
 
 
@@ -252,11 +263,24 @@ def decode_cf_timedelta(num_timedeltas, units):
 
 
 def _infer_time_units_from_diff(unique_timedeltas):
-    for time_unit in ["days", "hours", "minutes", "seconds"]:
+    # Note that the modulus operator was only implemented for np.timedelta64
+    # arrays as of NumPy version 1.16.0.  Once our minimum version of NumPy
+    # supported is greater than or equal to this we will no longer need to cast
+    # unique_timedeltas to a TimedeltaIndex.  In the meantime, however, the
+    # modulus operator works for TimedeltaIndex objects.
+    unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas)
+    for time_unit in [
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "milliseconds",
+        "microseconds",
+        "nanoseconds",
+    ]:
         delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)]
         unit_delta = np.timedelta64(delta_ns, "ns")
-        diffs = unique_timedeltas / unit_delta
-        if np.all(diffs == diffs.astype(int)):
+        if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")):
             return time_unit
     return "seconds"
 
@@ -416,7 +440,15 @@ def encode_cf_datetime(dates, units=None, calendar=None):
         # Wrap the dates in a DatetimeIndex to do the subtraction to ensure
         # an OverflowError is raised if the ref_date is too far away from
         # dates to be encoded (GH 2272).
-        num = (pd.DatetimeIndex(dates.ravel()) - ref_date) / time_delta
+        dates_as_index = pd.DatetimeIndex(dates.ravel())
+        time_deltas = dates_as_index - ref_date
+
+        # Use floor division if time_delta evenly divides all differences
+        # to preserve integer dtype if possible (GH 4045).
+        if np.all(time_deltas % time_delta == np.timedelta64(0, "ns")):
+            num = time_deltas // time_delta
+        else:
+            num = time_deltas / time_delta
         num = num.values.reshape(dates.shape)
 
     except (OutOfBoundsDatetime, OverflowError):

diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py
@@ -6,7 +6,7 @@
 import pytest
 from pandas.errors import OutOfBoundsDatetime
 
-from xarray import DataArray, Dataset, Variable, coding, decode_cf
+from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf
 from xarray.coding.times import (
     cftime_to_nptime,
     decode_cf_datetime,
@@ -479,27 +479,36 @@ def test_decoded_cf_datetime_array_2d():
     assert_array_equal(np.asarray(result), expected)
 
 
+FREQUENCIES_TO_ENCODING_UNITS = {
+    "N": "nanoseconds",
+    "U": "microseconds",
+    "L": "milliseconds",
+    "S": "seconds",
+    "T": "minutes",
+    "H": "hours",
+    "D": "days",
+}
+
+
+@pytest.mark.parametrize(("freq", "units"), FREQUENCIES_TO_ENCODING_UNITS.items())
+def test_infer_datetime_units(freq, units):
+    dates = pd.date_range("2000", periods=2, freq=freq)
+    expected = f"{units} since 2000-01-01 00:00:00"
+    assert expected == coding.times.infer_datetime_units(dates)
+
+
 @pytest.mark.parametrize(
     ["dates", "expected"],
     [
-        (pd.date_range("1900-01-01", periods=5), "days since 1900-01-01 00:00:00"),
-        (
-            pd.date_range("1900-01-01 12:00:00", freq="H", periods=2),
-            "hours since 1900-01-01 12:00:00",
-        ),
         (
             pd.to_datetime(["1900-01-01", "1900-01-02", "NaT"]),
             "days since 1900-01-01 00:00:00",
         ),
-        (
-            pd.to_datetime(["1900-01-01", "1900-01-02T00:00:00.005"]),
-            "seconds since 1900-01-01 00:00:00",
-        ),
         (pd.to_datetime(["NaT", "1900-01-01"]), "days since 1900-01-01 00:00:00"),
         (pd.to_datetime(["NaT"]), "days since 1970-01-01 00:00:00"),
     ],
 )
-def test_infer_datetime_units(dates, expected):
+def test_infer_datetime_units_with_NaT(dates, expected):
     assert expected == coding.times.infer_datetime_units(dates)
 
 
@@ -535,6 +544,7 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected):
         ("1h", "hours", np.int64(1)),
         ("1ms", "milliseconds", np.int64(1)),
         ("1us", "microseconds", np.int64(1)),
+        ("1ns", "nanoseconds", np.int64(1)),
         (["NaT", "0s", "1s"], None, [np.nan, 0, 1]),
         (["30m", "60m"], "hours", [0.5, 1.0]),
         ("NaT", "days", np.nan),
@@ -958,3 +968,30 @@ def test_decode_ambiguous_time_warns(calendar):
         assert not record
 
     np.testing.assert_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values())
+@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
+def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq):
+    times = pd.date_range("2000", periods=3, freq=freq)
+    units = f"{encoding_units} since 2000-01-01"
+    encoded, _, _ = coding.times.encode_cf_datetime(times, units)
+
+    numpy_timeunit = coding.times._netcdf_to_numpy_timeunit(encoding_units)
+    encoding_units_as_timedelta = np.timedelta64(1, numpy_timeunit)
+    if pd.to_timedelta(1, freq) >= encoding_units_as_timedelta:
+        assert encoded.dtype == np.int64
+    else:
+        assert encoded.dtype == np.float64
+
+
+@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
+def test_encode_decode_roundtrip(freq):
+    # See GH 4045. Prior to GH 4684 this test would fail for frequencies of
+    # "S", "L", "U", and "N".
+    initial_time = pd.date_range("1678-01-01", periods=1)
+    times = initial_time.append(pd.date_range("1968", periods=2, freq=freq))
+    variable = Variable(["time"], times)
+    encoded = conventions.encode_cf_variable(variable)
+    decoded = conventions.decode_cf_variable("time", encoded)
+    assert_equal(variable, decoded)