Skip to content

Commit 45d88fc

Browse files
Enable pandas-style rounding of cftime.datetime objects (#3792)
* Initial progress on implementing cftime floor/ceil/round * Improve tests and docstrings * Add tests of rounding cftime datetimes via dt accessor * Add documentation * docstring edits * Test rounding raises error with non-fixed frequency * black * typo * A couple cleanup items: - Fix floating point issue in asi8 and add tests - Ensure dask only computes once when using the rounding accessors * black
1 parent 016a77d commit 45d88fc

File tree

6 files changed

+359
-9
lines changed

6 files changed

+359
-9
lines changed

doc/weather-climate.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports:
105105
da.time.dt.dayofyear
106106
da.time.dt.dayofweek
107107
108+
- Rounding of datetimes to fixed frequencies via the ``dt`` accessor:
109+
110+
.. ipython:: python
111+
112+
da.time.dt.ceil('3D')
113+
da.time.dt.floor('5D')
114+
da.time.dt.round('2D')
115+
108116
- Group-by operations based on datetime accessor attributes (e.g. by month of
109117
the year):
110118

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ Breaking changes
2525
New Features
2626
~~~~~~~~~~~~
2727

28+
- Added support for :py:class:`pandas.DatetimeIndex`-style rounding of
29+
``cftime.datetime`` objects directly via a :py:class:`CFTimeIndex` or via the
30+
:py:class:`~core.accessor_dt.DatetimeAccessor`.
31+
By `Spencer Clark <https://github.com/spencerkclark>`_
2832
- Support new h5netcdf backend keyword `phony_dims` (available from h5netcdf
2933
v0.8.0 for :py:class:`~xarray.backends.H5NetCDFStore`.
3034
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.

xarray/coding/cftimeindex.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,83 @@ def strftime(self, date_format):
528528
"""
529529
return pd.Index([date.strftime(date_format) for date in self._data])
530530

531+
@property
532+
def asi8(self):
533+
"""Convert to integers with units of microseconds since 1970-01-01."""
534+
from ..core.resample_cftime import exact_cftime_datetime_difference
535+
536+
epoch = self.date_type(1970, 1, 1)
537+
return np.array(
538+
[
539+
_total_microseconds(exact_cftime_datetime_difference(epoch, date))
540+
for date in self.values
541+
]
542+
)
543+
544+
def _round_via_method(self, freq, method):
545+
"""Round dates using a specified method."""
546+
from .cftime_offsets import CFTIME_TICKS, to_offset
547+
548+
offset = to_offset(freq)
549+
if not isinstance(offset, CFTIME_TICKS):
550+
raise ValueError(f"{offset} is a non-fixed frequency")
551+
552+
unit = _total_microseconds(offset.as_timedelta())
553+
values = self.asi8
554+
rounded = method(values, unit)
555+
return _cftimeindex_from_i8(rounded, self.date_type, self.name)
556+
557+
def floor(self, freq):
558+
"""Round dates down to fixed frequency.
559+
560+
Parameters
561+
----------
562+
freq : str or CFTimeOffset
563+
The frequency level to round the index to. Must be a fixed
564+
frequency like 'S' (second) not 'ME' (month end). See `frequency
565+
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
566+
for a list of possible values.
567+
568+
Returns
569+
-------
570+
CFTimeIndex
571+
"""
572+
return self._round_via_method(freq, _floor_int)
573+
574+
def ceil(self, freq):
575+
"""Round dates up to fixed frequency.
576+
577+
Parameters
578+
----------
579+
freq : str or CFTimeOffset
580+
The frequency level to round the index to. Must be a fixed
581+
frequency like 'S' (second) not 'ME' (month end). See `frequency
582+
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
583+
for a list of possible values.
584+
585+
Returns
586+
-------
587+
CFTimeIndex
588+
"""
589+
return self._round_via_method(freq, _ceil_int)
590+
591+
def round(self, freq):
592+
"""Round dates to a fixed frequency.
593+
594+
Parameters
595+
----------
596+
freq : str or CFTimeOffset
597+
The frequency level to round the index to. Must be a fixed
598+
frequency like 'S' (second) not 'ME' (month end). See `frequency
599+
aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_
600+
for a list of possible values.
601+
602+
Returns
603+
-------
604+
CFTimeIndex
605+
"""
606+
return self._round_via_method(freq, _round_to_nearest_half_even)
607+
531608

532609
def _parse_iso8601_without_reso(date_type, datetime_str):
533610
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
@@ -554,3 +631,61 @@ def _parse_array_of_cftime_strings(strings, date_type):
554631
return np.array(
555632
[_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
556633
).reshape(strings.shape)
634+
635+
636+
def _cftimeindex_from_i8(values, date_type, name):
637+
"""Construct a CFTimeIndex from an array of integers.
638+
639+
Parameters
640+
----------
641+
values : np.array
642+
Integers representing microseconds since 1970-01-01.
643+
date_type : cftime.datetime
644+
Type of date for the index.
645+
name : str
646+
Name of the index.
647+
648+
Returns
649+
-------
650+
CFTimeIndex
651+
"""
652+
epoch = date_type(1970, 1, 1)
653+
dates = np.array([epoch + timedelta(microseconds=int(value)) for value in values])
654+
return CFTimeIndex(dates, name=name)
655+
656+
657+
def _total_microseconds(delta):
658+
"""Compute the total number of microseconds of a datetime.timedelta.
659+
660+
Parameters
661+
----------
662+
delta : datetime.timedelta
663+
Input timedelta.
664+
665+
Returns
666+
-------
667+
int
668+
"""
669+
return delta / timedelta(microseconds=1)
670+
671+
672+
def _floor_int(values, unit):
673+
"""Copied from pandas."""
674+
return values - np.remainder(values, unit)
675+
676+
677+
def _ceil_int(values, unit):
678+
"""Copied from pandas."""
679+
return values + np.remainder(-values, unit)
680+
681+
682+
def _round_to_nearest_half_even(values, unit):
683+
"""Copied from pandas."""
684+
if unit % 2:
685+
return _ceil_int(values - unit // 2, unit)
686+
quotient, remainder = np.divmod(values, unit)
687+
mask = np.logical_or(
688+
remainder > (unit // 2), np.logical_and(remainder == (unit // 2), quotient % 2)
689+
)
690+
quotient[mask] += 1
691+
return quotient * unit

xarray/core/accessor_dt.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,20 +78,27 @@ def _get_date_field(values, name, dtype):
7878
return access_method(values, name)
7979

8080

81-
def _round_series(values, name, freq):
82-
"""Coerce an array of datetime-like values to a pandas Series and
83-
apply requested rounding
81+
def _round_through_series_or_index(values, name, freq):
82+
"""Coerce an array of datetime-like values to a pandas Series or xarray
83+
CFTimeIndex and apply requested rounding
8484
"""
85-
values_as_series = pd.Series(values.ravel())
86-
method = getattr(values_as_series.dt, name)
85+
from ..coding.cftimeindex import CFTimeIndex
86+
87+
if is_np_datetime_like(values.dtype):
88+
values_as_series = pd.Series(values.ravel())
89+
method = getattr(values_as_series.dt, name)
90+
else:
91+
values_as_cftimeindex = CFTimeIndex(values.ravel())
92+
method = getattr(values_as_cftimeindex, name)
93+
8794
field_values = method(freq=freq).values
8895

8996
return field_values.reshape(values.shape)
9097

9198

9299
def _round_field(values, name, freq):
93-
"""Indirectly access pandas rounding functions by wrapping data
94-
as a Series and calling through `.dt` attribute.
100+
"""Indirectly access rounding functions by wrapping data
101+
as a Series or CFTimeIndex
95102
96103
Parameters
97104
----------
@@ -110,9 +117,12 @@ def _round_field(values, name, freq):
110117
if isinstance(values, dask_array_type):
111118
from dask.array import map_blocks
112119

113-
return map_blocks(_round_series, values, name, freq=freq, dtype=np.datetime64)
120+
dtype = np.datetime64 if is_np_datetime_like(values.dtype) else np.dtype("O")
121+
return map_blocks(
122+
_round_through_series_or_index, values, name, freq=freq, dtype=dtype
123+
)
114124
else:
115-
return _round_series(values, name, freq)
125+
return _round_through_series_or_index(values, name, freq)
116126

117127

118128
def _strftime_through_cftimeindex(values, date_format):

xarray/tests/test_accessor_dt.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from . import (
88
assert_array_equal,
99
assert_equal,
10+
assert_identical,
1011
raises_regex,
1112
requires_cftime,
1213
requires_dask,
@@ -435,3 +436,106 @@ def test_seasons(cftime_date_type):
435436
seasons = xr.DataArray(seasons)
436437

437438
assert_array_equal(seasons.values, dates.dt.season.values)
439+
440+
441+
@pytest.fixture
442+
def cftime_rounding_dataarray(cftime_date_type):
443+
return xr.DataArray(
444+
[
445+
[cftime_date_type(1, 1, 1, 1), cftime_date_type(1, 1, 1, 15)],
446+
[cftime_date_type(1, 1, 1, 23), cftime_date_type(1, 1, 2, 1)],
447+
]
448+
)
449+
450+
451+
@requires_cftime
452+
@requires_dask
453+
@pytest.mark.parametrize("use_dask", [False, True])
454+
def test_cftime_floor_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
455+
import dask.array as da
456+
457+
freq = "D"
458+
expected = xr.DataArray(
459+
[
460+
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 1, 0)],
461+
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)],
462+
],
463+
name="floor",
464+
)
465+
466+
if use_dask:
467+
chunks = {"dim_0": 1}
468+
# Currently a compute is done to inspect a single value of the array
469+
# if it is of object dtype to check if it is a cftime.datetime (if not
470+
# we raise an error when using the dt accessor).
471+
with raise_if_dask_computes(max_computes=1):
472+
result = cftime_rounding_dataarray.chunk(chunks).dt.floor(freq)
473+
expected = expected.chunk(chunks)
474+
assert isinstance(result.data, da.Array)
475+
assert result.chunks == expected.chunks
476+
else:
477+
result = cftime_rounding_dataarray.dt.floor(freq)
478+
479+
assert_identical(result, expected)
480+
481+
482+
@requires_cftime
483+
@requires_dask
484+
@pytest.mark.parametrize("use_dask", [False, True])
485+
def test_cftime_ceil_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
486+
import dask.array as da
487+
488+
freq = "D"
489+
expected = xr.DataArray(
490+
[
491+
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)],
492+
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 3, 0)],
493+
],
494+
name="ceil",
495+
)
496+
497+
if use_dask:
498+
chunks = {"dim_0": 1}
499+
# Currently a compute is done to inspect a single value of the array
500+
# if it is of object dtype to check if it is a cftime.datetime (if not
501+
# we raise an error when using the dt accessor).
502+
with raise_if_dask_computes(max_computes=1):
503+
result = cftime_rounding_dataarray.chunk(chunks).dt.ceil(freq)
504+
expected = expected.chunk(chunks)
505+
assert isinstance(result.data, da.Array)
506+
assert result.chunks == expected.chunks
507+
else:
508+
result = cftime_rounding_dataarray.dt.ceil(freq)
509+
510+
assert_identical(result, expected)
511+
512+
513+
@requires_cftime
514+
@requires_dask
515+
@pytest.mark.parametrize("use_dask", [False, True])
516+
def test_cftime_round_accessor(cftime_rounding_dataarray, cftime_date_type, use_dask):
517+
import dask.array as da
518+
519+
freq = "D"
520+
expected = xr.DataArray(
521+
[
522+
[cftime_date_type(1, 1, 1, 0), cftime_date_type(1, 1, 2, 0)],
523+
[cftime_date_type(1, 1, 2, 0), cftime_date_type(1, 1, 2, 0)],
524+
],
525+
name="round",
526+
)
527+
528+
if use_dask:
529+
chunks = {"dim_0": 1}
530+
# Currently a compute is done to inspect a single value of the array
531+
# if it is of object dtype to check if it is a cftime.datetime (if not
532+
# we raise an error when using the dt accessor).
533+
with raise_if_dask_computes(max_computes=1):
534+
result = cftime_rounding_dataarray.chunk(chunks).dt.round(freq)
535+
expected = expected.chunk(chunks)
536+
assert isinstance(result.data, da.Array)
537+
assert result.chunks == expected.chunks
538+
else:
539+
result = cftime_rounding_dataarray.dt.round(freq)
540+
541+
assert_identical(result, expected)

0 commit comments

Comments
 (0)