diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index a6681715a3e..8ed9e47be01 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -77,6 +77,7 @@ core.accessor_dt.DatetimeAccessor.floor core.accessor_dt.DatetimeAccessor.round core.accessor_dt.DatetimeAccessor.strftime + core.accessor_dt.DatetimeAccessor.calendar core.accessor_dt.DatetimeAccessor.date core.accessor_dt.DatetimeAccessor.day core.accessor_dt.DatetimeAccessor.dayofweek diff --git a/doc/api.rst b/doc/api.rst index ef2694ea661..b552bc6b4d2 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -109,6 +109,8 @@ Dataset contents Dataset.drop_dims Dataset.set_coords Dataset.reset_coords + Dataset.convert_calendar + Dataset.interp_calendar Dataset.get_index Comparisons @@ -308,6 +310,8 @@ DataArray contents DataArray.drop_duplicates DataArray.reset_coords DataArray.copy + DataArray.convert_calendar + DataArray.interp_calendar DataArray.get_index DataArray.astype DataArray.item @@ -526,6 +530,7 @@ Datetimelike properties DataArray.dt.season DataArray.dt.time DataArray.dt.date + DataArray.dt.calendar DataArray.dt.is_month_start DataArray.dt.is_month_end DataArray.dt.is_quarter_end @@ -1064,6 +1069,8 @@ Creating custom indexes :toctree: generated/ cftime_range + date_range + date_range_like Faceting -------- diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index e20bd510df1..893e7b50429 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -127,6 +127,23 @@ using the same formatting as the standard `datetime.strftime`_ convention . dates.strftime("%c") da["time"].dt.strftime("%Y%m%d") +Conversion between non-standard calendar and to/from pandas DatetimeIndexes is +facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as +:py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` +argument controls which datetime backend is used in the output. The default (``None``) is to +use `pandas` when possible, i.e. when the calendar is standard and dates are within 1678 and 2262. + +.. ipython:: python + + dates = xr.cftime_range(start="2001", periods=24, freq="MS", calendar="noleap") + da_nl = xr.DataArray(np.arange(24), coords=[dates], dims=["time"], name="foo") + da_std = da.convert_calendar("standard", use_cftime=True) + +The data is unchanged, only the timestamps are modified. Further options are implemented +for the special ``"360_day"`` calendar and for handling missing dates. There is also +:py:meth:`xarray.Dataset.interp_calendar` (and :py:meth:`xarray.DataArray.interp_calendar`) +for `interpolating` data between calendars. + For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: - `Partial datetime string indexing`_: @@ -150,7 +167,8 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: - Access of basic datetime components via the ``dt`` accessor (in this case just "year", "month", "day", "hour", "minute", "second", "microsecond", - "season", "dayofyear", "dayofweek", and "days_in_month"): + "season", "dayofyear", "dayofweek", and "days_in_month") with the addition + of "calendar", absent from pandas: .. ipython:: python @@ -160,6 +178,7 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.time.dt.dayofyear da.time.dt.dayofweek da.time.dt.days_in_month + da.time.dt.calendar - Rounding of datetimes to fixed frequencies via the ``dt`` accessor: @@ -214,30 +233,6 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.resample(time="81T", closed="right", label="right", base=3).mean() -.. note:: - - - For some use-cases it may still be useful to convert from - a :py:class:`~xarray.CFTimeIndex` to a :py:class:`pandas.DatetimeIndex`, - despite the difference in calendar types. The recommended way of doing this - is to use the built-in :py:meth:`~xarray.CFTimeIndex.to_datetimeindex` - method: - - .. ipython:: python - :okwarning: - - modern_times = xr.cftime_range("2000", periods=24, freq="MS", calendar="noleap") - da = xr.DataArray(range(24), [("time", modern_times)]) - da - datetimeindex = da.indexes["time"].to_datetimeindex() - da["time"] = datetimeindex - - However in this case one should use caution to only perform operations which - do not depend on differences between dates (e.g. differentiation, - interpolation, or upsampling with resample), as these could introduce subtle - and silent errors due to the difference in calendar types between the dates - encoded in your data and the dates stored in memory. - .. _Timestamp-valid range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 .. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bd6097d61fe..5c09ca47db5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -153,6 +153,8 @@ New Features - Added ``storage_options`` argument to :py:meth:`to_zarr` (:issue:`5601`, :pull:`5615`). By `Ray Bell `_, `Zachary Blackwood `_ and `Nathan Lis `_. +- Added calendar utilities :py:func:`DataArray.convert_calendar`, :py:func:`DataArray.interp_calendar`, :py:func:`date_range`, :py:func:`date_range_like` and :py:attr:`DataArray.dt.calendar` (:issue:`5155`, :pull:`5233`). + By `Pascal Bourgault `_. - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. - Slice plots display the coords units in the same way as x/y/colorbar labels (:pull:`5847`). diff --git a/xarray/__init__.py b/xarray/__init__.py index 81ab9f388a8..aa9739d3d35 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -9,7 +9,7 @@ ) from .backends.rasterio_ import open_rasterio from .backends.zarr import open_zarr -from .coding.cftime_offsets import cftime_range +from .coding.cftime_offsets import cftime_range, date_range, date_range_like from .coding.cftimeindex import CFTimeIndex from .coding.frequencies import infer_freq from .conventions import SerializationWarning, decode_cf @@ -65,6 +65,8 @@ "combine_by_coords", "combine_nested", "concat", + "date_range", + "date_range_like", "decode_cf", "dot", "cov", diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py new file mode 100644 index 00000000000..7b973c8d7ab --- /dev/null +++ b/xarray/coding/calendar_ops.py @@ -0,0 +1,341 @@ +import numpy as np +import pandas as pd + +from ..core.common import _contains_datetime_like_objects, is_np_datetime_like +from .cftime_offsets import date_range_like, get_date_type +from .cftimeindex import CFTimeIndex +from .times import _should_cftime_be_used, convert_times + +try: + import cftime +except ImportError: + cftime = None + + +_CALENDARS_WITHOUT_YEAR_ZERO = [ + "gregorian", + "proleptic_gregorian", + "julian", + "standard", +] + + +def _days_in_year(year, calendar, use_cftime=True): + """Return the number of days in the input year according to the input calendar.""" + date_type = get_date_type(calendar, use_cftime=use_cftime) + if year == -1 and calendar in _CALENDARS_WITHOUT_YEAR_ZERO: + difference = date_type(year + 2, 1, 1) - date_type(year, 1, 1) + else: + difference = date_type(year + 1, 1, 1) - date_type(year, 1, 1) + return difference.days + + +def convert_calendar( + obj, + calendar, + dim="time", + align_on=None, + missing=None, + use_cftime=None, +): + """Transform a time-indexed Dataset or DataArray to one that uses another calendar. + + This function only converts the individual timestamps; it does not modify any + data except in dropping invalid/surplus dates, or inserting values for missing dates. + + If the source and target calendars are both from a standard type, only the + type of the time array is modified. When converting to a calendar with a + leap year from to a calendar without a leap year, the 29th of February will + be removed from the array. In the other direction the 29th of February will + be missing in the output, unless `missing` is specified, in which case that + value is inserted. For conversions involving the `360_day` calendar, see Notes. + + This method is safe to use with sub-daily data as it doesn't touch the time + part of the timestamps. + + Parameters + ---------- + obj : DataArray or Dataset + Input DataArray or Dataset with a time coordinate of a valid dtype + (:py:class:`numpy.datetime64` or :py:class:`cftime.datetime`). + calendar : str + The target calendar name. + dim : str + Name of the time coordinate in the input DataArray or Dataset. + align_on : {None, 'date', 'year'} + Must be specified when either the source or target is a `"360_day"` + calendar; ignored otherwise. See Notes. + missing : any, optional + By default, i.e. if the value is None, this method will simply attempt + to convert the dates in the source calendar to the same dates in the + target calendar, and drop any of those that are not possible to + represent. If a value is provided, a new time coordinate will be + created in the target calendar with the same frequency as the original + time coordinate; for any dates that are not present in the source, the + data will be filled with this value. Note that using this mode requires + that the source data have an inferable frequency; for more information + see :py:func:`xarray.infer_freq`. For certain frequency, source, and + target calendar combinations, this could result in many missing values, see notes. + use_cftime : bool, optional + Whether to use cftime objects in the output, only used if `calendar` is + one of {"proleptic_gregorian", "gregorian" or "standard"}. + If True, the new time axis uses cftime objects. + If None (default), it uses :py:class:`numpy.datetime64` values if the date + range permits it, and :py:class:`cftime.datetime` objects if not. + If False, it uses :py:class:`numpy.datetime64` or fails. + + Returns + ------- + Copy of source with the time coordinate converted to the target calendar. + If `missing` was None (default), invalid dates in the new calendar are + dropped, but missing dates are not inserted. + If `missing` was given, the new data is reindexed to have a time axis + with the same frequency as the source, but in the new calendar; any + missing datapoints are filled with `missing`. + + Notes + ----- + Passing a value to `missing` is only usable if the source's time coordinate as an + inferrable frequencies (see :py:func:`~xarray.infer_freq`) and is only appropriate + if the target coordinate, generated from this frequency, has dates equivalent to the + source. It is usually **not** appropriate to use this mode with: + + - Period-end frequencies: 'A', 'Y', 'Q' or 'M', in opposition to 'AS' 'YS', 'QS' and 'MS' + - Sub-monthly frequencies that do not divide a day evenly: 'W', 'nD' where `n != 1` + or 'mH' where 24 % m != 0). + + If one of the source or target calendars is `"360_day"`, `align_on` must + be specified and two options are offered. + + "year" + The dates are translated according to their relative position in the year, + ignoring their original month and day information, meaning that the + missing/surplus days are added/removed at regular intervals. + + From a `360_day` to a standard calendar, the output will be missing the + following dates (day of year in parentheses): + To a leap year: + January 31st (31), March 31st (91), June 1st (153), July 31st (213), + September 31st (275) and November 30th (335). + To a non-leap year: + February 6th (36), April 19th (109), July 2nd (183), + September 12th (255), November 25th (329). + + From a standard calendar to a `"360_day"`, the following dates in the + source array will be dropped: + From a leap year: + January 31st (31), April 1st (92), June 1st (153), August 1st (214), + September 31st (275), December 1st (336) + From a non-leap year: + February 6th (37), April 20th (110), July 2nd (183), + September 13th (256), November 25th (329) + + This option is best used on daily and subdaily data. + + "date" + The month/day information is conserved and invalid dates are dropped + from the output. This means that when converting from a `"360_day"` to a + standard calendar, all 31sts (Jan, March, May, July, August, October and + December) will be missing as there is no equivalent dates in the + `"360_day"` calendar and the 29th (on non-leap years) and 30th of February + will be dropped as there are no equivalent dates in a standard calendar. + + This option is best used with data on a frequency coarser than daily. + """ + from ..core.dataarray import DataArray + + time = obj[dim] + if not _contains_datetime_like_objects(time): + raise ValueError(f"Coordinate {dim} must contain datetime objects.") + + use_cftime = _should_cftime_be_used(time, calendar, use_cftime) + + source_calendar = time.dt.calendar + # Do nothing if request calendar is the same as the source + # AND source is np XOR use_cftime + if source_calendar == calendar and is_np_datetime_like(time.dtype) ^ use_cftime: + return obj + + if (time.dt.year == 0).any() and calendar in _CALENDARS_WITHOUT_YEAR_ZERO: + raise ValueError( + f"Source time coordinate contains dates with year 0, which is not supported by target calendar {calendar}." + ) + + if (source_calendar == "360_day" or calendar == "360_day") and align_on is None: + raise ValueError( + "Argument `align_on` must be specified with either 'date' or " + "'year' when converting to or from a '360_day' calendar." + ) + + if source_calendar != "360_day" and calendar != "360_day": + align_on = "date" + + out = obj.copy() + + if align_on == "year": + # Special case for conversion involving 360_day calendar + # Instead of translating dates directly, this tries to keep the position within a year similar. + + new_doy = time.groupby(f"{dim}.year").map( + _interpolate_day_of_year, target_calendar=calendar, use_cftime=use_cftime + ) + + # Convert the source datetimes, but override the day of year with our new day of years. + out[dim] = DataArray( + [ + _convert_to_new_calendar_with_new_day_of_year( + date, newdoy, calendar, use_cftime + ) + for date, newdoy in zip(time.variable._data.array, new_doy) + ], + dims=(dim,), + name=dim, + ) + # Remove duplicate timestamps, happens when reducing the number of days + out = out.isel({dim: np.unique(out[dim], return_index=True)[1]}) + elif align_on == "date": + new_times = convert_times( + time.data, + get_date_type(calendar, use_cftime=use_cftime), + raise_on_invalid=False, + ) + out[dim] = new_times + + # Remove NaN that where put on invalid dates in target calendar + out = out.where(out[dim].notnull(), drop=True) + + if missing is not None: + time_target = date_range_like(time, calendar=calendar, use_cftime=use_cftime) + out = out.reindex({dim: time_target}, fill_value=missing) + + # Copy attrs but remove `calendar` if still present. + out[dim].attrs.update(time.attrs) + out[dim].attrs.pop("calendar", None) + return out + + +def _interpolate_day_of_year(time, target_calendar, use_cftime): + """Returns the nearest day in the target calendar of the corresponding + "decimal year" in the source calendar. + """ + year = int(time.dt.year[0]) + source_calendar = time.dt.calendar + return np.round( + _days_in_year(year, target_calendar, use_cftime) + * time.dt.dayofyear + / _days_in_year(year, source_calendar, use_cftime) + ).astype(int) + + +def _convert_to_new_calendar_with_new_day_of_year( + date, day_of_year, calendar, use_cftime +): + """Convert a datetime object to another calendar with a new day of year. + + Redefines the day of year (and thus ignores the month and day information + from the source datetime). + Nanosecond information is lost as cftime.datetime doesn't support it. + """ + new_date = cftime.num2date( + day_of_year - 1, + f"days since {date.year}-01-01", + calendar=calendar if use_cftime else "standard", + ) + try: + return get_date_type(calendar, use_cftime)( + date.year, + new_date.month, + new_date.day, + date.hour, + date.minute, + date.second, + date.microsecond, + ) + except ValueError: + return np.nan + + +def _datetime_to_decimal_year(times, dim="time", calendar=None): + """Convert a datetime DataArray to decimal years according to its calendar or the given one. + + The decimal year of a timestamp is its year plus its sub-year component + converted to the fraction of its year. + Ex: '2000-03-01 12:00' is 2000.1653 in a standard calendar, + 2000.16301 in a "noleap" or 2000.16806 in a "360_day". + """ + from ..core.dataarray import DataArray + + calendar = calendar or times.dt.calendar + + if is_np_datetime_like(times.dtype): + times = times.copy(data=convert_times(times.values, get_date_type("standard"))) + + def _make_index(time): + year = int(time.dt.year[0]) + doys = cftime.date2num(time, f"days since {year:04d}-01-01", calendar=calendar) + return DataArray( + year + doys / _days_in_year(year, calendar), + dims=(dim,), + coords=time.coords, + name=dim, + ) + + return times.groupby(f"{dim}.year").map(_make_index) + + +def interp_calendar(source, target, dim="time"): + """Interpolates a DataArray or Dataset indexed by a time coordinate to + another calendar based on decimal year measure. + + Each timestamp in `source` and `target` are first converted to their decimal + year equivalent then `source` is interpolated on the target coordinate. + The decimal year of a timestamp is its year plus its sub-year component + converted to the fraction of its year. For example "2000-03-01 12:00" is + 2000.1653 in a standard calendar or 2000.16301 in a `"noleap"` calendar. + + This method should only be used when the time (HH:MM:SS) information of + time coordinate is not important. + + Parameters + ---------- + source: DataArray or Dataset + The source data to interpolate; must have a time coordinate of a valid + dtype (:py:class:`numpy.datetime64` or :py:class:`cftime.datetime` objects) + target: DataArray, DatetimeIndex, or CFTimeIndex + The target time coordinate of a valid dtype (np.datetime64 or cftime objects) + dim : str + The time coordinate name. + + Return + ------ + DataArray or Dataset + The source interpolated on the decimal years of target, + """ + from ..core.dataarray import DataArray + + if isinstance(target, (pd.DatetimeIndex, CFTimeIndex)): + target = DataArray(target, dims=(dim,), name=dim) + + if not _contains_datetime_like_objects( + source[dim] + ) or not _contains_datetime_like_objects(target): + raise ValueError( + f"Both 'source.{dim}' and 'target' must contain datetime objects." + ) + + source_calendar = source[dim].dt.calendar + target_calendar = target.dt.calendar + + if ( + source[dim].time.dt.year == 0 + ).any() and target_calendar in _CALENDARS_WITHOUT_YEAR_ZERO: + raise ValueError( + f"Source time coordinate contains dates with year 0, which is not supported by target calendar {target_calendar}." + ) + + out = source.copy() + out[dim] = _datetime_to_decimal_year(source[dim], dim=dim, calendar=source_calendar) + target_idx = _datetime_to_decimal_year(target, dim=dim, calendar=target_calendar) + out = out.interp(**{dim: target_idx}) + out[dim] = target + return out diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 729f15bbd50..2db6d4e8097 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -41,15 +41,22 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import re -from datetime import timedelta +from datetime import datetime, timedelta from functools import partial from typing import ClassVar, Optional import numpy as np +import pandas as pd +from ..core.common import _contains_datetime_like_objects, is_np_datetime_like from ..core.pdcompat import count_not_none from .cftimeindex import CFTimeIndex, _parse_iso8601_with_reso -from .times import format_cftime_datetime +from .times import ( + _is_standard_calendar, + _should_cftime_be_used, + convert_time_or_go_back, + format_cftime_datetime, +) try: import cftime @@ -57,11 +64,14 @@ cftime = None -def get_date_type(calendar): +def get_date_type(calendar, use_cftime=True): """Return the cftime date type for a given calendar name.""" if cftime is None: raise ImportError("cftime is required for dates with non-standard calendars") else: + if _is_standard_calendar(calendar) and not use_cftime: + return pd.Timestamp + calendars = { "noleap": cftime.DatetimeNoLeap, "360_day": cftime.Datetime360Day, @@ -700,6 +710,8 @@ def to_cftime_datetime(date_str_or_date, calendar=None): return date elif isinstance(date_str_or_date, cftime.datetime): return date_str_or_date + elif isinstance(date_str_or_date, (datetime, pd.Timestamp)): + return cftime.DatetimeProlepticGregorian(*date_str_or_date.timetuple()) else: raise TypeError( "date_str_or_date must be a string or a " @@ -1009,3 +1021,178 @@ def cftime_range( dates = dates[:-1] return CFTimeIndex(dates, name=name) + + +def date_range( + start=None, + end=None, + periods=None, + freq="D", + tz=None, + normalize=False, + name=None, + closed=None, + calendar="standard", + use_cftime=None, +): + """Return a fixed frequency datetime index. + + The type (:py:class:`xarray.CFTimeIndex` or :py:class:`pandas.DatetimeIndex`) + of the returned index depends on the requested calendar and on `use_cftime`. + + Parameters + ---------- + start : str or datetime-like, optional + Left bound for generating dates. + end : str or datetime-like, optional + Right bound for generating dates. + periods : int, optional + Number of periods to generate. + freq : str or None, default: "D" + Frequency strings can have multiples, e.g. "5H". + tz : str or tzinfo, optional + Time zone name for returning localized DatetimeIndex, for example + 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is + timezone-naive. Only valid with pandas DatetimeIndex. + normalize : bool, default: False + Normalize start/end dates to midnight before generating date range. + name : str, default: None + Name of the resulting index + closed : {"left", "right"} or None, default: None + Make the interval closed with respect to the given frequency to the + "left", "right", or both sides (None). + calendar : str, default: "standard" + Calendar type for the datetimes. + use_cftime : boolean, optional + If True, always return a CFTimeIndex. + If False, return a pd.DatetimeIndex if possible or raise a ValueError. + If None (default), return a pd.DatetimeIndex if possible, + otherwise return a CFTimeIndex. Defaults to False if `tz` is not None. + + Returns + ------- + CFTimeIndex or pd.DatetimeIndex + + See also + -------- + pandas.date_range + cftime_range + date_range_like + """ + from .times import _is_standard_calendar + + if tz is not None: + use_cftime = False + + if _is_standard_calendar(calendar) and use_cftime is not True: + try: + return pd.date_range( + start=start, + end=end, + periods=periods, + freq=freq, + tz=tz, + normalize=normalize, + name=name, + closed=closed, + ) + except pd.errors.OutOfBoundsDatetime as err: + if use_cftime is False: + raise ValueError( + "Date range is invalid for pandas DatetimeIndex, try using `use_cftime=True`." + ) from err + elif use_cftime is False: + raise ValueError( + f"Invalid calendar {calendar} for pandas DatetimeIndex, try using `use_cftime=True`." + ) + + return cftime_range( + start=start, + end=end, + periods=periods, + freq=freq, + normalize=normalize, + name=name, + closed=closed, + calendar=calendar, + ) + + +def date_range_like(source, calendar, use_cftime=None): + """Generate a datetime array with the same frequency, start and end as + another one, but in a different calendar. + + Parameters + ---------- + source : DataArray, CFTimeIndex, or pd.DatetimeIndex + 1D datetime array + calendar : str + New calendar name. + use_cftime : bool, optional + If True, the output uses :py:class:`cftime.datetime` objects. + If None (default), :py:class:`numpy.datetime64` values are used if possible. + If False, :py:class:`numpy.datetime64` values are used or an error is raised. + + Returns + ------- + DataArray + 1D datetime coordinate with the same start, end and frequency as the + source, but in the new calendar. The start date is assumed to exist in + the target calendar. If the end date doesn't exist, the code tries 1 + and 2 calendar days before. There is a special case when the source time + series is daily or coarser and the end of the input range is on the + last day of the month. Then the output range will also end on the last + day of the month in the new calendar. + """ + from ..core.dataarray import DataArray + from .frequencies import infer_freq + + if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and ( + isinstance(source, DataArray) + and (source.ndim != 1) + or not _contains_datetime_like_objects(source) + ): + raise ValueError( + "'source' must be a 1D array of datetime objects for inferring its range." + ) + + freq = infer_freq(source) + if freq is None: + raise ValueError( + "`date_range_like` was unable to generate a range as the source frequency was not inferrable." + ) + + use_cftime = _should_cftime_be_used(source, calendar, use_cftime) + + source_start = source.values.min() + source_end = source.values.max() + if is_np_datetime_like(source.dtype): + # We want to use datetime fields (datetime64 object don't have them) + source_calendar = "standard" + source_start = pd.Timestamp(source_start) + source_end = pd.Timestamp(source_end) + else: + if isinstance(source, CFTimeIndex): + source_calendar = source.calendar + else: # DataArray + source_calendar = source.dt.calendar + + if calendar == source_calendar and is_np_datetime_like(source.dtype) ^ use_cftime: + return source + + date_type = get_date_type(calendar, use_cftime) + start = convert_time_or_go_back(source_start, date_type) + end = convert_time_or_go_back(source_end, date_type) + + # For the cases where the source ends on the end of the month, we expect the same in the new calendar. + if source_end.day == source_end.daysinmonth and isinstance( + to_offset(freq), (YearEnd, QuarterEnd, MonthEnd, Day) + ): + end = end.replace(day=end.daysinmonth) + + return date_range( + start=start.isoformat(), + end=end.isoformat(), + freq=freq, + calendar=calendar, + ) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 7d532f8fc38..c89b0c100cd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -8,8 +8,9 @@ from pandas.errors import OutOfBoundsDatetime from ..core import indexing -from ..core.common import contains_cftime_datetimes +from ..core.common import contains_cftime_datetimes, is_np_datetime_like from ..core.formatting import first_n_items, format_timestamp, last_item +from ..core.pycompat import is_duck_dask_array from ..core.variable import Variable from .variables import ( SerializationWarning, @@ -76,6 +77,26 @@ def _is_standard_calendar(calendar): return calendar.lower() in _STANDARD_CALENDARS +def _is_numpy_compatible_time_range(times): + if is_np_datetime_like(times.dtype): + return True + # times array contains cftime objects + times = np.asarray(times) + tmin = times.min() + tmax = times.max() + try: + convert_time_or_go_back(tmin, pd.Timestamp) + convert_time_or_go_back(tmax, pd.Timestamp) + except pd.errors.OutOfBoundsDatetime: + return False + except ValueError as err: + if err.args[0] == "year 0 is out of range": + return False + raise + else: + return True + + def _netcdf_to_numpy_timeunit(units): units = units.lower() if not units.endswith("s"): @@ -322,10 +343,21 @@ def _infer_time_units_from_diff(unique_timedeltas): def infer_calendar_name(dates): """Given an array of datetimes, infer the CF calendar name""" - if np.asarray(dates).dtype == "datetime64[ns]": + if is_np_datetime_like(dates.dtype): return "proleptic_gregorian" - else: - return np.asarray(dates).ravel()[0].calendar + elif dates.dtype == np.dtype("O") and dates.size > 0: + # Logic copied from core.common.contains_cftime_datetimes. + if cftime is not None: + sample = dates.ravel()[0] + if is_duck_dask_array(sample): + sample = sample.compute() + if isinstance(sample, np.ndarray): + sample = sample.item() + if isinstance(sample, cftime.datetime): + return sample.calendar + + # Error raise if dtype is neither datetime or "O", if cftime is not importable, and if element of 'O' dtype is not cftime. + raise ValueError("Array does not contain datetime objects.") def infer_datetime_units(dates): @@ -373,9 +405,12 @@ def infer_timedelta_units(deltas): return _infer_time_units_from_diff(unique_timedeltas) -def cftime_to_nptime(times): +def cftime_to_nptime(times, raise_on_invalid=True): """Given an array of cftime.datetime objects, return an array of - numpy.datetime64 objects of the same size""" + numpy.datetime64 objects of the same size + + If raise_on_invalid is True (default), invalid dates trigger a ValueError. + Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) new = np.empty(times.shape, dtype="M8[ns]") for i, t in np.ndenumerate(times): @@ -388,14 +423,125 @@ def cftime_to_nptime(times): t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond ) except ValueError as e: - raise ValueError( - "Cannot convert date {} to a date in the " - "standard calendar. Reason: {}.".format(t, e) - ) + if raise_on_invalid: + raise ValueError( + "Cannot convert date {} to a date in the " + "standard calendar. Reason: {}.".format(t, e) + ) + else: + dt = "NaT" new[i] = np.datetime64(dt) return new +def convert_times(times, date_type, raise_on_invalid=True): + """Given an array of datetimes, return the same dates in another cftime or numpy date type. + + Useful to convert between calendars in numpy and cftime or between cftime calendars. + + If raise_on_valid is True (default), invalid dates trigger a ValueError. + Otherwise, the invalid element is replaced by np.NaN for cftime types and np.NaT for np.datetime64. + """ + if date_type in (pd.Timestamp, np.datetime64) and not is_np_datetime_like( + times.dtype + ): + return cftime_to_nptime(times, raise_on_invalid=raise_on_invalid) + if is_np_datetime_like(times.dtype): + # Convert datetime64 objects to Timestamps since those have year, month, day, etc. attributes + times = pd.DatetimeIndex(times) + new = np.empty(times.shape, dtype="O") + for i, t in enumerate(times): + try: + dt = date_type( + t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond + ) + except ValueError as e: + if raise_on_invalid: + raise ValueError( + "Cannot convert date {} to a date in the " + "{} calendar. Reason: {}.".format( + t, date_type(2000, 1, 1).calendar, e + ) + ) + else: + dt = np.NaN + + new[i] = dt + return new + + +def convert_time_or_go_back(date, date_type): + """Convert a single date to a new date_type (cftime.datetime or pd.Timestamp). + + If the new date is invalid, it goes back a day and tries again. If it is still + invalid, goes back a second day. + + This is meant to convert end-of-month dates into a new calendar. + """ + try: + return date_type( + date.year, + date.month, + date.day, + date.hour, + date.minute, + date.second, + date.microsecond, + ) + except OutOfBoundsDatetime: + raise + except ValueError: + # Day is invalid, happens at the end of months, try again the day before + try: + return date_type( + date.year, + date.month, + date.day - 1, + date.hour, + date.minute, + date.second, + date.microsecond, + ) + except ValueError: + # Still invalid, happens for 360_day to non-leap february. Try again 2 days before date. + return date_type( + date.year, + date.month, + date.day - 2, + date.hour, + date.minute, + date.second, + date.microsecond, + ) + + +def _should_cftime_be_used(source, target_calendar, use_cftime): + """Return whether conversion of the source to the target calendar should + result in a cftime-backed array. + + Source is a 1D datetime array, target_cal a string (calendar name) and + use_cftime is a boolean or None. If use_cftime is None, this returns True + if the source's range and target calendar are convertible to np.datetime64 objects. + """ + # Arguments Checks for target + if use_cftime is not True: + if _is_standard_calendar(target_calendar): + if _is_numpy_compatible_time_range(source): + # Conversion is possible with pandas, force False if it was None + use_cftime = False + elif use_cftime is False: + raise ValueError( + "Source time range is not valid for numpy datetimes. Try using `use_cftime=True`." + ) + elif use_cftime is False: + raise ValueError( + f"Calendar '{target_calendar}' is only valid with cftime. Try using `use_cftime=True`." + ) + else: + use_cftime = True + return use_cftime + + def _cleanup_netcdf_time_units(units): delta, ref_date = _unpack_netcdf_time_units(units) try: diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index 2cdd467bdf3..2a7b6200d3b 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from ..coding.times import infer_calendar_name from .common import ( _contains_datetime_like_objects, is_np_datetime_like, @@ -440,6 +441,15 @@ def weekofyear(self): "is_leap_year", "Boolean indicator if the date belongs to a leap year.", bool ) + @property + def calendar(self): + """The name of the calendar of the dates. + + Only relevant for arrays of :py:class:`cftime.datetime` objects, + returns "proleptic_gregorian" for arrays of :py:class:`numpy.datetime64` values. + """ + return infer_calendar_name(self._obj.data) + class TimedeltaAccessor(Properties): """Access Timedelta fields for DataArrays with Timedelta-like dtypes. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 05d06400f2e..0a99d898c3a 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -21,6 +21,8 @@ import numpy as np import pandas as pd +from ..coding.calendar_ops import convert_calendar, interp_calendar +from ..coding.cftimeindex import CFTimeIndex from ..plot.plot import _PlotMethods from ..plot.utils import _get_units_from_attrs from . import ( @@ -4656,6 +4658,160 @@ def drop_duplicates( indexes = {dim: ~self.get_index(dim).duplicated(keep=keep)} return self.isel(indexes) + def convert_calendar( + self, + calendar: str, + dim: str = "time", + align_on: Optional[str] = None, + missing: Optional[Any] = None, + use_cftime: Optional[bool] = None, + ) -> "DataArray": + """Convert the DataArray to another calendar. + + Only converts the individual timestamps, does not modify any data except + in dropping invalid/surplus dates or inserting missing dates. + + If the source and target calendars are either no_leap, all_leap or a + standard type, only the type of the time array is modified. + When converting to a leap year from a non-leap year, the 29th of February + is removed from the array. In the other direction the 29th of February + will be missing in the output, unless `missing` is specified, + in which case that value is inserted. + + For conversions involving `360_day` calendars, see Notes. + + This method is safe to use with sub-daily data as it doesn't touch the + time part of the timestamps. + + Parameters + --------- + calendar : str + The target calendar name. + dim : str + Name of the time coordinate. + align_on : {None, 'date', 'year'} + Must be specified when either source or target is a `360_day` calendar, + ignored otherwise. See Notes. + missing : Optional[any] + By default, i.e. if the value is None, this method will simply attempt + to convert the dates in the source calendar to the same dates in the + target calendar, and drop any of those that are not possible to + represent. If a value is provided, a new time coordinate will be + created in the target calendar with the same frequency as the original + time coordinate; for any dates that are not present in the source, the + data will be filled with this value. Note that using this mode requires + that the source data have an inferable frequency; for more information + see :py:func:`xarray.infer_freq`. For certain frequency, source, and + target calendar combinations, this could result in many missing values, see notes. + use_cftime : boolean, optional + Whether to use cftime objects in the output, only used if `calendar` + is one of {"proleptic_gregorian", "gregorian" or "standard"}. + If True, the new time axis uses cftime objects. + If None (default), it uses :py:class:`numpy.datetime64` values if the + date range permits it, and :py:class:`cftime.datetime` objects if not. + If False, it uses :py:class:`numpy.datetime64` or fails. + + Returns + ------- + DataArray + Copy of the dataarray with the time coordinate converted to the + target calendar. If 'missing' was None (default), invalid dates in + the new calendar are dropped, but missing dates are not inserted. + If `missing` was given, the new data is reindexed to have a time axis + with the same frequency as the source, but in the new calendar; any + missing datapoints are filled with `missing`. + + Notes + ----- + Passing a value to `missing` is only usable if the source's time coordinate as an + inferrable frequencies (see :py:func:`~xarray.infer_freq`) and is only appropriate + if the target coordinate, generated from this frequency, has dates equivalent to the + source. It is usually **not** appropriate to use this mode with: + + - Period-end frequencies : 'A', 'Y', 'Q' or 'M', in opposition to 'AS' 'YS', 'QS' and 'MS' + - Sub-monthly frequencies that do not divide a day evenly : 'W', 'nD' where `N != 1` + or 'mH' where 24 % m != 0). + + If one of the source or target calendars is `"360_day"`, `align_on` must + be specified and two options are offered. + + - "year" + The dates are translated according to their relative position in the year, + ignoring their original month and day information, meaning that the + missing/surplus days are added/removed at regular intervals. + + From a `360_day` to a standard calendar, the output will be missing the + following dates (day of year in parentheses): + + To a leap year: + January 31st (31), March 31st (91), June 1st (153), July 31st (213), + September 31st (275) and November 30th (335). + To a non-leap year: + February 6th (36), April 19th (109), July 2nd (183), + September 12th (255), November 25th (329). + + From a standard calendar to a `"360_day"`, the following dates in the + source array will be dropped: + + From a leap year: + January 31st (31), April 1st (92), June 1st (153), August 1st (214), + September 31st (275), December 1st (336) + From a non-leap year: + February 6th (37), April 20th (110), July 2nd (183), + September 13th (256), November 25th (329) + + This option is best used on daily and subdaily data. + + - "date" + The month/day information is conserved and invalid dates are dropped + from the output. This means that when converting from a `"360_day"` to a + standard calendar, all 31st (Jan, March, May, July, August, October and + December) will be missing as there is no equivalent dates in the + `"360_day"` calendar and the 29th (on non-leap years) and 30th of February + will be dropped as there are no equivalent dates in a standard calendar. + + This option is best used with data on a frequency coarser than daily. + """ + return convert_calendar( + self, + calendar, + dim=dim, + align_on=align_on, + missing=missing, + use_cftime=use_cftime, + ) + + def interp_calendar( + self, + target: Union[pd.DatetimeIndex, CFTimeIndex, "DataArray"], + dim: str = "time", + ) -> "DataArray": + """Interpolates the DataArray to another calendar based on decimal year measure. + + Each timestamp in `source` and `target` are first converted to their decimal + year equivalent then `source` is interpolated on the target coordinate. + The decimal year of a timestamp is its year plus its sub-year component + converted to the fraction of its year. For example "2000-03-01 12:00" is + 2000.1653 in a standard calendar or 2000.16301 in a `"noleap"` calendar. + + This method should only be used when the time (HH:MM:SS) information of + time coordinate is not important. + + Parameters + ---------- + target: DataArray or DatetimeIndex or CFTimeIndex + The target time coordinate of a valid dtype + (np.datetime64 or cftime objects) + dim : str + The time coordinate name. + + Return + ------ + DataArray + The source interpolated on the decimal years of target, + """ + return interp_calendar(self, target, dim=dim) + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3fbc6154c5d..360ffd42d54 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -35,7 +35,8 @@ import xarray as xr -from ..coding.cftimeindex import _parse_array_of_cftime_strings +from ..coding.calendar_ops import convert_calendar, interp_calendar +from ..coding.cftimeindex import CFTimeIndex, _parse_array_of_cftime_strings from ..plot.dataset_plot import _Dataset_PlotMethods from . import ( alignment, @@ -7731,3 +7732,157 @@ def _wrapper(Y, *coords_, **kwargs): result.attrs = self.attrs.copy() return result + + def convert_calendar( + self, + calendar: str, + dim: str = "time", + align_on: Optional[str] = None, + missing: Optional[Any] = None, + use_cftime: Optional[bool] = None, + ) -> "Dataset": + """Convert the Dataset to another calendar. + + Only converts the individual timestamps, does not modify any data except + in dropping invalid/surplus dates or inserting missing dates. + + If the source and target calendars are either no_leap, all_leap or a + standard type, only the type of the time array is modified. + When converting to a leap year from a non-leap year, the 29th of February + is removed from the array. In the other direction the 29th of February + will be missing in the output, unless `missing` is specified, + in which case that value is inserted. + + For conversions involving `360_day` calendars, see Notes. + + This method is safe to use with sub-daily data as it doesn't touch the + time part of the timestamps. + + Parameters + --------- + calendar : str + The target calendar name. + dim : str + Name of the time coordinate. + align_on : {None, 'date', 'year'} + Must be specified when either source or target is a `360_day` calendar, + ignored otherwise. See Notes. + missing : Optional[any] + By default, i.e. if the value is None, this method will simply attempt + to convert the dates in the source calendar to the same dates in the + target calendar, and drop any of those that are not possible to + represent. If a value is provided, a new time coordinate will be + created in the target calendar with the same frequency as the original + time coordinate; for any dates that are not present in the source, the + data will be filled with this value. Note that using this mode requires + that the source data have an inferable frequency; for more information + see :py:func:`xarray.infer_freq`. For certain frequency, source, and + target calendar combinations, this could result in many missing values, see notes. + use_cftime : boolean, optional + Whether to use cftime objects in the output, only used if `calendar` + is one of {"proleptic_gregorian", "gregorian" or "standard"}. + If True, the new time axis uses cftime objects. + If None (default), it uses :py:class:`numpy.datetime64` values if the + date range permits it, and :py:class:`cftime.datetime` objects if not. + If False, it uses :py:class:`numpy.datetime64` or fails. + + Returns + ------- + Dataset + Copy of the dataarray with the time coordinate converted to the + target calendar. If 'missing' was None (default), invalid dates in + the new calendar are dropped, but missing dates are not inserted. + If `missing` was given, the new data is reindexed to have a time axis + with the same frequency as the source, but in the new calendar; any + missing datapoints are filled with `missing`. + + Notes + ----- + Passing a value to `missing` is only usable if the source's time coordinate as an + inferrable frequencies (see :py:func:`~xarray.infer_freq`) and is only appropriate + if the target coordinate, generated from this frequency, has dates equivalent to the + source. It is usually **not** appropriate to use this mode with: + + - Period-end frequencies : 'A', 'Y', 'Q' or 'M', in opposition to 'AS' 'YS', 'QS' and 'MS' + - Sub-monthly frequencies that do not divide a day evenly : 'W', 'nD' where `N != 1` + or 'mH' where 24 % m != 0). + + If one of the source or target calendars is `"360_day"`, `align_on` must + be specified and two options are offered. + + - "year" + The dates are translated according to their relative position in the year, + ignoring their original month and day information, meaning that the + missing/surplus days are added/removed at regular intervals. + + From a `360_day` to a standard calendar, the output will be missing the + following dates (day of year in parentheses): + + To a leap year: + January 31st (31), March 31st (91), June 1st (153), July 31st (213), + September 31st (275) and November 30th (335). + To a non-leap year: + February 6th (36), April 19th (109), July 2nd (183), + September 12th (255), November 25th (329). + + From a standard calendar to a `"360_day"`, the following dates in the + source array will be dropped: + + From a leap year: + January 31st (31), April 1st (92), June 1st (153), August 1st (214), + September 31st (275), December 1st (336) + From a non-leap year: + February 6th (37), April 20th (110), July 2nd (183), + September 13th (256), November 25th (329) + + This option is best used on daily and subdaily data. + + - "date" + The month/day information is conserved and invalid dates are dropped + from the output. This means that when converting from a `"360_day"` to a + standard calendar, all 31st (Jan, March, May, July, August, October and + December) will be missing as there is no equivalent dates in the + `"360_day"` calendar and the 29th (on non-leap years) and 30th of February + will be dropped as there are no equivalent dates in a standard calendar. + + This option is best used with data on a frequency coarser than daily. + """ + return convert_calendar( + self, + calendar, + dim=dim, + align_on=align_on, + missing=missing, + use_cftime=use_cftime, + ) + + def interp_calendar( + self, + target: Union[pd.DatetimeIndex, CFTimeIndex, "DataArray"], + dim: str = "time", + ) -> "Dataset": + """Interpolates the Dataset to another calendar based on decimal year measure. + + Each timestamp in `source` and `target` are first converted to their decimal + year equivalent then `source` is interpolated on the target coordinate. + The decimal year of a timestamp is its year plus its sub-year component + converted to the fraction of its year. For example "2000-03-01 12:00" is + 2000.1653 in a standard calendar or 2000.16301 in a `"noleap"` calendar. + + This method should only be used when the time (HH:MM:SS) information of + time coordinate is not important. + + Parameters + ---------- + target: DataArray or DatetimeIndex or CFTimeIndex + The target time coordinate of a valid dtype + (np.datetime64 or cftime objects) + dim : str + The time coordinate name. + + Return + ------ + DataArray + The source interpolated on the decimal years of target, + """ + return interp_calendar(self, target, dim=dim) diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index bdfe25b3c5c..b471bd2e267 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -105,6 +105,10 @@ def test_isocalendar(self, field, pandas_field) -> None: actual = self.data.time.dt.isocalendar()[field] assert_equal(expected, actual) + def test_calendar(self) -> None: + cal = self.data.time.dt.calendar + assert cal == "proleptic_gregorian" + def test_strftime(self) -> None: assert ( "2000-01-01 01:00:00" == self.data.time.dt.strftime("%Y-%m-%d %H:%M:%S")[1] @@ -409,6 +413,52 @@ def test_field_access(data, field) -> None: assert_equal(result, expected) +@requires_cftime +def test_calendar_cftime(data) -> None: + expected = data.time.values[0].calendar + assert data.time.dt.calendar == expected + + +@requires_cftime +def test_calendar_cftime_2D(data) -> None: + # 2D np datetime: + data = xr.DataArray( + np.random.randint(1, 1000000, size=(4, 5)).astype(" None: + import dask.array as da + + # 3D lazy dask - np + data = xr.DataArray( + da.random.random_integers(1, 1000000, size=(4, 5, 6)).astype(" None: + from cftime import num2date + + # 3D lazy dask + data = xr.DataArray( + num2date( + np.random.randint(1, 1000000, size=(4, 5, 6)), + "hours since 1970-01-01T00:00", + calendar="noleap", + ), + dims=("x", "y", "z"), + ).chunk() + with raise_if_dask_computes(max_computes=2): + assert data.dt.calendar == "noleap" + + @requires_cftime def test_isocalendar_cftime(data) -> None: diff --git a/xarray/tests/test_calendar_ops.py b/xarray/tests/test_calendar_ops.py new file mode 100644 index 00000000000..8d1ddcf4689 --- /dev/null +++ b/xarray/tests/test_calendar_ops.py @@ -0,0 +1,246 @@ +import numpy as np +import pytest + +from xarray import DataArray, infer_freq +from xarray.coding.calendar_ops import convert_calendar, interp_calendar +from xarray.coding.cftime_offsets import date_range +from xarray.testing import assert_identical + +from . import requires_cftime + +cftime = pytest.importorskip("cftime") + + +@pytest.mark.parametrize( + "source, target, use_cftime, freq", + [ + ("standard", "noleap", None, "D"), + ("noleap", "proleptic_gregorian", True, "D"), + ("noleap", "all_leap", None, "D"), + ("all_leap", "proleptic_gregorian", False, "4H"), + ], +) +def test_convert_calendar(source, target, use_cftime, freq): + src = DataArray( + date_range("2004-01-01", "2004-12-31", freq=freq, calendar=source), + dims=("time",), + name="time", + ) + da_src = DataArray( + np.linspace(0, 1, src.size), dims=("time",), coords={"time": src} + ) + + conv = convert_calendar(da_src, target, use_cftime=use_cftime) + + assert conv.time.dt.calendar == target + + if source != "noleap": + expected_times = date_range( + "2004-01-01", + "2004-12-31", + freq=freq, + use_cftime=use_cftime, + calendar=target, + ) + else: + expected_times_pre_leap = date_range( + "2004-01-01", + "2004-02-28", + freq=freq, + use_cftime=use_cftime, + calendar=target, + ) + expected_times_post_leap = date_range( + "2004-03-01", + "2004-12-31", + freq=freq, + use_cftime=use_cftime, + calendar=target, + ) + expected_times = expected_times_pre_leap.append(expected_times_post_leap) + np.testing.assert_array_equal(conv.time, expected_times) + + +@pytest.mark.parametrize( + "source,target,freq", + [ + ("standard", "360_day", "D"), + ("360_day", "proleptic_gregorian", "D"), + ("proleptic_gregorian", "360_day", "4H"), + ], +) +@pytest.mark.parametrize("align_on", ["date", "year"]) +def test_convert_calendar_360_days(source, target, freq, align_on): + src = DataArray( + date_range("2004-01-01", "2004-12-30", freq=freq, calendar=source), + dims=("time",), + name="time", + ) + da_src = DataArray( + np.linspace(0, 1, src.size), dims=("time",), coords={"time": src} + ) + + conv = convert_calendar(da_src, target, align_on=align_on) + + assert conv.time.dt.calendar == target + + if align_on == "date": + np.testing.assert_array_equal( + conv.time.resample(time="M").last().dt.day, + [30, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], + ) + elif target == "360_day": + np.testing.assert_array_equal( + conv.time.resample(time="M").last().dt.day, + [30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29], + ) + else: + np.testing.assert_array_equal( + conv.time.resample(time="M").last().dt.day, + [30, 29, 30, 30, 31, 30, 30, 31, 30, 31, 29, 31], + ) + if source == "360_day" and align_on == "year": + assert conv.size == 360 if freq == "D" else 360 * 4 + else: + assert conv.size == 359 if freq == "D" else 359 * 4 + + +@requires_cftime +@pytest.mark.parametrize( + "source,target,freq", + [ + ("standard", "noleap", "D"), + ("noleap", "proleptic_gregorian", "4H"), + ("noleap", "all_leap", "M"), + ("360_day", "noleap", "D"), + ("noleap", "360_day", "D"), + ], +) +def test_convert_calendar_missing(source, target, freq): + src = DataArray( + date_range( + "2004-01-01", + "2004-12-31" if source != "360_day" else "2004-12-30", + freq=freq, + calendar=source, + ), + dims=("time",), + name="time", + ) + da_src = DataArray( + np.linspace(0, 1, src.size), dims=("time",), coords={"time": src} + ) + out = convert_calendar(da_src, target, missing=np.nan, align_on="date") + assert infer_freq(out.time) == freq + + expected = date_range( + "2004-01-01", + "2004-12-31" if target != "360_day" else "2004-12-30", + freq=freq, + calendar=target, + ) + np.testing.assert_array_equal(out.time, expected) + + if freq != "M": + out_without_missing = convert_calendar(da_src, target, align_on="date") + expected_nan = out.isel(time=~out.time.isin(out_without_missing.time)) + assert expected_nan.isnull().all() + + expected_not_nan = out.sel(time=out_without_missing.time) + assert_identical(expected_not_nan, out_without_missing) + + +@requires_cftime +def test_convert_calendar_errors(): + src_nl = DataArray( + date_range("0000-01-01", "0000-12-31", freq="D", calendar="noleap"), + dims=("time",), + name="time", + ) + # no align_on for conversion to 360_day + with pytest.raises(ValueError, match="Argument `align_on` must be specified"): + convert_calendar(src_nl, "360_day") + + # Standard doesn't suuport year 0 + with pytest.raises( + ValueError, match="Source time coordinate contains dates with year 0" + ): + convert_calendar(src_nl, "standard") + + # no align_on for conversion from 360 day + src_360 = convert_calendar(src_nl, "360_day", align_on="year") + with pytest.raises(ValueError, match="Argument `align_on` must be specified"): + convert_calendar(src_360, "noleap") + + # Datetime objects + da = DataArray([0, 1, 2], dims=("x",), name="x") + with pytest.raises(ValueError, match="Coordinate x must contain datetime objects."): + convert_calendar(da, "standard", dim="x") + + +def test_convert_calendar_same_calendar(): + src = DataArray( + date_range("2000-01-01", periods=12, freq="6H", use_cftime=False), + dims=("time",), + name="time", + ) + out = convert_calendar(src, "proleptic_gregorian") + assert src is out + + +@pytest.mark.parametrize( + "source,target", + [ + ("standard", "noleap"), + ("noleap", "proleptic_gregorian"), + ("standard", "360_day"), + ("360_day", "proleptic_gregorian"), + ("noleap", "all_leap"), + ("360_day", "noleap"), + ], +) +def test_interp_calendar(source, target): + src = DataArray( + date_range("2004-01-01", "2004-07-30", freq="D", calendar=source), + dims=("time",), + name="time", + ) + tgt = DataArray( + date_range("2004-01-01", "2004-07-30", freq="D", calendar=target), + dims=("time",), + name="time", + ) + da_src = DataArray( + np.linspace(0, 1, src.size), dims=("time",), coords={"time": src} + ) + conv = interp_calendar(da_src, tgt) + + assert_identical(tgt.time, conv.time) + + np.testing.assert_almost_equal(conv.max(), 1, 2) + assert conv.min() == 0 + + +@requires_cftime +def test_interp_calendar_errors(): + src_nl = DataArray( + [1] * 100, + dims=("time",), + coords={ + "time": date_range("0000-01-01", periods=100, freq="MS", calendar="noleap") + }, + ) + tgt_360 = date_range("0001-01-01", "0001-12-30", freq="MS", calendar="standard") + + with pytest.raises( + ValueError, match="Source time coordinate contains dates with year 0" + ): + interp_calendar(src_nl, tgt_360) + + da1 = DataArray([0, 1, 2], dims=("x",), name="x") + da2 = da1 + 1 + + with pytest.raises( + ValueError, match="Both 'source.x' and 'target' must contain datetime objects." + ): + interp_calendar(da1, da2, dim="x") diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index 6d2d9907627..061c1420aba 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -22,11 +22,16 @@ YearEnd, _days_in_month, cftime_range, + date_range, + date_range_like, get_date_type, to_cftime_datetime, to_offset, ) -from xarray.tests import _CFTIME_CALENDARS +from xarray.coding.frequencies import infer_freq +from xarray.core.dataarray import DataArray + +from . import _CFTIME_CALENDARS, requires_cftime cftime = pytest.importorskip("cftime") @@ -1217,3 +1222,108 @@ def test_cftime_range_standard_calendar_refers_to_gregorian(): (result,) = cftime_range("2000", periods=1) assert isinstance(result, DatetimeGregorian) + + +@pytest.mark.parametrize( + "start,calendar,use_cftime,expected_type", + [ + ("1990-01-01", "standard", None, pd.DatetimeIndex), + ("1990-01-01", "proleptic_gregorian", True, CFTimeIndex), + ("1990-01-01", "noleap", None, CFTimeIndex), + ("1990-01-01", "gregorian", False, pd.DatetimeIndex), + ("1400-01-01", "standard", None, CFTimeIndex), + ("3400-01-01", "standard", None, CFTimeIndex), + ], +) +def test_date_range(start, calendar, use_cftime, expected_type): + dr = date_range( + start, periods=14, freq="D", calendar=calendar, use_cftime=use_cftime + ) + + assert isinstance(dr, expected_type) + + +def test_date_range_errors(): + with pytest.raises(ValueError, match="Date range is invalid"): + date_range( + "1400-01-01", periods=1, freq="D", calendar="standard", use_cftime=False + ) + + with pytest.raises(ValueError, match="Date range is invalid"): + date_range( + "2480-01-01", + periods=1, + freq="D", + calendar="proleptic_gregorian", + use_cftime=False, + ) + + with pytest.raises(ValueError, match="Invalid calendar "): + date_range( + "1900-01-01", periods=1, freq="D", calendar="noleap", use_cftime=False + ) + + +@requires_cftime +@pytest.mark.parametrize( + "start,freq,cal_src,cal_tgt,use_cftime,exp0,exp_pd", + [ + ("2020-02-01", "4M", "standard", "noleap", None, "2020-02-28", False), + ("2020-02-01", "M", "noleap", "gregorian", True, "2020-02-29", True), + ("2020-02-28", "3H", "all_leap", "gregorian", False, "2020-02-28", True), + ("2020-03-30", "M", "360_day", "gregorian", False, "2020-03-31", True), + ("2020-03-31", "M", "gregorian", "360_day", None, "2020-03-30", False), + ], +) +def test_date_range_like(start, freq, cal_src, cal_tgt, use_cftime, exp0, exp_pd): + source = date_range(start, periods=12, freq=freq, calendar=cal_src) + + out = date_range_like(source, cal_tgt, use_cftime=use_cftime) + + assert len(out) == 12 + assert infer_freq(out) == freq + + assert out[0].isoformat().startswith(exp0) + + if exp_pd: + assert isinstance(out, pd.DatetimeIndex) + else: + assert isinstance(out, CFTimeIndex) + assert out.calendar == cal_tgt + + +def test_date_range_like_same_calendar(): + src = date_range("2000-01-01", periods=12, freq="6H", use_cftime=False) + out = date_range_like(src, "standard", use_cftime=False) + assert src is out + + +def test_date_range_like_errors(): + src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) + src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferrable. + + with pytest.raises( + ValueError, + match="`date_range_like` was unable to generate a range as the source frequency was not inferrable.", + ): + date_range_like(src, "gregorian") + + src = DataArray( + np.array( + [["1999-01-01", "1999-01-02"], ["1999-01-03", "1999-01-04"]], + dtype=np.datetime64, + ), + dims=("x", "y"), + ) + with pytest.raises( + ValueError, + match="'source' must be a 1D array of datetime objects for inferring its range.", + ): + date_range_like(src, "noleap") + + da = DataArray([1, 2, 3, 4], dims=("time",)) + with pytest.raises( + ValueError, + match="'source' must be a 1D array of datetime objects for inferring its range.", + ): + date_range_like(da, "noleap") diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 930677f75f4..2e19ddb3a75 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -18,6 +18,7 @@ ) from xarray.coding.times import ( _encode_datetime_with_cftime, + _should_cftime_be_used, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime, @@ -1107,3 +1108,21 @@ def test_decode_encode_roundtrip_with_non_lowercase_letters(calendar) -> None: # original form throughout the roundtripping process, uppercase letters and # all. assert_identical(variable, encoded) + + +@requires_cftime +def test_should_cftime_be_used_source_outside_range(): + src = cftime_range("1000-01-01", periods=100, freq="MS", calendar="noleap") + with pytest.raises( + ValueError, match="Source time range is not valid for numpy datetimes." + ): + _should_cftime_be_used(src, "standard", False) + + +@requires_cftime +def test_should_cftime_be_used_target_not_npable(): + src = cftime_range("2000-01-01", periods=100, freq="MS", calendar="noleap") + with pytest.raises( + ValueError, match="Calendar 'noleap' is only valid with cftime." + ): + _should_cftime_be_used(src, "noleap", False)