diff --git a/.travis.yml b/.travis.yml index 9a39105a96d..259b3b548e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -93,9 +93,9 @@ install: - python xarray/util/print_versions.py script: + - git diff upstream/master xarray/**/*py | flake8 --diff --exit-zero || true - python -OO -c "import xarray" - py.test xarray --cov=xarray --cov-config ci/.coveragerc --cov-report term-missing --verbose $EXTRA_FLAGS - - git diff upstream/master **/*py | flake8 --diff --exit-zero || true after_success: - coveralls diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py new file mode 100644 index 00000000000..c6aa8f428bd --- /dev/null +++ b/asv_bench/benchmarks/dataarray_missing.py @@ -0,0 +1,73 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import pandas as pd + +try: + import dask +except ImportError: + pass + +import xarray as xr + +from . import randn, requires_dask + + +def make_bench_data(shape, frac_nan, chunks): + vals = randn(shape, frac_nan) + coords = {'time': pd.date_range('2000-01-01', freq='D', + periods=shape[0])} + da = xr.DataArray(vals, dims=('time', 'x', 'y'), coords=coords) + + if chunks is not None: + da = da.chunk(chunks) + + return da + + +def time_interpolate_na(shape, chunks, method, limit): + if chunks is not None: + requires_dask() + da = make_bench_data(shape, 0.1, chunks=chunks) + actual = da.interpolate_na(dim='time', method='linear', limit=limit) + + if chunks is not None: + actual = actual.compute() + + +time_interpolate_na.param_names = ['shape', 'chunks', 'method', 'limit'] +time_interpolate_na.params = ([(3650, 200, 400), (100, 25, 25)], + [None, {'x': 25, 'y': 25}], + ['linear', 'spline', 'quadratic', 'cubic'], + [None, 3]) + + +def time_ffill(shape, chunks, limit): + + da = make_bench_data(shape, 0.1, chunks=chunks) + actual = da.ffill(dim='time', limit=limit) + + if chunks is not None: + actual = actual.compute() + + +time_ffill.param_names = ['shape', 'chunks', 'limit'] +time_ffill.params = ([(3650, 200, 400), (100, 25, 25)], + [None, {'x': 25, 'y': 25}], + [None, 3]) + + +def time_bfill(shape, chunks, limit): + + da = make_bench_data(shape, 0.1, chunks=chunks) + actual = da.bfill(dim='time', limit=limit) + + if chunks is not None: + actual = actual.compute() + + +time_bfill.param_names = ['shape', 'chunks', 'limit'] +time_bfill.params = ([(3650, 200, 400), (100, 25, 25)], + [None, {'x': 25, 'y': 25}], + [None, 3]) diff --git a/doc/api.rst b/doc/api.rst index 98ce3dc7b33..10386fe3a9b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -148,6 +148,9 @@ Computation :py:attr:`~Dataset.count` :py:attr:`~Dataset.dropna` :py:attr:`~Dataset.fillna` +:py:attr:`~Dataset.ffill` +:py:attr:`~Dataset.bfill` +:py:attr:`~Dataset.interpolate_na` :py:attr:`~Dataset.where` **ndarray methods**: @@ -299,6 +302,9 @@ Computation :py:attr:`~DataArray.count` :py:attr:`~DataArray.dropna` :py:attr:`~DataArray.fillna` +:py:attr:`~DataArray.ffill` +:py:attr:`~DataArray.bfill` +:py:attr:`~DataArray.interpolate_na` :py:attr:`~DataArray.where` **ndarray methods**: diff --git a/doc/computation.rst b/doc/computation.rst index 087cca64e15..420b97923d7 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -59,8 +59,9 @@ Missing values xarray objects borrow the :py:meth:`~xarray.DataArray.isnull`, :py:meth:`~xarray.DataArray.notnull`, :py:meth:`~xarray.DataArray.count`, -:py:meth:`~xarray.DataArray.dropna` and :py:meth:`~xarray.DataArray.fillna` methods -for working with missing data from pandas: +:py:meth:`~xarray.DataArray.dropna`, :py:meth:`~xarray.DataArray.fillna`, +:py:meth:`~xarray.DataArray.ffill`, and :py:meth:`~xarray.DataArray.bfill` +methods for working with missing data from pandas: .. ipython:: python @@ -70,10 +71,25 @@ for working with missing data from pandas: x.count() x.dropna(dim='x') x.fillna(-1) + x.ffill() + x.bfill() Like pandas, xarray uses the float value ``np.nan`` (not-a-number) to represent missing values. +xarray objects also have an :py:meth:`~xarray.DataArray.interpolate_na` method +for filling missing values via 1D interpolation. + +.. ipython:: python + + x = xr.DataArray([0, 1, np.nan, np.nan, 2], dims=['x'], + coords={'xx': xr.Variable('x', [0, 1, 1.1, 1.9, 3])}) + x.interpolate_na(dim='x', method='linear', use_coordinate='xx') + +Note that xarray slightly diverges from the pandas ``interpolate`` syntax by +providing the ``use_coordinate`` keyword which facilitates a clear specification +of which values to use as the index in the interpolation. + Aggregation =========== diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 03625764bdd..a522fe60b2d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1228,6 +1228,97 @@ def fillna(self, value): out = ops.fillna(self, value) return out + def interpolate_na(self, dim=None, method='linear', limit=None, + use_coordinate=True, + **kwargs): + """Interpolate values according to different methods. + + Parameters + ---------- + dim : str + Specifies the dimension along which to interpolate. + method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial', 'barycentric', 'krog', 'pchip', + 'spline', 'akima'}, optional + String indicating which method to use for interpolation: + + - 'linear': linear interpolation (Default). Additional keyword + arguments are passed to ``numpy.interp`` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial': are passed to ``scipy.interpolate.interp1d``. If + method=='polynomial', the ``order`` keyword argument must also be + provided. + - 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their + respective``scipy.interpolate`` classes. + use_coordinate : boolean or str, default True + Specifies which index to use as the x values in the interpolation + formulated as `y = f(x)`. If False, values are treated as if + eqaully-spaced along `dim`. If True, the IndexVariable `dim` is + used. If use_coordinate is a string, it specifies the name of a + coordinate variariable to use as the index. + limit : int, default None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. + + Returns + ------- + DataArray + + See also + -------- + numpy.interp + scipy.interpolate + """ + from .missing import interp_na + return interp_na(self, dim=dim, method=method, limit=limit, + use_coordinate=use_coordinate, **kwargs) + + def ffill(self, dim, limit=None): + '''Fill NaN values by propogating values forward + + *Requires bottleneck.* + + Parameters + ---------- + dim : str + Specifies the dimension along which to propagate values when + filling. + limit : int, default None + The maximum number of consecutive NaN values to forward fill. In + other words, if there is a gap with more than this number of + consecutive NaNs, it will only be partially filled. Must be greater + than 0 or None for no limit. + + Returns + ------- + DataArray + ''' + from .missing import ffill + return ffill(self, dim, limit=limit) + + def bfill(self, dim, limit=None): + '''Fill NaN values by propogating values backward + + *Requires bottleneck.* + + Parameters + ---------- + dim : str + Specifies the dimension along which to propagate values when + filling. + limit : int, default None + The maximum number of consecutive NaN values to backward fill. In + other words, if there is a gap with more than this number of + consecutive NaNs, it will only be partially filled. Must be greater + than 0 or None for no limit. + + Returns + ------- + DataArray + ''' + from .missing import bfill + return bfill(self, dim, limit=limit) + def combine_first(self, other): """Combine two DataArray objects, with union of coordinates. @@ -1935,10 +2026,10 @@ def sortby(self, variables, ascending=True): sorted: DataArray A new dataarray where all the specified dims are sorted by dim labels. - + Examples -------- - + >>> da = xr.DataArray(np.random.rand(5), ... coords=[pd.date_range('1/1/2000', periods=5)], ... dims='time') @@ -1952,7 +2043,7 @@ def sortby(self, variables, ascending=True): array([ 0.26532 , 0.270962, 0.552878, 0.615637, 0.965471]) Coordinates: - * time (time) datetime64[ns] 2000-01-03 2000-01-04 2000-01-05 ... + * time (time) datetime64[ns] 2000-01-03 2000-01-04 2000-01-05 ... """ ds = self._to_temp_dataset().sortby(variables, ascending=ascending) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d77336a2b5c..58847bb0086 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2410,6 +2410,105 @@ def fillna(self, value): out = ops.fillna(self, value) return out + def interpolate_na(self, dim=None, method='linear', limit=None, + use_coordinate=True, + **kwargs): + """Interpolate values according to different methods. + + Parameters + ---------- + dim : str + Specifies the dimension along which to interpolate. + method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial', 'barycentric', 'krog', 'pchip', + 'spline'}, optional + String indicating which method to use for interpolation: + + - 'linear': linear interpolation (Default). Additional keyword + arguments are passed to ``numpy.interp`` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', + 'polynomial': are passed to ``scipy.interpolate.interp1d``. If + method=='polynomial', the ``order`` keyword argument must also be + provided. + - 'barycentric', 'krog', 'pchip', 'spline': use their respective + ``scipy.interpolate`` classes. + use_coordinate : boolean or str, default True + Specifies which index to use as the x values in the interpolation + formulated as `y = f(x)`. If False, values are treated as if + eqaully-spaced along `dim`. If True, the IndexVariable `dim` is + used. If use_coordinate is a string, it specifies the name of a + coordinate variariable to use as the index. + limit : int, default None + Maximum number of consecutive NaNs to fill. Must be greater than 0 + or None for no limit. + + Returns + ------- + Dataset + + See also + -------- + numpy.interp + scipy.interpolate + """ + from .missing import interp_na, _apply_over_vars_with_dim + + new = _apply_over_vars_with_dim(interp_na, self, dim=dim, + method=method, limit=limit, + use_coordinate=use_coordinate, + **kwargs) + return new + + def ffill(self, dim, limit=None): + '''Fill NaN values by propogating values forward + + *Requires bottleneck.* + + Parameters + ---------- + dim : str + Specifies the dimension along which to propagate values when + filling. + limit : int, default None + The maximum number of consecutive NaN values to forward fill. In + other words, if there is a gap with more than this number of + consecutive NaNs, it will only be partially filled. Must be greater + than 0 or None for no limit. + + Returns + ------- + Dataset + ''' + from .missing import ffill, _apply_over_vars_with_dim + + new = _apply_over_vars_with_dim(ffill, self, dim=dim, limit=limit) + return new + + def bfill(self, dim, limit=None): + '''Fill NaN values by propogating values backward + + *Requires bottleneck.* + + Parameters + ---------- + dim : str + Specifies the dimension along which to propagate values when + filling. + limit : int, default None + The maximum number of consecutive NaN values to backward fill. In + other words, if there is a gap with more than this number of + consecutive NaNs, it will only be partially filled. Must be greater + than 0 or None for no limit. + + Returns + ------- + Dataset + ''' + from .missing import bfill, _apply_over_vars_with_dim + + new = _apply_over_vars_with_dim(bfill, self, dim=dim, limit=limit) + return new + def combine_first(self, other): """Combine two Datasets, default to data_vars of self. diff --git a/xarray/core/missing.py b/xarray/core/missing.py new file mode 100644 index 00000000000..cbf0df958d5 --- /dev/null +++ b/xarray/core/missing.py @@ -0,0 +1,329 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import Iterable +from functools import partial + +import numpy as np +import pandas as pd + + +from .pycompat import iteritems +from .computation import apply_ufunc +from .utils import is_scalar +from .npcompat import flip + + +class BaseInterpolator(object): + '''gerneric interpolator class for normalizing interpolation methods''' + cons_kwargs = {} + call_kwargs = {} + f = None + method = None + + def __init__(self, xi, yi, method=None, **kwargs): + self.method = method + self.call_kwargs = kwargs + + def __call__(self, x): + return self.f(x, **self.call_kwargs) + + def __repr__(self): + return "{type}: method={method}".format(type=self.__class__.__name__, + method=self.method) + + +class NumpyInterpolator(BaseInterpolator): + '''One-dimensional linear interpolation. + + See Also + -------- + numpy.interp + ''' + def __init__(self, xi, yi, method='linear', fill_value=None, **kwargs): + + if method != 'linear': + raise ValueError( + 'only method `linear` is valid for the NumpyInterpolator') + + self.method = method + self.f = np.interp + self.cons_kwargs = kwargs + self.call_kwargs = {'period': self.cons_kwargs.pop('period', None)} + + self._xi = xi + self._yi = yi + + if self.cons_kwargs: + raise ValueError( + 'recieved invalid kwargs: %r' % self.cons_kwargs.keys()) + + if fill_value is None: + self._left = np.nan + self._right = np.nan + elif isinstance(fill_value, Iterable) and len(fill_value) == 2: + self._left = fill_value[0] + self._right = fill_value[1] + elif is_scalar(fill_value): + self._left = fill_value + self._right = fill_value + else: + raise ValueError('%s is not a valid fill_value' % fill_value) + + def __call__(self, x): + return self.f(x, self._xi, self._yi, left=self._left, + right=self._right, **self.call_kwargs) + + +class ScipyInterpolator(BaseInterpolator): + '''Interpolate a 1-D function using Scipy interp1d + + See Also + -------- + scipy.interpolate.interp1d + ''' + def __init__(self, xi, yi, method=None, fill_value=None, + assume_sorted=True, copy=False, bounds_error=False, **kwargs): + from scipy.interpolate import interp1d + + if method is None: + raise ValueError('method is a required argument, please supply a ' + 'valid scipy.inter1d method (kind)') + + if method == 'polynomial': + method = kwargs.pop('order', None) + if method is None: + raise ValueError('order is required when method=polynomial') + + self.method = method + + self.cons_kwargs = kwargs + self.call_kwargs = {} + + if fill_value is None and method == 'linear': + fill_value = kwargs.pop('fill_value', (np.nan, np.nan)) + elif fill_value is None: + fill_value = np.nan + + self.f = interp1d(xi, yi, kind=self.method, fill_value=fill_value, + bounds_error=False, assume_sorted=assume_sorted, + copy=copy, **self.cons_kwargs) + + +class SplineInterpolator(BaseInterpolator): + '''One-dimensional smoothing spline fit to a given set of data points. + + See Also + -------- + scipy.interpolate.UnivariateSpline + ''' + def __init__(self, xi, yi, method='spline', fill_value=None, order=3, + **kwargs): + from scipy.interpolate import UnivariateSpline + + if method != 'spline': + raise ValueError( + 'only method `spline` is valid for the SplineInterpolator') + + self.method = method + self.cons_kwargs = kwargs + self.call_kwargs['nu'] = kwargs.pop('nu', 0) + self.call_kwargs['ext'] = kwargs.pop('ext', None) + + if fill_value is not None: + raise ValueError('SplineInterpolator does not support fill_value') + + self.f = UnivariateSpline(xi, yi, k=order, **self.cons_kwargs) + + +def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): + '''wrapper for datasets''' + + ds = type(self)(coords=self.coords, attrs=self.attrs) + + for name, var in iteritems(self.data_vars): + if dim in var.dims: + ds[name] = func(var, dim=dim, **kwargs) + else: + ds[name] = var + + return ds + + +def get_clean_interp_index(arr, dim, use_coordinate=True, **kwargs): + '''get index to use for x values in interpolation. + + If use_coordinate is True, the coordinate that shares the name of the + dimension along which interpolation is being performed will be used as the + x values. + + If use_coordinate is False, the x values are set as an equally spaced + sequence. + ''' + if use_coordinate: + if use_coordinate is True: + index = arr.get_index(dim) + else: + index = arr.coords[use_coordinate] + if index.ndim != 1: + raise ValueError( + 'Coordinates used for interpolation must be 1D, ' + '%s is %dD.' % (use_coordinate, index.ndim)) + + # raise if index cannot be cast to a float (e.g. MultiIndex) + try: + index = index.values.astype(np.float64) + except (TypeError, ValueError): + # pandas raises a TypeError + # xarray/nuppy raise a ValueError + raise TypeError('Index must be castable to float64 to support' + 'interpolation, got: %s' % type(index)) + # check index sorting now so we can skip it later + if not (np.diff(index) > 0).all(): + raise ValueError("Index must be monotonicly increasing") + else: + axis = arr.get_axis_num(dim) + index = np.arange(arr.shape[axis], dtype=np.float64) + + return index + + +def interp_na(self, dim=None, use_coordinate=True, method='linear', limit=None, + **kwargs): + '''Interpolate values according to different methods.''' + + if dim is None: + raise NotImplementedError('dim is a required argument') + + if limit is not None: + valids = _get_valid_fill_mask(self, dim, limit) + + # method + index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate, + **kwargs) + interpolator = _get_interpolator(method, **kwargs) + + arr = apply_ufunc(interpolator, index, self, + input_core_dims=[[dim], [dim]], + output_core_dims=[[dim]], + output_dtypes=[self.dtype], + dask='parallelized', + vectorize=True, + keep_attrs=True).transpose(*self.dims) + + if limit is not None: + arr = arr.where(valids) + + return arr + + +def wrap_interpolator(interpolator, x, y, **kwargs): + '''helper function to apply interpolation along 1 dimension''' + # it would be nice if this wasn't necessary, works around: + # "ValueError: assignment destination is read-only" in assignment below + out = y.copy() + + nans = pd.isnull(y) + nonans = ~nans + + # fast track for no-nans and all-nans cases + n_nans = nans.sum() + if n_nans == 0 or n_nans == len(y): + return y + + f = interpolator(x[nonans], y[nonans], **kwargs) + out[nans] = f(x[nans]) + return out + + +def _bfill(arr, n=None, axis=-1): + '''inverse of ffill''' + import bottleneck as bn + + arr = flip(arr, axis=axis) + + # fill + arr = bn.push(arr, axis=axis, n=n) + + # reverse back to original + return flip(arr, axis=axis) + + +def ffill(arr, dim=None, limit=None): + '''forward fill missing values''' + import bottleneck as bn + + axis = arr.get_axis_num(dim) + + # work around for bottleneck 178 + _limit = limit if limit is not None else arr.shape[axis] + + return apply_ufunc(bn.push, arr, + dask='parallelized', + keep_attrs=True, + output_dtypes=[arr.dtype], + kwargs=dict(n=_limit, axis=axis)).transpose(*arr.dims) + + +def bfill(arr, dim=None, limit=None): + '''backfill missing values''' + axis = arr.get_axis_num(dim) + + # work around for bottleneck 178 + _limit = limit if limit is not None else arr.shape[axis] + + return apply_ufunc(_bfill, arr, + dask='parallelized', + keep_attrs=True, + output_dtypes=[arr.dtype], + kwargs=dict(n=_limit, axis=axis)).transpose(*arr.dims) + + +def _get_interpolator(method, **kwargs): + '''helper function to select the appropriate interpolator class + + returns a partial of wrap_interpolator + ''' + interp1d_methods = ['linear', 'nearest', 'zero', 'slinear', 'quadratic', + 'cubic', 'polynomial'] + valid_methods = interp1d_methods + ['barycentric', 'krog', 'pchip', + 'spline', 'akima'] + + if (method == 'linear' and not + kwargs.get('fill_value', None) == 'extrapolate'): + kwargs.update(method=method) + interp_class = NumpyInterpolator + elif method in valid_methods: + try: + from scipy import interpolate + except ImportError: + raise ImportError( + 'Interpolation with method `%s` requires scipy' % method) + if method in interp1d_methods: + kwargs.update(method=method) + interp_class = ScipyInterpolator + elif method == 'barycentric': + interp_class = interpolate.BarycentricInterpolator + elif method == 'krog': + interp_class = interpolate.KroghInterpolator + elif method == 'pchip': + interp_class = interpolate.PchipInterpolator + elif method == 'spline': + kwargs.update(method=method) + interp_class = SplineInterpolator + elif method == 'akima': + interp_class = interpolate.Akima1DInterpolator + else: + raise ValueError('%s is not a valid scipy interpolator' % method) + else: + raise ValueError('%s is not a valid interpolator' % method) + + return partial(wrap_interpolator, interp_class, **kwargs) + + +def _get_valid_fill_mask(arr, dim, limit): + '''helper function to determine values that can be filled when limit is not + None''' + kw = {dim: limit + 1} + return arr.isnull().rolling(min_periods=1, **kw).sum() <= limit diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index fc080c63e7e..fa01e37e94f 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -4,7 +4,7 @@ import numpy as np try: - from numpy import nancumsum, nancumprod + from numpy import nancumsum, nancumprod, flip except ImportError: # pragma: no cover # Code copied from newer versions of NumPy (v1.12). # Used under the terms of NumPy's license, see licenses/NUMPY_LICENSE. @@ -174,3 +174,74 @@ def nancumprod(a, axis=None, dtype=None, out=None): """ a, mask = _replace_nan(a, 1) return np.cumprod(a, axis=axis, dtype=dtype, out=out) + + def flip(m, axis): + """ + Reverse the order of elements in an array along the given axis. + + The shape of the array is preserved, but the elements are reordered. + + .. versionadded:: 1.12.0 + + Parameters + ---------- + m : array_like + Input array. + axis : integer + Axis in array, which entries are reversed. + + + Returns + ------- + out : array_like + A view of `m` with the entries of axis reversed. Since a view is + returned, this operation is done in constant time. + + See Also + -------- + flipud : Flip an array vertically (axis=0). + fliplr : Flip an array horizontally (axis=1). + + Notes + ----- + flip(m, 0) is equivalent to flipud(m). + flip(m, 1) is equivalent to fliplr(m). + flip(m, n) corresponds to ``m[...,::-1,...]`` with ``::-1`` at position n. + + Examples + -------- + >>> A = np.arange(8).reshape((2,2,2)) + >>> A + array([[[0, 1], + [2, 3]], + + [[4, 5], + [6, 7]]]) + + >>> flip(A, 0) + array([[[4, 5], + [6, 7]], + + [[0, 1], + [2, 3]]]) + + >>> flip(A, 1) + array([[[2, 3], + [0, 1]], + + [[6, 7], + [4, 5]]]) + + >>> A = np.random.randn(3,4,5) + >>> np.all(flip(A,2) == A[:,:,::-1,...]) + True + """ + if not hasattr(m, 'ndim'): + m = np.asarray(m) + indexer = [slice(None)] * m.ndim + try: + indexer[axis] = slice(None, None, -1) + except IndexError: + raise ValueError("axis=%i is invalid for the %i-dimensional " + "input array" % (axis, m.ndim)) + return m[tuple(indexer)] diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 235c6e9e410..9afe6f43850 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -72,6 +72,7 @@ def _importorskip(modname, minversion=None): has_rasterio, requires_rasterio = _importorskip('rasterio') has_pathlib, requires_pathlib = _importorskip('pathlib') has_zarr, requires_zarr = _importorskip('zarr', minversion='2.2.0') +has_np112, requires_np112 = _importorskip('numpy', minversion='1.12.0') # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py new file mode 100644 index 00000000000..209e8de8663 --- /dev/null +++ b/xarray/tests/test_missing.py @@ -0,0 +1,457 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import pandas as pd +import pytest +import itertools + +import xarray as xr + +from xarray.core.missing import (NumpyInterpolator, ScipyInterpolator, + SplineInterpolator) +from xarray.core.pycompat import dask_array_type + +from xarray.tests import (assert_equal, assert_array_equal, raises_regex, + requires_scipy, requires_bottleneck, requires_dask, + requires_np112) + + +@pytest.fixture +def da(): + return xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], + dims='time') + + +@pytest.fixture +def ds(): + ds = xr.Dataset() + ds['var1'] = xr.DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], + dims='time') + ds['var2'] = xr.DataArray([10, np.nan, 11, 12, np.nan, 13, 14, 15, np.nan, + 16, 17], dims='x') + return ds + + +def make_interpolate_example_data(shape, frac_nan, seed=12345, + non_uniform=False): + rs = np.random.RandomState(seed) + vals = rs.normal(size=shape) + if frac_nan == 1: + vals[:] = np.nan + elif frac_nan == 0: + pass + else: + n_missing = int(vals.size * frac_nan) + + ys = np.arange(shape[0]) + xs = np.arange(shape[1]) + if n_missing: + np.random.shuffle(ys) + ys = ys[:n_missing] + + np.random.shuffle(xs) + xs = xs[:n_missing] + + vals[ys, xs] = np.nan + + if non_uniform: + # construct a datetime index that has irregular spacing + deltas = pd.TimedeltaIndex(unit='d', data=rs.normal(size=shape[0], + scale=10)) + coords = {'time': (pd.Timestamp('2000-01-01') + deltas).sort_values()} + else: + coords = {'time': pd.date_range('2000-01-01', freq='D', + periods=shape[0])} + da = xr.DataArray(vals, dims=('time', 'x'), coords=coords) + df = da.to_pandas() + + return da, df + + +@requires_np112 +@requires_scipy +def test_interpolate_pd_compat(): + shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] + frac_nans = [0, 0.5, 1] + methods = ['linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'] + + for (shape, frac_nan, method) in itertools.product(shapes, frac_nans, + methods): + + da, df = make_interpolate_example_data(shape, frac_nan) + + for dim in ['time', 'x']: + actual = da.interpolate_na(method=method, dim=dim, + fill_value=np.nan) + expected = df.interpolate(method=method, axis=da.get_axis_num(dim), + fill_value=(np.nan, np.nan)) + # Note, Pandas does some odd things with the left/right fill_value + # for the linear methods. This next line inforces the xarray + # fill_value convention on the pandas output. Therefore, this test + # only checks that interpolated values are the same (not nans) + expected.values[pd.isnull(actual.values)] = np.nan + + np.testing.assert_allclose(actual.values, expected.values) + + +@requires_np112 +@requires_scipy +def test_scipy_methods_function(): + for method in ['barycentric', 'krog', 'pchip', 'spline', 'akima']: + kwargs = {} + # Note: Pandas does some wacky things with these methods and the full + # integration tests wont work. + da, _ = make_interpolate_example_data((25, 25), 0.4, non_uniform=True) + actual = da.interpolate_na(method=method, dim='time', **kwargs) + assert (da.count('time') <= actual.count('time')).all() + + +@requires_np112 +@requires_scipy +def test_interpolate_pd_compat_non_uniform_index(): + shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] + frac_nans = [0, 0.5, 1] + methods = ['time', 'index', 'values'] + + for (shape, frac_nan, method) in itertools.product(shapes, frac_nans, + methods): + + da, df = make_interpolate_example_data(shape, frac_nan, + non_uniform=True) + for dim in ['time', 'x']: + if method == 'time' and dim != 'time': + continue + actual = da.interpolate_na(method='linear', dim=dim, + use_coordinate=True, fill_value=np.nan) + expected = df.interpolate(method=method, axis=da.get_axis_num(dim), + fill_value=np.nan) + + # Note, Pandas does some odd things with the left/right fill_value + # for the linear methods. This next line inforces the xarray + # fill_value convention on the pandas output. Therefore, this test + # only checks that interpolated values are the same (not nans) + expected.values[pd.isnull(actual.values)] = np.nan + + np.testing.assert_allclose(actual.values, expected.values) + + +@requires_np112 +@requires_scipy +def test_interpolate_pd_compat_polynomial(): + shapes = [(8, 8), (1, 20), (20, 1), (100, 100)] + frac_nans = [0, 0.5, 1] + orders = [1, 2, 3] + + for (shape, frac_nan, order) in itertools.product(shapes, frac_nans, + orders): + + da, df = make_interpolate_example_data(shape, frac_nan) + + for dim in ['time', 'x']: + actual = da.interpolate_na(method='polynomial', order=order, + dim=dim, use_coordinate=False) + expected = df.interpolate(method='polynomial', order=order, + axis=da.get_axis_num(dim)) + np.testing.assert_allclose(actual.values, expected.values) + + +@requires_np112 +@requires_scipy +def test_interpolate_unsorted_index_raises(): + vals = np.array([1, 2, 3], dtype=np.float64) + expected = xr.DataArray(vals, dims='x', coords={'x': [2, 1, 3]}) + with raises_regex(ValueError, 'Index must be monotonicly increasing'): + expected.interpolate_na(dim='x', method='index') + + +def test_interpolate_no_dim_raises(): + da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims='x') + with raises_regex(NotImplementedError, 'dim is a required argument'): + da.interpolate_na(method='linear') + + +def test_interpolate_invalid_interpolator_raises(): + da = xr.DataArray(np.array([1, 2, np.nan, 5], dtype=np.float64), dims='x') + with raises_regex(ValueError, 'not a valid'): + da.interpolate_na(dim='x', method='foo') + + +def test_interpolate_multiindex_raises(): + data = np.random.randn(2, 3) + data[1, 1] = np.nan + da = xr.DataArray(data, coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) + das = da.stack(z=('x', 'y')) + with raises_regex(TypeError, 'Index must be castable to float64'): + das.interpolate_na(dim='z') + + +def test_interpolate_2d_coord_raises(): + coords = {'x': xr.Variable(('a', 'b'), np.arange(6).reshape(2, 3)), + 'y': xr.Variable(('a', 'b'), np.arange(6).reshape(2, 3)) * 2} + + data = np.random.randn(2, 3) + data[1, 1] = np.nan + da = xr.DataArray(data, dims=('a', 'b'), coords=coords) + with raises_regex(ValueError, 'interpolation must be 1D'): + da.interpolate_na(dim='a', use_coordinate='x') + + +@requires_np112 +@requires_scipy +def test_interpolate_kwargs(): + da = xr.DataArray(np.array([4, 5, np.nan], dtype=np.float64), dims='x') + expected = xr.DataArray(np.array([4, 5, 6], dtype=np.float64), dims='x') + actual = da.interpolate_na(dim='x', fill_value='extrapolate') + assert_equal(actual, expected) + + expected = xr.DataArray(np.array([4, 5, -999], dtype=np.float64), dims='x') + actual = da.interpolate_na(dim='x', fill_value=-999) + assert_equal(actual, expected) + + +@requires_np112 +def test_interpolate(): + + vals = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) + expected = xr.DataArray(vals, dims='x') + mvals = vals.copy() + mvals[2] = np.nan + missing = xr.DataArray(mvals, dims='x') + + actual = missing.interpolate_na(dim='x') + + assert_equal(actual, expected) + + +@requires_np112 +def test_interpolate_nonans(): + + vals = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) + expected = xr.DataArray(vals, dims='x') + actual = expected.interpolate_na(dim='x') + assert_equal(actual, expected) + + +@requires_np112 +@requires_scipy +def test_interpolate_allnans(): + vals = np.full(6, np.nan, dtype=np.float64) + expected = xr.DataArray(vals, dims='x') + actual = expected.interpolate_na(dim='x') + + assert_equal(actual, expected) + + +@requires_np112 +@requires_bottleneck +def test_interpolate_limits(): + da = xr.DataArray(np.array([1, 2, np.nan, np.nan, np.nan, 6], + dtype=np.float64), dims='x') + + actual = da.interpolate_na(dim='x', limit=None) + assert actual.isnull().sum() == 0 + + actual = da.interpolate_na(dim='x', limit=2) + expected = xr.DataArray(np.array([1, 2, 3, 4, np.nan, 6], + dtype=np.float64), dims='x') + + assert_equal(actual, expected) + + +@requires_np112 +@requires_scipy +def test_interpolate_methods(): + for method in ['linear', 'nearest', 'zero', 'slinear', 'quadratic', + 'cubic']: + kwargs = {} + da = xr.DataArray(np.array([0, 1, 2, np.nan, np.nan, np.nan, 6, 7, 8], + dtype=np.float64), dims='x') + actual = da.interpolate_na('x', method=method, **kwargs) + assert actual.isnull().sum() == 0 + + actual = da.interpolate_na('x', method=method, limit=2, **kwargs) + assert actual.isnull().sum() == 1 + + +@requires_scipy +@requires_np112 +def test_interpolators(): + for method, interpolator in [('linear', NumpyInterpolator), + ('linear', ScipyInterpolator), + ('spline', SplineInterpolator)]: + xi = np.array([-1, 0, 1, 2, 5], dtype=np.float64) + yi = np.array([-10, 0, 10, 20, 50], dtype=np.float64) + x = np.array([3, 4], dtype=np.float64) + + f = interpolator(xi, yi, method=method) + out = f(x) + assert pd.isnull(out).sum() == 0 + + +@requires_np112 +def test_interpolate_use_coordinate(): + xc = xr.Variable('x', [100, 200, 300, 400, 500, 600]) + da = xr.DataArray(np.array([1, 2, np.nan, np.nan, np.nan, 6], + dtype=np.float64), + dims='x', coords={'xc': xc}) + + # use_coordinate == False is same as using the default index + actual = da.interpolate_na(dim='x', use_coordinate=False) + expected = da.interpolate_na(dim='x') + assert_equal(actual, expected) + + # possible to specify non index coordinate + actual = da.interpolate_na(dim='x', use_coordinate='xc') + expected = da.interpolate_na(dim='x') + assert_equal(actual, expected) + + # possible to specify index coordinate by name + actual = da.interpolate_na(dim='x', use_coordinate='x') + expected = da.interpolate_na(dim='x') + assert_equal(actual, expected) + + +@requires_np112 +@requires_dask +def test_interpolate_dask(): + da, _ = make_interpolate_example_data((40, 40), 0.5) + da = da.chunk({'x': 5}) + actual = da.interpolate_na('time') + expected = da.load().interpolate_na('time') + assert isinstance(actual.data, dask_array_type) + assert_equal(actual.compute(), expected) + + # with limit + da = da.chunk({'x': 5}) + actual = da.interpolate_na('time', limit=3) + expected = da.load().interpolate_na('time', limit=3) + assert isinstance(actual.data, dask_array_type) + assert_equal(actual, expected) + + +@requires_np112 +@requires_dask +def test_interpolate_dask_raises_for_invalid_chunk_dim(): + da, _ = make_interpolate_example_data((40, 40), 0.5) + da = da.chunk({'time': 5}) + with raises_regex(ValueError, "dask='parallelized' consists of multiple"): + da.interpolate_na('time') + + +@requires_np112 +@requires_bottleneck +def test_ffill(): + da = xr.DataArray(np.array([4, 5, np.nan], dtype=np.float64), dims='x') + expected = xr.DataArray(np.array([4, 5, 5], dtype=np.float64), dims='x') + actual = da.ffill('x') + assert_equal(actual, expected) + + +@requires_np112 +@requires_bottleneck +@requires_dask +def test_ffill_dask(): + da, _ = make_interpolate_example_data((40, 40), 0.5) + da = da.chunk({'x': 5}) + actual = da.ffill('time') + expected = da.load().ffill('time') + assert isinstance(actual.data, dask_array_type) + assert_equal(actual, expected) + + # with limit + da = da.chunk({'x': 5}) + actual = da.ffill('time', limit=3) + expected = da.load().ffill('time', limit=3) + assert isinstance(actual.data, dask_array_type) + assert_equal(actual, expected) + + +@requires_bottleneck +@requires_dask +def test_bfill_dask(): + da, _ = make_interpolate_example_data((40, 40), 0.5) + da = da.chunk({'x': 5}) + actual = da.bfill('time') + expected = da.load().bfill('time') + assert isinstance(actual.data, dask_array_type) + assert_equal(actual, expected) + + # with limit + da = da.chunk({'x': 5}) + actual = da.bfill('time', limit=3) + expected = da.load().bfill('time', limit=3) + assert isinstance(actual.data, dask_array_type) + assert_equal(actual, expected) + + +@requires_bottleneck +@requires_np112 +def test_ffill_bfill_nonans(): + + vals = np.array([1, 2, 3, 4, 5, 6], dtype=np.float64) + expected = xr.DataArray(vals, dims='x') + + actual = expected.ffill(dim='x') + assert_equal(actual, expected) + + actual = expected.bfill(dim='x') + assert_equal(actual, expected) + + +@requires_bottleneck +@requires_np112 +def test_ffill_bfill_allnans(): + + vals = np.full(6, np.nan, dtype=np.float64) + expected = xr.DataArray(vals, dims='x') + + actual = expected.ffill(dim='x') + assert_equal(actual, expected) + + actual = expected.bfill(dim='x') + assert_equal(actual, expected) + + +@requires_bottleneck +@requires_np112 +def test_ffill_functions(da): + result = da.ffill('time') + assert result.isnull().sum() == 0 + + +@requires_bottleneck +@requires_np112 +def test_ffill_limit(): + da = xr.DataArray( + [0, np.nan, np.nan, np.nan, np.nan, 3, 4, 5, np.nan, 6, 7], + dims='time') + result = da.ffill('time') + expected = xr.DataArray([0, 0, 0, 0, 0, 3, 4, 5, 5, 6, 7], dims='time') + assert_array_equal(result, expected) + + result = da.ffill('time', limit=1) + expected = xr.DataArray( + [0, 0, np.nan, np.nan, np.nan, 3, 4, 5, 5, 6, 7], dims='time') + + +@requires_np112 +def test_interpolate_dataset(ds): + actual = ds.interpolate_na(dim='time') + # no missing values in var1 + assert actual['var1'].count('time') == actual.dims['time'] + + # var2 should be the same as it was + assert_array_equal(actual['var2'], ds['var2']) + + +@requires_bottleneck +@requires_np112 +def test_ffill_dataset(ds): + ds.ffill(dim='time') + + +@requires_bottleneck +@requires_np112 +def test_bfill_dataset(ds): + ds.ffill(dim='time')