pydata · shoyer · Dec 18, 2017 · Nov 30, 2017 · Dec 8, 2017 · fujiisoup
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
@@ -3,12 +3,14 @@
 from __future__ import print_function
 from datetime import timedelta
 from collections import defaultdict, Hashable
+import functools
 import operator
 import numpy as np
 import pandas as pd
 
 from . import nputils
 from . import utils
+from . import duck_array_ops
 from .pycompat import (iteritems, range, integer_types, dask_array_type,
                        suppress)
 from .utils import is_dict_like
@@ -581,27 +583,23 @@ def as_indexable(array):
     raise TypeError('Invalid array type: {}'.format(type(array)))
 
 
-def _outer_to_numpy_indexer(key, shape):
-    """Convert an OuterIndexer into an indexer for NumPy.
+def _outer_to_vectorized_indexer(key, shape):
+    """Convert an OuterIndexer into an vectorized indexer.
 
     Parameters
     ----------
     key : tuple
-        Outer indexing tuple to convert.
+        Tuple from an OuterIndexer to convert.
     shape : tuple
         Shape of the array subject to the indexing.
 
     Returns
     -------
     tuple
-        Base tuple suitable for use to index a NumPy array.
+        Tuple suitable for use to index a NumPy array with vectorized indexing.
+        Each element is an integer or array: broadcasting them together gives
+        the shape of the result.
     """
-    if len([k for k in key if not isinstance(k, slice)]) <= 1:
-        # If there is only one vector and all others are slice,
-        # it can be safely used in mixed basic/advanced indexing.
-        # Boolean index should already be converted to integer array.
-        return tuple(key)
-
     n_dim = len([k for k in key if not isinstance(k, integer_types)])
     i_dim = 0
     new_key = []
@@ -619,6 +617,149 @@ def _outer_to_numpy_indexer(key, shape):
     return tuple(new_key)
 
 
+def _outer_to_numpy_indexer(key, shape):
+    """Convert an OuterIndexer into an indexer for NumPy.
+
+    Parameters
+    ----------
+    key : tuple
+        Tuple from an OuterIndexer to convert.
+    shape : tuple
+        Shape of the array subject to the indexing.
+
+    Returns
+    -------
+    tuple
+        Tuple suitable for use to index a NumPy array.
+    """
+    if len([k for k in key if not isinstance(k, slice)]) <= 1:
+        # If there is only one vector and all others are slice,
+        # it can be safely used in mixed basic/advanced indexing.
+        # Boolean index should already be converted to integer array.
+        return tuple(key)
+    else:
+        return _outer_to_vectorized_indexer(key, shape)
+
+
+def _dask_array_with_chunks_hint(array, chunks):
+    """Create a dask array using the chunks hint for dimensions of size > 1."""
+    import dask.array as da
+    if len(chunks) < array.ndim:
+        raise ValueError('not enough chunks in hint')
+    new_chunks = []
+    for chunk, size in zip(chunks, array.shape):
+        new_chunks.append(chunk if size > 1 else (1,))
+    return da.from_array(array, new_chunks)
+
+
+def _logical_any(args):
+    return functools.reduce(operator.or_, args)
+
+
+def _masked_result_drop_slice(key, chunks_hint=None):
+    key = (k for k in key if not isinstance(k, slice))
+    if chunks_hint is not None:
+        key = [_dask_array_with_chunks_hint(k, chunks_hint)
+               if isinstance(k, np.ndarray) else k
+               for k in key]
+    return _logical_any(k == -1 for k in key)
+
+
+def create_mask(indexer, shape, chunks_hint=None):
+    """Create a mask for indexing with a fill-value.
+
+    Parameters
+    ----------
+    indexer : ExplicitIndexer
+        Indexer with -1 in integer or ndarray value to indicate locations in
+        the result that should be masked.
+    shape : tuple
+        Shape of the array being indexed.
+    chunks_hint : tuple, optional
+        Optional tuple indicating desired chunks for the result. If provided,
+        used as a hint for chunks on the resulting dask. Must have a hint for
+        each dimension on the result array.
+
+    Returns
+    -------
+    mask : bool, np.ndarray or dask.array.Array with dtype=bool
+        Dask array if chunks_hint is provided, otherwise a NumPy array. Has the
+        same shape as the indexing result.
+    """
+    if isinstance(indexer, OuterIndexer):
+        key = _outer_to_vectorized_indexer(indexer.tuple, shape)
+        assert not any(isinstance(k, slice) for k in key)
+        mask = _masked_result_drop_slice(key, chunks_hint)
+
+    elif isinstance(indexer, VectorizedIndexer):
+        key = indexer.tuple
+        base_mask = _masked_result_drop_slice(key, chunks_hint)
+        slice_shape = tuple(np.arange(*k.indices(size)).size
+                            for k, size in zip(key, shape)
+                            if isinstance(k, slice))
+        expanded_mask = base_mask[
+            (Ellipsis,) + (np.newaxis,) * len(slice_shape)]
+        mask = duck_array_ops.broadcast_to(
+            expanded_mask, base_mask.shape + slice_shape)
+
+    elif isinstance(indexer, BasicIndexer):
+        mask = any(k == -1 for k in indexer.tuple)
+
+    else:
+        raise TypeError('unexpected key type: {}'.format(type(indexer)))
+
+    return mask
+
+
+def _posify_mask_subindexer(index):
+    """Convert masked indices in a flat array to the nearest unmasked index.
+
+    Parameters
+    ----------
+    index : np.ndarray
+        One dimensional ndarray with dtype=int.
+
+    Returns
+    -------
+    np.ndarray
+        One dimensional ndarray with all values equal to -1 replaced by an
+        adjacent non-masked element.
+    """
+    masked = index == -1
+    unmasked_locs = np.flatnonzero(~masked)
+    if not unmasked_locs.size:
+        # indexing unmasked_locs is invalid
+        return np.zeros_like(index)
+    masked_locs = np.flatnonzero(masked)
+    prev_value = np.maximum(0, np.searchsorted(unmasked_locs, masked_locs) - 1)
+    new_index = index.copy()
+    new_index[masked_locs] = index[unmasked_locs[prev_value]]
+    return new_index
+
+
+def posify_mask_indexer(indexer):
+    """Convert masked values (-1) in an indexer to nearest unmasked values.
+
+    This routine is useful for dask, where it can be much faster to index
+    adjacent points than arbitrary points from the end of an array.
+
+    Parameters
+    ----------
+    indexer : ExplicitIndexer
+        Input indexer.
+
+    Returns
+    -------
+    ExplicitIndexer
+        Same type of input, with all values in ndarray keys equal to -1
+        replaced by an adjacent non-masked element.
+    """
+    key = tuple(_posify_mask_subindexer(k.ravel()).reshape(k.shape)
+                if isinstance(k, np.ndarray) else k
+                for k in indexer.tuple)
+    return type(indexer)(key)
+
+
 class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
     """Wrap a NumPy array to use explicit indexing."""
 

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -307,10 +307,6 @@ def data(self, data):
                 "replacement data must match the Variable's shape")
         self._data = data
 
-    @property
-    def _indexable_data(self):
-        return as_indexable(self._data)
-
     def load(self, **kwargs):
         """Manually trigger loading of this variable's data from disk or a
         remote source into memory and return this variable.
@@ -622,13 +618,56 @@ def __getitem__(self, key):
         If you really want to do indexing like `x[x > 0]`, manipulate the numpy
         array `x.values` directly.
         """
-        dims, index_tuple, new_order = self._broadcast_indexes(key)
-        data = self._indexable_data[index_tuple]
+        dims, indexer, new_order = self._broadcast_indexes(key)
+        data = as_indexable(self._data)[indexer]
         if new_order:
             data = np.moveaxis(data, range(len(new_order)), new_order)
+        return self._finalize_indexing_result(dims, data)
+
+    def _finalize_indexing_result(self, dims, data):
+        """Used by IndexVariable to return IndexVariable objects when possible.
+        """
         return type(self)(dims, data, self._attrs, self._encoding,
                           fastpath=True)
 
+    def _getitem_with_mask(self, key, fill_value=dtypes.NA):
+        """Index this Variable with -1 remapped to fill_value."""
+        # TODO(shoyer): expose this method in public API somewhere (isel?) and
+        # use it for reindex.
+        # TODO(shoyer): add a sanity check that all other integers are
+        # non-negative
+        # TODO(shoyer): add an optimization, remapping -1 to an adjacent value
+        # that is actually indexed rather than mapping it to the last value
+        # along each axis.
+
+        if fill_value is dtypes.NA:
+            fill_value = dtypes.get_fill_value(self.dtype)
+
+        dims, indexer, new_order = self._broadcast_indexes(key)
+
+        if self.size:
+            if isinstance(self._data, dask_array_type):
+                # dask's indexing is faster this way; also vindex does not
+                # support negative indices yet:
+                # https://github.com/dask/dask/pull/2967
+                actual_indexer = indexing.posify_mask_indexer(indexer)
+            else:
+                actual_indexer = indexer
+
+            data = as_indexable(self._data)[actual_indexer]
+            chunks_hint = getattr(data, 'chunks', None)
+            mask = indexing.create_mask(indexer, self.shape, chunks_hint)
+            data = duck_array_ops.where(mask, fill_value, data)
+        else:
+            # array cannot be indexed along dimensions of size 0, so just
+            # build the mask directly instead.
+            mask = indexing.create_mask(indexer, self.shape)
+            data = np.broadcast_to(fill_value, getattr(mask, 'shape', ()))
+
+        if new_order:
+            data = np.moveaxis(data, range(len(new_order)), new_order)
+        return self._finalize_indexing_result(dims, data)
+
     def __setitem__(self, key, value):
         """__setitem__ is overloaded to access the underlying numpy values with
         orthogonal indexing.
@@ -652,7 +691,8 @@ def __setitem__(self, key, value):
                           (Ellipsis,)]
             value = np.moveaxis(value, new_order, range(len(new_order)))
 
-        self._indexable_data[index_tuple] = value
+        indexable = as_indexable(self._data)
+        indexable[index_tuple] = value
 
     @property
     def attrs(self):
@@ -1463,14 +1503,12 @@ def chunk(self, chunks=None, name=None, lock=False):
         # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk()
         return self.copy(deep=False)
 
-    def __getitem__(self, key):
-        dims, index_tuple, new_order = self._broadcast_indexes(key)
-        values = self._indexable_data[index_tuple]
-        if getattr(values, 'ndim', 0) != 1:
+    def _finalize_indexing_result(self, dims, data):
+        if getattr(data, 'ndim', 0) != 1:
             # returns Variable rather than IndexVariable if multi-dimensional
-            return Variable(dims, values, self._attrs, self._encoding)
+            return Variable(dims, data, self._attrs, self._encoding)
         else:
-            return type(self)(dims, values, self._attrs,
+            return type(self)(dims, data, self._attrs,
                               self._encoding, fastpath=True)
 
     def __setitem__(self, key, value):

diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py
@@ -385,3 +385,79 @@ def nonzero(x):
         actual = indexing._outer_to_numpy_indexer(outer_index, v.shape)
         actual_data = v.data[actual]
         np.testing.assert_array_equal(actual_data, expected_data)
+
+
+def test_create_mask_outer_indexer():
+    indexer = indexing.OuterIndexer((np.array([0, -1, 2]),))
+    expected = np.array([False, True, False])
+    actual = indexing.create_mask(indexer, (5,))
+    np.testing.assert_array_equal(expected, actual)
+
+    indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]),))
+    expected = np.array(2 * [[False, True, False]])
+    actual = indexing.create_mask(indexer, (5, 5, 5,))
+    np.testing.assert_array_equal(expected, actual)
+
+
+def test_create_mask_vectorized_indexer():
+    indexer = indexing.VectorizedIndexer(
+        (np.array([0, -1, 2]), np.array([0, 1, -1])))
+    expected = np.array([False, True, True])
+    actual = indexing.create_mask(indexer, (5,))
+    np.testing.assert_array_equal(expected, actual)
+
+    indexer = indexing.VectorizedIndexer(
+        (np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])))
+    expected = np.array([[False, True, True]] * 2).T
+    actual = indexing.create_mask(indexer, (5, 2))
+    np.testing.assert_array_equal(expected, actual)
+
+
+def test_create_mask_basic_indexer():
+    indexer = indexing.BasicIndexer((-1,))
+    actual = indexing.create_mask(indexer, (3,))
+    np.testing.assert_array_equal(True, actual)
+
+    indexer = indexing.BasicIndexer((0,))
+    actual = indexing.create_mask(indexer, (3,))
+    np.testing.assert_array_equal(False, actual)
+
+
+def test_create_mask_dask():
+    da = pytest.importorskip('dask.array')
+
+    indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]),))
+    expected = np.array(2 * [[False, True, False]])
+    actual = indexing.create_mask(indexer, (5, 5, 5,),
+                                  chunks_hint=((1, 1), (2, 1)))
+    assert actual.chunks == ((1, 1), (2, 1))
+    np.testing.assert_array_equal(expected, actual)
+
+    indexer = indexing.VectorizedIndexer(
+        (np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])))
+    expected = np.array([[False, True, True]] * 2).T
+    actual = indexing.create_mask(indexer, (5, 2), chunks_hint=((3,), (2,)))
+    assert isinstance(actual, da.Array)
+    np.testing.assert_array_equal(expected, actual)
+
+    with pytest.raises(ValueError):
+        indexing.create_mask(indexer, (5, 2), chunks_hint=())
+
+
+def test_create_mask_error():
+    with raises_regex(TypeError, 'unexpected key type'):
+        indexing.create_mask((1, 2), (3, 4))
+
+
+@pytest.mark.parametrize('indices, expected', [
+    (np.arange(5), np.arange(5)),
+    (np.array([0, -1, -1]), np.array([0, 0, 0])),
+    (np.array([-1, 1, -1]), np.array([1, 1, 1])),
+    (np.array([-1, -1, 2]), np.array([2, 2, 2])),
+    (np.array([-1]), np.array([0])),
+    (np.array([0, -1, 1, -1, -1]), np.array([0, 0, 1, 1, 1])),
+    (np.array([0, -1, -1, -1, 1]), np.array([0, 0, 0, 0, 1])),
+])
+def test_posify_mask_subindexer(indices, expected):
+    actual = indexing._posify_mask_subindexer(indices)
+    np.testing.assert_array_equal(expected, actual)