Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 151 additions & 10 deletions xarray/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from __future__ import print_function
from datetime import timedelta
from collections import defaultdict, Hashable
import functools
import operator
import numpy as np
import pandas as pd

from . import nputils
from . import utils
from . import duck_array_ops
from .pycompat import (iteritems, range, integer_types, dask_array_type,
suppress)
from .utils import is_dict_like
Expand Down Expand Up @@ -581,27 +583,23 @@ def as_indexable(array):
raise TypeError('Invalid array type: {}'.format(type(array)))


def _outer_to_numpy_indexer(key, shape):
"""Convert an OuterIndexer into an indexer for NumPy.
def _outer_to_vectorized_indexer(key, shape):
"""Convert an OuterIndexer into an vectorized indexer.

Parameters
----------
key : tuple
Outer indexing tuple to convert.
Tuple from an OuterIndexer to convert.
shape : tuple
Shape of the array subject to the indexing.

Returns
-------
tuple
Base tuple suitable for use to index a NumPy array.
Tuple suitable for use to index a NumPy array with vectorized indexing.
Each element is an integer or array: broadcasting them together gives
the shape of the result.
"""
if len([k for k in key if not isinstance(k, slice)]) <= 1:
# If there is only one vector and all others are slice,
# it can be safely used in mixed basic/advanced indexing.
# Boolean index should already be converted to integer array.
return tuple(key)

n_dim = len([k for k in key if not isinstance(k, integer_types)])
i_dim = 0
new_key = []
Expand All @@ -619,6 +617,149 @@ def _outer_to_numpy_indexer(key, shape):
return tuple(new_key)


def _outer_to_numpy_indexer(key, shape):
"""Convert an OuterIndexer into an indexer for NumPy.

Parameters
----------
key : tuple
Tuple from an OuterIndexer to convert.
shape : tuple
Shape of the array subject to the indexing.

Returns
-------
tuple
Tuple suitable for use to index a NumPy array.
"""
if len([k for k in key if not isinstance(k, slice)]) <= 1:
# If there is only one vector and all others are slice,
# it can be safely used in mixed basic/advanced indexing.
# Boolean index should already be converted to integer array.
return tuple(key)
else:
return _outer_to_vectorized_indexer(key, shape)


def _dask_array_with_chunks_hint(array, chunks):
"""Create a dask array using the chunks hint for dimensions of size > 1."""
import dask.array as da
if len(chunks) < array.ndim:
raise ValueError('not enough chunks in hint')
new_chunks = []
for chunk, size in zip(chunks, array.shape):
new_chunks.append(chunk if size > 1 else (1,))
return da.from_array(array, new_chunks)


def _logical_any(args):
return functools.reduce(operator.or_, args)


def _masked_result_drop_slice(key, chunks_hint=None):
key = (k for k in key if not isinstance(k, slice))
if chunks_hint is not None:
key = [_dask_array_with_chunks_hint(k, chunks_hint)
if isinstance(k, np.ndarray) else k
for k in key]
return _logical_any(k == -1 for k in key)


def create_mask(indexer, shape, chunks_hint=None):
"""Create a mask for indexing with a fill-value.

Parameters
----------
indexer : ExplicitIndexer
Indexer with -1 in integer or ndarray value to indicate locations in
the result that should be masked.
shape : tuple
Shape of the array being indexed.
chunks_hint : tuple, optional
Optional tuple indicating desired chunks for the result. If provided,
used as a hint for chunks on the resulting dask. Must have a hint for
each dimension on the result array.

Returns
-------
mask : bool, np.ndarray or dask.array.Array with dtype=bool
Dask array if chunks_hint is provided, otherwise a NumPy array. Has the
same shape as the indexing result.
"""
if isinstance(indexer, OuterIndexer):
key = _outer_to_vectorized_indexer(indexer.tuple, shape)
assert not any(isinstance(k, slice) for k in key)
mask = _masked_result_drop_slice(key, chunks_hint)

elif isinstance(indexer, VectorizedIndexer):
key = indexer.tuple
base_mask = _masked_result_drop_slice(key, chunks_hint)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we take care of more advanced indexers, such as 2d-array or multiple arrays to be broadcasted?
We may leave it for future, but I think this PR would be a good place to add the full feature.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this already works with advanced indexers, though likely I need more test coverage :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I misunderstood. Yes, this seems already working.

slice_shape = tuple(np.arange(*k.indices(size)).size
for k, size in zip(key, shape)
if isinstance(k, slice))
expanded_mask = base_mask[
(Ellipsis,) + (np.newaxis,) * len(slice_shape)]
mask = duck_array_ops.broadcast_to(
expanded_mask, base_mask.shape + slice_shape)

elif isinstance(indexer, BasicIndexer):
mask = any(k == -1 for k in indexer.tuple)

else:
raise TypeError('unexpected key type: {}'.format(type(indexer)))

return mask


def _posify_mask_subindexer(index):
"""Convert masked indices in a flat array to the nearest unmasked index.

Parameters
----------
index : np.ndarray
One dimensional ndarray with dtype=int.

Returns
-------
np.ndarray
One dimensional ndarray with all values equal to -1 replaced by an
adjacent non-masked element.
"""
masked = index == -1
unmasked_locs = np.flatnonzero(~masked)
if not unmasked_locs.size:
# indexing unmasked_locs is invalid
return np.zeros_like(index)
masked_locs = np.flatnonzero(masked)
prev_value = np.maximum(0, np.searchsorted(unmasked_locs, masked_locs) - 1)
new_index = index.copy()
new_index[masked_locs] = index[unmasked_locs[prev_value]]
return new_index


def posify_mask_indexer(indexer):
"""Convert masked values (-1) in an indexer to nearest unmasked values.

This routine is useful for dask, where it can be much faster to index
adjacent points than arbitrary points from the end of an array.

Parameters
----------
indexer : ExplicitIndexer
Input indexer.

Returns
-------
ExplicitIndexer
Same type of input, with all values in ndarray keys equal to -1
replaced by an adjacent non-masked element.
"""
key = tuple(_posify_mask_subindexer(k.ravel()).reshape(k.shape)
if isinstance(k, np.ndarray) else k
for k in indexer.tuple)
return type(indexer)(key)


class NumpyIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
"""Wrap a NumPy array to use explicit indexing."""

Expand Down
64 changes: 51 additions & 13 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,6 @@ def data(self, data):
"replacement data must match the Variable's shape")
self._data = data

@property
def _indexable_data(self):
return as_indexable(self._data)

def load(self, **kwargs):
"""Manually trigger loading of this variable's data from disk or a
remote source into memory and return this variable.
Expand Down Expand Up @@ -622,13 +618,56 @@ def __getitem__(self, key):
If you really want to do indexing like `x[x > 0]`, manipulate the numpy
array `x.values` directly.
"""
dims, index_tuple, new_order = self._broadcast_indexes(key)
data = self._indexable_data[index_tuple]
dims, indexer, new_order = self._broadcast_indexes(key)
data = as_indexable(self._data)[indexer]
if new_order:
data = np.moveaxis(data, range(len(new_order)), new_order)
return self._finalize_indexing_result(dims, data)

def _finalize_indexing_result(self, dims, data):
"""Used by IndexVariable to return IndexVariable objects when possible.
"""
return type(self)(dims, data, self._attrs, self._encoding,
fastpath=True)

def _getitem_with_mask(self, key, fill_value=dtypes.NA):
"""Index this Variable with -1 remapped to fill_value."""
# TODO(shoyer): expose this method in public API somewhere (isel?) and
# use it for reindex.
# TODO(shoyer): add a sanity check that all other integers are
# non-negative
# TODO(shoyer): add an optimization, remapping -1 to an adjacent value
# that is actually indexed rather than mapping it to the last value
# along each axis.

if fill_value is dtypes.NA:
fill_value = dtypes.get_fill_value(self.dtype)

dims, indexer, new_order = self._broadcast_indexes(key)

if self.size:
if isinstance(self._data, dask_array_type):
# dask's indexing is faster this way; also vindex does not
# support negative indices yet:
# https://github.com/dask/dask/pull/2967
actual_indexer = indexing.posify_mask_indexer(indexer)
else:
actual_indexer = indexer

data = as_indexable(self._data)[actual_indexer]
chunks_hint = getattr(data, 'chunks', None)
mask = indexing.create_mask(indexer, self.shape, chunks_hint)
data = duck_array_ops.where(mask, fill_value, data)
else:
# array cannot be indexed along dimensions of size 0, so just
# build the mask directly instead.
mask = indexing.create_mask(indexer, self.shape)
data = np.broadcast_to(fill_value, getattr(mask, 'shape', ()))

if new_order:
data = np.moveaxis(data, range(len(new_order)), new_order)
return self._finalize_indexing_result(dims, data)

def __setitem__(self, key, value):
"""__setitem__ is overloaded to access the underlying numpy values with
orthogonal indexing.
Expand All @@ -652,7 +691,8 @@ def __setitem__(self, key, value):
(Ellipsis,)]
value = np.moveaxis(value, new_order, range(len(new_order)))

self._indexable_data[index_tuple] = value
indexable = as_indexable(self._data)
indexable[index_tuple] = value

@property
def attrs(self):
Expand Down Expand Up @@ -1463,14 +1503,12 @@ def chunk(self, chunks=None, name=None, lock=False):
# Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk()
return self.copy(deep=False)

def __getitem__(self, key):
dims, index_tuple, new_order = self._broadcast_indexes(key)
values = self._indexable_data[index_tuple]
if getattr(values, 'ndim', 0) != 1:
def _finalize_indexing_result(self, dims, data):
if getattr(data, 'ndim', 0) != 1:
# returns Variable rather than IndexVariable if multi-dimensional
return Variable(dims, values, self._attrs, self._encoding)
return Variable(dims, data, self._attrs, self._encoding)
else:
return type(self)(dims, values, self._attrs,
return type(self)(dims, data, self._attrs,
self._encoding, fastpath=True)

def __setitem__(self, key, value):
Expand Down
76 changes: 76 additions & 0 deletions xarray/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,3 +385,79 @@ def nonzero(x):
actual = indexing._outer_to_numpy_indexer(outer_index, v.shape)
actual_data = v.data[actual]
np.testing.assert_array_equal(actual_data, expected_data)


def test_create_mask_outer_indexer():
indexer = indexing.OuterIndexer((np.array([0, -1, 2]),))
expected = np.array([False, True, False])
actual = indexing.create_mask(indexer, (5,))
np.testing.assert_array_equal(expected, actual)

indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]),))
expected = np.array(2 * [[False, True, False]])
actual = indexing.create_mask(indexer, (5, 5, 5,))
np.testing.assert_array_equal(expected, actual)


def test_create_mask_vectorized_indexer():
indexer = indexing.VectorizedIndexer(
(np.array([0, -1, 2]), np.array([0, 1, -1])))
expected = np.array([False, True, True])
actual = indexing.create_mask(indexer, (5,))
np.testing.assert_array_equal(expected, actual)

indexer = indexing.VectorizedIndexer(
(np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])))
expected = np.array([[False, True, True]] * 2).T
actual = indexing.create_mask(indexer, (5, 2))
np.testing.assert_array_equal(expected, actual)


def test_create_mask_basic_indexer():
indexer = indexing.BasicIndexer((-1,))
actual = indexing.create_mask(indexer, (3,))
np.testing.assert_array_equal(True, actual)

indexer = indexing.BasicIndexer((0,))
actual = indexing.create_mask(indexer, (3,))
np.testing.assert_array_equal(False, actual)


def test_create_mask_dask():
da = pytest.importorskip('dask.array')

indexer = indexing.OuterIndexer((1, slice(2), np.array([0, -1, 2]),))
expected = np.array(2 * [[False, True, False]])
actual = indexing.create_mask(indexer, (5, 5, 5,),
chunks_hint=((1, 1), (2, 1)))
assert actual.chunks == ((1, 1), (2, 1))
np.testing.assert_array_equal(expected, actual)

indexer = indexing.VectorizedIndexer(
(np.array([0, -1, 2]), slice(None), np.array([0, 1, -1])))
expected = np.array([[False, True, True]] * 2).T
actual = indexing.create_mask(indexer, (5, 2), chunks_hint=((3,), (2,)))
assert isinstance(actual, da.Array)
np.testing.assert_array_equal(expected, actual)

with pytest.raises(ValueError):
indexing.create_mask(indexer, (5, 2), chunks_hint=())


def test_create_mask_error():
with raises_regex(TypeError, 'unexpected key type'):
indexing.create_mask((1, 2), (3, 4))


@pytest.mark.parametrize('indices, expected', [
(np.arange(5), np.arange(5)),
(np.array([0, -1, -1]), np.array([0, 0, 0])),
(np.array([-1, 1, -1]), np.array([1, 1, 1])),
(np.array([-1, -1, 2]), np.array([2, 2, 2])),
(np.array([-1]), np.array([0])),
(np.array([0, -1, 1, -1, -1]), np.array([0, 0, 1, 1, 1])),
(np.array([0, -1, -1, -1, 1]), np.array([0, 0, 0, 0, 1])),
])
def test_posify_mask_subindexer(indices, expected):
actual = indexing._posify_mask_subindexer(indices)
np.testing.assert_array_equal(expected, actual)
Loading