From 4cd2c4b41312d6dea2ca57a0fbd23a46bc6d2db0 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 5 Dec 2016 11:57:50 +0100 Subject: [PATCH 1/3] Add drop=True argument to isel, sel and squeeze Fixes GH242 This is useful for getting rid of extraneous scalar variables that arise from indexing, and in particular will resolve an issue for optional indexes: https://github.com/pydata/xarray/pull/1017#issuecomment-260777664 --- doc/whats-new.rst | 6 ++++++ xarray/core/common.py | 38 ++++++++++++++++++++--------------- xarray/core/dataarray.py | 9 +++++---- xarray/core/dataset.py | 20 +++++++++++++----- xarray/core/variable.py | 26 ++++++++++++++++++++++-- xarray/test/test_dataarray.py | 30 +++++++++++++++++++++++++++ xarray/test/test_dataset.py | 30 +++++++++++++++++++++++++++ 7 files changed, 132 insertions(+), 27 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2f3a3d4eae4..9e65f4ce21a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -101,6 +101,12 @@ Enhancements providing consistent access to dimension length on both ``Dataset`` and ``DataArray`` (:issue:`921`). By `Stephan Hoyer `_. +- New keyword argument ``drop=True`` for :py:meth:`~DataArray.sel`, + :py:meth:`~DataArray.isel` and :py:meth:`~DataArray.squeeze` for dropping + scalar coordinates that arise from indexing. + ``DataArray`` (:issue:`242`). + By `Stephan Hoyer `_. + - New top-level functions :py:func:`~xarray.full_like`, :py:func:`~xarray.zeros_like`, and :py:func:`~xarray.ones_like` By `Guido Imperiale `_. diff --git a/xarray/core/common.py b/xarray/core/common.py index 280aaf03c75..0e743a9234f 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -249,10 +249,24 @@ def __dir__(self): return sorted(set(dir(type(self)) + extra_attrs)) -class SharedMethodsMixin(object): - """Shared methods for Dataset, DataArray and Variable.""" +def get_squeeze_dims(xarray_obj, dim): + """Get a list of dimensions to squeeze out. + """ + if dim is None: + dim = [d for d, s in xarray_obj.sizes.items() if s == 1] + else: + if isinstance(dim, basestring): + dim = [dim] + if any(xarray_obj.sizes[k] > 1 for k in dim): + raise ValueError('cannot select a dimension to squeeze out ' + 'which has length greater than one') + return dim + + +class BaseDataObject(AttrAccessMixin): + """Shared base class for Dataset and DataArray.""" - def squeeze(self, dim=None): + def squeeze(self, dim=None, drop=False): """Return a new object with squeezed data. Parameters @@ -261,6 +275,9 @@ def squeeze(self, dim=None): Selects a subset of the length one dimensions. If a dimension is selected with length greater than one, an error is raised. If None, all length one dimensions are squeezed. + drop : bool, optional + If ``drop=True``, drop squeezed coordinates instead of making them + scalar. Returns ------- @@ -272,19 +289,8 @@ def squeeze(self, dim=None): -------- numpy.squeeze """ - if dim is None: - dim = [d for d, s in self.sizes.items() if s == 1] - else: - if isinstance(dim, basestring): - dim = [dim] - if any(self.sizes[k] > 1 for k in dim): - raise ValueError('cannot select a dimension to squeeze out ' - 'which has length greater than one') - return self.isel(**{d: 0 for d in dim}) - - -class BaseDataObject(SharedMethodsMixin, AttrAccessMixin): - """Shared base class for Dataset and DataArray.""" + dims = get_squeeze_dims(self, dim) + return self.isel(drop=drop, **{d: 0 for d in dims}) def _calc_assign_results(self, kwargs): results = SortedKeysDict() diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7838ade7ae1..f6e7c872f4b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -640,7 +640,7 @@ def chunk(self, chunks=None): ds = self._to_temp_dataset().chunk(chunks) return self._from_temp_dataset(ds) - def isel(self, **indexers): + def isel(self, drop=False, **indexers): """Return a new DataArray whose dataset is given by integer indexing along the specified dimension(s). @@ -649,10 +649,10 @@ def isel(self, **indexers): Dataset.isel DataArray.sel """ - ds = self._to_temp_dataset().isel(**indexers) + ds = self._to_temp_dataset().isel(drop=drop, **indexers) return self._from_temp_dataset(ds) - def sel(self, method=None, tolerance=None, **indexers): + def sel(self, method=None, tolerance=None, drop=False, **indexers): """Return a new DataArray whose dataset is given by selecting index labels along the specified dimension(s). @@ -664,7 +664,8 @@ def sel(self, method=None, tolerance=None, **indexers): pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) - return self.isel(**pos_indexers)._replace_indexes(new_indexes) + result = self.isel(drop=drop, **pos_indexers) + return result._replace_indexes(new_indexes) def isel_points(self, dim='points', **indexers): """Return a new DataArray whose dataset is given by pointwise integer diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b3bde8f7377..917fce2e680 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -892,7 +892,7 @@ def maybe_chunk(name, var, chunks): for k, v in self.variables.items()]) return self._replace_vars_and_dims(variables) - def isel(self, **indexers): + def isel(self, drop=False, **indexers): """Returns a new dataset with each array indexed along the specified dimension(s). @@ -902,6 +902,9 @@ def isel(self, **indexers): Parameters ---------- + drop : bool, optional + If ``drop=True``, drop coordinates variables indexed by integers + instead of making them scalar. **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given by integers, slice objects or arrays. @@ -935,10 +938,13 @@ def isel(self, **indexers): variables = OrderedDict() for name, var in iteritems(self._variables): var_indexers = dict((k, v) for k, v in indexers if k in var.dims) - variables[name] = var.isel(**var_indexers) - return self._replace_vars_and_dims(variables) + new_var = var.isel(**var_indexers) + if not (drop and name in var_indexers): + variables[name] = new_var + coord_names = set(self._coord_names) & set(variables) + return self._replace_vars_and_dims(variables, coord_names=coord_names) - def sel(self, method=None, tolerance=None, **indexers): + def sel(self, method=None, tolerance=None, drop=False, **indexers): """Returns a new dataset with each array indexed by tick labels along the specified dimension(s). @@ -969,6 +975,9 @@ def sel(self, method=None, tolerance=None, **indexers): matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Requires pandas>=0.17. + drop : bool, optional + If ``drop=True``, drop coordinates variables in `indexers` instead + of making them scalar. **indexers : {dim: indexer, ...} Keyword arguments with names matching dimensions and values given by scalars, slices or arrays of tick labels. For dimensions with @@ -994,7 +1003,8 @@ def sel(self, method=None, tolerance=None, **indexers): pos_indexers, new_indexes = indexing.remap_label_indexers( self, indexers, method=method, tolerance=tolerance ) - return self.isel(**pos_indexers)._replace_indexes(new_indexes) + result = self.isel(drop=drop, **pos_indexers) + return result._replace_indexes(new_indexes) def isel_points(self, dim='points', **indexers): """Returns a new dataset with each array indexed pointwise along the diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 97a75019c25..9501a32ebcc 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -196,8 +196,7 @@ def _as_array_or_item(data): return data -class Variable(common.AbstractArray, common.SharedMethodsMixin, - utils.NdimSizeLenMixin): +class Variable(common.AbstractArray, utils.NdimSizeLenMixin): """A netcdf-like variable consisting of dimensions, data and attributes which describe a single Array. A single Variable object is not fully @@ -553,6 +552,29 @@ def isel(self, **indexers): key[i] = indexers[dim] return self[tuple(key)] + def squeeze(self, dim=None): + """Return a new object with squeezed data. + + Parameters + ---------- + dim : None or str or tuple of str, optional + Selects a subset of the length one dimensions. If a dimension is + selected with length greater than one, an error is raised. If + None, all length one dimensions are squeezed. + + Returns + ------- + squeezed : same type as caller + This object, but with with all or a subset of the dimensions of + length 1 removed. + + See Also + -------- + numpy.squeeze + """ + dims = common.get_squeeze_dims(self, dim) + return self.isel(**{d: 0 for d in dims}) + def _shift_one_dim(self, dim, count): axis = self.get_axis_num(dim) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index 9c2c7a1fa32..195c9dfc550 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -498,6 +498,26 @@ def test_sel_method(self): with self.assertRaisesRegexp(NotImplementedError, 'tolerance'): data.sel(x=[0.9, 1.9], method='backfill', tolerance=1) + def test_sel_drop(self): + data = DataArray([1, 2, 3], [('x', [0, 1, 2])]) + expected = DataArray(1) + selected = data.sel(x=0, drop=True) + self.assertDataArrayIdentical(expected, selected) + + expected = DataArray(1, {'x': 0}) + selected = data.sel(x=0, drop=False) + self.assertDataArrayIdentical(expected, selected) + + def test_isel_drop(self): + data = DataArray([1, 2, 3], [('x', [0, 1, 2])]) + expected = DataArray(1) + selected = data.isel(x=0, drop=True) + self.assertDataArrayIdentical(expected, selected) + + expected = DataArray(1, {'x': 0}) + selected = data.isel(x=0, drop=False) + self.assertDataArrayIdentical(expected, selected) + def test_isel_points(self): shape = (10, 5, 6) np_array = np.random.random(shape) @@ -1064,6 +1084,16 @@ def test_transpose(self): def test_squeeze(self): self.assertVariableEqual(self.dv.variable.squeeze(), self.dv.squeeze()) + def test_squeeze_drop(self): + array = DataArray([1], [('x', [0])]) + expected = DataArray(1) + actual = array.squeeze(drop=True) + self.assertDataArrayIdentical(expected, actual) + + expected = DataArray(1, {'x': 0}) + actual = array.squeeze(drop=False) + self.assertDataArrayIdentical(expected, actual) + def test_drop_coordinates(self): expected = DataArray(np.random.randn(2, 3), dims=['x', 'y']) arr = expected.copy() diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 229b815633e..8ff454b58cf 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -815,6 +815,26 @@ def test_sel(self): self.assertDatasetEqual(data.isel(td=slice(1, 3)), data.sel(td=slice('1 days', '2 days'))) + def test_sel_drop(self): + data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) + expected = Dataset({'foo': 1}) + selected = data.sel(x=0, drop=True) + self.assertDatasetIdentical(expected, selected) + + expected = Dataset({'foo': 1}, {'x': 0}) + selected = data.sel(x=0, drop=False) + self.assertDatasetIdentical(expected, selected) + + def test_isel_drop(self): + data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) + expected = Dataset({'foo': 1}) + selected = data.isel(x=0, drop=True) + self.assertDatasetIdentical(expected, selected) + + expected = Dataset({'foo': 1}, {'x': 0}) + selected = data.isel(x=0, drop=False) + self.assertDatasetIdentical(expected, selected) + def test_isel_points(self): data = create_test_data() @@ -1750,6 +1770,16 @@ def get_args(v): with self.assertRaisesRegexp(ValueError, 'cannot select a dimension'): data.squeeze('y') + def test_squeeze_drop(self): + data = Dataset({'foo': ('x', [1])}, {'x': [0]}) + expected = Dataset({'foo': 1}) + selected = data.squeeze(drop=True) + self.assertDatasetIdentical(expected, selected) + + expected = Dataset({'foo': 1}, {'x': 0}) + selected = data.squeeze(drop=False) + self.assertDatasetIdentical(expected, selected) + def test_groupby(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, {'x': ('x', list('abc')), From 7565886ff009069a75664ee3856d9c30030f6b1d Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 6 Dec 2016 15:37:15 +0100 Subject: [PATCH 2/3] More tests for Dataset.squeeze(drop=True) --- xarray/test/test_dataset.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 8ff454b58cf..db7df46f927 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1780,6 +1780,19 @@ def test_squeeze_drop(self): selected = data.squeeze(drop=False) self.assertDatasetIdentical(expected, selected) + data = Dataset({'foo': (('x', 'y'), [[1]])}, {'x': [0], 'y': [0]}) + expected = Dataset({'foo': 1}) + selected = data.squeeze(drop=True) + self.assertDatasetIdentical(expected, selected) + + expected = Dataset({'foo': ('x', [1])}, {'x': [0]}) + selected = data.squeeze(dim='y', drop=True) + self.assertDatasetIdentical(expected, selected) + + data = Dataset({'foo': (('x',), [])}, {'x': []}) + selected = data.squeeze(drop=True) + self.assertDatasetIdentical(data, selected) + def test_groupby(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, {'x': ('x', list('abc')), From 881713a1dbb745c5dfc44d0cacb07fdde099568b Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 15 Dec 2016 19:12:44 -0800 Subject: [PATCH 3/3] Add two more tests, for drop=True without coords --- xarray/test/test_dataarray.py | 5 +++++ xarray/test/test_dataset.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index ba04b06b95d..bd6c5ccff92 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -536,6 +536,11 @@ def test_sel_drop(self): selected = data.sel(x=0, drop=False) self.assertDataArrayIdentical(expected, selected) + data = DataArray([1, 2, 3], dims=['x']) + expected = DataArray(1) + selected = data.sel(x=0, drop=True) + self.assertDataArrayIdentical(expected, selected) + def test_isel_drop(self): data = DataArray([1, 2, 3], [('x', [0, 1, 2])]) expected = DataArray(1) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 5d18c21d40a..1a970fe718d 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -834,6 +834,11 @@ def test_sel_drop(self): selected = data.sel(x=0, drop=False) self.assertDatasetIdentical(expected, selected) + data = Dataset({'foo': ('x', [1, 2, 3])}) + expected = Dataset({'foo': 1}) + selected = data.sel(x=0, drop=True) + self.assertDatasetIdentical(expected, selected) + def test_isel_drop(self): data = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) expected = Dataset({'foo': 1})