diff --git a/src/xray/__init__.py b/src/xray/__init__.py index 925c5660880..ec76a3b8ad7 100644 --- a/src/xray/__init__.py +++ b/src/xray/__init__.py @@ -1,4 +1,4 @@ -from .xarray import XArray, broadcast_xarrays +from .xarray import as_xarray, XArray, CoordXArray, broadcast_xarrays from .dataset import Dataset, open_dataset from .dataset_array import DatasetArray, align from .utils import (orthogonal_indexer, decode_cf_datetime, encode_cf_datetime, diff --git a/src/xray/common.py b/src/xray/common.py index ee66e4deef8..ccc4adf5c04 100644 --- a/src/xray/common.py +++ b/src/xray/common.py @@ -15,25 +15,6 @@ def func(self, dimension=cls._reduce_dimension_default, class AbstractArray(ImplementsReduce): - @property - def dtype(self): - return self._data.dtype - - @property - def shape(self): - return self._data.shape - - @property - def size(self): - return self._data.size - - @property - def ndim(self): - return self._data.ndim - - def __len__(self): - return len(self._data) - def __nonzero__(self): return bool(self.data) diff --git a/src/xray/conventions.py b/src/xray/conventions.py index 36fbb5027cb..4d0017d9a49 100644 --- a/src/xray/conventions.py +++ b/src/xray/conventions.py @@ -261,14 +261,11 @@ def encode_cf_variable(array): attributes['units'] = units attributes['calendar'] = calendar elif data.dtype == np.dtype('O'): - # Unfortunately, pandas.Index arrays often have dtype=object even if - # they were created from an array with a sensible datatype (e.g., - # pandas.Float64Index always has dtype=object for some reason). Because - # we allow for doing math with coordinates, these object arrays can - # propagate onward to other variables, which is why we don't only apply - # this check to XArrays with data that is a pandas.Index. - # Accordingly, we convert object arrays to the type of their first - # variable. + # Occasionally, one will end up with variables with dtype=object + # (likely because they were created from pandas objects which don't + # maintain dtype careful). Thie code makes a best effort attempt to + # encode them into a dtype that NETCDF can handle by inspecting the + # dtype of the first element. dtype = np.array(data.reshape(-1)[0]).dtype # N.B. the "astype" call below will fail if data cannot be cast to the # type of its first element (which is probably the only sensible thing diff --git a/src/xray/dataset.py b/src/xray/dataset.py index 41a00fdf7e0..b86c486b0c2 100644 --- a/src/xray/dataset.py +++ b/src/xray/dataset.py @@ -53,7 +53,8 @@ class _VariablesDict(OrderedDict): """ def _datetimeindices(self): return [k for k, v in self.iteritems() - if isinstance(v._data, pd.DatetimeIndex)] + if np.issubdtype(v.dtype, np.datetime64) + and isinstance(v.index, pd.DatetimeIndex)] @property def virtual(self): @@ -76,10 +77,10 @@ def _get_virtual_variable(self, key): if ref_var in self._datetimeindices(): if suffix == 'season': # seasons = np.array(['DJF', 'MAM', 'JJA', 'SON']) - month = self[ref_var].data.month + month = self[ref_var].index.month data = (month // 3) % 4 + 1 else: - data = getattr(self[ref_var].data, suffix) + data = getattr(self[ref_var].index, suffix) return xarray.XArray(self[ref_var].dimensions, data) raise KeyError('virtual variable %r not found' % key) @@ -130,14 +131,15 @@ def __init__(self, variables=None, attributes=None, decode_cf=False): def _as_variable(self, name, var, decode_cf=False): if isinstance(var, DatasetArray): - var = var.array - if not isinstance(var, xarray.XArray): + var = xarray.as_xarray(var) + elif not isinstance(var, xarray.XArray): try: var = xarray.XArray(*var) except TypeError: raise TypeError('Dataset variables must be of type ' 'DatasetArray or XArray, or a sequence of the ' - 'form (dimensions, data[, attributes])') + 'form (dimensions, data[, attributes, ' + 'encoding])') # this will unmask and rescale the data as well as convert # time variables to datetime indices. if decode_cf: @@ -147,9 +149,7 @@ def _as_variable(self, name, var, decode_cf=False): if var.ndim != 1: raise ValueError('a coordinate variable must be defined with ' '1-dimensional data') - # create a new XArray object on which to modify the data - var = xarray.XArray(var.dimensions, pd.Index(var.data), - var.attributes, encoding=var.encoding) + var = var.to_coord() return var def set_variables(self, variables, decode_cf=False): @@ -487,7 +487,7 @@ def labeled_by(self, **indexers): Dataset.indexed_by Array.indexed_by """ - return self.indexed_by(**remap_loc_indexers(self.variables, indexers)) + return self.indexed_by(**remap_loc_indexers(self, indexers)) def renamed(self, name_dict): """Returns a new object with renamed variables and dimensions. @@ -625,7 +625,8 @@ def unselect(self, *names): New dataset based on this dataset. Only the named variables are removed. """ - if any(k not in self.variables for k in names): + if any(k not in self.variables and k not in self.virtual_variables + for k in names): raise ValueError('One or more of the specified variable ' 'names does not exist on this dataset') drop = set(names) diff --git a/src/xray/dataset_array.py b/src/xray/dataset_array.py index 673dd123040..6ae264ea572 100644 --- a/src/xray/dataset_array.py +++ b/src/xray/dataset_array.py @@ -66,28 +66,50 @@ def __init__(self, dataset, focus): self.focus = focus @property - def array(self): + def variable(self): return self.dataset.variables[self.focus] - @array.setter - def array(self, value): + @variable.setter + def variable(self, value): self.dataset[self.focus] = value - # _data is necessary for AbstractArray @property - def _data(self): - return self.array._data + def dtype(self): + return self.variable.dtype + + @property + def shape(self): + return self.variable.shape + + @property + def size(self): + return self.variable.size + + @property + def ndim(self): + return self.variable.ndim + + def __len__(self): + return len(self.variable) @property def data(self): - """The array's data as a numpy.ndarray""" - return self.array.data + """The variables's data as a numpy.ndarray""" + return self.variable.data @data.setter def data(self, value): - self.array.data = value + self.variable.data = value + + @property + def index(self): + """The variable's data as a pandas.Index""" + return self.variable.index + + def is_coord(self): + return isinstance(self.variable, xarray.CoordXArray) @property def dimensions(self): - return self.array.dimensions + return self.variable.dimensions def _key_to_indexers(self, key): return OrderedDict( @@ -107,7 +129,7 @@ def __setitem__(self, key, value): self.dataset[key] = value else: # orthogonal array indexing - self.array[key] = value + self.variable[key] = value def __delitem__(self, key): del self.dataset[key] @@ -127,11 +149,11 @@ def __iter__(self): @property def attributes(self): - return self.array.attributes + return self.variable.attributes @property def encoding(self): - return self.array.encoding + return self.variable.encoding @property def variables(self): @@ -175,10 +197,11 @@ def indexed_by(self, **indexers): Dataset.indexed_by """ ds = self.dataset.indexed_by(**indexers) - if self.focus not in ds: + if self.focus not in ds and self.focus in self.dataset: # always keep focus variable in the dataset, even if it was # unselected because indexing made it a scaler - ds[self.focus] = self.array.indexed_by(**indexers) + # don't add back in virtual variables (not found in the dataset) + ds[self.focus] = self.variable.indexed_by(**indexers) return type(self)(ds, self.focus) def labeled_by(self, **indexers): @@ -236,13 +259,8 @@ def refocus(self, new_var, name=None): If `new_var` is a dataset array, its contents will be merged in. """ if not hasattr(new_var, 'dimensions'): - new_var = type(self.array)(self.array.dimensions, new_var) - if self.focus not in self.dimensions: - # only unselect the focus from the dataset if it isn't a coordinate - # variable - ds = self.unselected() - else: - ds = self.dataset + new_var = type(self.variable)(self.variable.dimensions, new_var) + ds = self.dataset.copy() if self.is_coord() else self.unselected() if name is None: name = self.focus + '_' ds[name] = new_var @@ -301,7 +319,7 @@ def transpose(self, *dimensions): numpy.transpose Array.transpose """ - return self.refocus(self.array.transpose(*dimensions), self.focus) + return self.refocus(self.variable.transpose(*dimensions), self.focus) def squeeze(self, dimension=None): """Return a new DatasetArray object with squeezed data. @@ -361,7 +379,7 @@ def reduce(self, func, dimension=None, axis=None, **kwargs): DatasetArray with this object's array replaced with an array with summarized data and the indicated dimension(s) removed. """ - var = self.array.reduce(func, dimension, axis, **kwargs) + var = self.variable.reduce(func, dimension, axis, **kwargs) drop = set(self.dimensions) - set(var.dimensions) # For now, take an aggressive strategy of removing all variables # associated with any dropped dimensions @@ -495,13 +513,13 @@ def to_series(self): return pd.Series(self.data.reshape(-1), index=index, name=self.focus) def __array_wrap__(self, obj, context=None): - return self.refocus(self.array.__array_wrap__(obj, context)) + return self.refocus(self.variable.__array_wrap__(obj, context)) @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): - return self.refocus(f(self.array, *args, **kwargs), + return self.refocus(f(self.variable, *args, **kwargs), self.focus + '_' + f.__name__) return func @@ -520,15 +538,15 @@ def func(self, other): # TODO: automatically group by other variable dimensions to allow # for broadcasting dimensions like 'dayofyear' against 'time' self._check_coordinates_compat(other) - ds = self.unselected() + ds = self.dataset.copy() if self.is_coord() else self.unselected() if hasattr(other, 'unselected'): ds.merge(other.unselected(), inplace=True) - other_array = getattr(other, 'array', other) + other_array = getattr(other, 'variable', other) other_focus = getattr(other, 'focus', 'other') focus = self.focus + '_' + f.__name__ + '_' + other_focus - ds[focus] = (f(self.array, other_array) + ds[focus] = (f(self.variable, other_array) if not reflexive - else f(other_array, self.array)) + else f(other_array, self.variable)) return type(self)(ds, focus) return func @@ -537,8 +555,8 @@ def _inplace_binary_op(f): @functools.wraps(f) def func(self, other): self._check_coordinates_compat(other) - other_array = getattr(other, 'array', other) - self.array = f(self.array, other_array) + other_array = getattr(other, 'variable', other) + self.variable = f(self.variable, other_array) if hasattr(other, 'unselected'): self.dataset.merge(other.unselected(), inplace=True) return self @@ -555,8 +573,9 @@ def align(array1, array2): # TODO: automatically align when doing math with arrays, or better yet # calculate the union of the indices and fill in the mis-aligned data with # NaN. - overlapping_coords = {k: (array1.coordinates[k].data - & array2.coordinates[k].data) + # TODO: generalize this function to any number of arguments + overlapping_coords = {k: (array1.coordinates[k].index + & array2.coordinates[k].index) for k in array1.coordinates if k in array2.coordinates} return tuple(ar.labeled_by(**overlapping_coords) diff --git a/src/xray/utils.py b/src/xray/utils.py index c5bdf3e75fc..23200c246f4 100644 --- a/src/xray/utils.py +++ b/src/xray/utils.py @@ -6,6 +6,8 @@ import numpy as np import pandas as pd +import xarray + def expanded_indexer(key, ndim): """Given a key for indexing an ndarray, return an equivalent key which is a @@ -86,12 +88,12 @@ def all_full_slices(key_index): def remap_loc_indexers(indices, indexers): - """Given mappings of indices and label based indexers, return equivalent - location based indexers. + """Given mappings of XArray indices and label based indexers, return + equivalent location based indexers. """ new_indexers = OrderedDict() for dim, loc in indexers.iteritems(): - index = indices[dim].data + index = indices[dim].index if isinstance(loc, slice): indexer = index.slice_indexer(loc.start, loc.stop, loc.step) else: @@ -201,11 +203,12 @@ def encode_cf_datetime(dates, units=None, calendar=None): and np.issubdtype(dates.dtype, np.datetime64)): # for now, don't bother doing any trickery like decode_cf_datetime to # convert dates to numbers faster - dates = dates.astype(datetime) + # TODO: don't use pandas.DatetimeIndex to do the conversion + dates = pd.Index(dates.reshape(-1)).to_pydatetime().reshape(dates.shape) if hasattr(dates, 'ndim') and dates.ndim == 0: - # unpack dates because date2num doesn't like 0-dimensional arguments - dates = dates[()] + # date2num doesn't like 0-dimensional arguments + dates = dates.item() num = nc4.date2num(dates, units, calendar) return (num, units, calendar) @@ -235,33 +238,40 @@ def xarray_equal(v1, v2, rtol=1e-05, atol=1e-08): This function is necessary because `v1 == v2` for XArrays and DatasetArrays does element-wise comparisions (like numpy.ndarrays). """ + v1, v2 = map(xarray.as_xarray, [v1, v2]) if (v1.dimensions == v2.dimensions - and dict_equal(v1.attributes, v2.attributes)): - try: + and dict_equal(v1.attributes, v2.attributes)): + if v1._data is v2._data: # if _data is identical, skip checking arrays by value - if v1._data is v2._data: - return True - except AttributeError: - # _data is not part of the public interface, so it's okay if its - # missing - pass - - def is_floating(arr): - return np.issubdtype(arr.dtype, float) - - data1 = v1.data - data2 = v2.data - if hasattr(data1, 'equals'): - # handle pandas.Index objects - return data1.equals(data2) - elif is_floating(data1) or is_floating(data2): - return allclose_or_equiv(data1, data2, rtol=rtol, atol=atol) + return True else: - return np.array_equal(data1, data2) + def is_floating(arr): + return np.issubdtype(arr.dtype, float) + + data1 = v1.data + data2 = v2.data + if is_floating(data1) or is_floating(data2): + return allclose_or_equiv(data1, data2, rtol=rtol, atol=atol) + else: + return np.array_equal(data1, data2) else: return False +def safe_cast_to_index(array): + """Given an array, safely cast it to a pandas.Index + + Unlike pandas.Index, if the array has dtype=object or dtype=timedelta64, + this function will not attempt to do automatic type conversion but will + always return an index with dtype=object. + """ + kwargs = {} + if isinstance(array, np.ndarray): + if array.dtype == object or array.dtype == np.timedelta64: + kwargs['dtype'] = object + return pd.Index(array, **kwargs) + + def update_safety_check(first_dict, second_dict, compat=operator.eq): """Check the safety of updating one dictionary with another. diff --git a/src/xray/xarray.py b/src/xray/xarray.py index ed31775a43f..78cc1068ebb 100644 --- a/src/xray/xarray.py +++ b/src/xray/xarray.py @@ -1,5 +1,6 @@ import functools import numpy as np +import pandas as pd from itertools import izip from collections import OrderedDict @@ -14,6 +15,25 @@ from common import AbstractArray +def as_xarray(array): + """Convert an object into an XArray + + If the object is a DatasetArray or already an XArray, the existing XArray + object is returned. Otherwise, the object is converted into a new XArray + based on its 'dimensions' and 'data' attributes. + """ + # TODO: consider extending this method to automatically handle Iris and + # pandas objects. + if hasattr(array, 'variable'): + # extract the focus XArray from DatasetArrays + array = array.variable + if not isinstance(array, XArray): + array = XArray(array.dimensions, array.data, + getattr(array, 'attributes', None), + getattr(array, 'encoding', None)) + return array + + def _as_compatible_data(data): """If data does not have the necessary attributes to be the private _data attribute, convert it to a np.ndarray and raise an warning @@ -21,7 +41,8 @@ def _as_compatible_data(data): # don't check for __len__ or __iter__ so as not to warn if data is a numpy # numeric type like np.float32 required = ['dtype', 'shape', 'size', 'ndim'] - if not all(hasattr(data, attr) for attr in required): + if (not all(hasattr(data, attr) for attr in required) + or isinstance(data, np.string_)): data = np.asarray(data) elif isinstance(data, AbstractArray): # we don't want nested Array objects @@ -77,28 +98,73 @@ def __init__(self, dims, data, attributes=None, encoding=None, self.encoding = dict({} if encoding is None else encoding) self._indexing_mode = indexing_mode + @property + def dtype(self): + return self._data.dtype + + @property + def shape(self): + return self._data.shape + + @property + def size(self): + return self._data.size + + @property + def ndim(self): + return self._data.ndim + + def __len__(self): + return len(self._data) + + def _data_as_ndarray(self): + if isinstance(self._data, pd.Index): + # pandas does automatic type conversion when an index is accessed + # like index[...], so use index.values instead + data = self._data.values + else: + data = np.asarray(self._data[...]) + return data + @property def data(self): """The variable's data as a numpy.ndarray""" - if not isinstance(self._data, (np.ndarray, np.string_)): - self._data = np.asarray(self._data[...]) - self._indexing_mode = 'numpy' + self._data = self._data_as_ndarray() + self._indexing_mode = 'numpy' data = self._data if data.ndim == 0 and data.dtype.kind == 'O': # unpack 0d object arrays to be consistent with numpy - data = data[()] + data = data.item() return data @data.setter def data(self, value): - # allow any array to support pandas.Index objects - value = np.asanyarray(value) + value = np.asarray(value) if value.shape != self.shape: - raise ValueError("replacement data must match the Array's " - "shape") + raise ValueError("replacement data must match the XArray's shape") self._data = value self._indexing_mode = 'numpy' + @property + def index(self): + """The variable's data as a pandas.Index""" + if self.ndim != 1: + raise ValueError('can only access 1-d arrays as an index') + if isinstance(self._data, pd.Index): + index = self._data + else: + # TODO: add some logic to set dtype=object in some cases where + # pandas won't otherwise create a faithful index (e.g., for + # dtype=np.timedelta64 and arrays of datetime objects) + index = utils.safe_cast_to_index(self.data) + return index + + def to_coord(self): + """Return this array as an CoordXArray""" + return CoordXArray(self.dimensions, self._data, self.attributes, + encoding=self.encoding, + indexing_mode=self._indexing_mode, dtype=self.dtype) + @property def dimensions(self): """Tuple of dimension names with which this array is associated. @@ -124,6 +190,17 @@ def _convert_indexer(self, key, indexing_mode=None): key = utils.orthogonal_indexer(key, self.shape) return key + def _get_data(self, key): + """Internal method for getting data from _data, given a key already + converted to a suitable type (via _convert_indexer)""" + if len(key) == 1: + # unpack key so it can index a pandas.Index object (pandas.Index + # objects don't like tuples) + key, = key + # do integer based indexing if supported by _data (i.e., if _data is + # a pandas object) + return getattr(self._data, 'iloc', self._data)[key] + def __getitem__(self, key): """Return a new Array object whose contents are consistent with getting the provided key from the underlying data. @@ -144,21 +221,14 @@ def __getitem__(self, key): key = self._convert_indexer(key) dimensions = [dim for k, dim in zip(key, self.dimensions) if not isinstance(k, int)] - if len(key) == 1: - # unpack key so it can index a pandas.Index object (pandas.Index - # objects don't like tuples) - key, = key - # do location based indexing if supported by _data - new_data = getattr(self._data, 'iloc', self._data)[key] + data = self._get_data(key) # orthogonal indexing should ensure the dimensionality is consistent - if hasattr(new_data, 'ndim'): - assert new_data.ndim == len(dimensions) + if hasattr(data, 'ndim'): + assert data.ndim == len(dimensions) else: assert len(dimensions) == 0 - # return a variable with the same indexing_mode, because data should - # still be the same type as _data - return type(self)(dimensions, new_data, self.attributes, - self.encoding, self._indexing_mode) + # don't keep indexing_mode, because data should now be an ndarray + return type(self)(dimensions, data, self.attributes, self.encoding) def __setitem__(self, key, value): """__setitem__ is overloaded to access the underlying numpy data with @@ -186,7 +256,7 @@ def copy(self): def _copy(self, deepcopy=False): # np.array always makes a copy - data = np.array(self._data) if deepcopy else self.data + data = np.array(self.data) if deepcopy else self.data # note: # dimensions is already an immutable tuple # attributes will be copied when the new Array is created @@ -527,14 +597,14 @@ def concat(cls, variables, dimension='stacked_dimension', return concatenated def __array_wrap__(self, obj, context=None): - return type(self)(self.dimensions, obj, self.attributes) + return XArray(self.dimensions, obj, self.attributes) @staticmethod def _unary_op(f): @functools.wraps(f) def func(self, *args, **kwargs): - return type(self)(self.dimensions, f(self.data, *args, **kwargs), - _math_safe_attributes(self.attributes)) + return XArray(self.dimensions, f(self.data, *args, **kwargs), + _math_safe_attributes(self.attributes)) return func @staticmethod @@ -552,7 +622,7 @@ def func(self, other): if hasattr(other, 'attributes'): new_attr = utils.ordered_dict_intersection( new_attr, _math_safe_attributes(other.attributes)) - return type(self)(dims, new_data, new_attr) + return XArray(dims, new_data, new_attr) return func @staticmethod @@ -573,13 +643,76 @@ def func(self, other): ops.inject_special_operations(XArray) +class CoordXArray(XArray): + """Subclass of XArray which caches its data as a pandas.Index instead of + a numpy.ndarray + + CoordXArrays must always be 1-dimensional. + """ + def __init__(self, dims, data, attributes=None, encoding=None, + indexing_mode='numpy', dtype=None): + """ + Parameters + ---------- + dtype : np.dtype, optional + Numpy dtype for the values in data. It is useful to keep track of + this separately because data converted into a pandas.Index does not + necessarily faithfully maintain the data type (many types are + converted into object arrays). + """ + super(CoordXArray, self).__init__(dims, data, attributes, encoding, + indexing_mode) + if self.ndim != 1: + raise ValueError('%s objects must be 1-dimensional' % + type(self).__name__) + if dtype is None: + dtype = self._data.dtype + self._dtype = dtype + + @property + def dtype(self): + return self._dtype + + @property + def data(self): + """The variable's data as a numpy.ndarray""" + data = self._data_as_ndarray().astype(self.dtype) + if not isinstance(self._data, pd.Index): + # always cache data as a pandas index + self._data = utils.safe_cast_to_index(data) + self._indexing_mode = 'numpy' + return data + + @data.setter + def data(self, value): + raise TypeError('%s data cannot be modified' % type(self).__name__) + + def __getitem__(self, key): + data = self._get_data(self._convert_indexer(key)) + if not hasattr(data, 'ndim') or data.ndim == 0: + data = np.asarray(data).astype(self.dtype) + return XArray((), data, self.attributes, self.encoding) + else: + return type(self)(self.dimensions, data, self.attributes, + self.encoding, dtype=self.dtype) + + def __setitem__(self, key, value): + raise TypeError('%s data cannot be modified' % type(self).__name__) + + def _copy(self, deepcopy=False): + # there is no need to copy the index data here since pandas.Index + # objects are immutable + return type(self)(self.dimensions, self.index, self.attributes, + self.encoding, dtype=self.dtype) + + def _math_safe_attributes(attributes): return OrderedDict((k, v) for k, v in attributes.iteritems() if k not in ['units']) def broadcast_xarrays(first, second): - """Given two XArrays, return two AXrrays with matching dimensions and numpy + """Given two XArrays, return two XArrays with matching dimensions and numpy broadcast compatible data. Parameters @@ -620,7 +753,7 @@ def broadcast_xarrays(first, second): # adding second's dimensions at the end first_data = first.data[(Ellipsis,) + (None,) * len(second_only_dims)] new_first = XArray(dimensions, first_data, first.attributes, - first.encoding) + first.encoding) # expand and reorder second_data so the dimensions line up first_only_dims = [d for d in dimensions if d not in second.dimensions] second_dims = list(second.dimensions) + first_only_dims diff --git a/test/test_dataset.py b/test/test_dataset.py index 0fbdde32f7f..cd3c6154b0c 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -82,7 +82,7 @@ def test_coordinate(self): attributes = {'foo': 'bar'} a['x'] = ('x', vec, attributes) self.assertTrue('x' in a.coordinates) - self.assertIsInstance(a.coordinates['x'].data, pd.Index) + self.assertIsInstance(a.coordinates['x'].index, pd.Index) self.assertXArrayEqual(a.coordinates['x'], a.variables['x']) b = Dataset() b['x'] = ('x', vec, attributes) @@ -300,15 +300,32 @@ def test_merge(self): def test_getitem(self): data = create_test_data() - data['time'] = ('time', pd.date_range('2000-01-01', periods=20)) self.assertIsInstance(data['var1'], DatasetArray) self.assertXArrayEqual(data['var1'], data.variables['var1']) self.assertIs(data['var1'].dataset, data) + + def test_virtual_variables(self): # access virtual variables + data = create_test_data() self.assertXArrayEqual(data['time.dayofyear'], XArray('time', 1 + np.arange(20))) self.assertArrayEqual(data['time.month'].data, - data.variables['time'].data.month) + data.variables['time'].index.month) + # test accessing a decoded virtual variable + data.set_variables({'time2': ('time', np.arange(20), + {'units': 'days since 2000-01-01'})}, + decode_cf=True) + self.assertXArrayEqual(data['time2.dayofyear'], + XArray('time', 1 + np.arange(20))) + # test virtual variable math + self.assertArrayEqual(data['time.dayofyear'] + 1, 2 + np.arange(20)) + self.assertArrayEqual(data['time2.dayofyear'] + 1, 2 + np.arange(20)) + self.assertArrayEqual(np.sin(data['time.dayofyear']), + np.sin(1 + np.arange(20))) + # test slicing the virtual variable -- it should still be virtual + actual = data['time.dayofyear'][:10].dataset + expected = data.indexed_by(time=slice(10)) + self.assertDatasetEqual(expected, actual) def test_setitem(self): # assign a variable @@ -320,7 +337,7 @@ def test_setitem(self): self.assertEqual(data1, data2) # assign a dataset array dv = 2 * data2['A'] - data1['B'] = dv.array + data1['B'] = dv.variable data2['B'] = dv self.assertEqual(data1, data2) # assign an array @@ -370,9 +387,9 @@ def test_concat(self): data0, data1 = deepcopy(split_data) data1['foo'] = ('bar', np.random.randn(10)) Dataset.concat([data0, data1], 'dim1') - with self.assertRaisesRegexp(ValueError, 'unsafe to merge datasets'): + with self.assertRaisesRegexp(ValueError, 'not equal across datasets'): data0, data1 = deepcopy(split_data) - data1['dim2'] *= 2 + data1['dim2'] = 2 * data1['dim2'] Dataset.concat([data0, data1], 'dim1') def test_to_dataframe(self): @@ -471,7 +488,7 @@ def create_encoded_masked_and_scaled_data(): return Dataset({'x': XArray('t', [-1, -1, 0, 1, 2], attributes)}) -class DatasetIOCases(object): +class DatasetIOTestCases(object): def get_store(self): raise NotImplementedError @@ -510,8 +527,20 @@ def test_roundtrip_example_1_netcdf(self): actual = self.roundtrip(expected) self.assertDatasetEqual(expected, actual) + def test_orthogonal_indexing(self): + in_memory = create_test_data() + on_disk = self.roundtrip(in_memory) + indexers = {'dim1': range(3), 'dim2': range(4), 'dim3': range(5)} + expected = in_memory.indexed_by(**indexers) + actual = on_disk.indexed_by(**indexers) + self.assertDatasetEqual(expected, actual) + # do it twice, to make sure we're switched from orthogonal -> numpy + # when we cached the values + actual = on_disk.indexed_by(**indexers) + self.assertDatasetEqual(expected, actual) + -class NetCDF4DataTest(DatasetIOCases, TestCase): +class NetCDF4DataTest(DatasetIOTestCases, TestCase): def get_store(self): f, self.tmp_file = tempfile.mkstemp(suffix='.nc') os.close(f) @@ -639,10 +668,10 @@ def test_0dimensional_variable(self): def test_lazy_decode(self): data = self.roundtrip(create_test_data(), decode_cf=True) - self.assertIsInstance(data['var1']._data, nc4.Variable) + self.assertIsInstance(data['var1'].variable._data, nc4.Variable) -class ScipyDataTest(DatasetIOCases, TestCase): +class ScipyDataTest(DatasetIOTestCases, TestCase): def get_store(self): fobj = StringIO() return backends.ScipyDataStore(fobj, 'w') diff --git a/test/test_dataset_array.py b/test/test_dataset_array.py index 6dc9bd564a2..290048588ca 100644 --- a/test/test_dataset_array.py +++ b/test/test_dataset_array.py @@ -23,7 +23,7 @@ def setUp(self): def test_properties(self): self.assertIs(self.dv.dataset, self.ds) self.assertEqual(self.dv.focus, 'foo') - self.assertXArrayEqual(self.dv.array, self.v) + self.assertXArrayEqual(self.dv.variable, self.v) self.assertArrayEqual(self.dv.data, self.v.data) for attr in ['dimensions', 'dtype', 'shape', 'size', 'ndim', 'attributes']: @@ -43,18 +43,18 @@ def test_items(self): self.assertDSArrayEqual(DatasetArray(self.ds, 'y'), y) # integer indexing I = ReturnItem() - for i in [I[:], I[...], I[x.data], I[x.array], I[x], I[x, y], - I[x.data > -1], I[x.array > -1], I[x > -1], + for i in [I[:], I[...], I[x.data], I[x.variable], I[x], I[x, y], + I[x.data > -1], I[x.variable > -1], I[x > -1], I[x > -1, y > -1]]: self.assertXArrayEqual(self.dv, self.dv[i]) for i in [I[0], I[:, 0], I[:3, :2], - I[x.data[:3]], I[x.array[:3]], I[x[:3]], I[x[:3], y[:4]], - I[x.data > 3], I[x.array > 3], I[x > 3], I[x > 3, y > 3]]: + I[x.data[:3]], I[x.variable[:3]], I[x[:3]], I[x[:3], y[:4]], + I[x.data > 3], I[x.variable > 3], I[x > 3], I[x > 3, y > 3]]: self.assertXArrayEqual(self.v[i], self.dv[i]) # make sure we always keep the array around, even if it's a scalar - self.assertXArrayEqual(self.dv[0, 0], self.dv.array[0, 0]) + self.assertXArrayEqual(self.dv[0, 0], self.dv.variable[0, 0]) self.assertEqual(self.dv[0, 0].dataset, - Dataset({'foo': self.dv.array[0, 0]})) + Dataset({'foo': self.dv.variable[0, 0]})) def test_indexed_by(self): self.assertEqual(self.dv[0].dataset, self.ds.indexed_by(x=0)) @@ -134,15 +134,22 @@ def test_math(self): with self.assertRaisesRegexp(ValueError, 'not aligned'): b + a + def test_coord_math(self): + ds = Dataset({'x': ('x', 1 + np.arange(3))}) + expected = ds.copy() + expected['x2'] = ('x', np.arange(3)) + actual = ds['x'] - 1 + self.assertDSArrayEquiv(expected['x2'], actual) + def test_item_math(self): self.ds['x'] = ('x', np.array(list('abcdefghij'))) self.assertXArrayEqual(self.dv + self.dv[0, 0], - self.dv + self.dv[0, 0].data) + self.dv + self.dv[0, 0].data) new_data = self.x[0][None, :] + self.x[:, 0][:, None] self.assertXArrayEqual(self.dv[:, 0] + self.dv[0], - XArray(['x', 'y'], new_data)) + XArray(['x', 'y'], new_data)) self.assertXArrayEqual(self.dv[0] + self.dv[:, 0], - XArray(['y', 'x'], new_data.T)) + XArray(['y', 'x'], new_data.T)) def test_inplace_math(self): x = self.x @@ -151,15 +158,16 @@ def test_inplace_math(self): b = a b += 1 self.assertIs(b, a) - self.assertIs(b.array, v) + self.assertIs(b.variable, v) self.assertIs(b.data, x) self.assertIs(b.dataset, self.ds) def test_transpose(self): - self.assertXArrayEqual(self.dv.array.transpose(), self.dv.transpose()) + self.assertXArrayEqual(self.dv.variable.transpose(), + self.dv.transpose()) def test_squeeze(self): - self.assertXArrayEqual(self.dv.array.squeeze(), self.dv.squeeze()) + self.assertXArrayEqual(self.dv.variable.squeeze(), self.dv.squeeze()) def test_reduce(self): self.assertXArrayEqual(self.dv.reduce(np.mean, 'x'), @@ -178,7 +186,7 @@ def test_groupby_iter(self): def test_groupby(self): agg_var = XArray(['y'], np.array(['a'] * 9 + ['c'] + ['b'] * 10)) self.dv['abc'] = agg_var - self.dv['y'] = 20 + 100 * self.ds['y'].array + self.dv['y'] = 20 + 100 * self.ds['y'].variable identity = lambda x: x for g in ['x', 'y']: @@ -227,8 +235,8 @@ def test_concat(self): # from xarrays: self.assertXArrayEqual(XArray(['w', 'x', 'y'], np.array([foo.data, bar.data])), - DatasetArray.concat([foo.array, - bar.array], 'w')) + DatasetArray.concat([foo.variable, + bar.variable], 'w')) # from iteration: stacked = DatasetArray.concat((v for _, v in foo.groupby('x')), self.ds['x']) diff --git a/test/test_utils.py b/test/test_utils.py index 22a88cde0fe..0401d9cff15 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -113,6 +113,21 @@ def test_guess_time_units(self): self.assertEquals(expected, utils.guess_time_units(dates)) +class TestSafeCastToIndex(TestCase): + def test(self): + dates = pd.date_range('2000-01-01', periods=10) + x = np.arange(5) + timedeltas = x * np.timedelta64(1, 'D') + for expected, array in [ + (dates, dates.values), + (pd.Index(x, dtype=object), x.astype(object)), + (pd.Index(timedeltas, dtype=object), timedeltas), + ]: + actual = utils.safe_cast_to_index(array) + self.assertArrayEqual(expected, actual) + self.assertEqual(expected.dtype, actual.dtype) + + class TestDictionaries(TestCase): def setUp(self): self.x = {'a': 'A', 'b': 'B'} diff --git a/test/test_xarray.py b/test/test_xarray.py index ab60c336cd7..4bbbbbb943d 100644 --- a/test/test_xarray.py +++ b/test/test_xarray.py @@ -2,25 +2,25 @@ from datetime import datetime import numpy as np +import pandas as pd -from xray import XArray +from xray import XArray, CoordXArray from . import TestCase -class TestXArray(TestCase): - def setUp(self): - self.d = np.random.random((10, 3)).astype(np.float64) - - def test_data(self): - v = XArray(['time', 'x'], self.d, indexing_mode='not-supported') - self.assertIs(v.data, self.d) - with self.assertRaises(ValueError): - # wrong size - v.data = np.random.random(5) - d2 = np.random.random((10, 3)) - v.data = d2 - self.assertIs(v.data, d2) - self.assertEqual(v._indexing_mode, 'numpy') +class XArraySubclassTestCases(object): + def test_properties(self): + data = 0.5 * np.arange(10) + v = XArray(['time'], data, {'foo': 'bar'}) + self.assertEqual(v.dimensions, ('time',)) + self.assertArrayEqual(v.data, data) + self.assertTrue(pd.Index(data).equals(v.index)) + self.assertEqual(v.dtype, float) + self.assertEqual(v.shape, (10,)) + self.assertEqual(v.size, 10) + self.assertEqual(v.ndim, 1) + self.assertEqual(len(v), 10) + self.assertEqual(v.attributes, {'foo': u'bar'}) def test_0d_data(self): d = datetime(2000, 1, 1) @@ -29,7 +29,7 @@ def test_0d_data(self): ('foo', np.string_), (d, None), (np.datetime64(d), np.datetime64)]: - x = XArray(['x'], [value]) + x = self.cls(['x'], [value]) # check array properties self.assertEqual(x[0].shape, ()) self.assertEqual(x[0].ndim, 0) @@ -46,6 +46,121 @@ def test_0d_data(self): self.assertTrue(np.issubdtype(x.data[0].dtype, dtype)) self.assertTrue(np.issubdtype(x[0].data.dtype, dtype)) + def test_pandas_data(self): + v = self.cls(['x'], pd.Series([0, 1, 2], index=[3, 2, 1])) + self.assertXArrayEqual(v, v[[0, 1, 2]]) + v = self.cls(['x'], pd.Index([0, 1, 2])) + self.assertEqual(v[0].data, v.data[0]) + + def test_1d_math(self): + x = 1.0 * np.arange(5) + y = np.ones(5) + v = self.cls(['x'], x) + # unary ops + self.assertXArrayEqual(v, +v) + self.assertXArrayEqual(v, abs(v)) + self.assertArrayEqual((-v).data, -x) + # bianry ops with numbers + self.assertXArrayEqual(v, v + 0) + self.assertXArrayEqual(v, 0 + v) + self.assertXArrayEqual(v, v * 1) + self.assertArrayEqual((v > 2).data, x > 2) + self.assertArrayEqual((0 == v).data, 0 == x) + self.assertArrayEqual((v - 1).data, x - 1) + self.assertArrayEqual((1 - v).data, 1 - x) + # binary ops with numpy arrays + self.assertArrayEqual((v * x).data, x ** 2) + self.assertArrayEqual((x * v).data, x ** 2) + self.assertArrayEqual(v - y, v - 1) + self.assertArrayEqual(y - v, 1 - v) + # verify math-safe attributes + v2 = self.cls(['x'], x, {'units': 'meters'}) + self.assertXArrayEqual(v, +v2) + v3 = self.cls(['x'], x, {'something': 'else'}) + self.assertXArrayEqual(v3, +v3) + # binary ops with all variables + self.assertArrayEqual(v + v, 2 * v) + w = self.cls(['x'], y, {'foo': 'bar'}) + self.assertXArrayEqual(v + w, self.cls(['x'], x + y)) + self.assertArrayEqual((v * w).data, x * y) + # something complicated + self.assertArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) + # make sure dtype is preserved (for CoordXArrays) + self.assertEqual(float, (+v).dtype) + self.assertEqual(float, (+v).data.dtype) + self.assertEqual(float, (0 + v).dtype) + self.assertEqual(float, (0 + v).data.dtype) + # check types of returned data + self.assertIsInstance(+v, XArray) + self.assertNotIsInstance(+v, CoordXArray) + self.assertIsInstance(0 + v, XArray) + self.assertNotIsInstance(0 + v, CoordXArray) + + def test_array_interface(self): + x = np.arange(5) + v = self.cls(['x'], x) + self.assertArrayEqual(np.asarray(v), x) + # test patched in methods + self.assertArrayEqual(v.take([2, 3]), x.take([2, 3])) + self.assertXArrayEqual(v.argsort(), v) + self.assertXArrayEqual(v.clip(2, 3), self.cls('x', x.clip(2, 3))) + # test ufuncs + self.assertXArrayEqual(np.sin(v), self.cls(['x'], np.sin(x))) + self.assertIsInstance(np.sin(v), XArray) + self.assertNotIsInstance(np.sin(v), CoordXArray) + + def test_concat(self): + x = np.arange(5) + y = np.ones(5) + v = self.cls(['a'], x) + w = self.cls(['a'], y) + self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), + XArray.concat([v, w], 'b')) + self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), + XArray.concat((v, w), 'b')) + self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), + XArray.concat((v, w), 'b', length=2)) + with self.assertRaisesRegexp(ValueError, 'actual length'): + XArray.concat([v, w], 'b', length=1) + with self.assertRaisesRegexp(ValueError, 'actual length'): + XArray.concat([v, w, w], 'b', length=4) + with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): + XArray.concat([v, XArray(['c'], y)], 'b') + # test concatenating along a dimension + v = XArray(['time', 'x'], np.random.random((10, 8))) + self.assertXArrayEqual(v, XArray.concat([v[:5], v[5:]], 'time')) + self.assertXArrayEqual(v, XArray.concat([v[:5], v[5], v[6:]], 'time')) + self.assertXArrayEqual(v, XArray.concat([v[0], v[1:]], 'time')) + # test dimension order + self.assertXArrayEqual(v, XArray.concat([v[:, :5], v[:, 5:]], 'x')) + self.assertXArrayEqual(v.transpose(), + XArray.concat([v[:, 0], v[:, 1:]], 'x')) + + def test_copy(self): + v = self.cls('x', 0.5 * np.arange(10)) + w = v.copy() + self.assertIs(type(v), type(w)) + self.assertXArrayEqual(v, w) + self.assertEqual(v.dtype, w.dtype) + + +class TestXArray(TestCase, XArraySubclassTestCases): + cls = XArray + + def setUp(self): + self.d = np.random.random((10, 3)).astype(np.float64) + + def test_data(self): + v = XArray(['time', 'x'], self.d, indexing_mode='not-supported') + self.assertIs(v.data, self.d) + with self.assertRaises(ValueError): + # wrong size + v.data = np.random.random(5) + d2 = np.random.random((10, 3)) + v.data = d2 + self.assertIs(v.data, d2) + self.assertEqual(v._indexing_mode, 'numpy') + def test_array_equality(self): d = np.random.rand(10, 3) v1 = XArray(('dim1', 'dim2'), data=d, @@ -63,16 +178,6 @@ def test_array_equality(self): self.assertXArrayNotEqual(v1, v4) self.assertXArrayNotEqual(v1, v5) - def test_properties(self): - v = XArray(['time', 'x'], self.d, {'foo': 'bar'}) - self.assertEqual(v.dimensions, ('time', 'x')) - self.assertEqual(v.dtype, float) - self.assertEqual(v.shape, (10, 3)) - self.assertEqual(v.size, 30) - self.assertEqual(v.ndim, 2) - self.assertEqual(len(v), 10) - self.assertEqual(v.attributes, {'foo': u'bar'}) - def test_repr(self): v = XArray(['time', 'x'], self.d) self.assertEqual('', @@ -141,40 +246,6 @@ def test_squeeze(self): with self.assertRaisesRegexp(ValueError, 'cannot select a dimension'): v.squeeze('y') - def test_1d_math(self): - x = np.arange(5) - y = np.ones(5) - v = XArray(['x'], x) - # unary ops - self.assertXArrayEqual(v, +v) - self.assertXArrayEqual(v, abs(v)) - self.assertArrayEqual((-v).data, -x) - # bianry ops with numbers - self.assertXArrayEqual(v, v + 0) - self.assertXArrayEqual(v, 0 + v) - self.assertXArrayEqual(v, v * 1) - self.assertArrayEqual((v > 2).data, x > 2) - self.assertArrayEqual((0 == v).data, 0 == x) - self.assertArrayEqual((v - 1).data, x - 1) - self.assertArrayEqual((1 - v).data, 1 - x) - # binary ops with numpy arrays - self.assertArrayEqual((v * x).data, x ** 2) - self.assertArrayEqual((x * v).data, x ** 2) - self.assertArrayEqual(v - y, v - 1) - self.assertArrayEqual(y - v, 1 - v) - # verify math-safe attributes - v2 = XArray(['x'], x, {'units': 'meters'}) - self.assertXArrayEqual(v, +v2) - v3 = XArray(['x'], x, {'something': 'else'}) - self.assertXArrayEqual(v3, +v3) - # binary ops with all variables - self.assertArrayEqual(v + v, 2 * v) - w = XArray(['x'], y, {'foo': 'bar'}) - self.assertXArrayEqual(v + w, XArray(['x'], x + y)) - self.assertArrayEqual((v * w).data, x * y) - # something complicated - self.assertArrayEqual((v ** 2 * w - 1 + x).data, x ** 2 * y - 1 + x) - def test_broadcasting_math(self): x = np.random.randn(2, 3) v = XArray(['a', 'b'], x) @@ -223,17 +294,6 @@ def test_inplace_math(self): self.assertIs(v.data, x) self.assertArrayEqual(v.data, np.arange(5) + 1) - def test_array_interface(self): - x = np.arange(5) - v = XArray(['x'], x) - self.assertArrayEqual(np.asarray(v), x) - # test patched in methods - self.assertArrayEqual(v.take([2, 3]), x.take([2, 3])) - self.assertXArrayEqual(v.argsort(), v) - self.assertXArrayEqual(v.clip(2, 3), XArray('x', x.clip(2, 3))) - # test ufuncs - self.assertXArrayEqual(np.sin(v), XArray(['x'], np.sin(x))) - def test_reduce(self): v = XArray(['time', 'x'], self.d) # intentionally test with an operation for which order matters @@ -277,29 +337,22 @@ def test_groupby(self): self.assertXArrayEqual(ke, ka) self.assertXArrayEqual(ve, va) - def test_concat(self): - x = np.arange(5) - y = np.ones(5) - v = XArray(['a'], x) - w = XArray(['a'], y) - self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), - XArray.concat([v, w], 'b')) - self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), - XArray.concat((v, w), 'b')) - self.assertXArrayEqual(XArray(['b', 'a'], np.array([x, y])), - XArray.concat((v, w), 'b', length=2)) - with self.assertRaisesRegexp(ValueError, 'actual length'): - XArray.concat([v, w], 'b', length=1) - with self.assertRaisesRegexp(ValueError, 'actual length'): - XArray.concat([v, w, w], 'b', length=4) - with self.assertRaisesRegexp(ValueError, 'inconsistent dimensions'): - XArray.concat([v, XArray(['c'], y)], 'b') - # test concatenating along a dimension - v = XArray(['time', 'x'], np.random.random((10, 8))) - self.assertXArrayEqual(v, XArray.concat([v[:5], v[5:]], 'time')) - self.assertXArrayEqual(v, XArray.concat([v[:5], v[5], v[6:]], 'time')) - self.assertXArrayEqual(v, XArray.concat([v[0], v[1:]], 'time')) - # test dimension order - self.assertXArrayEqual(v, XArray.concat([v[:, :5], v[:, 5:]], 'x')) - self.assertXArrayEqual(v.transpose(), - XArray.concat([v[:, 0], v[:, 1:]], 'x')) + +class TestCoordXArray(TestCase, XArraySubclassTestCases): + cls = CoordXArray + + def test_init(self): + with self.assertRaisesRegexp(ValueError, 'must be 1-dimensional'): + CoordXArray((), 0) + + def test_data(self): + x = CoordXArray('x', [0, 1, 2], dtype=float) + # data should be initially saved as an ndarray + self.assertIs(type(x._data), np.ndarray) + self.assertEqual(float, x.dtype) + self.assertArrayEqual(np.arange(3), x) + self.assertEqual(float, x.data.dtype) + # after inspecting x.data, the CoordXArray will be saved as an Index + self.assertIsInstance(x._data, pd.Index) + with self.assertRaisesRegexp(TypeError, 'cannot be modified'): + x[:] = 0