diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b55d59077fc..8c61d71bf55 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,10 @@ Enhancements - More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when raster files are opened with :py:func:`~xarray.open_rasterio`. By `Greg Brener `_ +- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a + a signed integer data type should be interpreted as unsigned bytes + (:issue:`1444`). + By `Eric Bruning `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 449971a9145..f70c286a990 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False): import Nio opener = functools.partial(Nio.open_file, filename, mode=mode) self.ds = opener() + # xarray provides its own support for FillValue, + # so turn off PyNIO's support for the same. + self.ds.set_option('MaskedArrayMode', 'MaskedNever') self._autoclose = autoclose self._isopen = True self._opener = opener diff --git a/xarray/conventions.py b/xarray/conventions.py index d39ae20925a..fac70ac4615 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -534,6 +534,34 @@ def __getitem__(self, key): return np.asarray(self.array[key], dtype=self.dtype) +class UnsignedIntTypeArray(utils.NDArrayMixin): + """Decode arrays on the fly from signed integer to unsigned + integer. Typically used when _Unsigned is set at as a netCDF + attribute on a signed integer variable. + + >>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + + >>> sb.dtype + dtype('int8') + + >>> UnsignedIntTypeArray(sb).dtype + dtype('uint8') + + >>> UnsignedIntTypeArray(sb)[:] + array([ 0, 1, 127, 128, 255], dtype=uint8) + """ + def __init__(self, array): + self.array = array + self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize) + + @property + def dtype(self): + return self.unsigned_dtype + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + def string_to_char(arr): """Like netCDF4.stringtochar, but faster and more flexible. """ @@ -637,6 +665,14 @@ def maybe_encode_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] + if encoding.get('_Unsigned', False): + signed_dtype = 'i%s' % dtype.itemsize + if '_FillValue' in var.attrs: + old_fill = np.asarray(attrs['_FillValue']) + new_fill = old_fill.astype(signed_dtype) + attrs['_FillValue'] = new_fill + data = data.astype(signed_dtype) + pop_to(encoding, attrs, '_Unsigned') if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) dims = dims + ('string%s' % data.shape[-1],) @@ -761,7 +797,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' mask_and_scale: bool Lazily scale (using scale_factor and add_offset) and mask - (using _FillValue). + (using _FillValue). If the _Unsigned attribute is present + treat integer arrays as unsigned. decode_times : bool Decode cf times ('hours since 2000-01-01') to np.datetime64. decode_endianness : bool @@ -786,6 +823,16 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dimensions = dimensions[:-1] data = CharToStringArray(data) + pop_to(attributes, encoding, '_Unsigned') + is_unsigned = encoding.get('_Unsigned', False) + if is_unsigned and mask_and_scale: + if data.dtype.kind == 'i': + data = UnsignedIntTypeArray(data) + else: + warnings.warn("variable has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + RuntimeWarning, stacklevel=3) + if mask_and_scale: if 'missing_value' in attributes: # missing_value is deprecated, but we still want to support it as @@ -800,7 +847,6 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, "and decoding explicitly using " "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') - fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " @@ -808,12 +854,19 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, RuntimeWarning, stacklevel=3) scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') - if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or - scale_factor is not None or add_offset is not None): + has_fill = (fill_value is not None and + not np.any(pd.isnull(fill_value))) + if (has_fill or scale_factor is not None or add_offset is not None): if fill_value.dtype.kind in ['U', 'S']: dtype = object else: dtype = float + # According to the CF spec, the fill value is of the same + # type as its variable, i.e. its storage format on disk. + # This handles the case where the fill_value also needs to be + # converted to its unsigned value. + if has_fill: + fill_value = np.asarray(fill_value, dtype=data.dtype) data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset, dtype) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fa6fff9f4b..f5bd615cfe2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -62,6 +62,23 @@ def create_encoded_masked_and_scaled_data(): return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) +def create_unsigned_masked_scaled_data(): + encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1', + 'add_offset': 10, 'scale_factor': np.float32(0.1)} + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan]) + return Dataset({'x': ('t', x, {}, encoding)}) + + +def create_encoded_unsigned_masked_scaled_data(): + # These are values as written to the file: the _FillValue will + # be represented in the signed form. + attributes = {'_FillValue': -1, '_Unsigned': 'true', + 'add_offset': 10, 'scale_factor': np.float32(0.1)} + # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned + sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + return Dataset({'x': ('t', sb, attributes)}) + + def create_boolean_data(): attributes = {'units': '-'} return Dataset({'x': ('t', [True, False, False, True], attributes)}) @@ -360,16 +377,54 @@ def test_roundtrip_strings_with_fill_value(self): with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) + def test_unsigned_roundtrip_mask_and_scale(self): + decoded = create_unsigned_masked_scaled_data() + encoded = create_encoded_unsigned_masked_scaled_data() + with self.roundtrip(decoded) as actual: + for k in decoded.variables: + self.assertEqual(decoded.variables[k].dtype, + actual.variables[k].dtype) + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(decoded, + open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) + self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical( + encoded, create_encoded_unsigned_masked_scaled_data()) + with self.roundtrip(encoded) as actual: + for k in decoded.variables: + self.assertEqual(decoded.variables[k].dtype, + actual.variables[k].dtype) + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) + self.assertDatasetAllClose(encoded, actual) + def test_roundtrip_mask_and_scale(self): decoded = create_masked_and_scaled_data() encoded = create_encoded_masked_and_scaled_data() with self.roundtrip(decoded) as actual: self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: + with self.roundtrip(decoded, + open_kwargs=dict(decode_cf=False)) as actual: # TODO: this assumes that all roundtrips will first # encode. Is that something we want to test for? self.assertDatasetAllClose(encoded, actual) - with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: self.assertDatasetAllClose(encoded, actual) # make sure roundtrip encoding didn't change the # original dataset. @@ -377,7 +432,8 @@ def test_roundtrip_mask_and_scale(self): create_encoded_masked_and_scaled_data()) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: self.assertDatasetAllClose(encoded, actual) def test_coordinates_encoding(self): diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 6c9d791660d..a6230761b86 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -108,6 +108,15 @@ def test_string_to_char(self): self.assertArrayEqual(actual, expected) +class TestUnsignedIntTypeArray(TestCase): + def test_unsignedinttype_array(self): + sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + ub = conventions.UnsignedIntTypeArray(sb) + self.assertEqual(ub.dtype, np.dtype('u1')) + self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255], + dtype=np.dtype('u1'))) + + class TestBoolTypeArray(TestCase): def test_booltype_array(self): x = np.array([1, 0, 1, 1, 0], dtype='i1')