@@ -262,6 +262,44 @@ def _is_time_like(units):
262262 return any (tstr == units for tstr in time_strings )
263263
264264
265+ def _check_fill_values (attrs , name , dtype ):
266+ """ "Check _FillValue and missing_value if available.
267+
268+ Return dictionary with raw fill values and set with encoded fill values.
269+
270+ Issue SerializationWarning if appropriate.
271+ """
272+ raw_fill_dict = {}
273+ [
274+ pop_to (attrs , raw_fill_dict , attr , name = name )
275+ for attr in ("missing_value" , "_FillValue" )
276+ ]
277+ encoded_fill_values = set ()
278+ for k in list (raw_fill_dict ):
279+ v = raw_fill_dict [k ]
280+ kfill = {fv for fv in np .ravel (v ) if not pd .isnull (fv )}
281+ if not kfill and np .issubdtype (dtype , np .integer ):
282+ warnings .warn (
283+ f"variable { name !r} has non-conforming { k !r} "
284+ f"{ v !r} defined, dropping { k !r} entirely." ,
285+ SerializationWarning ,
286+ stacklevel = 3 ,
287+ )
288+ del raw_fill_dict [k ]
289+ else :
290+ encoded_fill_values |= kfill
291+
292+ if len (encoded_fill_values ) > 1 :
293+ warnings .warn (
294+ f"variable { name !r} has multiple fill values "
295+ f"{ encoded_fill_values } defined, decoding all values to NaN." ,
296+ SerializationWarning ,
297+ stacklevel = 3 ,
298+ )
299+
300+ return raw_fill_dict , encoded_fill_values
301+
302+
265303class CFMaskCoder (VariableCoder ):
266304 """Mask or unmask fill values according to CF conventions."""
267305
@@ -283,67 +321,56 @@ def encode(self, variable: Variable, name: T_Name = None):
283321 f"Variable { name !r} has conflicting _FillValue ({ fv } ) and missing_value ({ mv } ). Cannot encode data."
284322 )
285323
286- # special case DateTime to properly handle NaT
287- is_time_like = _is_time_like (attrs .get ("units" ))
288-
289324 if fv_exists :
290325 # Ensure _FillValue is cast to same dtype as data's
291326 encoding ["_FillValue" ] = dtype .type (fv )
292327 fill_value = pop_to (encoding , attrs , "_FillValue" , name = name )
293- if not pd .isnull (fill_value ):
294- if is_time_like and data .dtype .kind in "iu" :
295- data = duck_array_ops .where (
296- data != np .iinfo (np .int64 ).min , data , fill_value
297- )
298- else :
299- data = duck_array_ops .fillna (data , fill_value )
300328
301329 if mv_exists :
302- # Ensure missing_value is cast to same dtype as data's
303- encoding ["missing_value" ] = dtype .type (mv )
330+ # try to use _FillValue, if it exists to align both values
331+ # or use missing_value and ensure it's cast to same dtype as data's
332+ encoding ["missing_value" ] = attrs .get ("_FillValue" , dtype .type (mv ))
304333 fill_value = pop_to (encoding , attrs , "missing_value" , name = name )
305- if not pd .isnull (fill_value ) and not fv_exists :
306- if is_time_like and data .dtype .kind in "iu" :
307- data = duck_array_ops .where (
308- data != np .iinfo (np .int64 ).min , data , fill_value
309- )
310- else :
311- data = duck_array_ops .fillna (data , fill_value )
334+
335+ # apply fillna
336+ if not pd .isnull (fill_value ):
337+ # special case DateTime to properly handle NaT
338+ if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
339+ data = duck_array_ops .where (
340+ data != np .iinfo (np .int64 ).min , data , fill_value
341+ )
342+ else :
343+ data = duck_array_ops .fillna (data , fill_value )
312344
313345 return Variable (dims , data , attrs , encoding , fastpath = True )
314346
315347 def decode (self , variable : Variable , name : T_Name = None ):
316- dims , data , attrs , encoding = unpack_for_decoding (variable )
317-
318- raw_fill_values = [
319- pop_to (attrs , encoding , attr , name = name )
320- for attr in ("missing_value" , "_FillValue" )
321- ]
322- if raw_fill_values :
323- encoded_fill_values = {
324- fv
325- for option in raw_fill_values
326- for fv in np .ravel (option )
327- if not pd .isnull (fv )
328- }
329-
330- if len (encoded_fill_values ) > 1 :
331- warnings .warn (
332- "variable {!r} has multiple fill values {}, "
333- "decoding all values to NaN." .format (name , encoded_fill_values ),
334- SerializationWarning ,
335- stacklevel = 3 ,
336- )
348+ raw_fill_dict , encoded_fill_values = _check_fill_values (
349+ variable .attrs , name , variable .dtype
350+ )
337351
338- # special case DateTime to properly handle NaT
339- dtype : np .typing .DTypeLike
340- decoded_fill_value : Any
341- if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
342- dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
343- else :
344- dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
352+ if raw_fill_dict :
353+ dims , data , attrs , encoding = unpack_for_decoding (variable )
354+ [
355+ safe_setitem (encoding , attr , value , name = name )
356+ for attr , value in raw_fill_dict .items ()
357+ ]
345358
346359 if encoded_fill_values :
360+ # special case DateTime to properly handle NaT
361+ dtype : np .typing .DTypeLike
362+ decoded_fill_value : Any
363+ if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
364+ dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
365+ else :
366+ if "scale_factor" not in attrs and "add_offset" not in attrs :
367+ dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
368+ else :
369+ dtype , decoded_fill_value = (
370+ _choose_float_dtype (data .dtype , attrs ),
371+ np .nan ,
372+ )
373+
347374 transform = partial (
348375 _apply_mask ,
349376 encoded_fill_values = encoded_fill_values ,
@@ -366,20 +393,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
366393 return data
367394
368395
369- def _choose_float_dtype (dtype : np .dtype , has_offset : bool ) -> type [np .floating [Any ]]:
396+ def _choose_float_dtype (
397+ dtype : np .dtype , mapping : MutableMapping
398+ ) -> type [np .floating [Any ]]:
370399 """Return a float dtype that can losslessly represent `dtype` values."""
371- # Keep float32 as-is. Upcast half-precision to single-precision,
400+ # check scale/offset first to derive wanted float dtype
401+ # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
402+ scale_factor = mapping .get ("scale_factor" )
403+ add_offset = mapping .get ("add_offset" )
404+ if scale_factor is not None or add_offset is not None :
405+ # get the type from scale_factor/add_offset to determine
406+ # the needed floating point type
407+ if scale_factor is not None :
408+ scale_type = np .dtype (type (scale_factor ))
409+ if add_offset is not None :
410+ offset_type = np .dtype (type (add_offset ))
411+ # CF conforming, both scale_factor and add-offset are given and
412+ # of same floating point type (float32/64)
413+ if (
414+ add_offset is not None
415+ and scale_factor is not None
416+ and offset_type == scale_type
417+ and scale_type in [np .float32 , np .float64 ]
418+ ):
419+ # in case of int32 -> we need upcast to float64
420+ # due to precision issues
421+ if dtype .itemsize == 4 and np .issubdtype (dtype , np .integer ):
422+ return np .float64
423+ return scale_type .type
424+ # Not CF conforming and add_offset given:
425+ # A scale factor is entirely safe (vanishing into the mantissa),
426+ # but a large integer offset could lead to loss of precision.
427+ # Sensitivity analysis can be tricky, so we just use a float64
428+ # if there's any offset at all - better unoptimised than wrong!
429+ if add_offset is not None :
430+ return np .float64
431+ # return dtype depending on given scale_factor
432+ return scale_type .type
433+ # If no scale_factor or add_offset is given, use some general rules.
434+ # Keep float32 as-is. Upcast half-precision to single-precision,
372435 # because float16 is "intended for storage but not computation"
373436 if dtype .itemsize <= 4 and np .issubdtype (dtype , np .floating ):
374437 return np .float32
375438 # float32 can exactly represent all integers up to 24 bits
376439 if dtype .itemsize <= 2 and np .issubdtype (dtype , np .integer ):
377- # A scale factor is entirely safe (vanishing into the mantissa),
378- # but a large integer offset could lead to loss of precision.
379- # Sensitivity analysis can be tricky, so we just use a float64
380- # if there's any offset at all - better unoptimised than wrong!
381- if not has_offset :
382- return np .float32
440+ return np .float32
383441 # For all other types and circumstances, we just use float64.
384442 # (safe because eg. complex numbers are not supported in NetCDF)
385443 return np .float64
@@ -396,7 +454,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
396454 dims , data , attrs , encoding = unpack_for_encoding (variable )
397455
398456 if "scale_factor" in encoding or "add_offset" in encoding :
399- dtype = _choose_float_dtype (data .dtype , "add_offset" in encoding )
457+ # if we have a _FillValue/masked_value we do not want to cast now
458+ # but leave that to CFMaskCoder
459+ dtype = data .dtype
460+ if "_FillValue" not in encoding and "missing_value" not in encoding :
461+ dtype = _choose_float_dtype (data .dtype , encoding )
462+ # but still we need a copy prevent changing original data
400463 data = duck_array_ops .astype (data , dtype = dtype , copy = True )
401464 if "add_offset" in encoding :
402465 data -= pop_to (encoding , attrs , "add_offset" , name = name )
@@ -412,11 +475,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
412475
413476 scale_factor = pop_to (attrs , encoding , "scale_factor" , name = name )
414477 add_offset = pop_to (attrs , encoding , "add_offset" , name = name )
415- dtype = _choose_float_dtype (data .dtype , "add_offset" in encoding )
416478 if np .ndim (scale_factor ) > 0 :
417479 scale_factor = np .asarray (scale_factor ).item ()
418480 if np .ndim (add_offset ) > 0 :
419481 add_offset = np .asarray (add_offset ).item ()
482+ # if we have a _FillValue/masked_value we already have the wanted
483+ # floating point dtype here (via CFMaskCoder), so no check is necessary
484+ # only check in other cases
485+ dtype = data .dtype
486+ if "_FillValue" not in encoding and "missing_value" not in encoding :
487+ dtype = _choose_float_dtype (dtype , encoding )
488+
420489 transform = partial (
421490 _scale_offset_decoding ,
422491 scale_factor = scale_factor ,
0 commit comments