13
13
14
14
from .. import backends , conventions
15
15
from .common import ArrayWriter
16
+ from ..core import indexing
16
17
from ..core .combine import auto_combine
17
18
from ..core .utils import close_on_error , is_remote_uri
18
19
from ..core .pycompat import basestring
19
20
20
21
DATAARRAY_NAME = '__xarray_dataarray_name__'
21
22
DATAARRAY_VARIABLE = '__xarray_dataarray_variable__'
22
23
24
+
23
25
def _get_default_engine (path , allow_remote = False ):
24
26
if allow_remote and is_remote_uri (path ): # pragma: no cover
25
27
try :
@@ -46,6 +48,13 @@ def _get_default_engine(path, allow_remote=False):
46
48
return engine
47
49
48
50
51
+ def _normalize_path (path ):
52
+ if is_remote_uri (path ):
53
+ return path
54
+ else :
55
+ return os .path .abspath (os .path .expanduser (path ))
56
+
57
+
49
58
_global_lock = threading .Lock ()
50
59
51
60
@@ -117,10 +126,20 @@ def check_attr(name, value):
117
126
check_attr (k , v )
118
127
119
128
129
+ def _protect_dataset_variables_inplace (dataset , cache ):
130
+ for name , variable in dataset .variables .items ():
131
+ if name not in variable .dims :
132
+ # no need to protect IndexVariable objects
133
+ data = indexing .CopyOnWriteArray (variable ._data )
134
+ if cache :
135
+ data = indexing .MemoryCachedArray (data )
136
+ variable .data = data
137
+
138
+
120
139
def open_dataset (filename_or_obj , group = None , decode_cf = True ,
121
140
mask_and_scale = True , decode_times = True ,
122
141
concat_characters = True , decode_coords = True , engine = None ,
123
- chunks = None , lock = None , drop_variables = None ):
142
+ chunks = None , lock = None , cache = None , drop_variables = None ):
124
143
"""Load and decode a dataset from a file or file-like object.
125
144
126
145
Parameters
@@ -162,14 +181,22 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
162
181
'netcdf4'.
163
182
chunks : int or dict, optional
164
183
If chunks is provided, it used to load the new dataset into dask
165
- arrays. This is an experimental feature; see the documentation for more
166
- details.
184
+ arrays. ``chunks={}`` loads the dataset with dask using a single
185
+ chunk for all arrays. This is an experimental feature; see the
186
+ documentation for more details.
167
187
lock : False, True or threading.Lock, optional
168
188
If chunks is provided, this argument is passed on to
169
189
:py:func:`dask.array.from_array`. By default, a per-variable lock is
170
190
used when reading data from netCDF files with the netcdf4 and h5netcdf
171
191
engines to avoid issues with concurrent access when using dask's
172
192
multithreaded backend.
193
+ cache : bool, optional
194
+ If True, cache data loaded from the underlying datastore in memory as
195
+ NumPy arrays when accessed to avoid reading from the underlying data-
196
+ store multiple times. Defaults to True unless you specify the `chunks`
197
+ argument to use dask, in which case it defaults to False. Does not
198
+ change the behavior of coordinates corresponding to dimensions, which
199
+ always load their data from disk into a ``pandas.Index``.
173
200
drop_variables: string or iterable, optional
174
201
A variable or list of variables to exclude from being parsed from the
175
202
dataset. This may be useful to drop variables with problems or
@@ -190,12 +217,17 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
190
217
concat_characters = False
191
218
decode_coords = False
192
219
220
+ if cache is None :
221
+ cache = chunks is None
222
+
193
223
def maybe_decode_store (store , lock = False ):
194
224
ds = conventions .decode_cf (
195
225
store , mask_and_scale = mask_and_scale , decode_times = decode_times ,
196
226
concat_characters = concat_characters , decode_coords = decode_coords ,
197
227
drop_variables = drop_variables )
198
228
229
+ _protect_dataset_variables_inplace (ds , cache )
230
+
199
231
if chunks is not None :
200
232
try :
201
233
from dask .base import tokenize
@@ -226,6 +258,17 @@ def maybe_decode_store(store, lock=False):
226
258
if isinstance (filename_or_obj , backends .AbstractDataStore ):
227
259
store = filename_or_obj
228
260
elif isinstance (filename_or_obj , basestring ):
261
+
262
+ if (isinstance (filename_or_obj , bytes ) and
263
+ filename_or_obj .startswith (b'\x89 HDF' )):
264
+ raise ValueError ('cannot read netCDF4/HDF5 file images' )
265
+ elif (isinstance (filename_or_obj , bytes ) and
266
+ filename_or_obj .startswith (b'CDF' )):
267
+ # netCDF3 file images are handled by scipy
268
+ pass
269
+ elif isinstance (filename_or_obj , basestring ):
270
+ filename_or_obj = _normalize_path (filename_or_obj )
271
+
229
272
if filename_or_obj .endswith ('.gz' ):
230
273
if engine is not None and engine != 'scipy' :
231
274
raise ValueError ('can only read gzipped netCDF files with '
@@ -274,7 +317,7 @@ def maybe_decode_store(store, lock=False):
274
317
def open_dataarray (filename_or_obj , group = None , decode_cf = True ,
275
318
mask_and_scale = True , decode_times = True ,
276
319
concat_characters = True , decode_coords = True , engine = None ,
277
- chunks = None , lock = None , drop_variables = None ):
320
+ chunks = None , lock = None , cache = None , drop_variables = None ):
278
321
"""
279
322
Opens an DataArray from a netCDF file containing a single data variable.
280
323
@@ -328,6 +371,13 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
328
371
used when reading data from netCDF files with the netcdf4 and h5netcdf
329
372
engines to avoid issues with concurrent access when using dask's
330
373
multithreaded backend.
374
+ cache : bool, optional
375
+ If True, cache data loaded from the underlying datastore in memory as
376
+ NumPy arrays when accessed to avoid reading from the underlying data-
377
+ store multiple times. Defaults to True unless you specify the `chunks`
378
+ argument to use dask, in which case it defaults to False. Does not
379
+ change the behavior of coordinates corresponding to dimensions, which
380
+ always load their data from disk into a ``pandas.Index``.
331
381
drop_variables: string or iterable, optional
332
382
A variable or list of variables to exclude from being parsed from the
333
383
dataset. This may be useful to drop variables with problems or
@@ -349,7 +399,7 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
349
399
dataset = open_dataset (filename_or_obj , group , decode_cf ,
350
400
mask_and_scale , decode_times ,
351
401
concat_characters , decode_coords , engine ,
352
- chunks , lock , drop_variables )
402
+ chunks , lock , cache , drop_variables )
353
403
354
404
if len (dataset .data_vars ) != 1 :
355
405
raise ValueError ('Given file dataset contains more than one data '
@@ -494,8 +544,10 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
494
544
raise ValueError ('invalid engine for creating bytes with '
495
545
'to_netcdf: %r. Only the default engine '
496
546
"or engine='scipy' is supported" % engine )
497
- elif engine is None :
498
- engine = _get_default_engine (path )
547
+ else :
548
+ if engine is None :
549
+ engine = _get_default_engine (path )
550
+ path = _normalize_path (path )
499
551
500
552
# validate Dataset keys, DataArray names, and attr keys/values
501
553
_validate_dataset_names (dataset )
0 commit comments