Skip to content

Commit 27132fb

Browse files
guziyJoe Hamman
authored andcommitted
data_vars option added to open_mfdataset (#1580)
* add data_vars option to open_mfdataset * use single quotes * fix the 'line too long' warning from flake8 * document the data_vars keyword for open_mfdataset * improve the data_vars record in whats-new * update my name in wats-new.rst * Start writing the test for the data_vars keyword * use the data_vars keyword in combine * address flake8 warnings for test_backend.py * ignore flake8 warnings concerning whats-new.rst * fix function reference in whats-new.rst * open_mfdataset does not accept dim keyword argument * use single quotes for strings in the added tests * refactor data_vars related tests * Use with for opening mfdataset in data_vars related tests * add @requires_scipy_or_netCDF4 to the data_vars test class * address flake8 warnings about long lines in the data_vars related tests. * close opened datasets in case of a ValueError in open_mfdataset, seems important for Windows * fix line too long warnings from flake8 * refactor tests and open_mfdataset, to address comments * refactor tests for data_vars keyword in open_mfdataset * refactor to address flake8 warnings * add another example of data_vars usage in open_mfdataset * add coords keyword to open_mfdataset * add a memory and performance related observations to the whats-new and modify code snippets to use single quotes for consistency. * fixed a grammar mistake * quote variable names referenced in the text * add tests for coords keyword in the open_mfdataset, along with the similar tests for the data_vars keyword. * split a test into 2 to simplify, introduce context manager for setting up test inputs in OpenMFDatasetWithDataVarsAndCoordsKwTest
1 parent 57ccf42 commit 27132fb

File tree

4 files changed

+204
-10
lines changed

4 files changed

+204
-10
lines changed

doc/whats-new.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,34 @@ Backward Incompatible Changes
7474

7575
Enhancements
7676
~~~~~~~~~~~~
77+
- Support for ``data_vars`` and ``coords`` keywords added to
78+
:py:func:`~xarray.open_mfdataset`
79+
(:issue:`438`):
80+
81+
.. ipython::
82+
:verbatim:
83+
#allows to open multiple files as
84+
ds = xarray.open_mfdataset(paths, chunks={'time': 100}, data_vars='minimal')
85+
#instead of
86+
ds = xarray.concat([xarray.open_dataset(p, chunks={'time': 100}) for p in paths], data_vars='minimal', dim='time')
87+
# in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat)
88+
89+
# in case of 'minimal' does not add time dimension to spatial coordinates
90+
In [1]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='all')
91+
92+
In [2]: ds['lon'].shape
93+
94+
Out[2]: (13505, 808, 782)
95+
96+
In [3]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='minimal')
97+
98+
In [4]: ds['lon'].shape
99+
100+
Out[4]: (808, 782)
101+
102+
# I also noticed that my memory-intensive applications use much less memory and run faster, when ``data_vars='minimal'`` is used.
103+
104+
By `Oleksandr Huziy <https://github.com/guziy>`_.
77105

78106
- Support for `pathlib.Path` objects added to
79107
:py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,

xarray/backends/api.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ def close(self):
431431

432432
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
433433
compat='no_conflicts', preprocess=None, engine=None,
434-
lock=None, **kwargs):
434+
lock=None, data_vars='all', coords='different', **kwargs):
435435
"""Open multiple files as a single dataset.
436436
437437
Requires dask to be installed. Attributes from the first dataset file
@@ -487,6 +487,32 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
487487
default, a per-variable lock is used when reading data from netCDF
488488
files with the netcdf4 and h5netcdf engines to avoid issues with
489489
concurrent access when using dask's multithreaded backend.
490+
data_vars : {'minimal', 'different', 'all' or list of str}, optional
491+
These data variables will be concatenated together:
492+
* 'minimal': Only data variables in which the dimension already
493+
appears are included.
494+
* 'different': Data variables which are not equal (ignoring
495+
attributes) across all datasets are also concatenated (as well as
496+
all for which dimension already appears). Beware: this option may
497+
load the data payload of data variables into memory if they are not
498+
already loaded.
499+
* 'all': All data variables will be concatenated.
500+
* list of str: The listed data variables will be concatenated, in
501+
addition to the 'minimal' data variables.
502+
coords : {'minimal', 'different', 'all' o list of str}, optional
503+
These coordinate variables will be concatenated together:
504+
* 'minimal': Only coordinates in which the dimension already appears
505+
are included.
506+
* 'different': Coordinates which are not equal (ignoring attributes)
507+
across all datasets are also concatenated (as well as all for which
508+
dimension already appears). Beware: this option may load the data
509+
payload of coordinate variables into memory if they are not already
510+
loaded.
511+
* 'all': All coordinate variables will be concatenated, except
512+
those corresponding to other dimensions.
513+
* list of str: The listed coordinate variables will be concatenated,
514+
in addition the 'minimal' coordinates.
515+
490516
**kwargs : optional
491517
Additional arguments passed on to :py:func:`xarray.open_dataset`.
492518
@@ -516,13 +542,22 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
516542
if preprocess is not None:
517543
datasets = [preprocess(ds) for ds in datasets]
518544

519-
if concat_dim is _CONCAT_DIM_DEFAULT:
520-
combined = auto_combine(datasets, compat=compat)
521-
else:
522-
combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
545+
# close datasets in case of a ValueError
546+
try:
547+
if concat_dim is _CONCAT_DIM_DEFAULT:
548+
combined = auto_combine(datasets, compat=compat,
549+
data_vars=data_vars, coords=coords)
550+
else:
551+
combined = auto_combine(datasets, concat_dim=concat_dim,
552+
compat=compat,
553+
data_vars=data_vars, coords=coords)
554+
except ValueError:
555+
for ds in datasets:
556+
ds.close()
557+
raise
558+
523559
combined._file_obj = _MultiFileCloser(file_objs)
524560
combined.attrs = datasets[0].attrs
525-
526561
return combined
527562

528563

xarray/core/combine.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
340340
return arrays[0]._from_temp_dataset(ds, name)
341341

342342

343-
def _auto_concat(datasets, dim=None):
343+
def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
344344
if len(datasets) == 1:
345345
return datasets[0]
346346
else:
@@ -362,15 +362,16 @@ def _auto_concat(datasets, dim=None):
362362
'supply the ``concat_dim`` argument '
363363
'explicitly')
364364
dim, = concat_dims
365-
return concat(datasets, dim=dim)
365+
return concat(datasets, dim=dim, data_vars=data_vars, coords=coords)
366366

367367

368368
_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
369369

370370

371371
def auto_combine(datasets,
372372
concat_dim=_CONCAT_DIM_DEFAULT,
373-
compat='no_conflicts'):
373+
compat='no_conflicts',
374+
data_vars='all', coords='different'):
374375
"""Attempt to auto-magically combine the given datasets into one.
375376
376377
This method attempts to combine a list of datasets into a single entity by
@@ -411,6 +412,10 @@ def auto_combine(datasets,
411412
- 'no_conflicts': only values which are not null in both datasets
412413
must be equal. The returned dataset then contains the combination
413414
of all non-null values.
415+
data_vars : {'minimal', 'different', 'all' or list of str}, optional
416+
Details are in the documentation of concat
417+
coords : {'minimal', 'different', 'all' o list of str}, optional
418+
Details are in the documentation of concat
414419
415420
Returns
416421
-------
@@ -426,7 +431,9 @@ def auto_combine(datasets,
426431
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
427432
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
428433
datasets).values()
429-
concatenated = [_auto_concat(ds, dim=dim) for ds in grouped]
434+
concatenated = [_auto_concat(ds, dim=dim,
435+
data_vars=data_vars, coords=coords)
436+
for ds in grouped]
430437
else:
431438
concatenated = datasets
432439
merged = merge(concatenated, compat=compat)

xarray/tests/test_backends.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1268,6 +1268,130 @@ def test_4_open_large_num_files_h5netcdf(self):
12681268
self.validate_open_mfdataset_large_num_files(engine=['h5netcdf'])
12691269

12701270

1271+
@requires_scipy_or_netCDF4
1272+
class OpenMFDatasetWithDataVarsAndCoordsKwTest(TestCase):
1273+
coord_name = 'lon'
1274+
var_name = 'v1'
1275+
1276+
@contextlib.contextmanager
1277+
def setup_files_and_datasets(self):
1278+
ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
1279+
with create_tmp_file() as tmpfile1:
1280+
with create_tmp_file() as tmpfile2:
1281+
1282+
# save data to the temporary files
1283+
ds1.to_netcdf(tmpfile1)
1284+
ds2.to_netcdf(tmpfile2)
1285+
1286+
yield [tmpfile1, tmpfile2], [ds1, ds2]
1287+
1288+
def gen_datasets_with_common_coord_and_time(self):
1289+
# create coordinate data
1290+
nx = 10
1291+
nt = 10
1292+
x = np.arange(nx)
1293+
t1 = np.arange(nt)
1294+
t2 = np.arange(nt, 2 * nt, 1)
1295+
1296+
v1 = np.random.randn(nt, nx)
1297+
v2 = np.random.randn(nt, nx)
1298+
1299+
ds1 = Dataset(data_vars={self.var_name: (['t', 'x'], v1),
1300+
self.coord_name: ('x', 2 * x)},
1301+
coords={
1302+
't': (['t', ], t1),
1303+
'x': (['x', ], x)
1304+
})
1305+
1306+
ds2 = Dataset(data_vars={self.var_name: (['t', 'x'], v2),
1307+
self.coord_name: ('x', 2 * x)},
1308+
coords={
1309+
't': (['t', ], t2),
1310+
'x': (['x', ], x)
1311+
})
1312+
1313+
return ds1, ds2
1314+
1315+
def test_open_mfdataset_does_same_as_concat(self):
1316+
options = ['all', 'minimal', 'different', ]
1317+
1318+
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
1319+
for opt in options:
1320+
with open_mfdataset(files, data_vars=opt) as ds:
1321+
kwargs = dict(data_vars=opt, dim='t')
1322+
ds_expect = xr.concat([ds1, ds2], **kwargs)
1323+
self.assertDatasetIdentical(ds, ds_expect)
1324+
1325+
with open_mfdataset(files, coords=opt) as ds:
1326+
kwargs = dict(coords=opt, dim='t')
1327+
ds_expect = xr.concat([ds1, ds2], **kwargs)
1328+
self.assertDatasetIdentical(ds, ds_expect)
1329+
1330+
def test_common_coord_when_datavars_all(self):
1331+
opt = 'all'
1332+
1333+
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
1334+
# open the files with the data_var option
1335+
with open_mfdataset(files, data_vars=opt) as ds:
1336+
1337+
coord_shape = ds[self.coord_name].shape
1338+
coord_shape1 = ds1[self.coord_name].shape
1339+
coord_shape2 = ds2[self.coord_name].shape
1340+
1341+
var_shape = ds[self.var_name].shape
1342+
1343+
# shape pairs to be compared
1344+
shape_pairs = [
1345+
(var_shape, coord_shape),
1346+
(coord_shape1, coord_shape),
1347+
(coord_shape2, coord_shape)
1348+
]
1349+
# tests to be applied to respective pairs
1350+
tests = [self.assertEqual,
1351+
self.assertNotEqual, self.assertNotEqual]
1352+
1353+
for a_test, a_shape_pair in zip(tests, shape_pairs):
1354+
a_test(*a_shape_pair)
1355+
1356+
def test_common_coord_when_datavars_minimal(self):
1357+
opt = 'minimal'
1358+
1359+
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
1360+
# open the files using data_vars option
1361+
with open_mfdataset(files, data_vars=opt) as ds:
1362+
1363+
coord_shape = ds[self.coord_name].shape
1364+
coord_shape1 = ds1[self.coord_name].shape
1365+
coord_shape2 = ds2[self.coord_name].shape
1366+
1367+
var_shape = ds[self.var_name].shape
1368+
1369+
# shape pairs to be compared
1370+
shape_pairs = [
1371+
(var_shape, coord_shape),
1372+
(coord_shape1, coord_shape),
1373+
(coord_shape2, coord_shape)
1374+
]
1375+
# tests to be applied to respective pairs
1376+
tests = [self.assertNotEqual,
1377+
self.assertEqual, self.assertEqual]
1378+
1379+
for a_test, a_shape_pair in zip(tests, shape_pairs):
1380+
a_test(*a_shape_pair)
1381+
1382+
def test_invalid_data_vars_value_should_fail(self):
1383+
1384+
with self.setup_files_and_datasets() as (files, _):
1385+
with self.assertRaises(ValueError):
1386+
with open_mfdataset(files, data_vars='minimum'):
1387+
pass
1388+
1389+
# test invalid coord parameter
1390+
with self.assertRaises(ValueError):
1391+
with open_mfdataset(files, coords='minimum'):
1392+
pass
1393+
1394+
12711395
@requires_dask
12721396
@requires_scipy
12731397
@requires_netCDF4

0 commit comments

Comments
 (0)