data_vars option added to open_mfdataset (#1580)

guziy · Joe Hamman · commit 27132fba1e46 · 2017-10-10T13:51:18.000-07:00
* add data_vars option to open_mfdataset

* use single quotes

* fix the 'line too long' warning from flake8

* document the data_vars keyword for open_mfdataset

* improve the data_vars record in whats-new

* update my name in wats-new.rst

* Start writing the test for the data_vars keyword

* use the data_vars keyword in combine

* address flake8 warnings for test_backend.py

* ignore flake8 warnings concerning whats-new.rst

* fix function reference in whats-new.rst

* open_mfdataset does not accept dim keyword argument

* use single quotes for strings in the added tests

* refactor data_vars related tests

* Use with for opening mfdataset in data_vars related tests

* add @requires_scipy_or_netCDF4 to the data_vars test class

* address flake8 warnings about long lines in the data_vars related tests.

* close opened datasets in case of a ValueError in open_mfdataset, seems important for Windows

* fix line too long warnings from flake8

* refactor tests and open_mfdataset, to address comments

* refactor tests for data_vars keyword in open_mfdataset

* refactor to address flake8 warnings

* add another example of data_vars usage in open_mfdataset

* add coords keyword to open_mfdataset

* add a memory and performance related observations to the whats-new and modify code snippets to use single quotes for consistency.

* fixed a grammar mistake

* quote variable names referenced in the text

* add tests for coords keyword in the open_mfdataset, along with the similar tests for the data_vars keyword.

* split a test into 2 to simplify, introduce context manager for setting up test inputs in OpenMFDatasetWithDataVarsAndCoordsKwTest
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -74,6 +74,34 @@ Backward Incompatible Changes
 
 Enhancements
 ~~~~~~~~~~~~
+- Support for ``data_vars`` and ``coords`` keywords added to
+  :py:func:`~xarray.open_mfdataset`
+  (:issue:`438`):
+
+  .. ipython::
+    :verbatim:
+    #allows to open multiple files as
+    ds = xarray.open_mfdataset(paths, chunks={'time': 100}, data_vars='minimal')
+    #instead of
+    ds = xarray.concat([xarray.open_dataset(p, chunks={'time': 100}) for p in paths], data_vars='minimal', dim='time')
+    # in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat)
+
+    # in case of 'minimal' does not add time dimension to spatial coordinates
+    In [1]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='all')
+
+    In [2]: ds['lon'].shape
+
+    Out[2]: (13505, 808, 782)
+
+    In [3]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='minimal')
+
+    In [4]: ds['lon'].shape
+
+    Out[4]: (808, 782)
+
+    # I also noticed that my memory-intensive applications use much less memory and run faster, when ``data_vars='minimal'`` is used.
+
+  By `Oleksandr Huziy <https://github.com/guziy>`_.
 
 - Support for `pathlib.Path` objects added to
   :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -431,7 +431,7 @@ def close(self):
 
 def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
                    compat='no_conflicts', preprocess=None, engine=None,
-                   lock=None, **kwargs):
+                   lock=None, data_vars='all', coords='different', **kwargs):
     """Open multiple files as a single dataset.
 
     Requires dask to be installed.  Attributes from the first dataset file
@@ -487,6 +487,32 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
         default, a per-variable lock is used when reading data from netCDF
         files with the netcdf4 and h5netcdf engines to avoid issues with
         concurrent access when using dask's multithreaded backend.
+    data_vars : {'minimal', 'different', 'all' or list of str}, optional
+        These data variables will be concatenated together:
+          * 'minimal': Only data variables in which the dimension already
+            appears are included.
+          * 'different': Data variables which are not equal (ignoring
+            attributes) across all datasets are also concatenated (as well as
+            all for which dimension already appears). Beware: this option may
+            load the data payload of data variables into memory if they are not
+            already loaded.
+          * 'all': All data variables will be concatenated.
+          * list of str: The listed data variables will be concatenated, in
+            addition to the 'minimal' data variables.
+    coords : {'minimal', 'different', 'all' o list of str}, optional
+        These coordinate variables will be concatenated together:
+          * 'minimal': Only coordinates in which the dimension already appears
+            are included.
+          * 'different': Coordinates which are not equal (ignoring attributes)
+            across all datasets are also concatenated (as well as all for which
+            dimension already appears). Beware: this option may load the data
+            payload of coordinate variables into memory if they are not already
+            loaded.
+          * 'all': All coordinate variables will be concatenated, except
+            those corresponding to other dimensions.
+          * list of str: The listed coordinate variables will be concatenated,
+            in addition the 'minimal' coordinates.
+
     **kwargs : optional
         Additional arguments passed on to :py:func:`xarray.open_dataset`.
 
@@ -516,13 +542,22 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
     if preprocess is not None:
         datasets = [preprocess(ds) for ds in datasets]
 
-    if concat_dim is _CONCAT_DIM_DEFAULT:
-        combined = auto_combine(datasets, compat=compat)
-    else:
-        combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
+    # close datasets in case of a ValueError
+    try:
+        if concat_dim is _CONCAT_DIM_DEFAULT:
+            combined = auto_combine(datasets, compat=compat,
+                                    data_vars=data_vars, coords=coords)
+        else:
+            combined = auto_combine(datasets, concat_dim=concat_dim,
+                                    compat=compat,
+                                    data_vars=data_vars, coords=coords)
+    except ValueError:
+        for ds in datasets:
+            ds.close()
+        raise
+
     combined._file_obj = _MultiFileCloser(file_objs)
     combined.attrs = datasets[0].attrs
-
     return combined
 
 
diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -340,7 +340,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
     return arrays[0]._from_temp_dataset(ds, name)
 
 
-def _auto_concat(datasets, dim=None):
+def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
     if len(datasets) == 1:
         return datasets[0]
     else:
@@ -362,15 +362,16 @@ def _auto_concat(datasets, dim=None):
                                  'supply the ``concat_dim`` argument '
                                  'explicitly')
             dim, = concat_dims
-        return concat(datasets, dim=dim)
+        return concat(datasets, dim=dim, data_vars=data_vars, coords=coords)
 
 
 _CONCAT_DIM_DEFAULT = '__infer_concat_dim__'
 
 
 def auto_combine(datasets,
                  concat_dim=_CONCAT_DIM_DEFAULT,
-                 compat='no_conflicts'):
+                 compat='no_conflicts',
+                 data_vars='all', coords='different'):
     """Attempt to auto-magically combine the given datasets into one.
 
     This method attempts to combine a list of datasets into a single entity by
@@ -411,6 +412,10 @@ def auto_combine(datasets,
         - 'no_conflicts': only values which are not null in both datasets
           must be equal. The returned dataset then contains the combination
           of all non-null values.
+    data_vars : {'minimal', 'different', 'all' or list of str}, optional
+        Details are in the documentation of concat
+    coords : {'minimal', 'different', 'all' o list of str}, optional
+        Details are in the documentation of concat
 
     Returns
     -------
@@ -426,7 +431,9 @@ def auto_combine(datasets,
         dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
         grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
                                     datasets).values()
-        concatenated = [_auto_concat(ds, dim=dim) for ds in grouped]
+        concatenated = [_auto_concat(ds, dim=dim,
+                                     data_vars=data_vars, coords=coords)
+                        for ds in grouped]
     else:
         concatenated = datasets
     merged = merge(concatenated, compat=compat)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -1268,6 +1268,130 @@ def test_4_open_large_num_files_h5netcdf(self):
         self.validate_open_mfdataset_large_num_files(engine=['h5netcdf'])
 
 
+@requires_scipy_or_netCDF4
+class OpenMFDatasetWithDataVarsAndCoordsKwTest(TestCase):
+    coord_name = 'lon'
+    var_name = 'v1'
+
+    @contextlib.contextmanager
+    def setup_files_and_datasets(self):
+        ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+        with create_tmp_file() as tmpfile1:
+            with create_tmp_file() as tmpfile2:
+
+                # save data to the temporary files
+                ds1.to_netcdf(tmpfile1)
+                ds2.to_netcdf(tmpfile2)
+
+                yield [tmpfile1, tmpfile2], [ds1, ds2]
+
+    def gen_datasets_with_common_coord_and_time(self):
+        # create coordinate data
+        nx = 10
+        nt = 10
+        x = np.arange(nx)
+        t1 = np.arange(nt)
+        t2 = np.arange(nt, 2 * nt, 1)
+
+        v1 = np.random.randn(nt, nx)
+        v2 = np.random.randn(nt, nx)
+
+        ds1 = Dataset(data_vars={self.var_name: (['t', 'x'], v1),
+                                 self.coord_name: ('x', 2 * x)},
+                      coords={
+                          't': (['t', ], t1),
+                          'x': (['x', ], x)
+                      })
+
+        ds2 = Dataset(data_vars={self.var_name: (['t', 'x'], v2),
+                                 self.coord_name: ('x', 2 * x)},
+                      coords={
+                          't': (['t', ], t2),
+                          'x': (['x', ], x)
+                      })
+
+        return ds1, ds2
+
+    def test_open_mfdataset_does_same_as_concat(self):
+        options = ['all', 'minimal', 'different', ]
+
+        with self.setup_files_and_datasets() as (files, [ds1, ds2]):
+            for opt in options:
+                with open_mfdataset(files, data_vars=opt) as ds:
+                    kwargs = dict(data_vars=opt, dim='t')
+                    ds_expect = xr.concat([ds1, ds2], **kwargs)
+                    self.assertDatasetIdentical(ds, ds_expect)
+
+                with open_mfdataset(files, coords=opt) as ds:
+                    kwargs = dict(coords=opt, dim='t')
+                    ds_expect = xr.concat([ds1, ds2], **kwargs)
+                    self.assertDatasetIdentical(ds, ds_expect)
+
+    def test_common_coord_when_datavars_all(self):
+        opt = 'all'
+
+        with self.setup_files_and_datasets() as (files, [ds1, ds2]):
+            # open the files with the data_var option
+            with open_mfdataset(files, data_vars=opt) as ds:
+
+                coord_shape = ds[self.coord_name].shape
+                coord_shape1 = ds1[self.coord_name].shape
+                coord_shape2 = ds2[self.coord_name].shape
+
+                var_shape = ds[self.var_name].shape
+
+                # shape pairs to be compared
+                shape_pairs = [
+                    (var_shape, coord_shape),
+                    (coord_shape1, coord_shape),
+                    (coord_shape2, coord_shape)
+                ]
+                # tests to be applied to respective pairs
+                tests = [self.assertEqual,
+                         self.assertNotEqual, self.assertNotEqual]
+
+                for a_test, a_shape_pair in zip(tests, shape_pairs):
+                    a_test(*a_shape_pair)
+
+    def test_common_coord_when_datavars_minimal(self):
+        opt = 'minimal'
+
+        with self.setup_files_and_datasets() as (files, [ds1, ds2]):
+            # open the files using data_vars option
+            with open_mfdataset(files, data_vars=opt) as ds:
+
+                coord_shape = ds[self.coord_name].shape
+                coord_shape1 = ds1[self.coord_name].shape
+                coord_shape2 = ds2[self.coord_name].shape
+
+                var_shape = ds[self.var_name].shape
+
+                # shape pairs to be compared
+                shape_pairs = [
+                    (var_shape, coord_shape),
+                    (coord_shape1, coord_shape),
+                    (coord_shape2, coord_shape)
+                ]
+                # tests to be applied to respective pairs
+                tests = [self.assertNotEqual,
+                         self.assertEqual, self.assertEqual]
+
+                for a_test, a_shape_pair in zip(tests, shape_pairs):
+                    a_test(*a_shape_pair)
+
+    def test_invalid_data_vars_value_should_fail(self):
+
+        with self.setup_files_and_datasets() as (files, _):
+            with self.assertRaises(ValueError):
+                with open_mfdataset(files, data_vars='minimum'):
+                    pass
+
+            # test invalid coord parameter
+            with self.assertRaises(ValueError):
+                with open_mfdataset(files, coords='minimum'):
+                    pass
+
+
 @requires_dask
 @requires_scipy
 @requires_netCDF4