diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6eeb55c23cb..9759a152a4c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,11 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Support using an existing, opened h5netcdf ``File`` with + :py:class:`~xarray.backends.H5NetCDFStore`. This permits creating an + :py:class:`~xarray.Dataset` from a h5netcdf ``File`` that has been opened + using other means (:issue:`3618`). + By `Kai Mühlbauer `_. - Implement :py:func:`median` and :py:func:`nanmedian` for dask arrays. This works by rechunking to a single chunk along all reduction axes. (:issue:`2999`). By `Deepak Cherian `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index eea1fb9ddce..0990fc46219 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -503,7 +503,7 @@ def maybe_decode_store(store, lock=False): elif engine == "pydap": store = backends.PydapDataStore.open(filename_or_obj, **backend_kwargs) elif engine == "h5netcdf": - store = backends.H5NetCDFStore( + store = backends.H5NetCDFStore.open( filename_or_obj, group=group, lock=lock, **backend_kwargs ) elif engine == "pynio": @@ -527,7 +527,7 @@ def maybe_decode_store(store, lock=False): if engine == "scipy": store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) elif engine == "h5netcdf": - store = backends.H5NetCDFStore( + store = backends.H5NetCDFStore.open( filename_or_obj, group=group, lock=lock, **backend_kwargs ) @@ -981,7 +981,7 @@ def open_mfdataset( WRITEABLE_STORES: Dict[str, Callable] = { "netcdf4": backends.NetCDF4DataStore.open, "scipy": backends.ScipyDataStore, - "h5netcdf": backends.H5NetCDFStore, + "h5netcdf": backends.H5NetCDFStore.open, } diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 9786e0a0203..fa3ee19f542 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -34,7 +34,7 @@ def find_root_and_group(ds): """Find the root and group name of a netCDF4/h5netcdf dataset.""" hierarchy = () while ds.parent is not None: - hierarchy = (ds.name,) + hierarchy + hierarchy = (ds.name.split("/")[-1],) + hierarchy ds = ds.parent group = "/" + "/".join(hierarchy) return ds, group diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 51ed512f98b..203213afe82 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -4,9 +4,9 @@ from .. import Variable from ..core import indexing -from ..core.utils import FrozenDict -from .common import WritableCFDataStore -from .file_manager import CachingFileManager +from ..core.utils import FrozenDict, is_remote_uri +from .common import WritableCFDataStore, find_root_and_group +from .file_manager import CachingFileManager, DummyFileManager from .locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock from .netCDF4_ import ( BaseNetCDF4Array, @@ -69,8 +69,47 @@ class H5NetCDFStore(WritableCFDataStore): """Store for reading and writing data via h5netcdf """ - def __init__( - self, + __slots__ = ( + "autoclose", + "format", + "is_remote", + "lock", + "_filename", + "_group", + "_manager", + "_mode", + ) + + def __init__(self, manager, group=None, mode=None, lock=HDF5_LOCK, autoclose=False): + + import h5netcdf + + if isinstance(manager, (h5netcdf.File, h5netcdf.Group)): + if group is None: + root, group = find_root_and_group(manager) + else: + if not type(manager) is h5netcdf.File: + raise ValueError( + "must supply a h5netcdf.File if the group " + "argument is provided" + ) + root = manager + manager = DummyFileManager(root) + + self._manager = manager + self._group = group + self._mode = mode + self.format = None + # todo: utilizing find_root_and_group seems a bit clunky + # making filename available on h5netcdf.Group seems better + self._filename = find_root_and_group(self.ds)[0].filename + self.is_remote = is_remote_uri(self._filename) + self.lock = ensure_lock(lock) + self.autoclose = autoclose + + @classmethod + def open( + cls, filename, mode="r", format=None, @@ -86,22 +125,14 @@ def __init__( kwargs = {"invalid_netcdf": invalid_netcdf} - self._manager = CachingFileManager( - h5netcdf.File, filename, mode=mode, kwargs=kwargs - ) - if lock is None: if mode == "r": lock = HDF5_LOCK else: lock = combine_locks([HDF5_LOCK, get_write_lock(filename)]) - self._group = group - self.format = format - self._filename = filename - self._mode = mode - self.lock = ensure_lock(lock) - self.autoclose = autoclose + manager = CachingFileManager(h5netcdf.File, filename, mode=mode, kwargs=kwargs) + return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) def _acquire(self, needs_lock=True): with self._manager.acquire_context(needs_lock) as root: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4fccdf2dd6c..0436ae9d244 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2182,7 +2182,7 @@ class TestH5NetCDFData(NetCDF4Base): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: - yield backends.H5NetCDFStore(tmp_file, "w") + yield backends.H5NetCDFStore.open(tmp_file, "w") @pytest.mark.filterwarnings("ignore:complex dtypes are supported by h5py") @pytest.mark.parametrize( @@ -2345,6 +2345,27 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding["compression"] == "lzf" assert actual.x.encoding["compression_opts"] is None + def test_already_open_dataset_group(self): + import h5netcdf + + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, mode="w") as nc: + group = nc.createGroup("g") + v = group.createVariable("x", "int") + v[...] = 42 + + h5 = h5netcdf.File(tmp_file, mode="r") + store = backends.H5NetCDFStore(h5["g"]) + with open_dataset(store) as ds: + expected = Dataset({"x": ((), 42)}) + assert_identical(expected, ds) + + h5 = h5netcdf.File(tmp_file, mode="r") + store = backends.H5NetCDFStore(h5, group="g") + with open_dataset(store) as ds: + expected = Dataset({"x": ((), 42)}) + assert_identical(expected, ds) + @requires_h5netcdf class TestH5NetCDFFileObject(TestH5NetCDFData):