Skip to content

Commit 0512da1

Browse files
New inline_array kwarg for open_dataset (#6566)
* added inline_array kwarg * remove cheeky print statements * Remove another rogue print statement * bump dask dependency * update multiple dependencies based on min-deps-check.py * update environment to match #6559 * Update h5py in ci/requirements/min-all-deps.yml * Update ci/requirements/min-all-deps.yml * remove pynio from test env * Update ci/requirements/min-all-deps.yml * promote inline_array kwarg to be top-level kwarg * whatsnew * add test * Remove repeated docstring entry Co-authored-by: Deepak Cherian <[email protected]> * Remove repeated docstring entry Co-authored-by: Deepak Cherian <[email protected]> * hyperlink to dask functions Co-authored-by: Deepak Cherian <[email protected]>
1 parent cad4474 commit 0512da1

File tree

6 files changed

+82
-4
lines changed

6 files changed

+82
-4
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ New Features
4141
- Allow passing chunks in ``**kwargs`` form to :py:meth:`Dataset.chunk`, :py:meth:`DataArray.chunk`, and
4242
:py:meth:`Variable.chunk`. (:pull:`6471`)
4343
By `Tom Nicholas <https://github.com/TomNicholas>`_.
44+
- Expose `inline_array` kwarg from `dask.array.from_array` in :py:func:`open_dataset`, :py:meth:`Dataset.chunk`,
45+
:py:meth:`DataArray.chunk`, and :py:meth:`Variable.chunk`. (:pull:`6471`)
46+
By `Tom Nicholas <https://github.com/TomNicholas>`_.
4447
- :py:meth:`xr.polyval` now supports :py:class:`Dataset` and :py:class:`DataArray` args of any shape,
4548
is faster and requires less memory. (:pull:`6548`)
4649
By `Michael Niklas <https://github.com/headtr1ck>`_.

xarray/backends/api.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ def _chunk_ds(
274274
engine,
275275
chunks,
276276
overwrite_encoded_chunks,
277+
inline_array,
277278
**extra_tokens,
278279
):
279280
from dask.base import tokenize
@@ -292,6 +293,7 @@ def _chunk_ds(
292293
overwrite_encoded_chunks=overwrite_encoded_chunks,
293294
name_prefix=name_prefix,
294295
token=token,
296+
inline_array=inline_array,
295297
)
296298
return backend_ds._replace(variables)
297299

@@ -303,6 +305,7 @@ def _dataset_from_backend_dataset(
303305
chunks,
304306
cache,
305307
overwrite_encoded_chunks,
308+
inline_array,
306309
**extra_tokens,
307310
):
308311
if not isinstance(chunks, (int, dict)) and chunks not in {None, "auto"}:
@@ -320,6 +323,7 @@ def _dataset_from_backend_dataset(
320323
engine,
321324
chunks,
322325
overwrite_encoded_chunks,
326+
inline_array,
323327
**extra_tokens,
324328
)
325329

@@ -346,6 +350,7 @@ def open_dataset(
346350
concat_characters=None,
347351
decode_coords=None,
348352
drop_variables=None,
353+
inline_array=False,
349354
backend_kwargs=None,
350355
**kwargs,
351356
):
@@ -430,6 +435,12 @@ def open_dataset(
430435
A variable or list of variables to exclude from being parsed from the
431436
dataset. This may be useful to drop variables with problems or
432437
inconsistent values.
438+
inline_array: bool, optional
439+
How to include the array in the dask task graph.
440+
By default(``inline_array=False``) the array is included in a task by
441+
itself, and each chunk refers to that task by its key. With
442+
``inline_array=True``, Dask will instead inline the array directly
443+
in the values of the task graph. See :py:func:`dask.array.from_array`.
433444
backend_kwargs: dict
434445
Additional keyword arguments passed on to the engine open function,
435446
equivalent to `**kwargs`.
@@ -505,6 +516,7 @@ def open_dataset(
505516
chunks,
506517
cache,
507518
overwrite_encoded_chunks,
519+
inline_array,
508520
drop_variables=drop_variables,
509521
**decoders,
510522
**kwargs,
@@ -526,6 +538,7 @@ def open_dataarray(
526538
concat_characters=None,
527539
decode_coords=None,
528540
drop_variables=None,
541+
inline_array=False,
529542
backend_kwargs=None,
530543
**kwargs,
531544
):
@@ -613,6 +626,12 @@ def open_dataarray(
613626
A variable or list of variables to exclude from being parsed from the
614627
dataset. This may be useful to drop variables with problems or
615628
inconsistent values.
629+
inline_array: bool, optional
630+
How to include the array in the dask task graph.
631+
By default(``inline_array=False``) the array is included in a task by
632+
itself, and each chunk refers to that task by its key. With
633+
``inline_array=True``, Dask will instead inline the array directly
634+
in the values of the task graph. See :py:func:`dask.array.from_array`.
616635
backend_kwargs: dict
617636
Additional keyword arguments passed on to the engine open function,
618637
equivalent to `**kwargs`.
@@ -660,6 +679,7 @@ def open_dataarray(
660679
chunks=chunks,
661680
cache=cache,
662681
drop_variables=drop_variables,
682+
inline_array=inline_array,
663683
backend_kwargs=backend_kwargs,
664684
use_cftime=use_cftime,
665685
decode_timedelta=decode_timedelta,

xarray/core/dataarray.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1114,6 +1114,7 @@ def chunk(
11141114
name_prefix: str = "xarray-",
11151115
token: str = None,
11161116
lock: bool = False,
1117+
inline_array: bool = False,
11171118
**chunks_kwargs: Any,
11181119
) -> DataArray:
11191120
"""Coerce this array's data into a dask arrays with the given chunks.
@@ -1138,13 +1139,23 @@ def chunk(
11381139
lock : optional
11391140
Passed on to :py:func:`dask.array.from_array`, if the array is not
11401141
already as dask array.
1142+
inline_array: optional
1143+
Passed on to :py:func:`dask.array.from_array`, if the array is not
1144+
already as dask array.
11411145
**chunks_kwargs : {dim: chunks, ...}, optional
11421146
The keyword arguments form of ``chunks``.
11431147
One of chunks or chunks_kwargs must be provided.
11441148
11451149
Returns
11461150
-------
11471151
chunked : xarray.DataArray
1152+
1153+
See Also
1154+
--------
1155+
DataArray.chunks
1156+
DataArray.chunksizes
1157+
xarray.unify_chunks
1158+
dask.array.from_array
11481159
"""
11491160
if chunks is None:
11501161
warnings.warn(
@@ -1163,7 +1174,11 @@ def chunk(
11631174
chunks = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk")
11641175

11651176
ds = self._to_temp_dataset().chunk(
1166-
chunks, name_prefix=name_prefix, token=token, lock=lock
1177+
chunks,
1178+
name_prefix=name_prefix,
1179+
token=token,
1180+
lock=lock,
1181+
inline_array=inline_array,
11671182
)
11681183
return self._from_temp_dataset(ds)
11691184

xarray/core/dataset.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ def _maybe_chunk(
240240
lock=None,
241241
name_prefix="xarray-",
242242
overwrite_encoded_chunks=False,
243+
inline_array=False,
243244
):
244245
from dask.base import tokenize
245246

@@ -251,7 +252,7 @@ def _maybe_chunk(
251252
# subtle bugs result otherwise. see GH3350
252253
token2 = tokenize(name, token if token else var._data, chunks)
253254
name2 = f"{name_prefix}{name}-{token2}"
254-
var = var.chunk(chunks, name=name2, lock=lock)
255+
var = var.chunk(chunks, name=name2, lock=lock, inline_array=inline_array)
255256

256257
if overwrite_encoded_chunks and var.chunks is not None:
257258
var.encoding["chunks"] = tuple(x[0] for x in var.chunks)
@@ -1995,6 +1996,7 @@ def chunk(
19951996
name_prefix: str = "xarray-",
19961997
token: str = None,
19971998
lock: bool = False,
1999+
inline_array: bool = False,
19982000
**chunks_kwargs: Any,
19992001
) -> Dataset:
20002002
"""Coerce all arrays in this dataset into dask arrays with the given
@@ -2019,6 +2021,9 @@ def chunk(
20192021
lock : optional
20202022
Passed on to :py:func:`dask.array.from_array`, if the array is not
20212023
already as dask array.
2024+
inline_array: optional
2025+
Passed on to :py:func:`dask.array.from_array`, if the array is not
2026+
already as dask array.
20222027
**chunks_kwargs : {dim: chunks, ...}, optional
20232028
The keyword arguments form of ``chunks``.
20242029
One of chunks or chunks_kwargs must be provided
@@ -2032,6 +2037,7 @@ def chunk(
20322037
Dataset.chunks
20332038
Dataset.chunksizes
20342039
xarray.unify_chunks
2040+
dask.array.from_array
20352041
"""
20362042
if chunks is None and chunks_kwargs is None:
20372043
warnings.warn(

xarray/core/variable.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,7 @@ def chunk(
10231023
) = {},
10241024
name: str = None,
10251025
lock: bool = False,
1026+
inline_array: bool = False,
10261027
**chunks_kwargs: Any,
10271028
) -> Variable:
10281029
"""Coerce this array's data into a dask array with the given chunks.
@@ -1046,13 +1047,23 @@ def chunk(
10461047
lock : optional
10471048
Passed on to :py:func:`dask.array.from_array`, if the array is not
10481049
already as dask array.
1050+
inline_array: optional
1051+
Passed on to :py:func:`dask.array.from_array`, if the array is not
1052+
already as dask array.
10491053
**chunks_kwargs : {dim: chunks, ...}, optional
10501054
The keyword arguments form of ``chunks``.
10511055
One of chunks or chunks_kwargs must be provided.
10521056
10531057
Returns
10541058
-------
10551059
chunked : xarray.Variable
1060+
1061+
See Also
1062+
--------
1063+
Variable.chunks
1064+
Variable.chunksizes
1065+
xarray.unify_chunks
1066+
dask.array.from_array
10561067
"""
10571068
import dask.array as da
10581069

@@ -1098,7 +1109,9 @@ def chunk(
10981109
if utils.is_dict_like(chunks):
10991110
chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape))
11001111

1101-
data = da.from_array(data, chunks, name=name, lock=lock, **kwargs)
1112+
data = da.from_array(
1113+
data, chunks, name=name, lock=lock, inline_array=inline_array, **kwargs
1114+
)
11021115

11031116
return self._replace(data=data)
11041117

@@ -2710,7 +2723,7 @@ def values(self, values):
27102723
f"Please use DataArray.assign_coords, Dataset.assign_coords or Dataset.assign as appropriate."
27112724
)
27122725

2713-
def chunk(self, chunks={}, name=None, lock=False):
2726+
def chunk(self, chunks={}, name=None, lock=False, inline_array=False):
27142727
# Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk()
27152728
return self.copy(deep=False)
27162729

xarray/tests/test_backends.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3840,6 +3840,27 @@ def test_load_dataarray(self):
38403840
# load_dataarray
38413841
ds.to_netcdf(tmp)
38423842

3843+
@pytest.mark.skipif(
3844+
ON_WINDOWS,
3845+
reason="counting number of tasks in graph fails on windows for some reason",
3846+
)
3847+
def test_inline_array(self):
3848+
with create_tmp_file() as tmp:
3849+
original = Dataset({"foo": ("x", np.random.randn(10))})
3850+
original.to_netcdf(tmp)
3851+
chunks = {"time": 10}
3852+
3853+
def num_graph_nodes(obj):
3854+
return len(obj.__dask_graph__())
3855+
3856+
not_inlined = open_dataset(tmp, inline_array=False, chunks=chunks)
3857+
inlined = open_dataset(tmp, inline_array=True, chunks=chunks)
3858+
assert num_graph_nodes(inlined) < num_graph_nodes(not_inlined)
3859+
3860+
not_inlined = open_dataarray(tmp, inline_array=False, chunks=chunks)
3861+
inlined = open_dataarray(tmp, inline_array=True, chunks=chunks)
3862+
assert num_graph_nodes(inlined) < num_graph_nodes(not_inlined)
3863+
38433864

38443865
@requires_scipy_or_netCDF4
38453866
@requires_pydap

0 commit comments

Comments
 (0)