Make dask names change when chunking Variables by different amounts. (#3584)

dcherian · crusaderky · web-flow · commit 24f929211489 · 2020-01-10T16:10:56.000Z
* Make dask tokens change when chunking Variables by different amounts.

When rechunking by the current chunk size, the dask token should not change.
Add a __dask_tokenize__ method for ReprObject so that this behaviour is present
when DataArrays are converted to temporary Datasets and back.

Co-Authored-By: crusaderky &lt;crusaderky@gmail.com&gt;

Co-authored-by: crusaderky &lt;crusaderky@gmail.com&gt;
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -47,6 +47,7 @@ New Features
 
 Bug fixes
 ~~~~~~~~~
+
 - Fix :py:meth:`xarray.combine_by_coords` to allow for combining incomplete
   hypercubes of Datasets (:issue:`3648`).  By `Ian Bolliger
   <https://github.com/bolliger32>`_.
@@ -91,6 +92,9 @@ Documentation
 
 Internal Changes
 ~~~~~~~~~~~~~~~~
+- Make sure dask names change when rechunking by different chunk sizes. Conversely, make sure they
+  stay the same when rechunking by the same chunk size. (:issue:`3350`)
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - 2x to 5x speed boost (on small arrays) for :py:meth:`Dataset.isel`,
   :py:meth:`DataArray.isel`, and :py:meth:`DataArray.__getitem__` when indexing by int,
   slice, list of int, scalar ndarray, or 1-dimensional ndarray.
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1754,7 +1754,10 @@ def maybe_chunk(name, var, chunks):
             if not chunks:
                 chunks = None
             if var.ndim > 0:
-                token2 = tokenize(name, token if token else var._data)
+                # when rechunking by different amounts, make sure dask names change
+                # by provinding chunks as an input to tokenize.
+                # subtle bugs result otherwise. see GH3350
+                token2 = tokenize(name, token if token else var._data, chunks)
                 name2 = f"{name_prefix}{name}-{token2}"
                 return var.chunk(chunks, name=name2, lock=lock)
             else:
diff --git a/xarray/core/utils.py b/xarray/core/utils.py
@@ -547,7 +547,12 @@ def __eq__(self, other) -> bool:
         return False
 
     def __hash__(self) -> int:
-        return hash((ReprObject, self._value))
+        return hash((type(self), self._value))
+
+    def __dask_tokenize__(self):
+        from dask.base import normalize_token
+
+        return normalize_token((type(self), self._value))
 
 
 @contextlib.contextmanager
diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py
@@ -1083,7 +1083,7 @@ def func(obj):
         actual = xr.map_blocks(func, obj)
     expected = func(obj)
     assert_chunks_equal(expected.chunk(), actual)
-    xr.testing.assert_identical(actual.compute(), expected.compute())
+    assert_identical(actual, expected)
 
 
 @pytest.mark.parametrize("obj", [make_da(), make_ds()])
@@ -1092,7 +1092,7 @@ def test_map_blocks_convert_args_to_list(obj):
     with raise_if_dask_computes():
         actual = xr.map_blocks(operator.add, obj, [10])
     assert_chunks_equal(expected.chunk(), actual)
-    xr.testing.assert_identical(actual.compute(), expected.compute())
+    assert_identical(actual, expected)
 
 
 @pytest.mark.parametrize("obj", [make_da(), make_ds()])
@@ -1107,7 +1107,7 @@ def add_attrs(obj):
     with raise_if_dask_computes():
         actual = xr.map_blocks(add_attrs, obj)
 
-    xr.testing.assert_identical(actual.compute(), expected.compute())
+    assert_identical(actual, expected)
 
 
 def test_map_blocks_change_name(map_da):
@@ -1120,7 +1120,7 @@ def change_name(obj):
     with raise_if_dask_computes():
         actual = xr.map_blocks(change_name, map_da)
 
-    xr.testing.assert_identical(actual.compute(), expected.compute())
+    assert_identical(actual, expected)
 
 
 @pytest.mark.parametrize("obj", [make_da(), make_ds()])
@@ -1129,15 +1129,15 @@ def test_map_blocks_kwargs(obj):
     with raise_if_dask_computes():
         actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan))
     assert_chunks_equal(expected.chunk(), actual)
-    xr.testing.assert_identical(actual.compute(), expected.compute())
+    assert_identical(actual, expected)
 
 
 def test_map_blocks_to_array(map_ds):
     with raise_if_dask_computes():
         actual = xr.map_blocks(lambda x: x.to_array(), map_ds)
 
     # to_array does not preserve name, so cannot use assert_identical
-    assert_equal(actual.compute(), map_ds.to_array().compute())
+    assert_equal(actual, map_ds.to_array())
 
 
 @pytest.mark.parametrize(
@@ -1156,7 +1156,7 @@ def test_map_blocks_da_transformations(func, map_da):
     with raise_if_dask_computes():
         actual = xr.map_blocks(func, map_da)
 
-    assert_identical(actual.compute(), func(map_da).compute())
+    assert_identical(actual, func(map_da))
 
 
 @pytest.mark.parametrize(
@@ -1175,7 +1175,7 @@ def test_map_blocks_ds_transformations(func, map_ds):
     with raise_if_dask_computes():
         actual = xr.map_blocks(func, map_ds)
 
-    assert_identical(actual.compute(), func(map_ds).compute())
+    assert_identical(actual, func(map_ds))
 
 
 @pytest.mark.parametrize("obj", [make_da(), make_ds()])
@@ -1188,7 +1188,7 @@ def func(obj):
         expected = xr.map_blocks(func, obj)
         actual = obj.map_blocks(func)
 
-    assert_identical(expected.compute(), actual.compute())
+    assert_identical(expected, actual)
 
 
 def test_map_blocks_hlg_layers():
diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -752,12 +752,19 @@ def test_chunk(self):
 
         blocked = unblocked.chunk()
         assert blocked.chunks == ((3,), (4,))
+        first_dask_name = blocked.data.name
 
         blocked = unblocked.chunk(chunks=((2, 1), (2, 2)))
         assert blocked.chunks == ((2, 1), (2, 2))
+        assert blocked.data.name != first_dask_name
 
         blocked = unblocked.chunk(chunks=(3, 3))
         assert blocked.chunks == ((3,), (3, 1))
+        assert blocked.data.name != first_dask_name
+
+        # name doesn't change when rechunking by same amount
+        # this fails if ReprObject doesn't have __dask_tokenize__ defined
+        assert unblocked.chunk(2).data.name == unblocked.chunk(2).data.name
 
         assert blocked.load().chunks is None
 
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -936,19 +936,35 @@ def test_chunk(self):
         expected_chunks = {"dim1": (8,), "dim2": (9,), "dim3": (10,)}
         assert reblocked.chunks == expected_chunks
 
+        def get_dask_names(ds):
+            return {k: v.data.name for k, v in ds.items()}
+
+        orig_dask_names = get_dask_names(reblocked)
+
         reblocked = data.chunk({"time": 5, "dim1": 5, "dim2": 5, "dim3": 5})
         # time is not a dim in any of the data_vars, so it
         # doesn't get chunked
         expected_chunks = {"dim1": (5, 3), "dim2": (5, 4), "dim3": (5, 5)}
         assert reblocked.chunks == expected_chunks
 
+        # make sure dask names change when rechunking by different amounts
+        # regression test for GH3350
+        new_dask_names = get_dask_names(reblocked)
+        for k, v in new_dask_names.items():
+            assert v != orig_dask_names[k]
+
         reblocked = data.chunk(expected_chunks)
         assert reblocked.chunks == expected_chunks
 
         # reblock on already blocked data
+        orig_dask_names = get_dask_names(reblocked)
         reblocked = reblocked.chunk(expected_chunks)
+        new_dask_names = get_dask_names(reblocked)
         assert reblocked.chunks == expected_chunks
         assert_identical(reblocked, data)
+        # recuhnking with same chunk sizes should not change names
+        for k, v in new_dask_names.items():
+            assert v == orig_dask_names[k]
 
         with raises_regex(ValueError, "some chunks"):
             data.chunk({"foo": 10})