Merge branch 'main' into topk

dcherian · dcherian · commit 489c8434ed8b · 2025-03-17T19:15:57.000-06:00
* main: Fix upstream-dev tests (#421) Bump codecov/codecov-action from 5.1.2 to 5.3.1 (#420) optimize cohorts yet again (#419)
diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml
@@ -77,7 +77,7 @@ jobs:
           --ignore flox/tests \
           --cov=./ --cov-report=xml
       - name: Upload code coverage to Codecov
-        uses: codecov/codecov-action@v5.1.2
+        uses: codecov/codecov-action@v5.3.1
         with:
           file: ./coverage.xml
           flags: unittests
@@ -132,7 +132,7 @@ jobs:
           python -m mypy --install-types --non-interactive --cache-dir=.mypy_cache/ --cobertura-xml-report mypy_report
 
       - name: Upload mypy coverage to Codecov
-        uses: codecov/codecov-action@v5.1.2
+        uses: codecov/codecov-action@v5.3.1
         with:
           file: mypy_report/cobertura.xml
           flags: mypy
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -76,7 +76,7 @@ jobs:
           python -c "import xarray; xarray.show_versions()"
           pytest --durations=20 --durations-min=0.5 -n auto --cov=./ --cov-report=xml --hypothesis-profile ci --log-disable=flox
       - name: Upload code coverage to Codecov
-        uses: codecov/codecov-action@v5.1.2
+        uses: codecov/codecov-action@v5.3.1
         with:
           file: ./coverage.xml
           flags: unittests
diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml
@@ -93,7 +93,8 @@ jobs:
         id: status
         run: |
           pytest -rf -n auto --cov=./ --cov-report=xml \
-            --report-log output-${{ matrix.python-version }}-log.jsonl
+            --report-log output-${{ matrix.python-version }}-log.jsonl \
+             --hypothesis-profile ci
       - name: Generate and publish the report
         if: |
           failure()
diff --git a/ci/upstream-dev-env.yml b/ci/upstream-dev-env.yml
@@ -2,9 +2,11 @@ name: flox-tests
 channels:
   - conda-forge
 dependencies:
+  - asv_runner # for test_asv
   - cachey
   - codecov
   - pooch
+  - hypothesis
   - toolz
   # - numpy
   # - pandas
diff --git a/flox/core.py b/flox/core.py
@@ -1470,7 +1470,7 @@ def _reduce_blockwise(
     return result
 
 
-def _normalize_indexes(array: DaskArray, flatblocks, blkshape) -> tuple:
+def _normalize_indexes(ndim: int, flatblocks: Sequence[int], blkshape: tuple[int, ...]) -> tuple:
     """
     .blocks accessor can only accept one iterable at a time,
     but can handle multiple slices.
@@ -1488,20 +1488,23 @@ def _normalize_indexes(array: DaskArray, flatblocks, blkshape) -> tuple:
         if i.ndim == 0:
             normalized.append(i.item())
         else:
-            if np.array_equal(i, np.arange(blkshape[ax])):
+            if len(i) == blkshape[ax] and np.array_equal(i, np.arange(blkshape[ax])):
                 normalized.append(slice(None))
-            elif np.array_equal(i, np.arange(i[0], i[-1] + 1)):
-                normalized.append(slice(i[0], i[-1] + 1))
+            elif _issorted(i) and np.array_equal(i, np.arange(i[0], i[-1] + 1)):
+                start = None if i[0] == 0 else i[0]
+                stop = i[-1] + 1
+                stop = None if stop == blkshape[ax] else stop
+                normalized.append(slice(start, stop))
             else:
                 normalized.append(list(i))
-    full_normalized = (slice(None),) * (array.ndim - len(normalized)) + tuple(normalized)
+    full_normalized = (slice(None),) * (ndim - len(normalized)) + tuple(normalized)
 
     # has no iterables
     noiter = list(i if not hasattr(i, "__len__") else slice(None) for i in full_normalized)
     # has all iterables
     alliter = {ax: i for ax, i in enumerate(full_normalized) if hasattr(i, "__len__")}
 
-    mesh = dict(zip(alliter.keys(), np.ix_(*alliter.values())))
+    mesh = dict(zip(alliter.keys(), np.ix_(*alliter.values())))  # type: ignore[arg-type, var-annotated]
 
     full_tuple = tuple(i if ax not in mesh else mesh[ax] for ax, i in enumerate(noiter))
 
@@ -1528,7 +1531,6 @@ def subset_to_blocks(
     -------
     dask.array
     """
-    from dask.array.slicing import normalize_index
     from dask.base import tokenize
 
     if blkshape is None:
@@ -1537,10 +1539,9 @@ def subset_to_blocks(
     if chunks_as_array is None:
         chunks_as_array = tuple(np.array(c) for c in array.chunks)
 
-    index = _normalize_indexes(array, flatblocks, blkshape)
+    index = _normalize_indexes(array.ndim, flatblocks, blkshape)
 
     # These rest is copied from dask.array.core.py with slight modifications
-    index = normalize_index(index, array.numblocks)
     index = tuple(slice(k, k + 1) if isinstance(k, Integral) else k for k in index)
 
     name = "groupby-cohort-" + tokenize(array, index)
diff --git a/flox/dask_array_ops.py b/flox/dask_array_ops.py
@@ -1,6 +1,6 @@
 import builtins
 import math
-from functools import partial
+from functools import lru_cache, partial
 from itertools import product
 from numbers import Integral
 
@@ -84,14 +84,8 @@ def partial_reduce(
     axis: tuple[int, ...],
     block_index: int | None = None,
 ):
-    numblocks = tuple(len(c) for c in chunks)
-    ndim = len(numblocks)
-    parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n) in enumerate(numblocks)]
-    keys = product(*map(range, map(len, parts)))
-    out_chunks = [
-        tuple(1 for p in partition_all(split_every[i], c)) if i in split_every else c
-        for (i, c) in enumerate(chunks)
-    ]
+    ndim = len(chunks)
+    keys, parts, out_chunks = get_parts(tuple(split_every.items()), chunks)
     for k, p in zip(keys, product(*parts)):
         free = {i: j[0] for (i, j) in enumerate(p) if len(j) == 1 and i not in split_every}
         dummy = dict(i for i in enumerate(p) if i[0] in split_every)
@@ -101,3 +95,17 @@ def partial_reduce(
             k = (*k[:-1], block_index)
         dsk[(name,) + k] = (func, g)
     return dsk, out_chunks
+
+
+@lru_cache
+def get_parts(split_every_items, chunks):
+    numblocks = tuple(len(c) for c in chunks)
+    split_every = dict(split_every_items)
+
+    parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n) in enumerate(numblocks)]
+    keys = tuple(product(*map(range, map(len, parts))))
+    out_chunks = tuple(
+        tuple(1 for p in partition_all(split_every[i], c)) if i in split_every else c
+        for (i, c) in enumerate(chunks)
+    )
+    return keys, parts, out_chunks
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -5,7 +5,7 @@
 import warnings
 from collections.abc import Callable
 from functools import partial, reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from unittest.mock import MagicMock, patch
 
 import numpy as np
@@ -1551,7 +1551,7 @@ def test_normalize_block_indexing_1d(flatblocks, expected):
     nblocks = 5
     array = dask.array.ones((nblocks,), chunks=(1,))
     expected = tuple(np.array(i) if isinstance(i, list) else i for i in expected)
-    actual = _normalize_indexes(array, flatblocks, array.blocks.shape)
+    actual = _normalize_indexes(array.ndim, flatblocks, array.blocks.shape)
     assert_equal_tuple(expected, actual)
 
 
@@ -1563,17 +1563,17 @@ def test_normalize_block_indexing_1d(flatblocks, expected):
         ((1, 2, 3), (0, slice(1, 4))),
         ((1, 3), (0, [1, 3])),
         ((0, 1, 3), (0, [0, 1, 3])),
-        (tuple(range(10)), (slice(0, 2), slice(None))),
-        ((0, 1, 3, 5, 6, 8), (slice(0, 2), [0, 1, 3])),
+        (tuple(range(10)), (slice(None, 2), slice(None))),
+        ((0, 1, 3, 5, 6, 8), (slice(None, 2), [0, 1, 3])),
         ((0, 3, 4, 5, 6, 8, 24), np.ix_([0, 1, 4], [0, 1, 3, 4])),
     ),
 )
-def test_normalize_block_indexing_2d(flatblocks, expected):
+def test_normalize_block_indexing_2d(flatblocks: tuple[int, ...], expected: tuple[Any, ...]) -> None:
     nblocks = 5
     ndim = 2
     array = dask.array.ones((nblocks,) * ndim, chunks=(1,) * ndim)
     expected = tuple(np.array(i) if isinstance(i, list) else i for i in expected)
-    actual = _normalize_indexes(array, flatblocks, array.blocks.shape)
+    actual = _normalize_indexes(array.ndim, flatblocks, array.blocks.shape)
     assert_equal_tuple(expected, actual)