Skip to content

Commit a4c54a3

Browse files
dcherianIllviljanpre-commit-ci[bot]
authored
Fix groupby_bins when labels are specified (#7769)
* Fix groupby_bins when labels are specified Closes #7766 * Apply suggestions from code review Co-authored-by: Illviljan <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixup wahts-new --------- Co-authored-by: Illviljan <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c75ac8b commit a4c54a3

File tree

3 files changed

+64
-16
lines changed

3 files changed

+64
-16
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ Deprecations
3535
Bug fixes
3636
~~~~~~~~~
3737

38+
- Fix binning when ``labels`` is specified. (:issue:`7766`).
39+
By `Deepak Cherian <https://github.com/dcherian>`_.
40+
3841

3942
Documentation
4043
~~~~~~~~~~~~~

xarray/core/groupby.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,8 @@ def _factorize_bins(
338338
if (codes == -1).all():
339339
raise ValueError(f"None of the data falls within bins with edges {bins!r}")
340340
full_index = binned.categories
341-
unique_values = np.sort(binned.unique().dropna())
341+
uniques = np.sort(pd.unique(codes))
342+
unique_values = full_index[uniques[uniques != -1]]
342343
group_indices = [g for g in _codes_to_groups(codes, len(full_index)) if g]
343344

344345
if len(group_indices) == 0:

xarray/tests/test_groupby.py

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1373,7 +1373,22 @@ def test_groupby_multidim_map(self):
13731373

13741374
@pytest.mark.parametrize("use_flox", [True, False])
13751375
@pytest.mark.parametrize("coords", [np.arange(4), np.arange(4)[::-1], [2, 0, 3, 1]])
1376-
def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None:
1376+
@pytest.mark.parametrize(
1377+
"cut_kwargs",
1378+
(
1379+
{"labels": None, "include_lowest": True},
1380+
{"labels": None, "include_lowest": False},
1381+
{"labels": ["a", "b"]},
1382+
{"labels": [1.2, 3.5]},
1383+
{"labels": ["b", "a"]},
1384+
),
1385+
)
1386+
def test_groupby_bins(
1387+
self,
1388+
coords: np.typing.ArrayLike,
1389+
use_flox: bool,
1390+
cut_kwargs: dict,
1391+
) -> None:
13771392
array = DataArray(
13781393
np.arange(4), dims="dim_0", coords={"dim_0": coords}, name="a"
13791394
)
@@ -1384,11 +1399,10 @@ def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None
13841399
bins = [0, 1.5, 5]
13851400

13861401
df = array.to_dataframe()
1387-
df["dim_0_bins"] = pd.cut(array["dim_0"], bins)
1402+
df["dim_0_bins"] = pd.cut(array["dim_0"], bins, **cut_kwargs)
13881403

13891404
expected_df = df.groupby("dim_0_bins").sum()
13901405
# TODO: can't convert df with IntervalIndex to Xarray
1391-
13921406
expected = (
13931407
expected_df.reset_index(drop=True)
13941408
.to_xarray()
@@ -1397,25 +1411,55 @@ def test_groupby_bins(self, coords: np.typing.ArrayLike, use_flox: bool) -> None
13971411
)
13981412

13991413
with xr.set_options(use_flox=use_flox):
1400-
actual = array.groupby_bins("dim_0", bins=bins).sum()
1414+
actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).sum()
14011415
assert_identical(expected, actual)
14021416

1403-
actual = array.groupby_bins("dim_0", bins=bins, labels=[1.2, 3.5]).sum()
1404-
assert_identical(expected.assign_coords(dim_0_bins=[1.2, 3.5]), actual)
1405-
1406-
actual = array.groupby_bins("dim_0", bins=bins).map(lambda x: x.sum())
1417+
actual = array.groupby_bins("dim_0", bins=bins, **cut_kwargs).map(
1418+
lambda x: x.sum()
1419+
)
14071420
assert_identical(expected, actual)
14081421

14091422
# make sure original array dims are unchanged
14101423
assert len(array.dim_0) == 4
14111424

1412-
da = xr.DataArray(np.ones((2, 3, 4)))
1413-
bins = [-1, 0, 1, 2]
1414-
with xr.set_options(use_flox=False):
1415-
actual = da.groupby_bins("dim_0", bins).mean(...)
1416-
with xr.set_options(use_flox=True):
1417-
expected = da.groupby_bins("dim_0", bins).mean(...)
1418-
assert_allclose(actual, expected)
1425+
def test_groupby_bins_ellipsis(self):
1426+
da = xr.DataArray(np.ones((2, 3, 4)))
1427+
bins = [-1, 0, 1, 2]
1428+
with xr.set_options(use_flox=False):
1429+
actual = da.groupby_bins("dim_0", bins).mean(...)
1430+
with xr.set_options(use_flox=True):
1431+
expected = da.groupby_bins("dim_0", bins).mean(...)
1432+
assert_allclose(actual, expected)
1433+
1434+
@pytest.mark.parametrize("use_flox", [True, False])
1435+
def test_groupby_bins_gives_correct_subset(self, use_flox: bool) -> None:
1436+
# GH7766
1437+
rng = np.random.default_rng(42)
1438+
coords = rng.normal(5, 5, 1000)
1439+
bins = np.logspace(-4, 1, 10)
1440+
labels = [
1441+
"one",
1442+
"two",
1443+
"three",
1444+
"four",
1445+
"five",
1446+
"six",
1447+
"seven",
1448+
"eight",
1449+
"nine",
1450+
]
1451+
# xArray
1452+
# Make a mock dataarray
1453+
darr = xr.DataArray(coords, coords=[coords], dims=["coords"])
1454+
expected = xr.DataArray(
1455+
[np.nan, np.nan, 1, 1, 1, 8, 31, 104, 542],
1456+
dims="coords_bins",
1457+
coords={"coords_bins": labels},
1458+
)
1459+
gb = darr.groupby_bins("coords", bins, labels=labels)
1460+
with xr.set_options(use_flox=use_flox):
1461+
actual = gb.count()
1462+
assert_identical(actual, expected)
14191463

14201464
def test_groupby_bins_empty(self):
14211465
array = DataArray(np.arange(4), [("x", range(4))])

0 commit comments

Comments
 (0)