Suggest the correct name when no key matches in the dataset (#9943)

Illviljan · web-flow · commit 70997ef02135 · 2025-01-17T19:41:34.000+01:00
* Add "did you mean" function

* improve error for wrong key in dataset

* Prioritize best guess

* increase number of valid suggestions to match previous idea

* Update dataset.py

* Update utils.py

* Update whats-new.rst

* Update whats-new.rst
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -52,6 +52,8 @@ New Features
 ~~~~~~~~~~~~
 - Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_ and `Spencer Clark <https://github.com/spencerkclark>`_.
+- Improve the error message raised when no key is matching the available variables in a dataset.  (:pull:`9943`)
+  By `Jimmy Westling <https://github.com/illviljan>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1610,7 +1610,14 @@ def __getitem__(
             try:
                 return self._construct_dataarray(key)
             except KeyError as e:
-                message = f"No variable named {key!r}. Variables on the dataset include {shorten_list_repr(list(self.variables.keys()), max_items=10)}"
+                message = f"No variable named {key!r}."
+
+                best_guess = utils.did_you_mean(key, self.variables.keys())
+                if best_guess:
+                    message += f" {best_guess}"
+                else:
+                    message += f" Variables on the dataset include {shorten_list_repr(list(self.variables.keys()), max_items=10)}"
+
                 # If someone attempts `ds['foo' , 'bar']` instead of `ds[['foo', 'bar']]`
                 if isinstance(key, tuple):
                     message += f"\nHint: use a list to select multiple variables, for example `ds[{list(key)}]`"
diff --git a/xarray/core/utils.py b/xarray/core/utils.py
@@ -37,6 +37,7 @@
 from __future__ import annotations
 
 import contextlib
+import difflib
 import functools
 import importlib
 import inspect
@@ -114,6 +115,47 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def did_you_mean(
+    word: Hashable, possibilities: Iterable[Hashable], *, n: int = 10
+) -> str:
+    """
+    Suggest a few correct words based on a list of possibilites
+
+    Parameters
+    ----------
+    word : Hashable
+        Word to compare to a list of possibilites.
+    possibilities : Iterable of Hashable
+        The iterable of Hashable that contains the correct values.
+    n : int, default: 10
+        Maximum number of suggestions to show.
+
+    Examples
+    --------
+    >>> did_you_mean("bluch", ("blech", "gray_r", 1, None, (2, 56)))
+    "Did you mean one of ('blech',)?"
+    >>> did_you_mean("none", ("blech", "gray_r", 1, None, (2, 56)))
+    'Did you mean one of (None,)?'
+
+    See also
+    --------
+    https://en.wikipedia.org/wiki/String_metric
+    """
+    # Convert all values to string, get_close_matches doesn't handle all hashables:
+    possibilites_str: dict[str, Hashable] = {str(k): k for k in possibilities}
+
+    msg = ""
+    if len(
+        best_str := difflib.get_close_matches(
+            str(word), list(possibilites_str.keys()), n=n
+        )
+    ):
+        best = tuple(possibilites_str[k] for k in best_str)
+        msg = f"Did you mean one of {best}?"
+
+    return msg
+
+
 def get_valid_numpy_dtype(array: np.ndarray | pd.Index) -> np.dtype:
     """Return a numpy compatible dtype from either
     a numpy array or a pandas.Index.