From 6b645e6f702e1a697f21939e7af2adbb36a89d2f Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 22 Feb 2020 09:37:37 -0600 Subject: [PATCH] Backport PR #31591: ENH: Enable indexing with nullable Boolean --- doc/source/user_guide/boolean.rst | 10 +-- doc/source/user_guide/indexing.rst | 12 ++- doc/source/whatsnew/v1.0.2.rst | 29 ++++++- pandas/core/arrays/datetimelike.py | 4 +- pandas/core/common.py | 7 +- pandas/core/indexers.py | 14 ++-- pandas/core/indexing.py | 7 +- .../tests/arrays/categorical/test_indexing.py | 9 +- pandas/tests/extension/base/getitem.py | 20 +++-- pandas/tests/extension/base/setitem.py | 84 +++++++++++++++++++ pandas/tests/extension/test_numpy.py | 42 ++++++++++ pandas/tests/indexing/test_check_indexer.py | 12 +-- pandas/tests/indexing/test_na_indexing.py | 27 ++++-- pandas/tests/series/indexing/test_boolean.py | 2 +- 14 files changed, 227 insertions(+), 52 deletions(-) diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 5276bc6142206..95f1931419d04 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -20,8 +20,9 @@ Nullable Boolean Data Type Indexing with NA values ----------------------- -pandas does not allow indexing with NA values. Attempting to do so -will raise a ``ValueError``. +pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``. + +.. versionchanged:: 1.0.2 .. ipython:: python :okexcept: @@ -30,12 +31,11 @@ will raise a ``ValueError``. mask = pd.array([True, False, pd.NA], dtype="boolean") s[mask] -The missing values will need to be explicitly filled with True or False prior -to using the array as a mask. +If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``. .. ipython:: python - s[mask.fillna(False)] + s[mask.fillna(True)] .. _boolean.kleene: diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index a8cdf4a61073d..2bd3ff626f2e1 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -59,7 +59,7 @@ of multi-axis indexing. slices, **both** the start and the stop are included, when present in the index! See :ref:`Slicing with labels ` and :ref:`Endpoints are inclusive `.) - * A boolean array + * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). @@ -75,7 +75,7 @@ of multi-axis indexing. * An integer e.g. ``5``. * A list or array of integers ``[4, 3, 0]``. * A slice object with ints ``1:7``. - * A boolean array. + * A boolean array (any ``NA`` values will be treated as ``False``). * A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above). @@ -374,6 +374,14 @@ For getting values with a boolean array: df1.loc['a'] > 0 df1.loc[:, df1.loc['a'] > 0] +NA values in a boolean array propogate as ``False``: + +.. versionchanged:: 1.0.2 + + mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean") + mask + df1[mask] + For getting a value explicitly: .. ipython:: python diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 07afe60c9c22a..affe019d0ac86 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -26,6 +26,33 @@ Fixed regressions .. --------------------------------------------------------------------------- +Indexing with Nullable Boolean Arrays +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`) + +.. ipython:: python + + s = pd.Series([1, 2, 3, 4]) + mask = pd.array([True, True, False, None], dtype="boolean") + s + mask + +*pandas 1.0.0-1.0.1* + +.. code-block:: python + + >>> s[mask] + Traceback (most recent call last): + ... + ValueError: cannot mask with array containing NA / NaN values + +*pandas 1.0.2* + +.. ipython:: python + + s[mask] + .. _whatsnew_102.bug_fixes: Bug fixes @@ -45,8 +72,6 @@ Bug fixes - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`) - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`). - - **Experimental dtypes** - Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`). diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e8d5890d2564f..ec954e5721f1d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -520,7 +520,9 @@ def __getitem__(self, key): if com.is_bool_indexer(key): # first convert to boolean, because check_array_indexer doesn't # allow object dtype - key = np.asarray(key, dtype=bool) + if is_object_dtype(key): + key = np.asarray(key, dtype=bool) + key = check_array_indexer(self, key) if key.all(): key = slice(0, None, None) diff --git a/pandas/core/common.py b/pandas/core/common.py index d8b082e7c0f79..673e223b835d2 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -124,7 +124,6 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -132,16 +131,12 @@ def is_bool_indexer(key: Any) -> bool: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): + na_msg = "Cannot mask with non-boolean array containing NA / NaN values" if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): - # an ndarray with bool-dtype by definition has no missing values. - # So we only need to check for NAs in ExtensionArrays - if is_extension_array_dtype(key.dtype): - if np.any(key.isna()): - raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index fe475527f4596..e9bdc99cef3ed 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -10,6 +10,7 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_extension_array_dtype, is_integer_dtype, is_list_like, ) @@ -333,14 +334,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: ... IndexError: Boolean index has wrong length: 3 instead of 2. - A ValueError is raised when the mask cannot be converted to - a bool-dtype ndarray. + NA values in a boolean array are treated as False. >>> mask = pd.array([True, pd.NA]) >>> pd.api.indexers.check_array_indexer(arr, mask) - Traceback (most recent call last): - ... - ValueError: Cannot mask with a boolean indexer containing NA values + array([ True, False]) A numpy boolean mask will get passed through (if the length is correct): @@ -392,10 +390,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: dtype = indexer.dtype if is_bool_dtype(dtype): - try: + if is_extension_array_dtype(dtype): + indexer = indexer.to_numpy(dtype=bool, na_value=False) + else: indexer = np.asarray(indexer, dtype=bool) - except ValueError: - raise ValueError("Cannot mask with a boolean indexer containing NA values") # GH26658 if len(indexer) != len(array): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 35909e1693b3f..7a67280f9c82d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -13,6 +13,7 @@ is_iterator, is_list_like, is_numeric_dtype, + is_object_dtype, is_scalar, is_sequence, ) @@ -2319,10 +2320,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: "the indexed object do not match)." ) result = result.astype(bool)._values - else: - # key might be sparse / object-dtype bool, check_array_indexer needs bool array + elif is_object_dtype(key): + # key might be object-dtype bool, check_array_indexer needs bool array result = np.asarray(result, dtype=bool) result = check_array_indexer(index, result) + else: + result = check_array_indexer(index, result) return result diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 85d5a6a3dc3ac..3d9469c252914 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -240,14 +240,17 @@ def test_mask_with_boolean(index): @pytest.mark.parametrize("index", [True, False]) -def test_mask_with_boolean_raises(index): +def test_mask_with_boolean_na_treated_as_false(index): + # https://github.com/pandas-dev/pandas/issues/31503 s = Series(range(3)) idx = Categorical([True, False, None]) if index: idx = CategoricalIndex(idx) - with pytest.raises(ValueError, match="NA / NaN"): - s[idx] + result = s[idx] + expected = s[idx.fillna(False)] + + tm.assert_series_equal(result, expected) @pytest.fixture diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 8615a8df22dcc..b08a64cc076b6 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data): result = pd.Series(data)[mask] self.assert_series_equal(result, expected) - def test_getitem_boolean_array_mask_raises(self, data): + def test_getitem_boolean_na_treated_as_false(self, data): + # https://github.com/pandas-dev/pandas/issues/31503 mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") mask[:2] = pd.NA + mask[2:4] = True - msg = ( - "Cannot mask with a boolean indexer containing NA values|" - "cannot mask with array containing NA / NaN values" - ) - with pytest.raises(ValueError, match=msg): - data[mask] + result = data[mask] + expected = data[mask.fillna(False)] + + self.assert_extension_array_equal(result, expected) s = pd.Series(data) - with pytest.raises(ValueError): - s[mask] + result = s[mask] + expected = s[mask.fillna(False)] + + self.assert_series_equal(result, expected) @pytest.mark.parametrize( "idx", diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 590bcd586900a..2b46f8ec3d1c3 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -93,6 +93,90 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df.iloc[10, 1] = data[1] assert df.loc[10, "B"] == data[1] + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series): + arr = data[:5].copy() + expected = arr.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + arr[mask] = data[0] + self.assert_equal(expected, arr) + + def test_setitem_mask_raises(self, data, box_in_series): + # wrong length + mask = np.array([True, False]) + + if box_in_series: + data = pd.Series(data) + + with pytest.raises(IndexError, match="wrong length"): + data[mask] = data[0] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError, match="wrong length"): + data[mask] = data[0] + + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:3] = True + mask[3:5] = pd.NA + + if box_in_series: + data = pd.Series(data) + + data[mask] = data[0] + + assert (data[:3] == data[0]).all() + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + arr[idx] = arr[0] + self.assert_equal(arr, expected) + + @pytest.mark.parametrize( + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param( + [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") + ), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + arr = data.copy() + + # TODO(xfail) this raises KeyError about labels not found (it tries label-based) + # for list of labels with Series + if box_in_series: + arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))]) + + msg = "Cannot index with an integer indexer containing NA values" + with pytest.raises(ValueError, match=msg): + arr[idx] = arr[0] + @pytest.mark.parametrize("as_callable", [True, False]) @pytest.mark.parametrize("setter", ["loc", None]) def test_setitem_mask_aligned(self, data, as_callable, setter): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 76573242a2506..61c5925383f88 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -396,6 +396,48 @@ def test_setitem_scalar_key_sequence_raise(self, data): # Failed: DID NOT RAISE super().test_setitem_scalar_key_sequence_raise(data) + # TODO: there is some issue with PandasArray, therefore, + # skip the setitem test for now, and fix it later (GH 31446) + + @skip_nested + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array"], + ) + def test_setitem_mask(self, data, mask, box_in_series): + super().test_setitem_mask(data, mask, box_in_series) + + @skip_nested + def test_setitem_mask_raises(self, data, box_in_series): + super().test_setitem_mask_raises(data, box_in_series) + + @skip_nested + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + super().test_setitem_integer_array(data, idx, box_in_series) + + @skip_nested + @pytest.mark.parametrize( + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) + @skip_nested def test_setitem_slice(self, data, box_in_series): super().test_setitem_slice(data, box_in_series) diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 82f8c12229824..69d4065234d93 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -34,12 +34,14 @@ def test_valid_input(indexer, expected): @pytest.mark.parametrize( "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], ) -def test_bool_raise_missing_values(indexer): - array = np.array([1, 2, 3]) +def test_boolean_na_returns_indexer(indexer): + # https://github.com/pandas-dev/pandas/issues/31503 + arr = np.array([1, 2, 3]) - msg = "Cannot mask with a boolean indexer containing NA values" - with pytest.raises(ValueError, match=msg): - check_array_indexer(array, indexer) + result = check_array_indexer(arr, indexer) + expected = np.array([True, False, False], dtype=bool) + + tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py index befe4fee8ecf8..345ca30ec77eb 100644 --- a/pandas/tests/indexing/test_na_indexing.py +++ b/pandas/tests/indexing/test_na_indexing.py @@ -62,18 +62,29 @@ def test_series_mask_boolean(values, dtype, mask, box_mask, frame): @pytest.mark.parametrize("frame", [True, False]) -def test_indexing_with_na_raises(frame): +def test_na_treated_as_false(frame): + # https://github.com/pandas-dev/pandas/issues/31503 s = pd.Series([1, 2, 3], name="name") if frame: s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") - match = "cannot mask with array containing NA / NaN values" - with pytest.raises(ValueError, match=match): - s[mask] - with pytest.raises(ValueError, match=match): - s.loc[mask] + result = s[mask] + expected = s[mask.fillna(False)] + + result_loc = s.loc[mask] + expected_loc = s.loc[mask.fillna(False)] - with pytest.raises(ValueError, match=match): - s.iloc[mask] + result_iloc = s.iloc[mask] + expected_iloc = s.iloc[mask.fillna(False)] + + if frame: + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result_loc, expected_loc) + tm.assert_frame_equal(result_iloc, expected_iloc) + else: + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_loc, expected_loc) + tm.assert_series_equal(result_iloc, expected_iloc) diff --git a/pandas/tests/series/indexing/test_boolean.py b/pandas/tests/series/indexing/test_boolean.py index d75efcf52c271..232c9cbc1541c 100644 --- a/pandas/tests/series/indexing/test_boolean.py +++ b/pandas/tests/series/indexing/test_boolean.py @@ -75,7 +75,7 @@ def test_getitem_boolean_object(string_series): # nans raise exception omask[5:10] = np.nan - msg = "cannot mask with array containing NA / NaN values" + msg = "Cannot mask with non-boolean array containing NA / NaN values" with pytest.raises(ValueError, match=msg): s[omask] with pytest.raises(ValueError, match=msg):