diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index e2ba35c1ad7f9..2f70d4e5946a0 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -485,6 +485,7 @@ ExtensionType Changes - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) - Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) @@ -616,7 +617,8 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Constructing a :class:`pd.CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index a6b05daf1d85d..14e47936e1b50 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -15,7 +15,9 @@ from pandas import compat from pandas.compat import iteritems, PY36, OrderedDict from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass -from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.common import ( + is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like +) from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike @@ -100,17 +102,45 @@ def maybe_box_datetimelike(value): def is_bool_indexer(key): - if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)): + # type: (Any) -> bool + """ + Check whether `key` is a valid boolean indexer. + + Parameters + ---------- + key : Any + Only list-likes may be considered boolean indexers. + All other types are not considered a boolean indexer. + For array-like input, boolean ndarrays or ExtensionArrays + with ``_is_boolean`` set are considered boolean indexers. + + Returns + ------- + bool + + Raises + ------ + ValueError + When the array is an object-dtype ndarray or ExtensionArray + and contains missing values. + """ + na_msg = 'cannot index with vector containing NA / NaN values' + if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or + (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): - raise ValueError('cannot index with vector containing ' - 'NA / NaN values') + raise ValueError(na_msg) return False return True - elif key.dtype == np.bool_: + elif is_bool_dtype(key.dtype): + # an ndarray with bool-dtype by definition has no missing values. + # So we only need to check for NAs in ExtensionArrays + if is_extension_array_dtype(key.dtype): + if np.any(key.isna()): + raise ValueError(na_msg) return True elif isinstance(key, list): try: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 7dcdf878231f1..a552251ebbafa 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -106,6 +106,25 @@ def _is_numeric(self): """ return False + @property + def _is_boolean(self): + # type: () -> bool + """ + Whether this dtype should be considered boolean. + + By default, ExtensionDtypes are assumed to be non-numeric. + Setting this to True will affect the behavior of several places, + e.g. + + * is_bool + * boolean indexing + + Returns + ------- + bool + """ + return False + class ExtensionDtype(_DtypeOpsMixin): """A custom data type, to be paired with an ExtensionArray. @@ -125,6 +144,7 @@ class ExtensionDtype(_DtypeOpsMixin): pandas operations * _is_numeric + * _is_boolean Optionally one can override construct_array_type for construction with the name of this dtype via the Registry. See diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f6e7e87f1043b..e2b9e246aee50 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1619,6 +1619,11 @@ def is_bool_dtype(arr_or_dtype): ------- boolean : Whether or not the array or dtype is of a boolean dtype. + Notes + ----- + An ExtensionArray is considered boolean when the ``_is_boolean`` + attribute is set to True. + Examples -------- >>> is_bool_dtype(str) @@ -1635,6 +1640,8 @@ def is_bool_dtype(arr_or_dtype): False >>> is_bool_dtype(np.array([True, False])) True + >>> is_bool_dtype(pd.Categorical([True, False])) + True """ if arr_or_dtype is None: @@ -1645,6 +1652,13 @@ def is_bool_dtype(arr_or_dtype): # this isn't even a dtype return False + if isinstance(arr_or_dtype, (ABCCategorical, ABCCategoricalIndex)): + arr_or_dtype = arr_or_dtype.dtype + + if isinstance(arr_or_dtype, CategoricalDtype): + arr_or_dtype = arr_or_dtype.categories + # now we use the special definition for Index + if isinstance(arr_or_dtype, ABCIndexClass): # TODO(jreback) @@ -1653,6 +1667,9 @@ def is_bool_dtype(arr_or_dtype): # guess this return (arr_or_dtype.is_object and arr_or_dtype.inferred_type == 'boolean') + elif is_extension_array_dtype(arr_or_dtype): + dtype = getattr(arr_or_dtype, 'dtype', arr_or_dtype) + return dtype._is_boolean return issubclass(tipo, np.bool_) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4fd77e41a1c67..d879ded4f0f09 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -462,6 +462,12 @@ def ordered(self): """Whether the categories have an ordered relationship""" return self._ordered + @property + def _is_boolean(self): + from pandas.core.dtypes.common import is_bool_dtype + + return is_bool_dtype(self.categories) + class DatetimeTZDtypeType(type): """ diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index b54ac2835bee3..d23da1565a952 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -5,7 +5,8 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, CategoricalIndex, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex, Series +import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical @@ -121,3 +122,27 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean(index): + s = Series(range(3)) + idx = Categorical([True, False, True]) + if index: + idx = CategoricalIndex(idx) + + assert com.is_bool_indexer(idx) + result = s[idx] + expected = s[idx.astype('object')] + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("index", [True, False]) +def test_mask_with_boolean_raises(index): + s = Series(range(3)) + idx = Categorical([True, False, None]) + if index: + idx = CategoricalIndex(idx) + + with tm.assert_raises_regex(ValueError, 'NA / NaN'): + s[idx] diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 55c841ba1fc46..e3d14497a38f9 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -17,7 +17,7 @@ is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, is_datetime64_any_dtype, is_string_dtype, - _coerce_to_dtype) + _coerce_to_dtype, is_bool_dtype) import pandas.util.testing as tm @@ -126,6 +126,18 @@ def test_tuple_categories(self): result = CategoricalDtype(categories) assert all(result.categories == categories) + @pytest.mark.parametrize("categories, expected", [ + ([True, False], True), + ([True, False, None], True), + ([True, False, "a", "b'"], False), + ([0, 1], False), + ]) + def test_is_boolean(self, categories, expected): + cat = Categorical(categories) + assert cat.dtype._is_boolean is expected + assert is_bool_dtype(cat) is expected + assert is_bool_dtype(cat.dtype) is expected + class TestDatetimeTZDtype(Base): diff --git a/pandas/tests/extension/arrow/__init__.py b/pandas/tests/extension/arrow/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py new file mode 100644 index 0000000000000..a9da25cdd2755 --- /dev/null +++ b/pandas/tests/extension/arrow/bool.py @@ -0,0 +1,108 @@ +"""Rudimentary Apache Arrow-backed ExtensionArray. + +At the moment, just a boolean array / type is implemented. +Eventually, we'll want to parametrize the type and support +multiple dtypes. Not all methods are implemented yet, and the +current implementation is not efficient. +""" +import copy +import itertools + +import numpy as np +import pyarrow as pa +import pandas as pd +from pandas.api.extensions import ( + ExtensionDtype, ExtensionArray, take, register_extension_dtype +) + + +@register_extension_dtype +class ArrowBoolDtype(ExtensionDtype): + + type = np.bool_ + kind = 'b' + name = 'arrow_bool' + na_value = pa.NULL + + @classmethod + def construct_from_string(cls, string): + if string == cls.name: + return cls() + else: + raise TypeError("Cannot construct a '{}' from " + "'{}'".format(cls, string)) + + @classmethod + def construct_array_type(cls): + return ArrowBoolArray + + def _is_boolean(self): + return True + + +class ArrowBoolArray(ExtensionArray): + def __init__(self, values): + if not isinstance(values, pa.ChunkedArray): + raise ValueError + + assert values.type == pa.bool_() + self._data = values + self._dtype = ArrowBoolDtype() + + def __repr__(self): + return "ArrowBoolArray({})".format(repr(self._data)) + + @classmethod + def from_scalars(cls, values): + arr = pa.chunked_array([pa.array(np.asarray(values))]) + return cls(arr) + + @classmethod + def from_array(cls, arr): + assert isinstance(arr, pa.Array) + return cls(pa.chunked_array([arr])) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy=False): + return cls.from_scalars(scalars) + + def __getitem__(self, item): + return self._data.to_pandas()[item] + + def __len__(self): + return len(self._data) + + @property + def dtype(self): + return self._dtype + + @property + def nbytes(self): + return sum(x.size for chunk in self._data.chunks + for x in chunk.buffers() + if x is not None) + + def isna(self): + return pd.isna(self._data.to_pandas()) + + def take(self, indices, allow_fill=False, fill_value=None): + data = self._data.to_pandas() + + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result, dtype=self.dtype) + + def copy(self, deep=False): + if deep: + return copy.deepcopy(self._data) + else: + return copy.copy(self._data) + + def _concat_same_type(cls, to_concat): + chunks = list(itertools.chain.from_iterable(x._data.chunks + for x in to_concat)) + arr = pa.chunked_array(chunks) + return cls(arr) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py new file mode 100644 index 0000000000000..e1afedcade3ff --- /dev/null +++ b/pandas/tests/extension/arrow/test_bool.py @@ -0,0 +1,48 @@ +import numpy as np +import pytest +import pandas as pd +import pandas.util.testing as tm +from pandas.tests.extension import base + +pytest.importorskip('pyarrow', minversion="0.10.0") + +from .bool import ArrowBoolDtype, ArrowBoolArray + + +@pytest.fixture +def dtype(): + return ArrowBoolDtype() + + +@pytest.fixture +def data(): + return ArrowBoolArray.from_scalars(np.random.randint(0, 2, size=100, + dtype=bool)) + + +class BaseArrowTests(object): + pass + + +class TestDtype(BaseArrowTests, base.BaseDtypeTests): + def test_array_type_with_arg(self, data, dtype): + pytest.skip("GH-22666") + + +class TestInterface(BaseArrowTests, base.BaseInterfaceTests): + def test_repr(self, data): + raise pytest.skip("TODO") + + +class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): + def test_from_dtype(self, data): + pytest.skip("GH-22666") + + +def test_is_bool_dtype(data): + assert pd.api.types.is_bool_dtype(data) + assert pd.core.common.is_bool_indexer(data) + s = pd.Series(range(len(data))) + result = s[data] + expected = s[np.asarray(data)] + tm.assert_series_equal(result, expected)