diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index adfa382b66514..e6068fc4e94b1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1117,7 +1117,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) -- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`) +- Performance improvement for indexing operations with nullable and arrow dtypes (:issue:`49420`, :issue:`51316`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Performance improvement for :func:`api.types.infer_dtype` (:issue:`51054`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 2a6c26d595548..1b42ad1c0fda7 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1141,12 +1141,26 @@ cdef class ExtensionEngine(SharedEngine): cdef class MaskedIndexEngine(IndexEngine): def __init__(self, object values): - super().__init__(values._data) - self.mask = values._mask + super().__init__(self._get_data(values)) + self.mask = self._get_mask(values) + + def _get_data(self, object values) -> np.ndarray: + if hasattr(values, "_mask"): + return values._data + # We are an ArrowExtensionArray + # Set 1 as na_value to avoid ending up with NA and an object array + # TODO: Remove when arrow engine is implemented + return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype) + + def _get_mask(self, object values) -> np.ndarray: + if hasattr(values, "_mask"): + return values._mask + # We are an ArrowExtensionArray + return values.isna() def get_indexer(self, object values) -> np.ndarray: self._ensure_mapping_populated() - return self.mapping.lookup(values._data, values._mask) + return self.mapping.lookup(self._get_data(values), self._get_mask(values)) def get_indexer_non_unique(self, object targets): """ @@ -1171,8 +1185,8 @@ cdef class MaskedIndexEngine(IndexEngine): Py_ssize_t count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx - target_vals = targets._data - target_mask = targets._mask + target_vals = self._get_data(targets) + target_mask = self._get_mask(targets) values = self.values assert not values.dtype == object # go through object path instead diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 5e67a2c92cccb..894dc75e351a4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -11,6 +11,7 @@ TypeVar, cast, ) +import warnings import numpy as np @@ -890,7 +891,10 @@ def to_numpy( mask = ~self.isna() result[mask] = np.asarray(self[mask]._data) else: - result = np.asarray(self._data, dtype=dtype) + with warnings.catch_warnings(): + # int dtype with NA raises Warning + warnings.filterwarnings("ignore", category=RuntimeWarning) + result = np.asarray(self._data, dtype=dtype) if copy or self._hasna: result = result.copy() if self._hasna: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 424cbf12c99cc..bd631c0c0d948 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -221,6 +221,19 @@ "Int16": libindex.MaskedInt16Engine, "Int8": libindex.MaskedInt8Engine, "boolean": libindex.MaskedBoolEngine, + "double[pyarrow]": libindex.MaskedFloat64Engine, + "float64[pyarrow]": libindex.MaskedFloat64Engine, + "float32[pyarrow]": libindex.MaskedFloat32Engine, + "float[pyarrow]": libindex.MaskedFloat32Engine, + "uint64[pyarrow]": libindex.MaskedUInt64Engine, + "uint32[pyarrow]": libindex.MaskedUInt32Engine, + "uint16[pyarrow]": libindex.MaskedUInt16Engine, + "uint8[pyarrow]": libindex.MaskedUInt8Engine, + "int64[pyarrow]": libindex.MaskedInt64Engine, + "int32[pyarrow]": libindex.MaskedInt32Engine, + "int16[pyarrow]": libindex.MaskedInt16Engine, + "int8[pyarrow]": libindex.MaskedInt8Engine, + "bool[pyarrow]": libindex.MaskedBoolEngine, } @@ -796,7 +809,7 @@ def _engine( # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() if isinstance(target_values, ExtensionArray): - if isinstance(target_values, BaseMaskedArray): + if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)): return _masked_engines[target_values.dtype.name](target_values) elif self._engine_type is libindex.ObjectEngine: return libindex.ExtensionEngine(target_values) @@ -4932,6 +4945,10 @@ def _get_engine_target(self) -> ArrayLike: type(self) is Index and isinstance(self._values, ExtensionArray) and not isinstance(self._values, BaseMaskedArray) + and not ( + isinstance(self._values, ArrowExtensionArray) + and is_numeric_dtype(self.dtype) + ) ): # TODO(ExtensionIndex): remove special-case, just use self._values return self._values.astype(object) diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index eec1df8b44f33..c99e912ce4c0f 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -317,26 +317,26 @@ def test_get_indexer_uint64(self, index_large): tm.assert_numpy_array_equal(indexer, expected) @pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)]) - def test_get_loc_masked(self, val, val2, any_numeric_ea_dtype): + def test_get_loc_masked(self, val, val2, any_numeric_ea_and_arrow_dtype): # GH#39133 - idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_dtype) + idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_and_arrow_dtype) result = idx.get_loc(2) assert result == 1 with pytest.raises(KeyError, match="9"): idx.get_loc(9) - def test_get_loc_masked_na(self, any_numeric_ea_dtype): + def test_get_loc_masked_na(self, any_numeric_ea_and_arrow_dtype): # GH#39133 - idx = Index([1, 2, NA], dtype=any_numeric_ea_dtype) + idx = Index([1, 2, NA], dtype=any_numeric_ea_and_arrow_dtype) result = idx.get_loc(NA) assert result == 2 - idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_dtype) + idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_and_arrow_dtype) result = idx.get_loc(NA) tm.assert_numpy_array_equal(result, np.array([False, False, True, True])) - idx = Index([1, 2, 3], dtype=any_numeric_ea_dtype) + idx = Index([1, 2, 3], dtype=any_numeric_ea_and_arrow_dtype) with pytest.raises(KeyError, match="NA"): idx.get_loc(NA) @@ -371,16 +371,19 @@ def test_get_loc_masked_na_and_nan(self): idx.get_loc(NA) @pytest.mark.parametrize("val", [4, 2]) - def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val): + def test_get_indexer_masked_na(self, any_numeric_ea_and_arrow_dtype, val): # GH#39133 - idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype) + idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_and_arrow_dtype) result = idx.get_indexer_for([1, NA, 5]) expected = np.array([0, 2, -1]) tm.assert_numpy_array_equal(result, expected, check_dtype=False) - def test_get_indexer_masked_na_boolean(self): + @pytest.mark.parametrize("dtype", ["boolean", "bool[pyarrow]"]) + def test_get_indexer_masked_na_boolean(self, dtype): # GH#39133 - idx = Index([True, False, NA], dtype="boolean") + if dtype == "bool[pyarrow]": + pytest.importorskip("pyarrow") + idx = Index([True, False, NA], dtype=dtype) result = idx.get_loc(False) assert result == 1 result = idx.get_loc(NA)