diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index cb949637ea745..924040ff0648b 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -92,3 +92,41 @@ def time_setitem_slice(self, multiple_chunks): def time_tolist(self, multiple_chunks): self.array.tolist() + + +class ArrowExtensionArray: + + params = [ + [ + "boolean[pyarrow]", + "float64[pyarrow]", + "int64[pyarrow]", + "string[pyarrow]", + "timestamp[ns][pyarrow]", + ], + [False, True], + ] + param_names = ["dtype", "hasna"] + + def setup(self, dtype, hasna): + N = 100_000 + if dtype == "boolean[pyarrow]": + data = np.random.choice([True, False], N, replace=True) + elif dtype == "float64[pyarrow]": + data = np.random.randn(N) + elif dtype == "int64[pyarrow]": + data = np.arange(N) + elif dtype == "string[pyarrow]": + data = tm.rands_array(10, N) + elif dtype == "timestamp[ns][pyarrow]": + data = pd.date_range("2000-01-01", freq="s", periods=N) + else: + raise NotImplementedError + + arr = pd.array(data, dtype=dtype) + if hasna: + arr[::2] = pd.NA + self.arr = arr + + def time_to_numpy(self, dtype, hasna): + self.arr.to_numpy() diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 215e9c2a85bba..b828d18d1d700 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -748,6 +748,7 @@ Performance improvements - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement in :meth:`~arrays.ArrowExtensionArray.factorize` (:issue:`49177`) +- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`49973`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) - Performance improvement in ``var`` for nullable dtypes (:issue:`48379`). diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 254ff8894b36c..d698c5eb11751 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -9,11 +9,13 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ( ArrayLike, Dtype, FillnaOptions, Iterator, + NpDtype, PositionalIndexer, SortKind, TakeIndexer, @@ -31,6 +33,7 @@ is_bool_dtype, is_integer, is_integer_dtype, + is_object_dtype, is_scalar, ) from pandas.core.dtypes.missing import isna @@ -351,6 +354,10 @@ def __arrow_array__(self, type=None): """Convert myself to a pyarrow ChunkedArray.""" return self._data + def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: return type(self)(pc.invert(self._data)) @@ -749,6 +756,33 @@ def take( indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array)) + @doc(ExtensionArray.to_numpy) + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + if dtype is None and self._hasna: + dtype = object + if na_value is lib.no_default: + na_value = self.dtype.na_value + + pa_type = self._data.type + if ( + is_object_dtype(dtype) + or pa.types.is_timestamp(pa_type) + or pa.types.is_duration(pa_type) + ): + result = np.array(list(self), dtype=dtype) + else: + result = np.asarray(self._data, dtype=dtype) + if copy or self._hasna: + result = result.copy() + if self._hasna: + result[self.isna()] = na_value + return result + def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: """ Compute the ArrowExtensionArray of unique values. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b8b1d64d7a093..c79e2f752c5a8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -12,7 +12,6 @@ ) from pandas._typing import ( Dtype, - NpDtype, Scalar, npt, ) @@ -151,31 +150,6 @@ def dtype(self) -> StringDtype: # type: ignore[override] """ return self._dtype - def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) - - def to_numpy( - self, - dtype: npt.DTypeLike | None = None, - copy: bool = False, - na_value=lib.no_default, - ) -> np.ndarray: - """ - Convert to a NumPy ndarray. - """ - # TODO: copy argument is ignored - - result = np.array(self._data, dtype=dtype) - if self._data.null_count > 0: - if na_value is lib.no_default: - if dtype and np.issubdtype(dtype, np.floating): - return result - na_value = self._dtype.na_value - mask = self.isna() - result[mask] = na_value - return result - def insert(self, loc: int, item) -> ArrowStringArray: if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") @@ -219,10 +193,11 @@ def astype(self, dtype, copy: bool = True): if copy: return self.copy() return self - elif isinstance(dtype, NumericDtype): data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) return dtype.__from_arrow__(data) + elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): + return self.to_numpy(dtype=dtype, na_value=np.nan) return super().astype(dtype, copy=copy) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 557cdd96bf00c..c36b129f919e8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1421,3 +1421,20 @@ def test_astype_from_non_pyarrow(data): assert not isinstance(pd_array.dtype, ArrowDtype) assert isinstance(result.dtype, ArrowDtype) tm.assert_extension_array_equal(result, data) + + +def test_to_numpy_with_defaults(data): + # GH49973 + result = data.to_numpy() + + pa_type = data._data.type + if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type): + expected = np.array(list(data)) + else: + expected = np.array(data._data) + + if data._hasna: + expected = expected.astype(object) + expected[pd.isna(data)] = pd.NA + + tm.assert_numpy_array_equal(result, expected)