diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index dc86352082cca..a0dd52e9f17e4 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -382,4 +382,23 @@ def time_iter(self, dtype): pass +class ToNumpy: + def setup(self): + N = 1_000_000 + self.ser = Series( + np.random.randn( + N, + ) + ) + + def time_to_numpy(self): + self.ser.to_numpy() + + def time_to_numpy_double_copy(self): + self.ser.to_numpy(dtype="float64", copy=True) + + def time_to_numpy_copy(self): + self.ser.to_numpy(copy=True) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b1387e9717079..d51885b7ad867 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -775,6 +775,7 @@ Performance improvements - Performance improvement when iterating over pyarrow and nullable dtypes (:issue:`49825`, :issue:`49851`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) +- Performance improvement in :meth:`Series.to_numpy` if ``copy=True`` by avoiding copying twice (:issue:`24345`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) - Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) @@ -849,6 +850,7 @@ Conversion - Bug where any :class:`ExtensionDtype` subclass with ``kind="M"`` would be interpreted as a timezone type (:issue:`34986`) - Bug in :class:`.arrays.ArrowExtensionArray` that would raise ``NotImplementedError`` when passed a sequence of strings or binary (:issue:`49172`) - Bug in :meth:`Series.astype` raising ``pyarrow.ArrowInvalid`` when converting from a non-pyarrow string dtype to a pyarrow numeric type (:issue:`50430`) +- Bug in :meth:`Series.to_numpy` converting to NumPy array before applying ``na_value`` (:issue:`48951`) - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - diff --git a/pandas/core/base.py b/pandas/core/base.py index e5e0ac4e121ae..23121b7075fe1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -531,12 +531,19 @@ def to_numpy( f"to_numpy() got an unexpected keyword argument '{bad_keys}'" ) - result = np.asarray(self._values, dtype=dtype) - # TODO(GH-24345): Avoid potential double copy - if copy or na_value is not lib.no_default: - result = result.copy() - if na_value is not lib.no_default: - result[np.asanyarray(self.isna())] = na_value + if na_value is not lib.no_default: + values = self._values.copy() + values[np.asanyarray(self.isna())] = na_value + else: + values = self._values + + result = np.asarray(values, dtype=dtype) + + if copy and na_value is lib.no_default: + if np.shares_memory(self._values[:2], result[:2]): + # Take slices to improve performance of check + result = result.copy() + return result @final diff --git a/pandas/tests/series/methods/test_to_numpy.py b/pandas/tests/series/methods/test_to_numpy.py new file mode 100644 index 0000000000000..487489e8c0b0c --- /dev/null +++ b/pandas/tests/series/methods/test_to_numpy.py @@ -0,0 +1,17 @@ +import numpy as np +import pytest + +from pandas import ( + NA, + Series, +) +import pandas._testing as tm + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +def test_to_numpy_na_value(dtype): + # GH#48951 + ser = Series([1, 2, NA, 4]) + result = ser.to_numpy(dtype=dtype, na_value=0) + expected = np.array([1, 2, 0, 4], dtype=dtype) + tm.assert_numpy_array_equal(result, expected)