ENH: Add na_value argument to DataFrame.to_numpy (#33857)

dsaxton · web-flow · commit 4bd69050fc4d · 2020-05-13T11:51:28.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -224,6 +224,7 @@ Other enhancements
   such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
 - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
 - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
+- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`)
 - The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals`
   method, similarly to :meth:`Series.equals` (:issue:`27081`).
 - The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader`  (:issue:`26667`).
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1280,7 +1280,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra
 
         return cls(data, index=index, columns=columns, dtype=dtype)
 
-    def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
+    def to_numpy(
+        self, dtype=None, copy: bool = False, na_value=lib.no_default
+    ) -> np.ndarray:
         """
         Convert the DataFrame to a NumPy array.
 
@@ -1301,6 +1303,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
             another array. Note that ``copy=False`` does not *ensure* that
             ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
             a copy is made, even if not strictly necessary.
+        na_value : Any, optional
+            The value to use for missing values. The default value depends
+            on `dtype` and the dtypes of the DataFrame columns.
+
+            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -1332,7 +1339,10 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
         array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
                [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
         """
-        result = np.array(self.values, dtype=dtype, copy=copy)
+        result = self._mgr.as_array(
+            transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
+        )
+
         return result
 
     def to_dict(self, orient="dict", into=dict):
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -781,14 +781,28 @@ def copy_func(ax):
         res.axes = new_axes
         return res
 
-    def as_array(self, transpose: bool = False) -> np.ndarray:
+    def as_array(
+        self,
+        transpose: bool = False,
+        dtype=None,
+        copy: bool = False,
+        na_value=lib.no_default,
+    ) -> np.ndarray:
         """
         Convert the blockmanager data into an numpy array.
 
         Parameters
         ----------
         transpose : bool, default False
-            If True, transpose the return array,
+            If True, transpose the return array.
+        dtype : object, default None
+            Data type of the return array.
+        copy : bool, default False
+            If True then guarantee that a copy is returned. A value of
+            False does not guarantee that the underlying data is not
+            copied.
+        na_value : object, default lib.no_default
+            Value to be used as the missing value sentinel.
 
         Returns
         -------
@@ -798,24 +812,41 @@ def as_array(self, transpose: bool = False) -> np.ndarray:
             arr = np.empty(self.shape, dtype=float)
             return arr.transpose() if transpose else arr
 
-        if self._is_single_block and self.blocks[0].is_datetimetz:
-            # TODO(Block.get_values): Make DatetimeTZBlock.get_values
-            # always be object dtype. Some callers seem to want the
-            # DatetimeArray (previously DTI)
-            arr = self.blocks[0].get_values(dtype=object)
+        # We want to copy when na_value is provided to avoid
+        # mutating the original object
+        copy = copy or na_value is not lib.no_default
+
+        if self._is_single_block and self.blocks[0].is_extension:
+            # Avoid implicit conversion of extension blocks to object
+            arr = (
+                self.blocks[0]
+                .values.to_numpy(dtype=dtype, na_value=na_value)
+                .reshape(self.blocks[0].shape)
+            )
         elif self._is_single_block or not self.is_mixed_type:
             arr = np.asarray(self.blocks[0].get_values())
+            if dtype:
+                arr = arr.astype(dtype, copy=False)
         else:
-            arr = self._interleave()
+            arr = self._interleave(dtype=dtype, na_value=na_value)
+            # The underlying data was copied within _interleave
+            copy = False
+
+        if copy:
+            arr = arr.copy()
+
+        if na_value is not lib.no_default:
+            arr[isna(arr)] = na_value
 
         return arr.transpose() if transpose else arr
 
-    def _interleave(self) -> np.ndarray:
+    def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray:
         """
         Return ndarray from blocks with specified item order
         Items must be contained in the blocks
         """
-        dtype = _interleaved_dtype(self.blocks)
+        if not dtype:
+            dtype = _interleaved_dtype(self.blocks)
 
         # TODO: https://github.com/pandas-dev/pandas/issues/22791
         # Give EAs some input on what happens here. Sparse needs this.
@@ -830,7 +861,12 @@ def _interleave(self) -> np.ndarray:
 
         for blk in self.blocks:
             rl = blk.mgr_locs
-            result[rl.indexer] = blk.get_values(dtype)
+            if blk.is_extension:
+                # Avoid implicit conversion of extension blocks to object
+                arr = blk.values.to_numpy(dtype=dtype, na_value=na_value)
+            else:
+                arr = blk.get_values(dtype)
+            result[rl.indexer] = arr
             itemmask[rl.indexer] = 1
 
         if not itemmask.all():
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
@@ -407,3 +407,48 @@ def test_to_numpy_kwargs_raises():
     s = pd.Series([1, 2, 3], dtype="Int64")
     with pytest.raises(TypeError, match=msg):
         s.to_numpy(foo=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [1, 2, None]},
+        {"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
+        {"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
+    ],
+)
+@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
+def test_to_numpy_dataframe_na_value(data, dtype, na_value):
+    # https://github.com/pandas-dev/pandas/issues/33820
+    df = pd.DataFrame(data)
+    result = df.to_numpy(dtype=dtype, na_value=na_value)
+    expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (
+            {"a": pd.array([1, 2, None])},
+            np.array([[1.0], [2.0], [np.nan]], dtype=float),
+        ),
+        (
+            {"a": [1, 2, 3], "b": [1, 2, 3]},
+            np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
+        ),
+    ],
+)
+def test_to_numpy_dataframe_single_block(data, expected):
+    # https://github.com/pandas-dev/pandas/issues/33820
+    df = pd.DataFrame(data)
+    result = df.to_numpy(dtype=float, na_value=np.nan)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_dataframe_single_block_no_mutate():
+    # https://github.com/pandas-dev/pandas/issues/33820
+    result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
+    expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
+    result.to_numpy(na_value=0.0)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -365,7 +365,7 @@ def test_to_numpy_copy(self):
         df = pd.DataFrame(arr)
         assert df.values.base is arr
         assert df.to_numpy(copy=False).base is arr
-        assert df.to_numpy(copy=True).base is None
+        assert df.to_numpy(copy=True).base is not arr
 
     def test_swapaxes(self):
         df = DataFrame(np.random.randn(10, 5))