Skip to content

Commit 4bd6905

Browse files
authored
ENH: Add na_value argument to DataFrame.to_numpy (#33857)
1 parent dfaa507 commit 4bd6905

File tree

5 files changed

+106
-14
lines changed

5 files changed

+106
-14
lines changed

doc/source/whatsnew/v1.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ Other enhancements
224224
such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`)
225225
- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`)
226226
- :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`)
227+
- :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`)
227228
- The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals`
228229
method, similarly to :meth:`Series.equals` (:issue:`27081`).
229230
- The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`).

pandas/core/frame.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,7 +1280,9 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFra
12801280

12811281
return cls(data, index=index, columns=columns, dtype=dtype)
12821282

1283-
def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
1283+
def to_numpy(
1284+
self, dtype=None, copy: bool = False, na_value=lib.no_default
1285+
) -> np.ndarray:
12841286
"""
12851287
Convert the DataFrame to a NumPy array.
12861288
@@ -1301,6 +1303,11 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
13011303
another array. Note that ``copy=False`` does not *ensure* that
13021304
``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
13031305
a copy is made, even if not strictly necessary.
1306+
na_value : Any, optional
1307+
The value to use for missing values. The default value depends
1308+
on `dtype` and the dtypes of the DataFrame columns.
1309+
1310+
.. versionadded:: 1.1.0
13041311
13051312
Returns
13061313
-------
@@ -1332,7 +1339,10 @@ def to_numpy(self, dtype=None, copy: bool = False) -> np.ndarray:
13321339
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
13331340
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
13341341
"""
1335-
result = np.array(self.values, dtype=dtype, copy=copy)
1342+
result = self._mgr.as_array(
1343+
transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value
1344+
)
1345+
13361346
return result
13371347

13381348
def to_dict(self, orient="dict", into=dict):

pandas/core/internals/managers.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -781,14 +781,28 @@ def copy_func(ax):
781781
res.axes = new_axes
782782
return res
783783

784-
def as_array(self, transpose: bool = False) -> np.ndarray:
784+
def as_array(
785+
self,
786+
transpose: bool = False,
787+
dtype=None,
788+
copy: bool = False,
789+
na_value=lib.no_default,
790+
) -> np.ndarray:
785791
"""
786792
Convert the blockmanager data into an numpy array.
787793
788794
Parameters
789795
----------
790796
transpose : bool, default False
791-
If True, transpose the return array,
797+
If True, transpose the return array.
798+
dtype : object, default None
799+
Data type of the return array.
800+
copy : bool, default False
801+
If True then guarantee that a copy is returned. A value of
802+
False does not guarantee that the underlying data is not
803+
copied.
804+
na_value : object, default lib.no_default
805+
Value to be used as the missing value sentinel.
792806
793807
Returns
794808
-------
@@ -798,24 +812,41 @@ def as_array(self, transpose: bool = False) -> np.ndarray:
798812
arr = np.empty(self.shape, dtype=float)
799813
return arr.transpose() if transpose else arr
800814

801-
if self._is_single_block and self.blocks[0].is_datetimetz:
802-
# TODO(Block.get_values): Make DatetimeTZBlock.get_values
803-
# always be object dtype. Some callers seem to want the
804-
# DatetimeArray (previously DTI)
805-
arr = self.blocks[0].get_values(dtype=object)
815+
# We want to copy when na_value is provided to avoid
816+
# mutating the original object
817+
copy = copy or na_value is not lib.no_default
818+
819+
if self._is_single_block and self.blocks[0].is_extension:
820+
# Avoid implicit conversion of extension blocks to object
821+
arr = (
822+
self.blocks[0]
823+
.values.to_numpy(dtype=dtype, na_value=na_value)
824+
.reshape(self.blocks[0].shape)
825+
)
806826
elif self._is_single_block or not self.is_mixed_type:
807827
arr = np.asarray(self.blocks[0].get_values())
828+
if dtype:
829+
arr = arr.astype(dtype, copy=False)
808830
else:
809-
arr = self._interleave()
831+
arr = self._interleave(dtype=dtype, na_value=na_value)
832+
# The underlying data was copied within _interleave
833+
copy = False
834+
835+
if copy:
836+
arr = arr.copy()
837+
838+
if na_value is not lib.no_default:
839+
arr[isna(arr)] = na_value
810840

811841
return arr.transpose() if transpose else arr
812842

813-
def _interleave(self) -> np.ndarray:
843+
def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray:
814844
"""
815845
Return ndarray from blocks with specified item order
816846
Items must be contained in the blocks
817847
"""
818-
dtype = _interleaved_dtype(self.blocks)
848+
if not dtype:
849+
dtype = _interleaved_dtype(self.blocks)
819850

820851
# TODO: https://github.com/pandas-dev/pandas/issues/22791
821852
# Give EAs some input on what happens here. Sparse needs this.
@@ -830,7 +861,12 @@ def _interleave(self) -> np.ndarray:
830861

831862
for blk in self.blocks:
832863
rl = blk.mgr_locs
833-
result[rl.indexer] = blk.get_values(dtype)
864+
if blk.is_extension:
865+
# Avoid implicit conversion of extension blocks to object
866+
arr = blk.values.to_numpy(dtype=dtype, na_value=na_value)
867+
else:
868+
arr = blk.get_values(dtype)
869+
result[rl.indexer] = arr
834870
itemmask[rl.indexer] = 1
835871

836872
if not itemmask.all():

pandas/tests/base/test_conversion.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,3 +407,48 @@ def test_to_numpy_kwargs_raises():
407407
s = pd.Series([1, 2, 3], dtype="Int64")
408408
with pytest.raises(TypeError, match=msg):
409409
s.to_numpy(foo=True)
410+
411+
412+
@pytest.mark.parametrize(
413+
"data",
414+
[
415+
{"a": [1, 2, 3], "b": [1, 2, None]},
416+
{"a": np.array([1, 2, 3]), "b": np.array([1, 2, np.nan])},
417+
{"a": pd.array([1, 2, 3]), "b": pd.array([1, 2, None])},
418+
],
419+
)
420+
@pytest.mark.parametrize("dtype, na_value", [(float, np.nan), (object, None)])
421+
def test_to_numpy_dataframe_na_value(data, dtype, na_value):
422+
# https://github.com/pandas-dev/pandas/issues/33820
423+
df = pd.DataFrame(data)
424+
result = df.to_numpy(dtype=dtype, na_value=na_value)
425+
expected = np.array([[1, 1], [2, 2], [3, na_value]], dtype=dtype)
426+
tm.assert_numpy_array_equal(result, expected)
427+
428+
429+
@pytest.mark.parametrize(
430+
"data, expected",
431+
[
432+
(
433+
{"a": pd.array([1, 2, None])},
434+
np.array([[1.0], [2.0], [np.nan]], dtype=float),
435+
),
436+
(
437+
{"a": [1, 2, 3], "b": [1, 2, 3]},
438+
np.array([[1, 1], [2, 2], [3, 3]], dtype=float),
439+
),
440+
],
441+
)
442+
def test_to_numpy_dataframe_single_block(data, expected):
443+
# https://github.com/pandas-dev/pandas/issues/33820
444+
df = pd.DataFrame(data)
445+
result = df.to_numpy(dtype=float, na_value=np.nan)
446+
tm.assert_numpy_array_equal(result, expected)
447+
448+
449+
def test_to_numpy_dataframe_single_block_no_mutate():
450+
# https://github.com/pandas-dev/pandas/issues/33820
451+
result = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
452+
expected = pd.DataFrame(np.array([1.0, 2.0, np.nan]))
453+
result.to_numpy(na_value=0.0)
454+
tm.assert_frame_equal(result, expected)

pandas/tests/frame/test_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ def test_to_numpy_copy(self):
365365
df = pd.DataFrame(arr)
366366
assert df.values.base is arr
367367
assert df.to_numpy(copy=False).base is arr
368-
assert df.to_numpy(copy=True).base is None
368+
assert df.to_numpy(copy=True).base is not arr
369369

370370
def test_swapaxes(self):
371371
df = DataFrame(np.random.randn(10, 5))

0 commit comments

Comments
 (0)