-
-
Notifications
You must be signed in to change notification settings - Fork 19k
API / CoW: Copy NumPy arrays by default in DataFrame constructor #51731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
563257e
f3161a3
3a95311
17cf5ae
07aa26d
8e84d85
f3ccf0f
5cdc6ad
3e384ea
49ee53f
fcc7be2
9223836
a474bf5
d5a0268
0be7fc6
293f8a5
3376d06
265d9e3
9ac1bae
db92ce4
be9cb04
4bf3ee8
e2eceec
65965c6
ecb756c
8e837d9
177fbbc
2bbff3b
5bef4ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,8 @@ | |
import numpy as np | ||
from numpy import ma | ||
|
||
from pandas._config import using_copy_on_write | ||
|
||
from pandas._libs import lib | ||
from pandas._libs.tslibs.period import Period | ||
from pandas._typing import ( | ||
|
@@ -762,6 +764,9 @@ def _try_cast( | |
|
||
subarr = maybe_cast_to_integer_array(arr, dtype) | ||
else: | ||
subarr = np.array(arr, dtype=dtype, copy=copy) | ||
if using_copy_on_write(): | ||
subarr = np.array(arr, dtype=dtype, copy=copy, order="F") | ||
|
||
else: | ||
subarr = np.array(arr, dtype=dtype, copy=copy) | ||
|
||
return subarr |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -685,6 +685,8 @@ def __init__( | |
# INFO(ArrayManager) by default copy the 2D input array to get | ||
# contiguous 1D arrays | ||
copy = True | ||
elif using_copy_on_write() and isinstance(data, np.ndarray): | ||
|
||
copy = True | ||
else: | ||
copy = False | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
import numpy as np | ||
from numpy import ma | ||
|
||
from pandas._config import using_copy_on_write | ||
|
||
from pandas._libs import lib | ||
from pandas._typing import ( | ||
ArrayLike, | ||
|
@@ -289,6 +291,15 @@ def ndarray_to_mgr( | |
if values.ndim == 1: | ||
values = values.reshape(-1, 1) | ||
|
||
elif ( | ||
using_copy_on_write() | ||
and isinstance(values, np.ndarray) | ||
and (dtype is None or is_dtype_equal(values.dtype, dtype)) | ||
and copy_on_sanitize | ||
): | ||
values = np.array(values, order="F", copy=copy_on_sanitize) | ||
|
||
values = _ensure_2d(values) | ||
|
||
elif isinstance(values, (np.ndarray, ExtensionArray, ABCSeries, Index)): | ||
# drop subclass info | ||
values = np.array(values, copy=copy_on_sanitize) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,7 +57,10 @@ def test_fillna_on_column_view(self, using_copy_on_write): | |
|
||
# i.e. we didn't create a new 49-column block | ||
assert len(df._mgr.arrays) == 1 | ||
assert np.shares_memory(df.values, arr) | ||
if using_copy_on_write: | ||
assert not np.shares_memory(df.values, arr) | ||
|
||
else: | ||
assert np.shares_memory(df.values, arr) | ||
|
||
def test_fillna_datetime(self, datetime_frame): | ||
tf = datetime_frame | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,11 +23,15 @@ def test_to_numpy_dtype(self): | |
tm.assert_numpy_array_equal(result, expected) | ||
|
||
@td.skip_array_manager_invalid_test | ||
def test_to_numpy_copy(self): | ||
def test_to_numpy_copy(self, using_copy_on_write): | ||
arr = np.random.randn(4, 3) | ||
df = DataFrame(arr) | ||
assert df.values.base is arr | ||
assert df.to_numpy(copy=False).base is arr | ||
if using_copy_on_write: | ||
assert df.values.base is not arr | ||
assert df.to_numpy(copy=False).base is not arr | ||
|
||
else: | ||
assert df.values.base is arr | ||
assert df.to_numpy(copy=False).base is arr | ||
assert df.to_numpy(copy=True).base is not arr | ||
|
||
def test_to_numpy_mixed_dtype_to_str(self): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -313,11 +313,14 @@ def test_1d_object_array_does_not_copy(self): | |
assert np.shares_memory(df.values, arr) | ||
|
||
@td.skip_array_manager_invalid_test | ||
def test_2d_object_array_does_not_copy(self): | ||
def test_2d_object_array_does_not_copy(self, using_copy_on_write): | ||
# https://github.com/pandas-dev/pandas/issues/39272 | ||
arr = np.array([["a", "b"], ["c", "d"]], dtype="object") | ||
df = DataFrame(arr) | ||
assert np.shares_memory(df.values, arr) | ||
if using_copy_on_write: | ||
assert not np.shares_memory(df.values, arr) | ||
|
||
else: | ||
assert np.shares_memory(df.values, arr) | ||
|
||
def test_constructor_dtype_list_data(self): | ||
df = DataFrame([[1, "2"], [None, "a"]], dtype=object) | ||
|
@@ -2107,13 +2110,18 @@ def test_constructor_frame_shallow_copy(self, float_frame): | |
cop.index = np.arange(len(cop)) | ||
tm.assert_frame_equal(float_frame, orig) | ||
|
||
def test_constructor_ndarray_copy(self, float_frame, using_array_manager): | ||
def test_constructor_ndarray_copy( | ||
self, float_frame, using_array_manager, using_copy_on_write | ||
): | ||
if not using_array_manager: | ||
arr = float_frame.values.copy() | ||
df = DataFrame(arr) | ||
|
||
arr[5] = 5 | ||
assert (df.values[5] == 5).all() | ||
if using_copy_on_write: | ||
assert not (df.values[5] == 5).all() | ||
else: | ||
assert (df.values[5] == 5).all() | ||
|
||
df = DataFrame(arr, copy=True) | ||
arr[6] = 6 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the False "needed" here (did it otherwise give failures), or just for efficiency since this is an example case where we know the array is not owned by anyone else?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copying here causes one test to fail, which is very weird(the failure). Haven't looked closer yet, but the test is useless as soon as your read_only pr is merged.
Want to understand what's off there nevertheless though