Skip to content

API: avoid silent consolidation #49456

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ Other API changes
- Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
- :meth:`DataFrame.values`, :meth:`DataFrame.to_numpy`, :meth:`DataFrame.xs`, :meth:`DataFrame.reindex`, :meth:`DataFrame.fillna`, and :meth:`DataFrame.replace` no longer silently consolidate the underlying arrays; do ``df = df.copy()`` to ensure consolidation (:issue:`49356`)
-

.. ---------------------------------------------------------------------------
Expand Down
9 changes: 3 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@
)
from pandas.core.arrays.sparse import SparseFrameAccessor
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
sanitize_masked_array,
Expand Down Expand Up @@ -960,20 +961,18 @@ def _values( # type: ignore[override]
"""
Analogue to ._values that may return a 2D ExtensionArray.
"""
self._consolidate_inplace()

mgr = self._mgr

if isinstance(mgr, ArrayManager):
if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
# error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
# has no attribute "reshape"
return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
return self.values
return ensure_wrapped_if_datetimelike(self.values)

blocks = mgr.blocks
if len(blocks) != 1:
return self.values
return ensure_wrapped_if_datetimelike(self.values)

arr = blocks[0].values
if arr.ndim == 1:
Expand Down Expand Up @@ -1804,7 +1803,6 @@ def to_numpy(
array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
[2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
"""
self._consolidate_inplace()
if dtype is not None:
dtype = np.dtype(dtype)
result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
Expand Down Expand Up @@ -11291,7 +11289,6 @@ def values(self) -> np.ndarray:
['lion', 80.5, 1],
['monkey', nan, None]], dtype=object)
"""
self._consolidate_inplace()
return self._mgr.as_array()

@overload
Expand Down
9 changes: 0 additions & 9 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3777,7 +3777,6 @@ def _take(

See the docstring of `take` for full explanation of the parameters.
"""
self._consolidate_inplace()

new_data = self._mgr.take(
indices,
Expand Down Expand Up @@ -3934,8 +3933,6 @@ class animal locomotion
else:
index = self.index

self._consolidate_inplace()

if isinstance(index, MultiIndex):
loc, new_index = index._get_loc_level(key, level=0)
if not drop_level:
Expand Down Expand Up @@ -5190,8 +5187,6 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT:
f'argument "{list(kwargs.keys())[0]}"'
)

self._consolidate_inplace()

# if all axes that are requested to reindex are equal, then only copy
# if indicated must have index names equal here as well as values
if all(
Expand Down Expand Up @@ -6730,8 +6725,6 @@ def fillna(
inplace = validate_bool_kwarg(inplace, "inplace")
value, method = validate_fillna_kwargs(value, method)

self._consolidate_inplace()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it was more performant to consolidate a fragmented frame before fillna or replace?

Nonetheless, I agree with the notion of "avoiding side effects" when calling a method

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any performance differences in the methods where consolidation no longer happens?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think the main one is going to be in DataFrame.values where subsequent calls could be faster in the status quo if it can be fully consolidated.


# set the default here, so functions examining the signaure
# can detect if something was set (e.g. in groupby) (GH9221)
if axis is None:
Expand Down Expand Up @@ -7049,8 +7042,6 @@ def replace(
if not is_bool(regex) and to_replace is not None:
raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")

self._consolidate_inplace()

if value is lib.no_default or method is not lib.no_default:
# GH#36984 if the user explicitly passes value=None we want to
# respect that. We have the corner case where the user explicitly
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/frame/methods/test_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,7 @@ def test_private_values_dt64tz_multicol(self):
df2 = df - df
tm.assert_equal(df2._values, tda)

def test_private_values_dt64_multiblock(self, using_array_manager, request):
if using_array_manager:
mark = pytest.mark.xfail(reason="returns ndarray")
request.node.add_marker(mark)

def test_private_values_dt64_multiblock(self):
dta = date_range("2000", periods=8)._data

df = DataFrame({"A": dta[:4]}, copy=False)
Expand Down
12 changes: 3 additions & 9 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,6 @@ def test_consolidate_inplace(self, float_frame):
for letter in range(ord("A"), ord("Z")):
float_frame[chr(letter)] = chr(letter)

def test_values_consolidate(self, float_frame):
float_frame["E"] = 7.0
assert not float_frame._mgr.is_consolidated()
_ = float_frame.values
assert float_frame._mgr.is_consolidated()

def test_modify_values(self, float_frame):
float_frame.values[5] = 5
assert (float_frame.values[5] == 5).all()
Expand All @@ -99,10 +93,10 @@ def test_modify_values(self, float_frame):
float_frame["E"] = 7.0
col = float_frame["E"]
float_frame.values[6] = 6
assert (float_frame.values[6] == 6).all()
# as of 2.0 .values does not consolidate, so subsequent calls to .values
# does not share data
assert not (float_frame.values[6] == 6).all()

# check that item_cache was cleared
assert float_frame["E"] is not col
assert (col == 7).all()

def test_boolean_set_uncons(self, float_frame):
Expand Down