Skip to content

ENH: Optimize replace to avoid copying when not necessary #50918

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 26, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 65 additions & 7 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
cast,
final,
)
import weakref

import numpy as np

Expand Down Expand Up @@ -152,6 +153,7 @@ class Block(PandasObject):
is_extension = False
_can_consolidate = True
_validate_ndim = True
_ref = None

@final
@cache_readonly
Expand Down Expand Up @@ -523,6 +525,8 @@ def replace(
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
original_blocks: list[Block] = [],
using_copy_on_write: bool = False,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
Expand All @@ -549,17 +553,40 @@ def replace(
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
return [self] if inplace else [self.copy()]
if using_copy_on_write and original_blocks:
result = self.copy(deep=False)
result._ref = result._ref = weakref.ref(
original_blocks[self.mgr_locs.as_array[0]]
)
return [result]
else:
return [self] if inplace else [self.copy()]

if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we pass an axis(I thinkaxis=0 should work, need to be careful of 1-D values/mask though), than we should have enough info to split the block.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep agree, but right now this is not that high on my list of priorities. With the ref tracking how it is right now, this only defers the copy a little bit

# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]
if using_copy_on_write and original_blocks:
result = self.copy(deep=False)
result._ref = result._ref = weakref.ref(
original_blocks[self.mgr_locs.as_array[0]]
)
return [result]
else:
return [self] if inplace else [self.copy()]

elif self._can_hold_element(value):
blk = self if inplace else self.copy()
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
if using_copy_on_write:
if original_blocks:
blk = self.copy()
else:
# In case we made a copy before, e.g. coerce to target dtype
blk = self
else:
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
Expand All @@ -584,6 +611,13 @@ def replace(
else:
# split so that we only upcast where necessary
blocks = []
if original_blocks:
_original_blocks = [original_blocks[self.mgr_locs.as_array[0]]] * len(
self
)
else:
_original_blocks = []

for i, nb in enumerate(self._split()):
blocks.extend(
type(self).replace(
Expand All @@ -592,6 +626,8 @@ def replace(
value=value,
inplace=True,
mask=mask[i : i + 1],
original_blocks=[self] * (self.mgr_locs.as_array.max() + 1),
using_copy_on_write=using_copy_on_write,
)
)
return blocks
Expand Down Expand Up @@ -645,6 +681,8 @@ def replace_list(
dest_list: Sequence[Any],
inplace: bool = False,
regex: bool = False,
original_blocks: list[Block] = [],
using_copy_on_write: bool = False,
) -> list[Block]:
"""
See BlockManager.replace_list docstring.
Expand All @@ -657,7 +695,14 @@ def replace_list(
]
if not len(pairs):
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]
if using_copy_on_write and original_blocks:
nb = self.copy(deep=False)
nb._ref = nb._ref = weakref.ref(
original_blocks[self.mgr_locs.as_array[0]]
)
return [nb]
else:
return [self] if inplace else [self.copy()]

src_len = len(pairs) - 1

Expand All @@ -678,7 +723,11 @@ def replace_list(
# ndarray]"
masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type]

rb = [self if inplace else self.copy()]
if using_copy_on_write:
# TODO(CoW): Optimize to avoid as many copies as possible
rb = [self.copy()]
else:
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
convert = i == src_len # only convert once at the end
new_rb: list[Block] = []
Expand All @@ -701,8 +750,10 @@ def replace_list(
to_replace=src,
value=dest,
mask=m, # type: ignore[arg-type]
inplace=inplace,
inplace=True, # We already made a copy if inplace=False
regex=regex,
# TODO(CoW): Optimize to avoid as many copies as possible
using_copy_on_write=False,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
Expand All @@ -719,6 +770,8 @@ def _replace_coerce(
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
original_blocks: list[Block] = [],
using_copy_on_write: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -760,7 +813,12 @@ def _replace_coerce(
return [nb]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace, value=value, inplace=inplace, mask=mask
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
original_blocks=original_blocks,
using_copy_on_write=using_copy_on_write,
)

# ---------------------------------------------------------------------
Expand Down
35 changes: 31 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,10 +449,36 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
# NDFrame.replace ensures the not-is_list_likes here
assert not is_list_like(to_replace)
assert not is_list_like(value)
return self.apply(
"replace", to_replace=to_replace, value=value, inplace=inplace
return self._call_function_and_update_refs(
"replace",
to_replace=to_replace,
value=value,
inplace=inplace,
)

def _call_function_and_update_refs(self, func, **kwargs):
if using_copy_on_write():
use_cow = True
if self.is_single_block:
original_blocks = [self.blocks[0]] * self.shape[0]
else:
original_blocks = [self.blocks[i] for i in self.blknos]
else:
use_cow = False
original_blocks = []

mgr = self.apply(
func,
**kwargs,
original_blocks=original_blocks,
using_copy_on_write=use_cow,
)
refs = [getattr(blk, "_ref", None) for blk in mgr.blocks]
if any(ref is not None for ref in refs):
mgr.refs = refs
mgr.parent = self
return mgr

def replace_regex(self, **kwargs):
return self.apply("_replace_regex", **kwargs)

Expand All @@ -466,14 +492,15 @@ def replace_list(
"""do a list replace"""
inplace = validate_bool_kwarg(inplace, "inplace")

bm = self.apply(
bm = self._call_function_and_update_refs(
"replace_list",
src_list=src_list,
dest_list=dest_list,
inplace=inplace,
regex=regex,
)
bm._consolidate_inplace()
if not using_copy_on_write():
bm._consolidate_inplace()
return bm

def to_native_types(self: T, **kwargs) -> T:
Expand Down
65 changes: 65 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,6 +1102,71 @@ def test_replace(using_copy_on_write, replace_kwargs):
tm.assert_frame_equal(df, df_orig)


def test_replace_mask_all_false_second_block(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value=55.5)

if using_copy_on_write:
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "c"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# TODO: This should split and not copy the whole block
# assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))


def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value="a")

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

elif not using_array_manager:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

if using_copy_on_write:
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


@pytest.mark.parametrize("to_replace", ["xxx", ["xxx"]])
def test_replace_to_replace_wrong_dtype(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace=to_replace, value=1.5)

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_putmask(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
view = df[:]
Expand Down