-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Optimize replace to avoid copying when not necessary #50918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
9b46125
773b6ea
e8f2c3f
16dbe9a
9217d46
2f8d719
fdd2f66
af83e7a
b52ec89
ce29442
5b9ddce
5467d2a
703959d
8aebfaf
bcde5c2
3057c4d
68f1d37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
cast, | ||
final, | ||
) | ||
import weakref | ||
|
||
import numpy as np | ||
|
||
|
@@ -152,6 +153,7 @@ class Block(PandasObject): | |
is_extension = False | ||
_can_consolidate = True | ||
_validate_ndim = True | ||
_ref = None | ||
|
||
@final | ||
@cache_readonly | ||
|
@@ -523,6 +525,8 @@ def replace( | |
inplace: bool = False, | ||
# mask may be pre-computed if we're called from replace_list | ||
mask: npt.NDArray[np.bool_] | None = None, | ||
original_blocks: list[Block] = [], | ||
using_copy_on_write: bool = False, | ||
) -> list[Block]: | ||
""" | ||
replace the to_replace value with value, possible to create new | ||
|
@@ -549,17 +553,40 @@ def replace( | |
# replacing it is a no-op. | ||
# Note: If to_replace were a list, NDFrame.replace would call | ||
# replace_list instead of replace. | ||
return [self] if inplace else [self.copy()] | ||
if using_copy_on_write and original_blocks: | ||
result = self.copy(deep=False) | ||
result._ref = result._ref = weakref.ref( | ||
original_blocks[self.mgr_locs.as_array[0]] | ||
) | ||
return [result] | ||
else: | ||
return [self] if inplace else [self.copy()] | ||
|
||
if mask is None: | ||
mask = missing.mask_missing(values, to_replace) | ||
if not mask.any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we pass an axis(I think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep agree, but right now this is not that high on my list of priorities. With the ref tracking how it is right now, this only defers the copy a little bit |
||
# Note: we get here with test_replace_extension_other incorrectly | ||
# bc _can_hold_element is incorrect. | ||
return [self] if inplace else [self.copy()] | ||
if using_copy_on_write and original_blocks: | ||
result = self.copy(deep=False) | ||
result._ref = result._ref = weakref.ref( | ||
original_blocks[self.mgr_locs.as_array[0]] | ||
) | ||
return [result] | ||
else: | ||
return [self] if inplace else [self.copy()] | ||
|
||
elif self._can_hold_element(value): | ||
blk = self if inplace else self.copy() | ||
# TODO(CoW): Maybe split here as well into columns where mask has True | ||
# and rest? | ||
if using_copy_on_write: | ||
if original_blocks: | ||
blk = self.copy() | ||
else: | ||
# In case we made a copy before, e.g. coerce to target dtype | ||
blk = self | ||
else: | ||
blk = self if inplace else self.copy() | ||
putmask_inplace(blk.values, mask, value) | ||
if not (self.is_object and value is None): | ||
# if the user *explicitly* gave None, we keep None, otherwise | ||
|
@@ -584,6 +611,13 @@ def replace( | |
else: | ||
# split so that we only upcast where necessary | ||
blocks = [] | ||
if original_blocks: | ||
_original_blocks = [original_blocks[self.mgr_locs.as_array[0]]] * len( | ||
self | ||
) | ||
else: | ||
_original_blocks = [] | ||
|
||
for i, nb in enumerate(self._split()): | ||
blocks.extend( | ||
type(self).replace( | ||
|
@@ -592,6 +626,8 @@ def replace( | |
value=value, | ||
inplace=True, | ||
mask=mask[i : i + 1], | ||
original_blocks=[self] * (self.mgr_locs.as_array.max() + 1), | ||
using_copy_on_write=using_copy_on_write, | ||
) | ||
) | ||
return blocks | ||
|
@@ -645,6 +681,8 @@ def replace_list( | |
dest_list: Sequence[Any], | ||
inplace: bool = False, | ||
regex: bool = False, | ||
original_blocks: list[Block] = [], | ||
using_copy_on_write: bool = False, | ||
) -> list[Block]: | ||
""" | ||
See BlockManager.replace_list docstring. | ||
|
@@ -657,7 +695,14 @@ def replace_list( | |
] | ||
if not len(pairs): | ||
# shortcut, nothing to replace | ||
return [self] if inplace else [self.copy()] | ||
if using_copy_on_write and original_blocks: | ||
nb = self.copy(deep=False) | ||
nb._ref = nb._ref = weakref.ref( | ||
phofl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
original_blocks[self.mgr_locs.as_array[0]] | ||
) | ||
return [nb] | ||
else: | ||
return [self] if inplace else [self.copy()] | ||
|
||
src_len = len(pairs) - 1 | ||
|
||
|
@@ -678,7 +723,11 @@ def replace_list( | |
# ndarray]" | ||
masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type] | ||
|
||
rb = [self if inplace else self.copy()] | ||
if using_copy_on_write: | ||
# TODO(CoW): Optimize to avoid as many copies as possible | ||
rb = [self.copy()] | ||
else: | ||
rb = [self if inplace else self.copy()] | ||
for i, (src, dest) in enumerate(pairs): | ||
convert = i == src_len # only convert once at the end | ||
new_rb: list[Block] = [] | ||
|
@@ -701,8 +750,10 @@ def replace_list( | |
to_replace=src, | ||
value=dest, | ||
mask=m, # type: ignore[arg-type] | ||
inplace=inplace, | ||
inplace=True, # We already made a copy if inplace=False | ||
regex=regex, | ||
# TODO(CoW): Optimize to avoid as many copies as possible | ||
using_copy_on_write=False, | ||
) | ||
if convert and blk.is_object and not all(x is None for x in dest_list): | ||
# GH#44498 avoid unwanted cast-back | ||
|
@@ -719,6 +770,8 @@ def _replace_coerce( | |
mask: npt.NDArray[np.bool_], | ||
inplace: bool = True, | ||
regex: bool = False, | ||
original_blocks: list[Block] = [], | ||
using_copy_on_write: bool = False, | ||
) -> list[Block]: | ||
""" | ||
Replace value corresponding to the given boolean array with another | ||
|
@@ -760,7 +813,12 @@ def _replace_coerce( | |
return [nb] | ||
return [self] if inplace else [self.copy()] | ||
return self.replace( | ||
to_replace=to_replace, value=value, inplace=inplace, mask=mask | ||
to_replace=to_replace, | ||
value=value, | ||
inplace=inplace, | ||
mask=mask, | ||
original_blocks=original_blocks, | ||
using_copy_on_write=using_copy_on_write, | ||
) | ||
|
||
# --------------------------------------------------------------------- | ||
|
Uh oh!
There was an error while loading. Please reload this page.