diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9b435aa31bd16..35a7855b8240f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -552,6 +552,7 @@ def replace( inplace: bool = False, # mask may be pre-computed if we're called from replace_list mask: npt.NDArray[np.bool_] | None = None, + using_cow: bool = False, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -566,7 +567,12 @@ def replace( if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 - blk = self if inplace else self.copy() + if using_cow and (self.refs.has_reference() or not inplace): + blk = self.copy() + elif using_cow: + blk = self.copy(deep=False) + else: + blk = self if inplace else self.copy() values = cast(Categorical, blk.values) values._replace(to_replace=to_replace, value=value, inplace=True) return [blk] @@ -576,22 +582,36 @@ def replace( # replacing it is a no-op. # Note: If to_replace were a list, NDFrame.replace would call # replace_list instead of replace. - return [self] if inplace else [self.copy()] + if using_cow: + return [self.copy(deep=False)] + else: + return [self] if inplace else [self.copy()] if mask is None: mask = missing.mask_missing(values, to_replace) if not mask.any(): # Note: we get here with test_replace_extension_other incorrectly # bc _can_hold_element is incorrect. - return [self] if inplace else [self.copy()] + if using_cow: + return [self.copy(deep=False)] + else: + return [self] if inplace else [self.copy()] elif self._can_hold_element(value): - blk = self if inplace else self.copy() + # TODO(CoW): Maybe split here as well into columns where mask has True + # and rest? + if using_cow: + if inplace: + blk = self.copy(deep=self.refs.has_reference()) + else: + blk = self.copy() + else: + blk = self if inplace else self.copy() putmask_inplace(blk.values, mask, value) if not (self.is_object and value is None): # if the user *explicitly* gave None, we keep None, otherwise # may downcast to NaN - blocks = blk.convert(copy=False) + blocks = blk.convert(copy=False, using_cow=using_cow) else: blocks = [blk] return blocks @@ -619,6 +639,7 @@ def replace( value=value, inplace=True, mask=mask[i : i + 1], + using_cow=using_cow, ) ) return blocks @@ -797,7 +818,10 @@ def _replace_coerce( return [nb] return [self] if inplace else [self.copy()] return self.replace( - to_replace=to_replace, value=value, inplace=inplace, mask=mask + to_replace=to_replace, + value=value, + inplace=inplace, + mask=mask, ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a619d824f1916..1a18bb6519f77 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -459,7 +459,11 @@ def replace(self: T, to_replace, value, inplace: bool) -> T: assert not is_list_like(to_replace) assert not is_list_like(value) return self.apply( - "replace", to_replace=to_replace, value=value, inplace=inplace + "replace", + to_replace=to_replace, + value=value, + inplace=inplace, + using_cow=using_copy_on_write(), ) def replace_regex(self, **kwargs): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index c88210dec3c09..7042d6e4f9478 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1210,44 +1210,6 @@ def test_items(using_copy_on_write): assert df.loc[0, name] == 0 -@pytest.mark.parametrize( - "replace_kwargs", - [ - {"to_replace": {"a": 1, "b": 4}, "value": -1}, - # Test CoW splits blocks to avoid copying unchanged columns - {"to_replace": {"a": 1}, "value": -1}, - {"to_replace": {"b": 4}, "value": -1}, - {"to_replace": {"b": {4: 1}}}, - # TODO: Add these in a further optimization - # We would need to see which columns got replaced in the mask - # which could be expensive - # {"to_replace": {"b": 1}}, - # 1 - ], -) -def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) - df_orig = df.copy() - - df_replaced = df.replace(**replace_kwargs) - - if using_copy_on_write: - if (df_replaced["b"] == df["b"]).all(): - assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) - - # mutating squeezed df triggers a copy-on-write for that column/block - df_replaced.loc[0, "c"] = -1 - if using_copy_on_write: - assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) - - if "a" in replace_kwargs["to_replace"]: - arr = get_array(df_replaced, "a") - df_replaced.loc[0, "a"] = 100 - assert np.shares_memory(get_array(df_replaced, "a"), arr) - tm.assert_frame_equal(df, df_orig) - - @pytest.mark.parametrize("dtype", ["int64", "Int64"]) def test_putmask(using_copy_on_write, dtype): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index de7278dca06ff..7cd197541ac33 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -9,34 +9,194 @@ from pandas.tests.copy_view.util import get_array -def test_replace_categorical_inplace_reference(using_copy_on_write): - df = DataFrame({"a": Categorical([1, 2, 3])}) +@pytest.mark.parametrize( + "replace_kwargs", + [ + {"to_replace": {"a": 1, "b": 4}, "value": -1}, + # Test CoW splits blocks to avoid copying unchanged columns + {"to_replace": {"a": 1}, "value": -1}, + {"to_replace": {"b": 4}, "value": -1}, + {"to_replace": {"b": {4: 1}}}, + # TODO: Add these in a further optimization + # We would need to see which columns got replaced in the mask + # which could be expensive + # {"to_replace": {"b": 1}}, + # 1 + ], +) +def test_replace(using_copy_on_write, replace_kwargs): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df_orig = df.copy() + + df_replaced = df.replace(**replace_kwargs) + + if using_copy_on_write: + if (df_replaced["b"] == df["b"]).all(): + assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + # mutating squeezed df triggers a copy-on-write for that column/block + df_replaced.loc[0, "c"] = -1 + if using_copy_on_write: + assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + + if "a" in replace_kwargs["to_replace"]: + arr = get_array(df_replaced, "a") + df_replaced.loc[0, "a"] = 100 + assert np.shares_memory(get_array(df_replaced, "a"), arr) + tm.assert_frame_equal(df, df_orig) + + +def test_replace_mask_all_false_second_block(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2}) + df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value=55.5) + + if using_copy_on_write: + # TODO: Block splitting would allow us to avoid copying b + assert np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "c"] = 1 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c")) + # TODO: This should split and not copy the whole block + # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d")) + + +def test_replace_coerce_single_column(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) df_orig = df.copy() + + df2 = df.replace(to_replace=1.5, value="a") + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + elif not using_array_manager: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + if using_copy_on_write: + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_to_replace_wrong_dtype(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3], "b": 100.5}) + df_orig = df.copy() + + df2 = df.replace(to_replace="xxx", value=1.5) + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + else: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + + df2.loc[0, "b"] = 0.5 + tm.assert_frame_equal(df, df_orig) # Original is unchanged + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b")) + + +def test_replace_inplace(using_copy_on_write): + df = DataFrame({"a": [1.5, 2, 3]}) + arr_a = get_array(df, "a") + df.replace(to_replace=1.5, value=15.5, inplace=True) + + assert np.shares_memory(get_array(df, "a"), arr_a) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1.5, [1.5]]) +def test_replace_inplace_reference(using_copy_on_write, to_replace): + df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=[1], value=2, inplace=True) + df.replace(to_replace=to_replace, value=15.5, inplace=True) if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert not np.shares_memory(get_array(df, "a"), arr_a) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) - tm.assert_frame_equal(view, df_orig) else: - assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + assert np.shares_memory(get_array(df, "a"), arr_a) -def test_replace_inplace_reference(using_copy_on_write): +@pytest.mark.parametrize("to_replace", ["a", 100.5]) +def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace): df = DataFrame({"a": [1.5, 2, 3]}) arr_a = get_array(df, "a") view = df[:] - df.replace(to_replace=[1.5], value=15.5, inplace=True) + df.replace(to_replace=to_replace, value=15.5, inplace=True) + assert np.shares_memory(get_array(df, "a"), arr_a) if using_copy_on_write: - assert not np.shares_memory(get_array(df, "a"), arr_a) + assert not df._mgr._has_no_reference(0) + assert not view._mgr._has_no_reference(0) + + +@pytest.mark.parametrize("to_replace", [1, [1]]) +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + arr_a = get_array(df, "a") + view = df[:] + df.replace(to_replace=to_replace, value=val, inplace=True) + + if using_copy_on_write: + assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes) assert df._mgr._has_no_reference(0) assert view._mgr._has_no_reference(0) + tm.assert_frame_equal(view, df_orig) else: - assert np.shares_memory(get_array(df, "a"), arr_a) + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical_inplace(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + arr_a = get_array(df, "a") + df.replace(to_replace=1, value=val, inplace=True) + + assert np.shares_memory(get_array(df, "a").codes, arr_a.codes) + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + + expected = DataFrame({"a": Categorical([val, 2, 3])}) + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize("val", [1, 1.5]) +def test_replace_categorical(using_copy_on_write, val): + df = DataFrame({"a": Categorical([1, 2, 3])}) + df_orig = df.copy() + df2 = df.replace(to_replace=1, value=val) + + if using_copy_on_write: + assert df._mgr._has_no_reference(0) + assert df2._mgr._has_no_reference(0) + assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes) + tm.assert_frame_equal(df, df_orig) + + arr_a = get_array(df2, "a").codes + df2.iloc[0, 0] = 2.0 + assert np.shares_memory(get_array(df2, "a").codes, arr_a) @pytest.mark.parametrize("method", ["where", "mask"])