pandas-dev · lithomas1 · Feb 26, 2023 · Jan 20, 2023 · Jan 20, 2023 · Jan 20, 2023
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -11,6 +11,7 @@
     cast,
     final,
 )
+import weakref
 
 import numpy as np
 
@@ -152,6 +153,7 @@ class Block(PandasObject):
     is_extension = False
     _can_consolidate = True
     _validate_ndim = True
+    _ref = None
 
     @final
     @cache_readonly
@@ -523,6 +525,8 @@ def replace(
         inplace: bool = False,
         # mask may be pre-computed if we're called from replace_list
         mask: npt.NDArray[np.bool_] | None = None,
+        original_blocks: list[Block] = [],
+        using_copy_on_write: bool = False,
     ) -> list[Block]:
         """
         replace the to_replace value with value, possible to create new
@@ -549,17 +553,40 @@ def replace(
             #  replacing it is a no-op.
             # Note: If to_replace were a list, NDFrame.replace would call
             #  replace_list instead of replace.
-            return [self] if inplace else [self.copy()]
+            if using_copy_on_write and original_blocks:
+                result = self.copy(deep=False)
+                result._ref = result._ref = weakref.ref(
+                    original_blocks[self.mgr_locs.as_array[0]]
+                )
+                return [result]
+            else:
+                return [self] if inplace else [self.copy()]
 
         if mask is None:
             mask = missing.mask_missing(values, to_replace)
         if not mask.any():
             # Note: we get here with test_replace_extension_other incorrectly
             #  bc _can_hold_element is incorrect.
-            return [self] if inplace else [self.copy()]
+            if using_copy_on_write and original_blocks:
+                result = self.copy(deep=False)
+                result._ref = result._ref = weakref.ref(
+                    original_blocks[self.mgr_locs.as_array[0]]
+                )
+                return [result]
+            else:
+                return [self] if inplace else [self.copy()]
 
         elif self._can_hold_element(value):
-            blk = self if inplace else self.copy()
+            # TODO(CoW): Maybe split here as well into columns where mask has True
+            # and rest?
+            if using_copy_on_write:
+                if original_blocks:
+                    blk = self.copy()
+                else:
+                    # In case we made a copy before, e.g. coerce to target dtype
+                    blk = self
+            else:
+                blk = self if inplace else self.copy()
             putmask_inplace(blk.values, mask, value)
             if not (self.is_object and value is None):
                 # if the user *explicitly* gave None, we keep None, otherwise
@@ -584,6 +611,13 @@ def replace(
         else:
             # split so that we only upcast where necessary
             blocks = []
+            if original_blocks:
+                _original_blocks = [original_blocks[self.mgr_locs.as_array[0]]] * len(
+                    self
+                )
+            else:
+                _original_blocks = []
+
             for i, nb in enumerate(self._split()):
                 blocks.extend(
                     type(self).replace(
@@ -592,6 +626,8 @@ def replace(
                         value=value,
                         inplace=True,
                         mask=mask[i : i + 1],
+                        original_blocks=[self] * (self.mgr_locs.as_array.max() + 1),
+                        using_copy_on_write=using_copy_on_write,
                     )
                 )
             return blocks
@@ -645,6 +681,8 @@ def replace_list(
         dest_list: Sequence[Any],
         inplace: bool = False,
         regex: bool = False,
+        original_blocks: list[Block] = [],
+        using_copy_on_write: bool = False,
     ) -> list[Block]:
         """
         See BlockManager.replace_list docstring.
@@ -657,7 +695,14 @@ def replace_list(
         ]
         if not len(pairs):
             # shortcut, nothing to replace
-            return [self] if inplace else [self.copy()]
+            if using_copy_on_write and original_blocks:
+                nb = self.copy(deep=False)
+                nb._ref = nb._ref = weakref.ref(
+                    original_blocks[self.mgr_locs.as_array[0]]
+                )
+                return [nb]
+            else:
+                return [self] if inplace else [self.copy()]
 
         src_len = len(pairs) - 1
 
@@ -678,7 +723,11 @@ def replace_list(
         # ndarray]"
         masks = [extract_bool_array(x) for x in masks]  # type: ignore[arg-type]
 
-        rb = [self if inplace else self.copy()]
+        if using_copy_on_write:
+            # TODO(CoW): Optimize to avoid as many copies as possible
+            rb = [self.copy()]
+        else:
+            rb = [self if inplace else self.copy()]
         for i, (src, dest) in enumerate(pairs):
             convert = i == src_len  # only convert once at the end
             new_rb: list[Block] = []
@@ -701,8 +750,10 @@ def replace_list(
                     to_replace=src,
                     value=dest,
                     mask=m,  # type: ignore[arg-type]
-                    inplace=inplace,
+                    inplace=True,  # We already made a copy if inplace=False
                     regex=regex,
+                    # TODO(CoW): Optimize to avoid as many copies as possible
+                    using_copy_on_write=False,
                 )
                 if convert and blk.is_object and not all(x is None for x in dest_list):
                     # GH#44498 avoid unwanted cast-back
@@ -719,6 +770,8 @@ def _replace_coerce(
         mask: npt.NDArray[np.bool_],
         inplace: bool = True,
         regex: bool = False,
+        original_blocks: list[Block] = [],
+        using_copy_on_write: bool = False,
     ) -> list[Block]:
         """
         Replace value corresponding to the given boolean array with another
@@ -760,7 +813,12 @@ def _replace_coerce(
                     return [nb]
                 return [self] if inplace else [self.copy()]
             return self.replace(
-                to_replace=to_replace, value=value, inplace=inplace, mask=mask
+                to_replace=to_replace,
+                value=value,
+                inplace=inplace,
+                mask=mask,
+                original_blocks=original_blocks,
+                using_copy_on_write=using_copy_on_write,
             )
 
     # ---------------------------------------------------------------------

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -449,10 +449,36 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
         # NDFrame.replace ensures the not-is_list_likes here
         assert not is_list_like(to_replace)
         assert not is_list_like(value)
-        return self.apply(
-            "replace", to_replace=to_replace, value=value, inplace=inplace
+        return self._call_function_and_update_refs(
+            "replace",
+            to_replace=to_replace,
+            value=value,
+            inplace=inplace,
         )
 
+    def _call_function_and_update_refs(self, func, **kwargs):
+        if using_copy_on_write():
+            use_cow = True
+            if self.is_single_block:
+                original_blocks = [self.blocks[0]] * self.shape[0]
+            else:
+                original_blocks = [self.blocks[i] for i in self.blknos]
+        else:
+            use_cow = False
+            original_blocks = []
+
+        mgr = self.apply(
+            func,
+            **kwargs,
+            original_blocks=original_blocks,
+            using_copy_on_write=use_cow,
+        )
+        refs = [getattr(blk, "_ref", None) for blk in mgr.blocks]
+        if any(ref is not None for ref in refs):
+            mgr.refs = refs
+            mgr.parent = self
+        return mgr
+
     def replace_regex(self, **kwargs):
         return self.apply("_replace_regex", **kwargs)
 
@@ -466,14 +492,15 @@ def replace_list(
         """do a list replace"""
         inplace = validate_bool_kwarg(inplace, "inplace")
 
-        bm = self.apply(
+        bm = self._call_function_and_update_refs(
             "replace_list",
             src_list=src_list,
             dest_list=dest_list,
             inplace=inplace,
             regex=regex,
         )
-        bm._consolidate_inplace()
+        if not using_copy_on_write():
+            bm._consolidate_inplace()
         return bm
 
     def to_native_types(self: T, **kwargs) -> T:

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1102,6 +1102,71 @@ def test_replace(using_copy_on_write, replace_kwargs):
     tm.assert_frame_equal(df, df_orig)
 
 
+def test_replace_mask_all_false_second_block(using_copy_on_write):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value=55.5)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    else:
+        assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "c"] = 0.5
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+        # TODO: This should split and not copy the whole block
+        # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
+
+
+def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value="a")
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    elif not using_array_manager:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    if using_copy_on_write:
+        df2.loc[0, "b"] = 0.5
+        tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
+@pytest.mark.parametrize("to_replace", ["xxx", ["xxx"]])
+def test_replace_to_replace_wrong_dtype(using_copy_on_write, to_replace):
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=to_replace, value=1.5)
+
+    if using_copy_on_write:
+        assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    else:
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "b"] = 0.5
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    if using_copy_on_write:
+        assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
 def test_putmask(using_copy_on_write):
     df = DataFrame({"a": [1, 2], "b": 1, "c": 2})
     view = df[:]