diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0e6356044c6db..5d2a0fe66cc1d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9612,7 +9612,8 @@ def _where( # align the cond to same shape as myself cond = common.apply_if_callable(cond, self) if isinstance(cond, NDFrame): - cond, _ = cond.align(self, join="right", broadcast_axis=1, copy=False) + # CoW: Make sure reference is not kept alive + cond = cond.align(self, join="right", broadcast_axis=1, copy=False)[0] else: if not hasattr(cond, "shape"): cond = np.asanyarray(cond) @@ -9648,14 +9649,15 @@ def _where( # align with me if other.ndim <= self.ndim: - _, other = self.align( + # CoW: Make sure reference is not kept alive + other = self.align( other, join="left", axis=axis, level=level, fill_value=None, copy=False, - ) + )[1] # if we are NOT aligned, raise as we cannot where index if axis is None and not other._indexed_same(self): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 401b9e60308be..574e633bc4ba3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -946,7 +946,7 @@ def _unstack( # --------------------------------------------------------------------- - def setitem(self, indexer, value) -> Block: + def setitem(self, indexer, value, using_cow: bool = False) -> Block: """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -956,6 +956,8 @@ def setitem(self, indexer, value) -> Block: The subset of self.values to set value : object The value being set + using_cow: bool, default False + Signaling if CoW is used. Returns ------- @@ -991,10 +993,17 @@ def setitem(self, indexer, value) -> Block: # checking lib.is_scalar here fails on # test_iloc_setitem_custom_object casted = setitem_datetimelike_compat(values, len(vi), casted) + + if using_cow and self.refs.has_reference(): + values = values.copy() + self = self.make_block_same_class( + values.T if values.ndim == 2 else values + ) + values[indexer] = casted return self - def putmask(self, mask, new) -> list[Block]: + def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1022,11 +1031,21 @@ def putmask(self, mask, new) -> list[Block]: new = extract_array(new, extract_numpy=True) if noop: + if using_cow: + return [self.copy(deep=False)] return [self] try: casted = np_can_hold_element(values.dtype, new) + + if using_cow and self.refs.has_reference(): + # Do this here to avoid copying twice + values = values.copy() + self = self.make_block_same_class(values) + putmask_without_repeat(values.T, mask, casted) + if using_cow: + return [self.copy(deep=False)] return [self] except LossySetitemError: @@ -1038,7 +1057,7 @@ def putmask(self, mask, new) -> list[Block]: return self.coerce_to_target_dtype(new).putmask(mask, new) else: indexer = mask.nonzero()[0] - nb = self.setitem(indexer, new[indexer]) + nb = self.setitem(indexer, new[indexer], using_cow=using_cow) return [nb] else: @@ -1053,7 +1072,7 @@ def putmask(self, mask, new) -> list[Block]: n = new[:, i : i + 1] submask = orig_mask[:, i : i + 1] - rbs = nb.putmask(submask, n) + rbs = nb.putmask(submask, n, using_cow=using_cow) res_blocks.extend(rbs) return res_blocks @@ -1448,7 +1467,7 @@ class EABackedBlock(Block): values: ExtensionArray - def setitem(self, indexer, value): + def setitem(self, indexer, value, using_cow: bool = False): """ Attempt self.values[indexer] = value, possibly creating a new array. @@ -1461,6 +1480,8 @@ def setitem(self, indexer, value): The subset of self.values to set value : object The value being set + using_cow: bool, default False + Signaling if CoW is used. Returns ------- @@ -1567,7 +1588,7 @@ def where(self, other, cond, _downcast: str | bool = "infer") -> list[Block]: nb = self.make_block_same_class(res_values) return [nb] - def putmask(self, mask, new) -> list[Block]: + def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -1585,8 +1606,16 @@ def putmask(self, mask, new) -> list[Block]: mask = self._maybe_squeeze_arg(mask) if not mask.any(): + if using_cow: + return [self.copy(deep=False)] return [self] + if using_cow and self.refs.has_reference(): + values = values.copy() + self = self.make_block_same_class( # type: ignore[assignment] + values.T if values.ndim == 2 else values + ) + try: # Caller is responsible for ensuring matching lengths values._putmask(mask, new) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1439c174b4f43..791f282517e33 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -362,14 +362,6 @@ def setitem(self: T, indexer, value) -> T: return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): - if using_copy_on_write() and any( - not self._has_no_reference_block(i) for i in range(len(self.blocks)) - ): - # some reference -> copy full dataframe - # TODO(CoW) this could be optimized to only copy the blocks that would - # get modified - self = self.copy() - if align: align_keys = ["new", "mask"] else: @@ -381,6 +373,7 @@ def putmask(self, mask, new, align: bool = True): align_keys=align_keys, mask=mask, new=new, + using_cow=using_copy_on_write(), ) def diff(self: T, n: int, axis: AxisInt) -> T: diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 6b54345723118..bd64d52ec157c 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1237,8 +1237,9 @@ def test_replace(using_copy_on_write, replace_kwargs): tm.assert_frame_equal(df, df_orig) -def test_putmask(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) view = df[:] df_orig = df.copy() df[df == df] = 5 @@ -1252,6 +1253,51 @@ def test_putmask(using_copy_on_write): assert view.iloc[0, 0] == 5 +@pytest.mark.parametrize("dtype", ["int64", "Int64"]) +def test_putmask_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = 5 + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize("dtype", ["float64", "Float64"]) +def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype): + df = DataFrame({"a": [1.5, 2], "b": 1.5}, dtype=dtype) + arr_a = get_array(df, "a") + df[df == df] = DataFrame({"a": [5.5, 5]}) + + if using_copy_on_write: + assert np.shares_memory(arr_a, get_array(df, "a")) + + +@pytest.mark.parametrize("val, exp", [(5.5, True), (5, False)]) +def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp): + df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) + view = df[:] + df_orig = df.copy() + indexer = DataFrame( + [[True, False, False], [True, False, False]], columns=list("abc") + ) + df[indexer] = val + + if using_copy_on_write: + assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) + # TODO(CoW): Could split blocks to avoid copying the whole block + assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert df._mgr._has_no_reference(1) is not exp + assert not df._mgr._has_no_reference(2) + tm.assert_frame_equal(view, df_orig) + elif val == 5: + # Without CoW the original will be modified, the other case upcasts, e.g. copy + assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) + assert np.shares_memory(get_array(view, "c"), get_array(df, "c")) + assert view.iloc[0, 0] == 5 + + def test_asfreq_noop(using_copy_on_write): df = DataFrame( {"a": [0.0, None, 2.0, 3.0]},