From ea0bff8f9259ddae6591bec330077ed5e7a6c7c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 May 2022 14:09:32 +0200 Subject: [PATCH 01/27] API: New copy / view semantics using Copy-on-Write --- pandas/_libs/internals.pyx | 13 +- pandas/conftest.py | 8 + pandas/core/config_init.py | 17 + pandas/core/frame.py | 22 +- pandas/core/generic.py | 13 +- pandas/core/indexing.py | 13 + pandas/core/internals/managers.py | 184 +++++- pandas/core/internals/ops.py | 2 +- pandas/core/series.py | 9 +- pandas/tests/frame/indexing/test_indexing.py | 47 +- .../multiindex/test_chaining_and_caching.py | 17 +- .../tests/indexing/multiindex/test_partial.py | 16 +- .../tests/indexing/multiindex/test_setitem.py | 39 +- .../indexing/test_chaining_and_caching.py | 195 ++++-- pandas/tests/indexing/test_copy_on_write.py | 595 ++++++++++++++++++ pandas/tests/indexing/test_iat.py | 5 +- pandas/tests/indexing/test_iloc.py | 43 +- pandas/tests/indexing/test_loc.py | 35 +- 18 files changed, 1125 insertions(+), 148 deletions(-) create mode 100644 pandas/tests/indexing/test_copy_on_write.py diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index be71fe53d35db..a0a5ae96f517a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,4 +1,5 @@ from collections import defaultdict +import weakref cimport cython from cpython.slice cimport PySlice_GetIndicesEx @@ -672,8 +673,9 @@ cdef class BlockManager: public list axes public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs + public list refs - def __cinit__(self, blocks=None, axes=None, verify_integrity=True): + def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True): # None as defaults for unpickling GH#42345 if blocks is None: # This adds 1-2 microseconds to DataFrame(np.array([])) @@ -685,6 +687,7 @@ cdef class BlockManager: self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable + self.refs = refs # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False @@ -793,12 +796,14 @@ cdef class BlockManager: ndarray blknos, blklocs nbs = [] + nrefs = [] for blk in self.blocks: nb = blk.getitem_block_index(slobj) nbs.append(nb) + nrefs.append(weakref.ref(blk)) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) + mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs @@ -811,7 +816,7 @@ cdef class BlockManager: def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) + new_blocks, new_refs = self._slice_take_blocks_ax0(slobj) elif axis == 1: return self._get_index_slice(slobj) else: @@ -820,4 +825,4 @@ cdef class BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) + return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False) diff --git a/pandas/conftest.py b/pandas/conftest.py index 9d98478010c97..a35a1bb184551 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1796,3 +1796,11 @@ def using_array_manager(): Fixture to check if the array manager is being used. """ return pd.options.mode.data_manager == "array" + + +@pytest.fixture +def using_copy_on_write(): + """ + Fixture to check if the array manager is being used. + """ + return pd.options.mode.copy_on_write diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index dd106b6dbb63c..7aa963ed23fda 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -527,6 +527,23 @@ def use_inf_as_na_cb(key): ) +# TODO better name? +copy_on_write_doc = """ +: bool + Use new copy-view behaviour using Copy-on-Write +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "copy_on_write", + # TODO turn to False before merging + True, + copy_on_write_doc, + validator=is_bool, + ) + + # user warnings chained_assignment = """ : string diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 84ea8df0b9b20..2da4523b96909 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3924,6 +3924,16 @@ def _set_value( Sets whether or not index/col interpreted as indexers """ try: + if get_option("mode.copy_on_write"): + if not takeable: + icol = self.columns.get_loc(col) + index = self.index.get_loc(index) + self._mgr.column_setitem(icol, index, value) + else: + self._mgr.column_setitem(col, index, value) + self._clear_item_cache() + return + if takeable: series = self._ixs(col, axis=1) loc = index @@ -4853,7 +4863,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): "labels", [ ("method", None), - ("copy", True), + ("copy", None), ("level", None), ("fill_value", np.nan), ("limit", None), @@ -5078,7 +5088,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: Literal[True], level: Level | None = ..., errors: IgnoreRaise = ..., @@ -5093,7 +5103,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: Literal[False] = ..., level: Level | None = ..., errors: IgnoreRaise = ..., @@ -5108,7 +5118,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: bool = ..., level: Level | None = ..., errors: IgnoreRaise = ..., @@ -5122,7 +5132,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool = True, + copy: bool | None = None, inplace: bool = False, level: Level | None = None, errors: IgnoreRaise = "ignore", @@ -6015,7 +6025,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy() + new_obj = self.copy(deep=None) if allow_duplicates is not lib.no_default: allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e1459a66a0f12..e2d171a2d6b66 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -979,7 +979,7 @@ def _rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool_t = True, + copy: bool_t | None = None, inplace: bool_t = False, level: Level | None = None, errors: str = "ignore", @@ -3905,6 +3905,9 @@ def _check_setitem_copy(self, t="setting", force=False): df.iloc[0:5]['group'] = 'a' """ + if config.get_option("mode.copy_on_write"): + return + # return early if the check is not needed if not (force or self._is_copy): return @@ -4950,7 +4953,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: axes, kwargs = self._construct_axes_from_arguments(args, kwargs) method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) level = kwargs.pop("level", None) - copy = kwargs.pop("copy", True) + copy = kwargs.pop("copy", None) limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) @@ -4975,9 +4978,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: for axis, ax in axes.items() if ax is not None ): - if copy: - return self.copy() - return self + return self.copy(deep=copy) # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): @@ -5954,7 +5955,7 @@ def astype( return cast(NDFrameT, result) @final - def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT: + def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: """ Make a copy of this object's indices and data. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ddae58fd46bb0..36c2b97387886 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -11,6 +11,8 @@ import numpy as np +from pandas._config import get_option + from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import ( @@ -1948,6 +1950,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): """ pi = plane_indexer + if get_option("mode.copy_on_write"): + # CoW: in this case we cannot rely on getting the column as a + # Series to mutate, but need to operated on the mgr directly + if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): + arr = self.obj._sanitize_column(value) + self.obj._mgr.iset(loc, arr) + else: + self.obj._mgr.column_setitem(loc, plane_indexer, value) + self.obj._clear_item_cache() + return + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ded525cd099fc..08c3bca971f83 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -10,9 +10,12 @@ cast, ) import warnings +import weakref import numpy as np +from pandas._config import get_option + from pandas._libs import ( algos as libalgos, internals as libinternals, @@ -141,16 +144,22 @@ class BaseBlockManager(DataManager): _blklocs: npt.NDArray[np.intp] blocks: tuple[Block, ...] axes: list[Index] + refs: list[weakref.ref | None] | None ndim: int _known_consolidated: bool _is_consolidated: bool - def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: + def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None: raise NotImplementedError @classmethod - def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T: + def from_blocks( + cls: type_t[T], + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None, + ) -> T: raise NotImplementedError @property @@ -223,6 +232,25 @@ def is_single_block(self) -> bool: def items(self) -> Index: return self.axes[0] + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if has references. + (whether it references another array or is itself being referenced) + Returns True if the columns has no references. + """ + blkno = self.blknos[i] + # TODO include `or self.refs[blkno]() is None` ? + return ( + self.refs is None or self.refs[blkno] is None + ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 + + def _clear_reference(self, blkno: int) -> None: + """ + Clear any reference for column `i`. + """ + if self.refs is not None: + self.refs[blkno] = None + def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return dtypes.take(self.blknos) @@ -337,9 +365,22 @@ def setitem(self: T, indexer, value) -> T: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") + if not self._has_no_reference(0): + # if being referenced -> perform Copy-on-Write and clear the reference + # self.blocks = tuple([self.blocks[0].copy()]) + self = self.copy() + # self._clear_reference(blkno) + return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): + if self.refs is not None and not all(ref is None for ref in self.refs): + # some reference -> copy full dataframe + # TODO this could be optimized to only copy the blocks that would + # get modified + # self.blocks = tuple([blk.copy() for blk in self.blocks]) + # self.refs = None + self = self.copy() if align: align_keys = ["new", "mask"] @@ -544,14 +585,22 @@ def copy(self: T, deep=True) -> T: Parameters ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data) If 'all', copy data and a deep copy of the index Returns ------- BlockManager """ + if deep is None: + if get_option("mode.copy_on_write"): + # use shallow copy + deep = False + else: + # preserve deep copy for BlockManager with copy=None + deep = True + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -564,8 +613,13 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) + if deep: + new_refs = None + else: + new_refs = [weakref.ref(blk) for blk in self.blocks] res.axes = new_axes + res.refs = new_refs if self.ndim > 1: # Avoid needing to re-compute these @@ -601,7 +655,7 @@ def reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, only_slice: bool = False, *, use_na_proxy: bool = False, @@ -614,7 +668,8 @@ def reindex_indexer( axis : int fill_value : object, default None allow_dups : bool, default False - copy : bool, default True + copy : bool or None, default True + If None, regard as False to get shallow copy. only_slice : bool, default False Whether to take views, not copies, along columns. use_na_proxy : bool, default False @@ -622,6 +677,14 @@ def reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + if get_option("mode.copy_on_write"): + # use shallow copy + copy = False + else: + # preserve deep copy for BlockManager with copy=None + copy = True + if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -639,7 +702,7 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( + new_blocks, new_refs = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice, @@ -656,11 +719,12 @@ def reindex_indexer( ) for blk in self.blocks ] + new_refs = None new_axes = list(self.axes) new_axes[axis] = new_axis - new_mgr = type(self).from_blocks(new_blocks, new_axes) + new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs) if axis == 1: # We can avoid the need to rebuild these new_mgr._blknos = self.blknos.copy() @@ -707,9 +771,11 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return [], [] bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [ + weakref.ref(blk) + ] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -725,7 +791,7 @@ def _slice_take_blocks_ax0( ] # We have # all(np.shares_memory(nb.values, blk.values) for nb in blocks) - return blocks + return blocks, [weakref.ref(blk)] * len(blocks) else: bp = BlockPlacement(slice(0, sllen)) return [ @@ -735,7 +801,7 @@ def _slice_take_blocks_ax0( new_mgr_locs=bp, fill_value=fill_value, ) - ] + ], [None] if sl_type == "slice": blknos = self.blknos[slobj] @@ -751,6 +817,7 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] + refs = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: @@ -763,6 +830,7 @@ def _slice_take_blocks_ax0( use_na_proxy=use_na_proxy, ) ) + refs.append(None) else: blk = self.blocks[blkno] @@ -776,18 +844,20 @@ def _slice_take_blocks_ax0( newblk = blk.copy(deep=False) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) blocks.append(newblk) + refs.append(weakref.ref(blk)) else: # GH#32779 to avoid the performance penalty of copying, # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice: + if only_slice or get_option("mode.copy_on_write"): taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) + refs.append(weakref.ref(blk)) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -797,11 +867,13 @@ def _slice_take_blocks_ax0( nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) + refs.append(weakref.ref(blk)) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) blocks.append(nb) + refs.append(None) - return blocks + return blocks, refs def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -868,6 +940,7 @@ def take( indexer=indexer, axis=axis, allow_dups=True, + copy=None, ) @@ -885,6 +958,7 @@ def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ) -> None: @@ -934,13 +1008,24 @@ def _verify_integrity(self) -> None: f"block items\n# manager items: {len(self.items)}, # " f"tot_items: {tot_items}" ) + if self.refs is not None: + if len(self.refs) != len(self.blocks): + raise ValueError( + "Number of passed refs must equal the number of blocks: " + f"{len(self.refs)} refs vs {len(self.blocks)} blocks." + ) @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager: + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None = None, + ) -> BlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ - return cls(blocks, axes, verify_integrity=False) + return cls(blocks, axes, refs, verify_integrity=False) # ---------------------------------------------------------------- # Indexing @@ -988,7 +1073,7 @@ def iget(self, i: int) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) nb = type(block)(values, placement=bp, ndim=1) - return SingleBlockManager(nb, self.axes[1]) + return SingleBlockManager(nb, self.axes[1], [weakref.ref(block)]) def iget_values(self, i: int) -> ArrayLike: """ @@ -1273,10 +1358,23 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - return type(self)(tuple(nbs), axes, verify_integrity=False) + return type(self)(tuple(nbs), axes, new_refs, verify_integrity=False) + + def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): + if not self._has_no_reference(loc): + # otherwise perform Copy-on-Write and clear the reference + blkno = self.blknos[loc] + blocks = list(self.blocks) + blocks[blkno] = blocks[blkno].copy() + self.blocks = tuple(blocks) + self._clear_reference(blkno) + + col_mgr = self.iget(loc) + new_mgr = col_mgr.setitem((idx,), value) + self.iset(loc, new_mgr._block.values, inplace=True) # ---------------------------------------------------------------- # Block-wise Operation @@ -1674,6 +1772,7 @@ def __init__( self, block: Block, axis: Index, + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = False, fastpath=lib.no_default, ) -> None: @@ -1691,15 +1790,23 @@ def __init__( self.axes = [axis] self.blocks = (block,) + self.refs = refs @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> SingleBlockManager: + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None = None, + ) -> SingleBlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ assert len(blocks) == 1 assert len(axes) == 1 - return cls(blocks[0], axes[0], verify_integrity=False) + if refs is not None: + assert len(refs) == 1 + return cls(blocks[0], axes[0], refs, verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: @@ -1720,6 +1827,16 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: axes = [columns, self.axes[0]] return BlockManager([new_blk], axes=axes, verify_integrity=False) + def _has_no_reference(self, i: int = 0) -> bool: + """ + Check for column `i` if has references. + (whether it references another array or is itself being referenced) + Returns True if the columns has no references. + """ + return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount( + self.blocks[0] + ) == 0 + def __getstate__(self): block_values = [b.values for b in self.blocks] block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] @@ -1788,7 +1905,9 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana block = type(blk)(array, placement=bp, ndim=1) new_idx = self.index[indexer] - return type(self)(block, new_idx) + # TODO in theory only need to track reference if new_array is a view + ref = weakref.ref(blk) + return type(self)(block, new_idx, [ref]) def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: # Assertion disabled for performance @@ -1801,7 +1920,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: bp = BlockPlacement(slice(0, len(array))) block = type(blk)(array, placement=bp, ndim=1) new_index = self.index._getitem_slice(slobj) - return type(self)(block, new_index) + return type(self)(block, new_index, [weakref.ref(blk)]) @property def index(self) -> Index: @@ -1837,6 +1956,23 @@ def get_numeric_data(self, copy: bool = False): def _can_hold_na(self) -> bool: return self._block._can_hold_na + def setitem_inplace(self, indexer, value) -> None: + """ + Set values with indexer. + + For Single[Block/Array]Manager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + if not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self.refs = None + self._cache.clear() + + super().setitem_inplace(indexer, value) + def idelete(self, indexer) -> SingleBlockManager: """ Delete single location from SingleBlockManager. @@ -1847,6 +1983,8 @@ def idelete(self, indexer) -> SingleBlockManager: self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) self._cache.clear() + # clear reference since delete always results in a new array + self.refs = None return self def fast_xs(self, loc): diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 1160d3b2a8e3a..5febb302a9de9 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -36,7 +36,7 @@ def _iter_block_pairs( left_ea = blk_vals.ndim == 1 - rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) + rblks, _ = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d3509cac0edd..96a5101e17c1e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1287,7 +1287,14 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher - elif len(self) == len(ref) and self.name in ref.columns: + # for CoW, we never want to update the parent DataFrame cache + # if the Series changed, and always pop the cached item + # TODO replace False with check for option + elif ( + not get_option("mode.copy_on_write") + and len(self) == len(ref) + and self.name in ref.columns + ): # GH#42530 self.name must be in ref.columns # to ensure column still in dataframe # otherwise, either self or ref has swapped in new arrays diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index c86150d10fde4..0cd2bae1c00a3 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -267,7 +267,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame): + def test_setitem(self, float_frame, using_copy_on_write): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -303,8 +303,12 @@ def test_setitem(self, float_frame): smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # With CoW, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + smaller["col10"] = ["1", "2"] assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -534,22 +538,29 @@ def test_getitem_setitem_integer_slice_keyerrors(self): df2.loc[3:11] = 0 @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view - def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): + def test_fancy_getitem_slice_mixed( + self, float_frame, float_string_frame, using_copy_on_write + ): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 # get view with single block # setting it triggers setting with copy + original = float_frame.copy() sliced = float_frame.iloc[:, -3:] assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - sliced.loc[:, "C"] = 4.0 + if not using_copy_on_write: + with pytest.raises(com.SettingWithCopyError, match=msg): + sliced.loc[:, "C"] = 4.0 - assert (float_frame["C"] == 4).all() + assert (float_frame["C"] == 4).all() + else: + sliced.loc[:, "C"] = 4.0 + tm.assert_frame_equal(float_frame, original) def test_getitem_setitem_non_ix_labels(self): df = tm.makeTimeDataFrame() @@ -989,7 +1000,7 @@ def test_iloc_row(self): expected = df.reindex(df.index[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_row_slice_view(self, using_array_manager): + def test_iloc_row_slice_view(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) original = df.copy() @@ -999,14 +1010,17 @@ def test_iloc_row_slice_view(self, using_array_manager): assert np.shares_memory(df[2], subset[2]) + exp_col = original[2].copy() msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: subset.loc[:, 2] = 0.0 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + subset.loc[:, 2] = 0.0 - exp_col = original[2].copy() - # TODO(ArrayManager) verify it is expected that the original didn't change - if not using_array_manager: - exp_col._values[4:8] = 0.0 + # TODO(ArrayManager) verify it is expected that the original didn't change + if not using_array_manager: + exp_col._values[4:8] = 0.0 tm.assert_series_equal(df[2], exp_col) def test_iloc_col(self): @@ -1031,14 +1045,13 @@ def test_iloc_col(self): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view(self, using_array_manager): + def test_iloc_col_slice_view(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2)) original = df.copy() subset = df.iloc[:, slice(4, 8)] - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: # verify slice is view - assert np.shares_memory(df[8]._values, subset[8]._values) # and that we are setting a copy @@ -1048,7 +1061,9 @@ def test_iloc_col_slice_view(self, using_array_manager): assert (df[8] == 0).all() else: - # TODO(ArrayManager) verify this is the desired behaviour + if using_copy_on_write: + # verify slice is view + assert np.shares_memory(df[8]._values, subset[8]._values) subset[8] = 0.0 # subset changed assert (subset[8] == 0).all() diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 6ccd44e698a8a..e517ce394e73d 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -12,7 +12,7 @@ import pandas.core.common as com -def test_detect_chained_assignment(): +def test_detect_chained_assignment(using_copy_on_write): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] @@ -29,17 +29,21 @@ def test_detect_chained_assignment(): multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) zed = DataFrame(events, index=["a", "b"], columns=multiind) - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: zed["eyes"]["right"].fillna(value=555, inplace=True) + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + zed["eyes"]["right"].fillna(value=555, inplace=True) @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view -def test_cache_updating(): +def test_cache_updating(using_copy_on_write): # 5216 # make sure that we don't try to set a dead cache a = np.random.rand(10, 3) df = DataFrame(a, columns=["x", "y", "z"]) + df_original = df.copy() tuples = [(i, j) for i in range(5) for j in range(2)] index = MultiIndex.from_tuples(tuples) df.index = index @@ -48,7 +52,10 @@ def test_cache_updating(): # but actually works, since everything is a view df.loc[0]["z"].iloc[0] = 1.0 result = df.loc[(0, 0), "z"] - assert result == 1 + if using_copy_on_write: + assert result == df_original.loc[0, "z"] + else: + assert result == 1 # correct setting df.loc[(0, 0), "z"] = 2 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 9d5e65e692fdc..cface630c6647 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -122,26 +122,32 @@ def test_getitem_partial_column_select(self): # TODO(ArrayManager) rewrite test to not use .values # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view @td.skip_array_manager_invalid_test - def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): + def test_partial_set( + self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write + ): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data df = ymd.copy() exp = ymd.copy() df.loc[2000, 4] = 0 - exp.loc[2000, 4].values[:] = 0 + exp.iloc[65:85] = 0 tm.assert_frame_equal(df, exp) df["A"].loc[2000, 4] = 1 - exp["A"].loc[2000, 4].values[:] = 1 + if not using_copy_on_write: + exp["A"].loc[2000, 4].values[:] = 1 tm.assert_frame_equal(df, exp) df.loc[2000] = 5 - exp.loc[2000].values[:] = 5 + exp.iloc[:100] = 5 tm.assert_frame_equal(df, exp) # this works...for now df["A"].iloc[14] = 5 - assert df["A"].iloc[14] == 5 + if using_copy_on_write: + df["A"].iloc[14] == exp["A"].iloc[14] + else: + assert df["A"].iloc[14] == 5 @pytest.mark.parametrize("dtype", [int, float]) def test_getitem_intkey_leading_level( diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index a9af83fa632f2..e2ef86fe13b98 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -196,7 +196,7 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype(self, using_array_manager): + def test_multiindex_assignment_single_dtype(self, using_copy_on_write): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -216,7 +216,8 @@ def test_multiindex_assignment_single_dtype(self, using_array_manager): tm.assert_series_equal(result, exp) # extra check for inplace-ness - tm.assert_numpy_array_equal(view, exp.values) + if not using_copy_on_write: + tm.assert_numpy_array_equal(view, exp.values) # arr + 0.5 cannot be cast losslessly to int, so we upcast df.loc[4, "c"] = arr + 0.5 @@ -405,16 +406,23 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): reindexed = dft.reindex(columns=[("foo", "two")]) tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) - def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): + def test_set_column_scalar_with_loc( + self, multiindex_dataframe_random_data, using_copy_on_write + ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() + frame_original = frame.copy() col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, "B"] == 97).all() + if using_copy_on_write: + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) + else: + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): df = DataFrame( @@ -487,21 +495,32 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): assert (df["foo"].values == 0).all() -def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): +def test_frame_setitem_copy_raises( + multiindex_dataframe_random_data, using_copy_on_write +): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) it would be nice if this could still warn/raise df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 -def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): +def test_frame_setitem_copy_no_write( + multiindex_dataframe_random_data, using_copy_on_write +): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 result = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index c8837a617bd9a..3db3ca07c3d99 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -29,7 +29,7 @@ def random_text(nobs=100): class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): + def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): @@ -49,7 +49,11 @@ def test_slice_consolidate_invalidate_item_cache(self): # Assignment to wrong series df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df["bb"][0], 0.17) + if not using_copy_on_write: + tm.assert_almost_equal(df["bb"][0], 0.17) + else: + # with ArrayManager, parent is not mutated with chained assignment + tm.assert_almost_equal(df["bb"][0], 2.2) @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): @@ -68,7 +72,7 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self): + def test_setitem_cache_updating_slices(self, using_copy_on_write): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -89,12 +93,17 @@ def test_setitem_cache_updating_slices(self): # try via a chain indexing # this actually works out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] out[row["C"]][six:eix] = v - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out["A"], expected["A"]) + if not using_copy_on_write: + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + else: + tm.assert_frame_equal(out, out_original) + tm.assert_series_equal(out["A"], out_original["A"]) out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): @@ -120,7 +129,7 @@ def test_altering_series_clears_parent_cache(self): class TestChaining: - def test_setitem_chained_setfault(self): + def test_setitem_chained_setfault(self, using_copy_on_write): # GH6026 data = ["right", "left", "left", "left", "right", "left", "timeout"] @@ -129,24 +138,38 @@ def test_setitem_chained_setfault(self): df = DataFrame({"response": np.array(data)}) mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + if using_copy_on_write: + tm.assert_frame_equal(df, DataFrame({"response": data})) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata})) recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + if using_copy_on_write: + tm.assert_frame_equal(df, DataFrame({"response": data})) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata})) df = DataFrame({"response": data, "response1": data}) + df_original = df.copy() mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) # GH 6056 expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) df["A"].iloc[0] = np.nan result = df.head() + if using_copy_on_write: + expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) + else: + expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) tm.assert_frame_equal(result, expected) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) @@ -155,7 +178,7 @@ def test_setitem_chained_setfault(self): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self): + def test_detect_chained_assignment(self, using_copy_on_write): with option_context("chained_assignment", "raise"): # work with the chain @@ -163,14 +186,20 @@ def test_detect_chained_assignment(self): df = DataFrame( np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64" ) + df_original = df.copy() assert df._is_copy is None df["A"][0] = -5 df["A"][1] = -6 - tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_raises(self, using_array_manager): + def test_detect_chained_assignment_raises( + self, using_array_manager, using_copy_on_write + ): # test with the chaining df = DataFrame( @@ -179,9 +208,14 @@ def test_detect_chained_assignment_raises(self, using_array_manager): "B": np.array(np.arange(2, 4), dtype=np.float64), } ) + df_original = df.copy() assert df._is_copy is None - if not using_array_manager: + if using_copy_on_write: + df["A"][0] = -5 + df["A"][1] = -6 + tm.assert_frame_equal(df, df_original) + elif not using_array_manager: with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = -5 @@ -189,7 +223,6 @@ def test_detect_chained_assignment_raises(self, using_array_manager): df["A"][1] = np.nan assert df["A"]._is_copy is None - else: # INFO(ArrayManager) for ArrayManager it doesn't matter that it's # a mixed dataframe @@ -200,7 +233,7 @@ def test_detect_chained_assignment_raises(self, using_array_manager): tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self): + def test_detect_chained_assignment_fails(self, using_copy_on_write): # Using a copy (the chain), fails df = DataFrame( @@ -210,11 +243,15 @@ def test_detect_chained_assignment_fails(self): } ) - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? df.loc[0]["A"] = -5 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self): + def test_detect_chained_assignment_doc_example(self, using_copy_on_write): # Doc example df = DataFrame( @@ -225,30 +262,43 @@ def test_detect_chained_assignment_doc_example(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 @pytest.mark.arm_slow - def test_detect_chained_assignment_object_dtype(self, using_array_manager): + def test_detect_chained_assignment_object_dtype( + self, using_array_manager, using_copy_on_write + ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 + if not using_copy_on_write: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = 111 - if not using_array_manager: + if using_copy_on_write: + # TODO(CoW) can we still warn here? + df["A"][0] = 111 + tm.assert_frame_equal(df, df_original) + elif not using_array_manager: with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = 111 df.loc[0, "A"] = 111 + tm.assert_frame_equal(df, expected) else: # INFO(ArrayManager) for ArrayManager it doesn't matter that it's # a mixed dataframe df["A"][0] = 111 - - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_is_copy_pickle(self): @@ -296,8 +346,9 @@ def test_detect_chained_assignment_implicit_take(self): df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take2(self): - + def test_detect_chained_assignment_implicit_take2(self, using_copy_on_write): + if using_copy_on_write: + pytest.skip("_is_copy is not always set for CoW") # Implicitly take 2 df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) @@ -353,18 +404,26 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self): + def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? df.iloc[0:5]["group"] = "a" + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow - def test_detect_chained_assignment_changing_dtype(self, using_array_manager): + def test_detect_chained_assignment_changing_dtype( + self, using_array_manager, using_copy_on_write + ): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -375,32 +434,45 @@ def test_detect_chained_assignment_changing_dtype(self, using_array_manager): "D": ["a", "b", "c", "d", "e"], } ) + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: df.loc[2]["D"] = "foo" - - with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[2]["C"] = "foo" + df["C"][2] = "foo" + tm.assert_frame_equal(df, df_original) - if not using_array_manager: + if not using_copy_on_write: with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["D"] = "foo" + + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["C"] = "foo" + + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df["C"][2] = "foo" + else: + # INFO(ArrayManager) for ArrayManager it doesn't matter if it's + # changing the dtype or not df["C"][2] = "foo" - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not - df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" + assert df.loc[2, "C"] == "foo" - def test_setting_with_copy_bug(self): + def test_setting_with_copy_bug(self, using_copy_on_write): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} ) + df_original = df.copy() mask = pd.isna(df.c) - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: df[["c"]][mask] = df[["b"]][mask] + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df[["c"]][mask] = df[["b"]][mask] def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object @@ -411,8 +483,12 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings_errors(self): + def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + if using_copy_on_write: + df.loc[0]["A"] = 111 + return + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): df.loc[0]["A"] = 111 @@ -421,28 +497,44 @@ def test_detect_chained_assignment_warnings_errors(self): with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): + def test_detect_chained_assignment_warnings_filter_and_dupe_cols( + self, using_copy_on_write + ): # xref gh-13017. with option_context("chained_assignment", "warn"): df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) + df_original = df.copy() - with tm.assert_produces_warning(com.SettingWithCopyWarning): + warn = None if using_copy_on_write else com.SettingWithCopyWarning + with tm.assert_produces_warning(warn): df.c.loc[df.c > 0] = None expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] ) - tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) - def test_detect_chained_assignment_warning_stacklevel(self, rhs): + def test_detect_chained_assignment_warning_stacklevel( + self, rhs, using_copy_on_write + ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) + df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: - chained[2] = rhs - assert t[0].filename == __file__ + if not using_copy_on_write: + with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: + chained[2] = rhs + assert t[0].filename == __file__ + else: + # INFO(CoW) no warning, and original dataframe not changed + with tm.assert_produces_warning(None): + chained[2] = rhs + tm.assert_frame_equal(df, df_original) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @td.skip_array_manager_not_yet_implemented @@ -493,7 +585,7 @@ def test_cache_updating2(self): expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - def test_iloc_setitem_chained_assignment(self): + def test_iloc_setitem_chained_assignment(self, using_copy_on_write): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -507,7 +599,10 @@ def test_iloc_setitem_chained_assignment(self): df.iloc[ck] df["bb"].iloc[0] = 0.15 - assert df["bb"].iloc[0] == 0.15 + if not using_copy_on_write: + assert df["bb"].iloc[0] == 0.15 + else: + assert df["bb"].iloc[0] == 2.2 def test_getitem_loc_assignment_slice_state(self): # GH 13569 diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py new file mode 100644 index 0000000000000..61dc762856fe2 --- /dev/null +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -0,0 +1,595 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +import pandas.core.common as com + + +def test_copy(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is None + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + # copy of df + copy of subset + # normal copy -> refs are removed, no mutation of parent + # deep=None -> refs are still generated / kept + # copy=False -> refs are kept? But if we then edit it + + # df = ... + # subset = df[1:3] + # subset_shallow_copy = subset.copy(deep=False) + # -> expected behaviour: mutating subset_shallow_copy should mutate subset + # but not mutate parent df + # - if we keep refs -> we copy on setitem -> subset is not mutated + # - if we remove refs -> we don't copy on setitem, but then also parent df + # is mutated + # -> disallow taking a shallow copy of a DataFrame that referencing other arrays? + + +def test_copy_shallow(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is not None + + if using_copy_on_write: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns/blocks + assert np.may_share_memory(df_copy["c"].values, df["c"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_copy_on_write): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["b"].values, df["b"].values) + assert np.may_share_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 2] = 0 + assert not np.may_share_memory(df2["b"].values, df["b"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_reindex_columns(using_copy_on_write): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["a"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["a"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +# # ----------------------------------------------------------------------------- +# # Indexing operations taking subset + modifying the subset/parent + + +def test_subset_column_selection(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[["a", "c"]] + + if using_copy_on_write: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 + else: + assert not np.may_share_memory(subset["a"].values, df["a"].values) + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_selection_modify_parent(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the parent + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + + subset = df[["a", "c"]] + if using_copy_on_write: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but parent uses CoW parent when it is modified + df.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + if using_copy_on_write: + # different column/block still shares memory + assert np.may_share_memory(subset["c"].values, df["c"].values) + + expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + + +def test_subset_row_slice(using_copy_on_write): + # Case: taking a subset of the rows of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[1:3] + subset._mgr._verify_integrity() + + assert np.may_share_memory(subset["a"].values, df["a"].values) + + if using_copy_on_write: + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + else: + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame( + {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.iloc[1, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_slice(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df.iloc[:, 1:] + subset._mgr._verify_integrity() + + if using_copy_on_write: + assert np.may_share_memory(subset["b"].values, df["b"].values) + + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["b"].values, df["b"].values) + + else: + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + # original parent dataframe is not modified (also not for BlockManager case) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): + # Case: setting values with a row indexer on a viewing subset + # subset[indexer] = value and subset.iloc[indexer] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype == "int" + ): + pytest.skip("setitem with labels selects on columns") + + if using_copy_on_write: + indexer_si(subset)[indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + indexer_si(subset)[indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig[1:3] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_with_mask(using_copy_on_write): + # Case: setting values with a mask on a viewing subset: subset[mask] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + mask = subset > 3 + + if using_copy_on_write: + subset[mask] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[mask] = 0 + + expected = pd.DataFrame( + {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[3, "a"] = 0 + df_orig.loc[1:3, "b"] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column(using_copy_on_write): + # Case: setting a single column on a viewing subset -> subset[col] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset["a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset["a"] = np.array([10, 11], dtype="int64") + + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_columns(using_copy_on_write, dtype): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_column_indexer(indexer, using_copy_on_write): + # Case: setting multiple columns with a column indexer on a viewing subset + # -> subset.loc[:, [col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) + ) + # TODO full row slice .loc[:, idx] update inplace instead of overwrite? + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + # In the mixed case with BlockManager, only one of the two columns is + # mutated in the parent frame .. + df_orig.loc[1:2, ["a"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +# TODO add more tests modifying the parent + + +# ----------------------------------------------------------------------------- +# Series -- Indexing operations taking subset + modifying the subset/parent + + +def test_series_getitem_slice(using_copy_on_write): + # Case: taking a slice of a Series + afterwards modifying the subset + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + subset = s[:] + assert np.may_share_memory(subset.values, s.values) + + subset.iloc[0] = 0 + + if using_copy_on_write: + assert not np.may_share_memory(subset.values, s.values) + + expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write): + # Case: setting values in a viewing Series with an indexer + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + subset = s[:] + + # if ( + # indexer_si is tm.setitem + # and isinstance(indexer, np.ndarray) + # and indexer.dtype == "int" + # ): + # pytest.skip("setitem with labels selects on columns") + + expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) + indexer_si(subset)[indexer] = 0 + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + tm.assert_series_equal(s, expected) + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + + +# ----------------------------------------------------------------------------- +# del operator + + +def test_del_frame(using_copy_on_write): + # Case: deleting a column with `del` on a viewing child dataframe should + # not modify parent + update the references + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df[:] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + + del df2["b"] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df2, df_orig[["a", "c"]]) + df2._mgr._verify_integrity() + + # TODO in theory modifying column "b" of the parent wouldn't need a CoW + # but the weakref is still alive and so we still perform CoW + + if using_copy_on_write: + # modifying child after deleting a column still doesn't update parent + df2.loc[0, "a"] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_del_series(): + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + s2 = s[:] + + assert np.may_share_memory(s.values, s2.values) + + del s2["a"] + + assert not np.may_share_memory(s.values, s2.values) + tm.assert_series_equal(s, s_orig) + tm.assert_series_equal(s2, s_orig[["b", "c"]]) + + # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array) + values = s2.values + s2.loc["b"] = 100 + assert values[0] == 100 + + +# ----------------------------------------------------------------------------- +# Accessing column as Series + + +def test_column_as_series(using_copy_on_write): + # Case: selecting a single column now also uses Copy-on-Write + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + + assert np.may_share_memory(s.values, df["a"].values) + + if using_copy_on_write: + s[0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = 0 + + expected = pd.Series([0, 2, 3], name="a") + tm.assert_series_equal(s, expected) + if using_copy_on_write: + # assert not np.may_share_memory(s.values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_column_as_series_set_with_upcast(using_copy_on_write): + # Case: selecting a single column now also uses Copy-on-Write -> when + # setting a value causes an upcast, we don't need to update the parent + # DataFrame through the cache mechanism + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + if using_copy_on_write: + s[0] = "foo" + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = "foo" + + expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + tm.assert_series_equal(s, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig["a"] = expected + tm.assert_frame_equal(df, df_orig) + + +# TODO add tests for other indexing methods on the Series + + +def test_dataframe_add_column_from_series(): + # Case: adding a new column to a DataFrame from an existing column/series + # -> always already takes a copy on assignment + # (no change in behaviour here) + # TODO can we achieve the same behaviour with Copy-on-Write? + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + s = pd.Series([10, 11, 12]) + df["new"] = s + assert not np.may_share_memory(df["new"].values, s.values) + + # editing series -> doesn't modify column in frame + s[0] = 0 + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + tm.assert_frame_equal(df, expected) + + # editing column in frame -> doesn't modify series + df.loc[2, "new"] = 100 + expected = pd.Series([0, 11, 12]) + tm.assert_series_equal(s, expected) + + +# TODO add tests for constructors diff --git a/pandas/tests/indexing/test_iat.py b/pandas/tests/indexing/test_iat.py index 44bd51ee1b7d1..916303884df88 100644 --- a/pandas/tests/indexing/test_iat.py +++ b/pandas/tests/indexing/test_iat.py @@ -31,7 +31,7 @@ def test_iat_getitem_series_with_period_index(): assert expected == result -def test_iat_setitem_item_cache_cleared(indexer_ial): +def test_iat_setitem_item_cache_cleared(indexer_ial, using_copy_on_write): # GH#45684 data = {"x": np.arange(8, dtype=np.int64), "y": np.int64(0)} df = DataFrame(data).copy() @@ -44,5 +44,6 @@ def test_iat_setitem_item_cache_cleared(indexer_ial): indexer_ial(df)[7, 1] = 1234 assert df.iat[7, 1] == 1234 - assert ser.iloc[-1] == 1234 + if not using_copy_on_write: + assert ser.iloc[-1] == 1234 assert df.iloc[-1, -1] == 1234 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 426192ab46914..29abc0ffce727 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -68,7 +68,9 @@ class TestiLocBaseIndependent: ], ) @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): + def test_iloc_setitem_fullcol_categorical( + self, indexer, key, using_array_manager, using_copy_on_write + ): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) @@ -98,11 +100,19 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage assert cat[0] != "gamma" # TODO with mixed dataframe ("split" path), we always overwrite the column + if using_copy_on_write and indexer == tm.loc: + # TODO(ArrayManager) with loc, slice(3) gets converted into slice(0, 3) + # which is then considered as "full" slice and does overwrite. For iloc + # this conversion is not done, and so it doesn't overwrite + overwrite = overwrite or (isinstance(key, slice) and key == slice(3)) + frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() orig_vals = df.values indexer(df)[key, 0] = cat expected = DataFrame({0: cat, 1: range(3)}) + if using_copy_on_write and not overwrite: + expected[0] = expected[0].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) @@ -115,7 +125,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box): if frame_or_series is Series: values = obj.values else: - values = obj[0].values + values = obj._mgr.blocks[0].values if frame_or_series is Series: obj.iloc[:2] = box(arr[2:]) @@ -835,7 +845,9 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager, request): + def test_identity_slice_returns_new_object( + self, using_array_manager, using_copy_on_write, request + ): # GH13873 if using_array_manager: mark = pytest.mark.xfail( @@ -860,7 +872,11 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): # should also be a shallow copy original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_copy_on_write: + # shallow copy not updated (CoW) + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) def test_indexing_zerodim_np_array(self): # GH24919 @@ -876,18 +892,21 @@ def test_series_indexing_zerodim_np_array(self): assert result == 1 @td.skip_array_manager_not_yet_implemented - def test_iloc_setitem_categorical_updates_inplace(self): + def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = Categorical(["A", "B", "C"]) + cat_original = cat.copy() df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) assert tm.shares_memory(df[1], cat) # This should modify our original values in-place df.iloc[:, 0] = cat[::-1] - assert tm.shares_memory(df[1], cat) - - expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) + if not using_copy_on_write: + assert tm.shares_memory(df[1], cat) + expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) + else: + expected = cat_original tm.assert_categorical_equal(cat, expected) def test_iloc_with_boolean_operation(self): @@ -1365,8 +1384,9 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self): + def test_iloc(self, using_copy_on_write): ser = Series(np.random.randn(10), index=list(range(0, 20, 2))) + ser_original = ser.copy() for i in range(len(ser)): result = ser.iloc[i] @@ -1382,7 +1402,10 @@ def test_iloc(self): with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning result[:] = 0 - assert (ser.iloc[1:3] == 0).all() + if using_copy_on_write: + tm.assert_series_equal(ser, ser_original) + else: + assert (ser.iloc[1:3] == 0).all() # list of integers result = ser.iloc[[0, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3f8e4401808b7..e19002b6a8701 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -53,7 +53,7 @@ def test_not_change_nan_loc(series, new_series, expected_ser): # GH 28403 df = DataFrame({"A": series}) - df["A"].loc[:] = new_series + df.loc[:, "A"] = new_series expected = DataFrame({"A": expected_ser}) tm.assert_frame_equal(df.isna(), expected) tm.assert_frame_equal(df.notna(), ~expected) @@ -703,22 +703,30 @@ def test_loc_setitem_frame_with_reindex(self): expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex_mixed(self): + def test_loc_setitem_frame_with_reindex_mixed(self, using_copy_on_write): # GH#40480 df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + if not using_copy_on_write: + # For default BlockManager case, this takes the "split" path, + # which still overwrites the column + ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) expected["B"] = "string" tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_inverted_slice(self): + def test_loc_setitem_frame_with_inverted_slice(self, using_copy_on_write): # GH#40480 df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) + expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) + if not using_copy_on_write: + # For default BlockManager case, this takes the "split" path, + # which still overwrites the column + expected["A"] = expected["A"].astype("int64") tm.assert_frame_equal(df, expected) def test_loc_setitem_empty_frame(self): @@ -1042,7 +1050,9 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager, request): + def test_identity_slice_returns_new_object( + self, using_array_manager, request, using_copy_on_write + ): # GH13873 if using_array_manager: mark = pytest.mark.xfail( @@ -1074,7 +1084,10 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): assert original_series[:] is not original_series original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_copy_on_write: + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) @pytest.mark.xfail(reason="accidental fix reverted - GH37497") def test_loc_copy_vs_view(self): @@ -2479,7 +2492,7 @@ def test_loc_setitem_boolean_and_column(self, float_frame): tm.assert_frame_equal(float_frame, expected) - def test_loc_setitem_ndframe_values_alignment(self): + def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): # GH#45501 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.loc[[False, False, True], ["a"]] = DataFrame( @@ -2500,9 +2513,13 @@ def test_loc_setitem_ndframe_values_alignment(self): tm.assert_frame_equal(df, expected) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() ser = df["a"] ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) - tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, expected) class TestLocListlike: From 74981f9a56f94371d3fa750e12cce7c90a70aae1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 7 May 2022 15:42:03 +0200 Subject: [PATCH 02/27] fix more tests --- pandas/compat/pickle_compat.py | 2 +- pandas/io/stata.py | 3 +- pandas/tests/apply/test_frame_apply.py | 16 +++-- pandas/tests/extension/base/setitem.py | 3 +- pandas/tests/frame/indexing/test_setitem.py | 20 ++++--- pandas/tests/frame/indexing/test_xs.py | 58 ++++++++++++++----- .../tests/frame/methods/test_combine_first.py | 2 +- pandas/tests/frame/methods/test_cov_corr.py | 4 +- .../tests/frame/methods/test_interpolate.py | 5 +- pandas/tests/frame/methods/test_update.py | 7 ++- pandas/tests/frame/test_api.py | 18 +++++- pandas/tests/frame/test_arithmetic.py | 3 +- pandas/tests/frame/test_block_internals.py | 14 +++-- pandas/tests/frame/test_constructors.py | 29 +++++++--- pandas/tests/groupby/test_apply_mutate.py | 2 +- .../indexes/period/test_partial_slicing.py | 8 ++- .../series/accessors/test_dt_accessor.py | 8 ++- pandas/tests/series/indexing/test_indexing.py | 24 +++++--- pandas/tests/series/methods/test_copy.py | 32 +++++++--- pandas/tests/series/methods/test_update.py | 12 ++-- 20 files changed, 190 insertions(+), 80 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 2333324a7e22d..90b537fddeba5 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -221,7 +221,7 @@ def load_newobj(self): arr = np.array([], dtype="m8[ns]") obj = cls.__new__(cls, arr, arr.dtype) elif cls is BlockManager and not args: - obj = cls.__new__(cls, (), [], False) + obj = cls.__new__(cls, (), [], None, False) else: obj = cls.__new__(cls, *args) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d9912f2480e07..f6623c24875ea 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -331,8 +331,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: has_bad_values = False if bad_locs.any(): has_bad_values = True - data_col = Series(dates) - data_col[bad_locs] = 1.0 # Replace with NaT + dates[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) if fmt.startswith(("%tc", "tc")): # Delta ms relative to base diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index ef7ab4a469865..fce5d25829b27 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1440,7 +1440,7 @@ def test_apply_dtype(col): tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager): +def test_apply_mutating(using_array_manager, using_copy_on_write): # GH#35462 case where applied func pins a new BlockManager to a row df = DataFrame({"a": range(100), "b": range(100, 200)}) df_orig = df.copy() @@ -1451,18 +1451,22 @@ def func(row): assert row._mgr is not mgr return row - expected = df.copy() - expected["a"] += 1 + if using_copy_on_write: + expected = df.copy() + expected["a"] += 1 + else: + expected = df_orig result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) - if not using_array_manager: + if using_copy_on_write or using_array_manager: + # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place, # with ArrayManager the row is not a view, and thus not mutated in place - tm.assert_frame_equal(df, result) - else: tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, result) def test_apply_empty_list_reduce(): diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index c2db54d832195..97abd85d195ef 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -364,6 +364,7 @@ def test_setitem_frame_2d_values(self, data): # Avoiding using_array_manager fixture # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) + using_copy_on_write = pd.options.mode.copy_on_write blk_data = df._mgr.arrays[0] @@ -377,7 +378,7 @@ def test_setitem_frame_2d_values(self, data): df.iloc[:] = df.values self.assert_frame_equal(df, orig) - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: # GH#33457 Check that this setting occurred in-place # FIXME(ArrayManager): this should work there too assert df._mgr.arrays[0] is blk_data diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index f1e7b18a73173..26ad7fc616cdc 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -743,7 +743,7 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self): + def test_setitem_listlike_views(self, using_copy_on_write): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -756,7 +756,10 @@ def test_setitem_listlike_views(self): # edit in place the first column to check view semantics df.iloc[0, 0] = 100 - expected = Series([100, 2, 3], name="a") + if using_copy_on_write: + expected = Series([1, 2, 3], name="a") + else: + expected = Series([100, 2, 3], name="a") tm.assert_series_equal(ser, expected) def test_setitem_string_column_numpy_dtype_raising(self): @@ -1054,7 +1057,9 @@ def test_setitem_always_copy(self, float_frame): assert notna(s[5:10]).all() @pytest.mark.parametrize("consolidate", [True, False]) - def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): + def test_setitem_partial_column_inplace( + self, consolidate, using_array_manager, using_copy_on_write + ): # This setting should be in-place, regardless of whether frame is # single-block or multi-block # GH#304 this used to be incorrectly not-inplace, in which case @@ -1079,10 +1084,11 @@ def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): tm.assert_series_equal(df["z"], expected) # check setting occurred in-place - tm.assert_numpy_array_equal(zvals, expected.values) - assert np.shares_memory(zvals, df["z"]._values) - if not consolidate: - assert df["z"]._values is zvals + if not using_copy_on_write: + tm.assert_numpy_array_equal(zvals, expected.values) + assert np.shares_memory(zvals, df["z"]._values) + if not consolidate: + assert df["z"]._values is zvals def test_setitem_duplicate_columns_not_inplace(self): # GH#39510 diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index c6938abb57d64..c4967b3f2c6d7 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -110,13 +110,17 @@ def test_xs_keep_level(self): result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view(self, using_array_manager): + def test_xs_view(self, using_array_manager, using_copy_on_write): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + df_orig = dm.copy() - if using_array_manager: + if using_copy_on_write: + dm.xs(2)[:] = 20 + tm.assert_frame_equal(dm, df_orig) + elif using_array_manager: # INFO(ArrayManager) with ArrayManager getting a row as a view is # not possible msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" @@ -175,27 +179,41 @@ def test_xs_level_eq_2(self): result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) - def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): + def test_xs_setting_with_copy_error( + self, multiindex_dataframe_random_data, using_copy_on_write + ): # this is a copy in 0.14 df = multiindex_dataframe_random_data + df_orig = df.copy() result = df.xs("two", level="second") - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) - def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): + def test_xs_setting_with_copy_error_multiple( + self, four_level_index_dataframe, using_copy_on_write + ): # this is a copy in 0.14 df = four_level_index_dataframe + df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): @@ -358,15 +376,20 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view(self, using_array_manager): + def test_xs_droplevel_false_view(self, using_array_manager, using_copy_on_write): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) - # modifying original df also modifies result when having a single block + df.iloc[0, 0] = 2 - expected = DataFrame({"a": [2]}) + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + else: + # modifying original df also modifies result when having a single block + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) # with mixed dataframe, modifying the parent doesn't modify result @@ -374,7 +397,10 @@ def test_xs_droplevel_false_view(self, using_array_manager): df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - if using_array_manager: + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + elif using_array_manager: # Here the behavior is consistent expected = DataFrame({"a": [2]}) else: diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index daddca7891b93..47ebca0b9bf5c 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -66,7 +66,7 @@ def test_combine_first(self, float_frame): assert (combined["A"][:10] == 1).all() # reverse overlap - tail["A"][:10] = 0 + tail.iloc[:10, tail.columns.get_loc("A")] = 0 combined = tail.combine_first(head) assert (combined["A"][:10] == 0).all() diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 2f0a4195d2f74..ee9af3f436943 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -27,8 +27,8 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - frame["A"][:5] = np.nan - frame["B"][5:10] = np.nan + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan result = frame.cov(min_periods=len(frame) - 8) expected = frame.cov() expected.loc["A", "B"] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 98f9d2670074d..7d6cf43c530a7 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -303,7 +303,10 @@ def test_interp_raise_on_all_object_dtype(self): with pytest.raises(TypeError, match=msg): df.interpolate() - def test_interp_inplace(self): + def test_interp_inplace(self, using_copy_on_write): + # TODO(CoW) inplace keyword (it is still mutating the parent) + if using_copy_on_write: + pytest.skip("CoW: inplace keyword not yet handled") df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) result = df.copy() diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 408113e9bc417..32961a9d7daf8 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -138,11 +138,14 @@ def test_update_datetime_tz(self): expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) - def test_update_with_different_dtype(self): + def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - df["c"].update(Series(["foo"], index=[0])) + if using_copy_on_write: + df.update({"c": Series(["foo"], index=[0])}) + else: + df["c"].update(Series(["foo"], index=[0])) expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 09ea9ae04320b..bc6c676568f73 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -322,7 +322,9 @@ def test_attrs(self): assert result.attrs == {"version": 1} @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) - def test_set_flags(self, allows_duplicate_labels, frame_or_series): + def test_set_flags( + self, allows_duplicate_labels, frame_or_series, using_copy_on_write + ): obj = DataFrame({"A": [1, 2]}) key = (0, 0) if frame_or_series is Series: @@ -344,15 +346,25 @@ def test_set_flags(self, allows_duplicate_labels, frame_or_series): assert obj.flags.allows_duplicate_labels is True # But we didn't copy data + if frame_or_series is Series: + assert np.may_share_memory(obj.values, result.values) + else: + assert np.may_share_memory(obj["A"].values, result["A"].values) + result.iloc[key] = 0 - assert obj.iloc[key] == 0 + if using_copy_on_write: + assert obj.iloc[key] == 1 + else: + assert obj.iloc[key] == 0 + # set back to 1 for test below + result.iloc[key] = 1 # Now we do copy. result = obj.set_flags( copy=True, allows_duplicate_labels=allows_duplicate_labels ) result.iloc[key] = 10 - assert obj.iloc[key] == 0 + assert obj.iloc[key] == 1 def test_constructor_expanddim(self): # GH#33628 accessing _constructor_expanddim should not raise NotImplementedError diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 7c33242192d2e..3e22734992d23 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1333,7 +1333,8 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) del frame_copy["D"] - frame_copy["C"][:5] = np.nan + # adding NAs to first 5 values of column "C" + frame_copy.loc[: frame_copy.index[4], "C"] = np.nan added = float_frame + frame_copy diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8aa0e980b01c4..fc46047714f13 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -334,7 +334,7 @@ def test_is_mixed_type(self, float_frame, float_string_frame): assert not float_frame._is_mixed_type assert float_string_frame._is_mixed_type - def test_stale_cached_series_bug_473(self): + def test_stale_cached_series_bug_473(self, using_copy_on_write): # this is chained, but ok with option_context("chained_assignment", None): @@ -349,9 +349,12 @@ def test_stale_cached_series_bug_473(self): repr(Y) result = Y.sum() # noqa exp = Y["g"].sum() # noqa - assert pd.isna(Y["g"]["c"]) + if using_copy_on_write: + assert not pd.isna(Y["g"]["c"]) + else: + assert pd.isna(Y["g"]["c"]) - def test_strange_column_corruption_issue(self): + def test_strange_column_corruption_issue(self, using_copy_on_write): # TODO(wesm): Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan @@ -363,7 +366,10 @@ def test_strange_column_corruption_issue(self): if col not in wasCol: wasCol[col] = 1 df[col] = np.nan - df[col][dt] = i + if using_copy_on_write: + df.loc[dt, col] = i + else: + df[col][dt] = i myid = 100 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 82c7117cc00c6..e8415fc63c51d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -271,15 +271,22 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self): + def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) - should_be_view[0][0] = 99 - assert df.values[0, 0] == 99 + if using_copy_on_write: + # INFO(CoW) doesn't mutate original + should_be_view.iloc[0, 0] = 99 + assert df.values[0, 0] == 1 + else: + should_be_view[0][0] = 99 + assert df.values[0, 0] == 99 - def test_constructor_dtype_nocast_view_2d_array(self, using_array_manager): + def test_constructor_dtype_nocast_view_2d_array( + self, using_array_manager, using_copy_on_write + ): df = DataFrame([[1, 2], [3, 4]], dtype="int64") - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -2502,7 +2509,13 @@ def test_constructor_list_str_na(self, string_dtype): @pytest.mark.parametrize("copy", [False, True]) def test_dict_nocopy( - self, request, copy, any_numeric_ea_dtype, any_numpy_dtype, using_array_manager + self, + request, + copy, + any_numeric_ea_dtype, + any_numpy_dtype, + using_array_manager, + using_copy_on_write, ): if ( using_array_manager @@ -2575,7 +2588,7 @@ def check_views(c_only: bool = False): # view, so we have to check in the other direction df.iloc[:, 2] = pd.array([45, 46], dtype=c.dtype) assert df.dtypes.iloc[2] == c.dtype - if not copy: + if not copy and not using_copy_on_write: check_views(True) if copy: @@ -2587,7 +2600,7 @@ def check_views(c_only: bool = False): assert b[0] == b.dtype.type(3) # FIXME(GH#35417): enable after GH#35417 assert c[0] == c_orig[0] # i.e. df.iloc[0, 2]=45 did *not* update c - else: + elif not using_copy_on_write: # TODO: we can call check_views if we stop consolidating # in setitem_with_indexer assert c[0] == 45 # i.e. df.iloc[0, 2]=45 *did* update c diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py index 36e117cf03353..d1f25aabe31a2 100644 --- a/pandas/tests/groupby/test_apply_mutate.py +++ b/pandas/tests/groupby/test_apply_mutate.py @@ -72,7 +72,7 @@ def test_apply_function_with_indexing(): ) def fn(x): - x.col2[x.index[-1]] = 0 + x.loc[x.index[-1], "col2"] = 0 return x.col2 result = df.groupby(["col1"], as_index=False).apply(fn) diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 72e7e458b4e1f..2cf1cf0f15652 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -12,16 +12,20 @@ class TestPeriodIndex: - def test_getitem_periodindex_duplicates_string_slice(self): + def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) + original = ts.copy() result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 - assert (ts[1:3] == 1).all() + if using_copy_on_write: + tm.assert_series_equal(ts, original) + else: + assert (ts[1:3] == 1).all() # not monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 41b5e55e75213..5e21d7b256e39 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -279,7 +279,7 @@ def test_dt_accessor_ambiguous_freq_conversions(self): expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self): + def test_dt_accessor_not_writeable(self, using_copy_on_write): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): @@ -288,8 +288,12 @@ def test_dt_accessor_not_writeable(self): # trying to set a copy msg = "modifications to a property of a datetimelike.+not supported" with pd.option_context("chained_assignment", "raise"): - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) it would be nice to keep a warning/error for this case ser.dt.hour[0] = 5 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + ser.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 3a8e14576a55d..3b045fa9eedf0 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -206,7 +206,8 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, slice(None, None)]] = 2 -def test_slice(string_series, object_series): +def test_slice(string_series, object_series, using_copy_on_write): + original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] objSlice = object_series[10:20] @@ -224,7 +225,11 @@ def test_slice(string_series, object_series): sl = string_series[10:20] sl[:] = 0 - assert (string_series[10:20] == 0).all() + if using_copy_on_write: + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) + else: + assert (string_series[10:20] == 0).all() def test_timedelta_assignment(): @@ -241,21 +246,24 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) -def test_underlying_data_conversion(): +def test_underlying_data_conversion(using_copy_on_write): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) return_value = df.set_index(["a", "b", "c"], inplace=True) assert return_value is None s = Series([1], index=[(2, 2, 2)]) df["val"] = 0 + df_original = df.copy() df df["val"].update(s) - expected = DataFrame( - {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} - ) - return_value = expected.set_index(["a", "b", "c"], inplace=True) - assert return_value is None + if using_copy_on_write: + expected = df_original + else: + expected = DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} + ) + expected.set_index(["a", "b", "c"], inplace=True) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 8aa5c14812dc0..d681c0d02e0a2 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -9,20 +9,28 @@ class TestCopy: - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy(self, deep, using_copy_on_write): ser = Series(np.arange(10), dtype="float64") # default deep is True - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[::2] = np.NaN - if deep is None or deep is True: + if deep is not False or using_copy_on_write: # Did not modify original Series assert np.isnan(ser2[0]) assert not np.isnan(ser[0]) @@ -31,8 +39,8 @@ def test_copy(self, deep): assert np.isnan(ser2[0]) assert np.isnan(ser[0]) - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy_tzaware(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy_tzaware(self, deep, using_copy_on_write): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) @@ -40,15 +48,23 @@ def test_copy_tzaware(self, deep): ser = Series([Timestamp("2012/01/01", tz="UTC")]) - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[0] = Timestamp("1999/01/01", tz="UTC") # default deep is True - if deep is None or deep is True: + if deep is not False or using_copy_on_write: # Did not modify original Series tm.assert_series_equal(ser2, expected2) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index d9d6641d54237..6403fcf76122a 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -14,7 +14,7 @@ class TestUpdate: - def test_update(self): + def test_update(self, using_copy_on_write): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0]) s.update(s2) @@ -25,11 +25,15 @@ def test_update(self): # GH 3217 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df["c"] = np.nan + df_orig = df.copy() df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame( - [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] - ) + if using_copy_on_write: + expected = df_orig + else: + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( From 36faac9b3f8f35a40edde3fa0e1ff336b2b8838f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 11:26:21 +0200 Subject: [PATCH 03/27] Handle CoW in BM.iset --- pandas/core/internals/blocks.py | 11 +++- pandas/core/internals/managers.py | 45 +++++++++++++--- pandas/tests/indexing/test_copy_on_write.py | 58 +++++++++++++++++++++ pandas/tests/indexing/test_iloc.py | 6 ++- pandas/tests/indexing/test_loc.py | 6 ++- 5 files changed, 116 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c8430a9266ea5..70b0f873bdbe8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -829,10 +829,13 @@ def _slice( return self.values[slicer] - def set_inplace(self, locs, values: ArrayLike) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: """ Modify block values in-place with new item value. + If copy=True, first copy the underlying values in place before modifying + (for Copy-on-Write). + Notes ----- `set_inplace` never creates a new array or new Block, whereas `setitem` @@ -840,6 +843,8 @@ def set_inplace(self, locs, values: ArrayLike) -> None: Caller is responsible for checking values.dtype == self.dtype. """ + if copy: + self.values = self.values.copy() self.values[locs] = values def take_nd( @@ -1653,9 +1658,11 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values: ArrayLike) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] + if copy: + self.values = self.values.copy() self.values[:] = values def _maybe_squeeze_arg(self, arg): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 08c3bca971f83..08585d3e7c719 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -239,12 +239,20 @@ def _has_no_reference(self, i: int) -> bool: Returns True if the columns has no references. """ blkno = self.blknos[i] + return self._has_no_reference_block(blkno) + + def _has_no_reference_block(self, blkno: int) -> bool: + """ + Check for column `i` if has references. + (whether it references another array or is itself being referenced) + Returns True if the columns has no references. + """ # TODO include `or self.refs[blkno]() is None` ? return ( self.refs is None or self.refs[blkno] is None ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 - def _clear_reference(self, blkno: int) -> None: + def _clear_reference_block(self, blkno: int) -> None: """ Clear any reference for column `i`. """ @@ -369,7 +377,7 @@ def setitem(self: T, indexer, value) -> T: # if being referenced -> perform Copy-on-Write and clear the reference # self.blocks = tuple([self.blocks[0].copy()]) self = self.copy() - # self._clear_reference(blkno) + # self._clear_reference_block(blkno) return self.apply("setitem", indexer=indexer, value=value) @@ -1179,7 +1187,12 @@ def value_getitem(placement): blk = self.blocks[blkno_l] blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): - blk.set_inplace(blk_locs, value_getitem(val_locs)) + # Updating inplace -> check if we need to do Copy-on-Write + if not self._has_no_reference_block(blkno_l): + blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) + self._clear_reference_block(blkno_l) + else: + blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1194,9 +1207,11 @@ def value_getitem(placement): ) self.blocks = blocks_tup self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + # blk.delete gives a copy, so we can remove a possible reference + self._clear_reference_block(blkno_l) if len(removed_blknos): - # Remove blocks & update blknos accordingly + # Remove blocks & update blknos and refs accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1207,6 +1222,12 @@ def value_getitem(placement): self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) + if self.refs is not None: + self.refs = [ + ref + for i, ref in enumerate(self.refs) + if i not in set(removed_blknos) + ] if unfit_val_locs: unfit_idxr = np.concatenate(unfit_mgr_locs) @@ -1243,6 +1264,10 @@ def value_getitem(placement): self._blklocs[unfit_idxr] = np.arange(unfit_count) self.blocks += tuple(new_blocks) + # TODO(CoW) is this always correct to assume that the new_blocks + # are not referencing anything else? + if self.refs is not None: + self.refs = list(self.refs) + [None] * len(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False @@ -1260,14 +1285,20 @@ def _iset_single( # Caller is responsible for verifying value.shape if inplace and blk.should_store(value): + copy = False + if not self._has_no_reference_block(blkno): + # perform Copy-on-Write and clear the reference + copy = True + self._clear_reference_block(blkno) iloc = self.blklocs[loc] - blk.set_inplace(slice(iloc, iloc + 1), value) + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return nb = new_block_2d(value, placement=blk._mgr_locs) old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks + self._clear_reference_block(blkno) return def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: @@ -1280,6 +1311,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: item : hashable value : np.ndarray or ExtensionArray """ + # TODO handle CoW (insert into refs as well) + # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1370,7 +1403,7 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): blocks = list(self.blocks) blocks[blkno] = blocks[blkno].copy() self.blocks = tuple(blocks) - self._clear_reference(blkno) + self._clear_reference_block(blkno) col_mgr = self.iget(loc) new_mgr = col_mgr.setitem((idx,), value) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 61dc762856fe2..cb84f80c2efa9 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -215,6 +215,8 @@ def test_subset_row_slice(using_copy_on_write): # with tm.assert_produces_warning(com.SettingWithCopyWarning): subset.iloc[0, 0] = 0 + subset._mgr._verify_integrity() + expected = pd.DataFrame( {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) ) @@ -333,6 +335,7 @@ def test_subset_set_column(using_copy_on_write): with tm.assert_produces_warning(com.SettingWithCopyWarning): subset["a"] = np.array([10, 11], dtype="int64") + subset._mgr._verify_integrity() expected = pd.DataFrame( {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) ) @@ -340,6 +343,56 @@ def test_subset_set_column(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_column_with_loc(using_copy_on_write, dtype): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + + subset._mgr._verify_integrity() + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)}, + index=range(1, 3), + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column_with_loc2(using_copy_on_write): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + # separate test for case of DataFrame of a single column -> takes a separate + # code path + df = pd.DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, "a"] = 0 + + subset._mgr._verify_integrity() + expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) @@ -359,6 +412,10 @@ def test_subset_set_columns(using_copy_on_write, dtype): with tm.assert_produces_warning(com.SettingWithCopyWarning): subset[["a", "c"]] = 0 + subset._mgr._verify_integrity() + if using_copy_on_write: + # first and third column should certainly have no references anymore + assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) tm.assert_frame_equal(df, df_orig) @@ -383,6 +440,7 @@ def test_subset_set_with_column_indexer(indexer, using_copy_on_write): with tm.assert_produces_warning(com.SettingWithCopyWarning): subset.loc[:, indexer] = 0 + subset._mgr._verify_integrity() expected = pd.DataFrame( {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) ) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 29abc0ffce727..9df56fbde513c 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -863,8 +863,12 @@ def test_identity_slice_returns_new_object( assert np.shares_memory(original_df["a"], sliced_df["a"]) # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + # depending on CoW original_df.loc[:, "a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_copy_on_write: + assert (sliced_df["a"] == [1, 2, 3]).all() + else: + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e19002b6a8701..b90bb0b2d2e92 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1069,8 +1069,12 @@ def test_identity_slice_returns_new_object( assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values) # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + # depending on CoW original_df.loc[:, "a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_copy_on_write: + assert (sliced_df["a"] == [1, 2, 3]).all() + else: + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] From ffa24a44df4b0d03cc815bc70d8e3945b1701431 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 11:50:56 +0200 Subject: [PATCH 04/27] Handle CoW in xs --- pandas/core/frame.py | 5 +++++ pandas/core/generic.py | 5 +++++ pandas/tests/apply/test_frame_apply.py | 7 ++----- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2da4523b96909..c5ea4738fba2c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -30,6 +30,7 @@ overload, ) import warnings +import weakref import numpy as np import numpy.ma as ma @@ -3495,6 +3496,10 @@ def _ixs(self, i: int, axis: int = 0): dtype=new_values.dtype, ).__finalize__(self) result._set_is_copy(self, copy=copy) + # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) + if not copy: + # assert len(self._mgr.blocks) == 1 + result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] return result # icol diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e2d171a2d6b66..135c5446156de 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3823,6 +3823,11 @@ class animal locomotion name=self.index[loc], dtype=new_values.dtype, ).__finalize__(self) + # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) + if len(self._mgr.blocks) == 1: + # in the case of a single block, new_values is a view + result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] + elif is_scalar(loc): result = self.iloc[:, slice(loc, loc + 1)] elif axis == 1: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index fce5d25829b27..9bdd021cac7b9 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1451,11 +1451,8 @@ def func(row): assert row._mgr is not mgr return row - if using_copy_on_write: - expected = df.copy() - expected["a"] += 1 - else: - expected = df_orig + expected = df.copy() + expected["a"] += 1 result = df.apply(func, axis=1) From 8c00bbd3e46973d9939b5ce4fea32579876dbe38 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 12:46:07 +0200 Subject: [PATCH 05/27] add bunch of todo comments and usage warnings --- pandas/core/frame.py | 6 +++++- pandas/core/indexing.py | 5 ++++- pandas/core/internals/managers.py | 23 ++++++++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c5ea4738fba2c..b7a1713f9795f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3518,6 +3518,7 @@ def _get_column_array(self, i: int) -> ArrayLike: Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ + # TODO(CoW) check if all use case are OK (i.e. don't need CoW handling) return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: @@ -3929,7 +3930,10 @@ def _set_value( Sets whether or not index/col interpreted as indexers """ try: - if get_option("mode.copy_on_write"): + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): if not takeable: icol = self.columns.get_loc(col) index = self.index.get_loc(index) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 36c2b97387886..8386c995a2330 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1950,7 +1950,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): """ pi = plane_indexer - if get_option("mode.copy_on_write"): + if ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ): # CoW: in this case we cannot rely on getting the column as a # Series to mutate, but need to operated on the mgr directly if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 08585d3e7c719..ec1c37149cd52 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -271,6 +271,9 @@ def arrays(self) -> list[ArrayLike]: Only for compatibility with ArrayManager for testing convenience. Not to be used in actual code, and return value is not the same as the ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + + Warning! The returned arrays don't handle Copy-on-Write, so this should + be used with caution (only in read-mode). """ return [blk.values for blk in self.blocks] @@ -556,6 +559,7 @@ def _combine( self: T, blocks: list[Block], copy: bool = True, index: Index | None = None ) -> T: """return a new manager with the blocks""" + # TODO(CoW) handle setting refs if len(blocks) == 0: if self.ndim == 2: # retain our own Index dtype @@ -1042,6 +1046,9 @@ def fast_xs(self, loc: int) -> ArrayLike: """ Return the array corresponding to `frame.iloc[loc]`. + Warning! The returned array is a view in case of a single block, + but doesn't handle Copy-on-Write, so this should be used with caution. + Parameters ---------- loc : int @@ -1086,7 +1093,11 @@ def iget(self, i: int) -> SingleBlockManager: def iget_values(self, i: int) -> ArrayLike: """ Return the data for column i as the values (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution. """ + # TODO(CoW) making the arrays read-only might make this safer to use? block = self.blocks[self.blknos[i]] values = block.iget(self.blklocs[i]) return values @@ -1096,6 +1107,9 @@ def column_arrays(self) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. This optimizes compared to using `iget_values` by converting each + + Warning! This doesn't handle Copy-on-Write, so should be used with + caution (current use case of consuming this in the JSON code is fine). """ # This is an optimized equivalent to # result = [self.iget_values(i) for i in range(len(self.items))] @@ -1311,7 +1325,7 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: item : hashable value : np.ndarray or ExtensionArray """ - # TODO handle CoW (insert into refs as well) + # TODO(CoW) handle CoW (insert into refs as well) # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1662,6 +1676,7 @@ def as_array( ------- arr : ndarray """ + # TODO(CoW) handle case where resulting array is a view if len(self.blocks) == 0: arr = np.empty(self.shape, dtype=float) return arr.transpose() @@ -1781,6 +1796,7 @@ def _consolidate_check(self) -> None: self._known_consolidated = True def _consolidate_inplace(self) -> None: + # TODO(CoW) correctly handle passing through refs # In general, _consolidate_inplace should only be called via # DataFrame._consolidate_inplace, otherwise we will fail to invalidate # the DataFrame's _item_cache. The exception is for newly-created @@ -1853,6 +1869,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: """ Manager analogue of Series.to_frame """ + # TODO(CoW) pass ref? blk = self.blocks[0] arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) @@ -1979,6 +1996,7 @@ def array_values(self): return self._block.array_values def get_numeric_data(self, copy: bool = False): + # TODO(CoW) set refs? if self._block.is_numeric: if copy: return self.copy() @@ -2034,6 +2052,9 @@ def set_values(self, values: ArrayLike): Use at your own risk! This does not check if the passed values are valid for the current Block/SingleBlockManager (length, dtype, etc). """ + # TODO(CoW) do we need to handle copy on write here? Currently this is + # only used for FrameColumnApply.series_generator (what if apply is + # mutating inplace?) self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) From af5a02ca09527a7fd66e7ba0eddad7ff7d441513 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 14:50:10 +0200 Subject: [PATCH 06/27] Insert None ref in BM.insert --- pandas/core/internals/managers.py | 5 +++-- pandas/io/stata.py | 1 - pandas/tests/frame/methods/test_rename.py | 7 +++++-- pandas/tests/frame/methods/test_to_dict_of_blocks.py | 9 ++++++--- pandas/tests/indexing/test_copy_on_write.py | 1 + 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ec1c37149cd52..bb36d0ce6d56a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1325,8 +1325,6 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: item : hashable value : np.ndarray or ExtensionArray """ - # TODO(CoW) handle CoW (insert into refs as well) - # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) @@ -1352,6 +1350,9 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: self.axes[0] = new_axis self.blocks += (block,) + # TODO(CoW) do we always "own" the passed `value`? + if self.refs is not None: + self.refs += [None] self._known_consolidated = False diff --git a/pandas/io/stata.py b/pandas/io/stata.py index f6623c24875ea..621563e03d1d5 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2506,7 +2506,6 @@ def _set_formats_and_types(self, dtypes: Series) -> None: def _prepare_pandas(self, data: DataFrame) -> None: # NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io - data = data.copy() if self._write_index: diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 33fb191027c27..3fd3f365167ab 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -171,13 +171,16 @@ def test_rename_multiindex(self): tm.assert_index_equal(renamed.index, new_index) @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view - def test_rename_nocopy(self, float_frame): + def test_rename_nocopy(self, float_frame, using_copy_on_write): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) renamed.loc[:, "foo"] = 1.0 - assert (float_frame["C"] == 1.0).all() + if using_copy_on_write: + assert not (float_frame["C"] == 1.0).all() + else: + assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): float_frame.rename(columns={"C": "foo"}) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index c81bed9d93cc4..eb9b78610a112 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -27,7 +27,7 @@ def test_copy_blocks(self, float_frame): # make sure we did not change the original DataFrame assert not _df[column].equals(df[column]) - def test_no_copy_blocks(self, float_frame): + def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 df = DataFrame(float_frame, copy=True) column = df.columns[0] @@ -38,8 +38,11 @@ def test_no_copy_blocks(self, float_frame): if column in _df: _df.loc[:, column] = _df[column] + 1 - # make sure we did change the original DataFrame - assert _df[column].equals(df[column]) + if not using_copy_on_write: + # make sure we did change the original DataFrame + assert _df[column].equals(df[column]) + else: + assert not _df[column].equals(df[column]) def test_to_dict_of_blocks_item_cache(): diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index cb84f80c2efa9..8b5f74ce968fb 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -72,6 +72,7 @@ def test_reset_index(using_copy_on_write): ) df_orig = df.copy() df2 = df.reset_index() + df2._mgr._verify_integrity() if using_copy_on_write: # still shares memory (df2 is a shallow copy) From a64f310b5204fe82829fd33846136379fe795528 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 23:24:50 +0200 Subject: [PATCH 07/27] Ensure to not assume copy_on_write is set in case of ArrayManager --- pandas/conftest.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 7 +++++-- pandas/core/series.py | 5 ++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index a35a1bb184551..a34413acf5240 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1803,4 +1803,4 @@ def using_copy_on_write(): """ Fixture to check if the array manager is being used. """ - return pd.options.mode.copy_on_write + return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b7a1713f9795f..eb2b95c87440a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3497,7 +3497,7 @@ def _ixs(self, i: int, axis: int = 0): ).__finalize__(self) result._set_is_copy(self, copy=copy) # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) - if not copy: + if not copy and isinstance(self._mgr, BlockManager): # assert len(self._mgr.blocks) == 1 result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 135c5446156de..c67eb4176da42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3824,7 +3824,7 @@ class animal locomotion dtype=new_values.dtype, ).__finalize__(self) # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) - if len(self._mgr.blocks) == 1: + if isinstance(self._mgr, BlockManager) and len(self._mgr.blocks) == 1: # in the case of a single block, new_values is a view result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] @@ -3910,7 +3910,10 @@ def _check_setitem_copy(self, t="setting", force=False): df.iloc[0:5]['group'] = 'a' """ - if config.get_option("mode.copy_on_write"): + if ( + config.get_option("mode.copy_on_write") + and config.get_option("mode.data_manager") == "block" + ): return # return early if the check is not needed diff --git a/pandas/core/series.py b/pandas/core/series.py index 96a5101e17c1e..13710e0946804 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1291,7 +1291,10 @@ def _maybe_update_cacher( # if the Series changed, and always pop the cached item # TODO replace False with check for option elif ( - not get_option("mode.copy_on_write") + not ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ) and len(self) == len(ref) and self.name in ref.columns ): From 977cbb8121ea22a7f76072a6a94f095c442e044d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 May 2022 21:13:38 +0200 Subject: [PATCH 08/27] Handle refs in BM._combine / test CoW in select_dtypes --- pandas/core/internals/managers.py | 20 ++++++++++++------- pandas/tests/indexing/test_copy_on_write.py | 22 +++++++++++++++++++++ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bb36d0ce6d56a..69d63f6798354 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -247,7 +247,7 @@ def _has_no_reference_block(self, blkno: int) -> bool: (whether it references another array or is itself being referenced) Returns True if the columns has no references. """ - # TODO include `or self.refs[blkno]() is None` ? + # TODO(CoW) include `or self.refs[blkno]() is None` ? return ( self.refs is None or self.refs[blkno] is None ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 @@ -387,7 +387,7 @@ def setitem(self: T, indexer, value) -> T: def putmask(self, mask, new, align: bool = True): if self.refs is not None and not all(ref is None for ref in self.refs): # some reference -> copy full dataframe - # TODO this could be optimized to only copy the blocks that would + # TODO(CoW) this could be optimized to only copy the blocks that would # get modified # self.blocks = tuple([blk.copy() for blk in self.blocks]) # self.refs = None @@ -575,17 +575,23 @@ def _combine( inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: list[Block] = [] + # TODO(CoW) we could optimize here if we know that the passed blocks + # are fully "owned" (eg created from an operation, not coming from + # an existing manager) + new_refs: list[weakref.ref | None] | None = None if copy else [] for b in blocks: - b = b.copy(deep=copy) - b.mgr_locs = BlockPlacement(inv_indexer[b.mgr_locs.indexer]) - new_blocks.append(b) + nb = b.copy(deep=copy) + nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) + new_blocks.append(nb) + if not copy: + new_refs.append(weakref.ref(b)) axes = list(self.axes) if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks(new_blocks, axes) + return type(self).from_blocks(new_blocks, axes, new_refs) @property def nblocks(self) -> int: @@ -1956,7 +1962,7 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana block = type(blk)(array, placement=bp, ndim=1) new_idx = self.index[indexer] - # TODO in theory only need to track reference if new_array is a view + # TODO(CoW) in theory only need to track reference if new_array is a view ref = weakref.ref(blk) return type(self)(block, new_idx, [ref]) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 8b5f74ce968fb..a88467dd9c145 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -144,6 +144,28 @@ def test_reindex_columns(using_copy_on_write): tm.assert_frame_equal(df, df_orig) +def test_select_dtypes(using_copy_on_write): + # Case: selecting columns using `select_dtypes()` returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.select_dtypes("int64") + df2._mgr._verify_integrity() + + # currently this always returns a "view" + assert np.may_share_memory(df2["a"].values, df["a"].values) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + # # ----------------------------------------------------------------------------- # # Indexing operations taking subset + modifying the subset/parent From cc367f51e755952acf76a79bc3a8cb83564a791e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 May 2022 21:25:50 +0200 Subject: [PATCH 09/27] handle get_numeric_data for single block manager --- pandas/core/internals/managers.py | 8 ++------ pandas/tests/series/methods/test_get_numeric_data.py | 9 ++++++++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 69d63f6798354..841bc7e7d7a69 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -559,7 +559,6 @@ def _combine( self: T, blocks: list[Block], copy: bool = True, index: Index | None = None ) -> T: """return a new manager with the blocks""" - # TODO(CoW) handle setting refs if len(blocks) == 0: if self.ndim == 2: # retain our own Index dtype @@ -1876,7 +1875,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: """ Manager analogue of Series.to_frame """ - # TODO(CoW) pass ref? + # TODO(CoW) pass ref to ensure CoW for to_frame blk = self.blocks[0] arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) @@ -2003,11 +2002,8 @@ def array_values(self): return self._block.array_values def get_numeric_data(self, copy: bool = False): - # TODO(CoW) set refs? if self._block.is_numeric: - if copy: - return self.copy() - return self + return self.copy(deep=copy) return self.make_empty() @property diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index e386f4b5b1dec..60dd64d7e1948 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -7,13 +7,20 @@ class TestGetNumericData: - def test_get_numeric_data_preserve_dtype(self): + def test_get_numeric_data_preserve_dtype(self, using_copy_on_write): # get the numeric data obj = Series([1, 2, 3]) result = obj._get_numeric_data() tm.assert_series_equal(result, obj) + # returned object is a shallow copy + result.iloc[0] = 0 + if using_copy_on_write: + assert obj.iloc[0] == 1 + else: + assert obj.iloc[0] == 0 + obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() expected = Series([], dtype=object, index=Index([], dtype=object)) From 53a8273ef487a05b232c9917d288923a3ce2f644 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 May 2022 12:25:03 +0200 Subject: [PATCH 10/27] fix test_internals (get_numeric_data now uses CoW) --- pandas/tests/internals/test_internals.py | 60 +++++++++++++++++------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3c90eee5be999..b30b27f5bae1a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -744,7 +744,7 @@ def test_reindex_items(self): mgr.iget(3).internal_values(), reindexed.iget(3).internal_values() ) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_copy_on_write): mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -765,10 +765,16 @@ def test_get_numeric_data(self): np.array([100.0, 200.0, 300.0]), inplace=True, ) - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) + if using_copy_on_write: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([1.0, 1.0, 1.0]), + ) + else: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), + ) numeric2 = mgr.get_numeric_data(copy=True) tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) @@ -777,12 +783,18 @@ def test_get_numeric_data(self): np.array([1000.0, 2000.0, 3000.0]), inplace=True, ) - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) + if using_copy_on_write: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([1.0, 1.0, 1.0]), + ) + else: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), + ) - def test_get_bool_data(self): + def test_get_bool_data(self, using_copy_on_write): msg = "object-dtype columns with all-bool values" mgr = create_mgr( "int: int; float: float; complex: complex;" @@ -800,19 +812,31 @@ def test_get_bool_data(self): ) bools.iset(0, np.array([True, False, True]), inplace=True) - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) + if using_copy_on_write: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, True, True]), + ) + else: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), + ) # Check sharing with tm.assert_produces_warning(FutureWarning, match=msg): bools2 = mgr.get_bool_data(copy=True) bools2.iset(0, np.array([False, True, False])) - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) + if using_copy_on_write: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, True, True]), + ) + else: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), + ) def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) From d3d26f46af48b9240374a5f80cd43519c889822b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 14:56:11 +0200 Subject: [PATCH 11/27] handle refs in consolidation --- pandas/core/internals/managers.py | 42 +++++++++++++++++---- pandas/tests/indexing/test_copy_on_write.py | 37 ++++++++++++++++++ 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 841bc7e7d7a69..cd264887dcc99 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -660,7 +660,7 @@ def consolidate(self: T) -> T: if self.is_consolidated(): return self - bm = type(self)(self.blocks, self.axes, verify_integrity=False) + bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -1808,7 +1808,10 @@ def _consolidate_inplace(self) -> None: # the DataFrame's _item_cache. The exception is for newly-created # BlockManager objects not yet attached to a DataFrame. if not self.is_consolidated(): - self.blocks = tuple(_consolidate(self.blocks)) + if self.refs is None: + self.blocks = _consolidate(self.blocks) + else: + self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs) self._is_consolidated = True self._known_consolidated = True self._rebuild_blknos_and_blklocs() @@ -2251,19 +2254,42 @@ def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: new_blocks: list[Block] = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks( + merged_blocks, _ = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) new_blocks = extend_blocks(merged_blocks, new_blocks) - return new_blocks + return tuple(new_blocks) + + +def _consolidate_with_refs(blocks: tuple[Block, ...], refs) -> list[Block]: + """ + Merge blocks having same dtype, exclude non-consolidating blocks, handling + refs + """ + gkey = lambda x: x[0]._consolidate_key + grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey) + + new_blocks: list[Block] = [] + new_refs: list[weakref.ref | None] = [] + for (_can_consolidate, dtype), group_blocks_refs in grouper: + group_blocks, group_refs = list(zip(*list(group_blocks_refs))) + merged_blocks, consolidated = _merge_blocks( + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate + ) + new_blocks = extend_blocks(merged_blocks, new_blocks) + if consolidated: + new_refs.extend([None]) + else: + new_refs.extend(group_refs) + return tuple(new_blocks), new_refs def _merge_blocks( blocks: list[Block], dtype: DtypeObj, can_consolidate: bool -) -> list[Block]: +) -> tuple[list[Block], bool]: if len(blocks) == 1: - return blocks + return blocks, False if can_consolidate: @@ -2289,10 +2315,10 @@ def _merge_blocks( new_mgr_locs = new_mgr_locs[argsort] bp = BlockPlacement(new_mgr_locs) - return [new_block_2d(new_values, placement=bp)] + return [new_block_2d(new_values, placement=bp)], True # can't consolidate --> no merge - return blocks + return blocks, False def _fast_count_smallints(arr: npt.NDArray[np.intp]): diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index a88467dd9c145..7f8721623ba60 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -674,3 +674,40 @@ def test_dataframe_add_column_from_series(): # TODO add tests for constructors + + +def test_consolidate(using_copy_on_write): + + # create unconsolidated DataFrame + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df["c"] = [4, 5, 6] + + # take a viewing subset + subset = df[:] + + # each block of subset references a block of df + assert subset._mgr.refs is not None and all( + ref is not None for ref in subset._mgr.refs + ) + + # consolidate the two int64 blocks + subset._consolidate_inplace() + + # the float64 block still references the parent one because it still a view + assert subset._mgr.refs[0] is not None + # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) + # but avoids caching df["b"] + assert np.shares_memory(df._get_column_array(1), subset["b"].values) + + # the new consolidated int64 block does not reference another + assert subset._mgr.refs[1] is None + + # the parent dataframe now also only is linked for the float column + assert df._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(2) + + # and modifying subset still doesn't modify parent + subset.iloc[0, 1] = 0.0 + assert df._mgr._has_no_reference(1) + assert df.loc[0, "b"] == 0.1 From e431bc48cf5af2decdcad6d5aa325d1a11982c84 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 15:09:35 +0200 Subject: [PATCH 12/27] fix deep=None for ArrayManager --- pandas/core/internals/array_manager.py | 5 +++++ pandas/tests/indexing/test_iloc.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 7c877af0585e7..d2c90399a7e87 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -525,6 +525,11 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ + if deep is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + deep = True + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index d244d7a1dcd93..0dcf7e98b9003 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -125,7 +125,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box): if frame_or_series is Series: values = obj.values else: - values = obj._mgr.blocks[0].values + values = obj._mgr.arrays[0] if frame_or_series is Series: obj.iloc[:2] = box(arr[2:]) From a2d8fde407ba19608ff4009a9c92fbd231b46820 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 15:12:43 +0200 Subject: [PATCH 13/27] update copy/view tests from other PR --- pandas/core/internals/array_manager.py | 5 + .../test_indexing.py} | 403 +++++------------- pandas/tests/copy_view/test_internals.py | 43 ++ pandas/tests/copy_view/test_methods.py | 149 +++++++ 4 files changed, 311 insertions(+), 289 deletions(-) rename pandas/tests/{indexing/test_copy_on_write.py => copy_view/test_indexing.py} (52%) create mode 100644 pandas/tests/copy_view/test_internals.py create mode 100644 pandas/tests/copy_view/test_methods.py diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d2c90399a7e87..42d1dda8ad772 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -594,6 +594,11 @@ def _reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + copy = True + if indexer is None: if new_axis is self._axes[axis] and not copy: return self diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/copy_view/test_indexing.py similarity index 52% rename from pandas/tests/indexing/test_copy_on_write.py rename to pandas/tests/copy_view/test_indexing.py index 7f8721623ba60..7e32c80348267 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -2,196 +2,40 @@ import pytest import pandas as pd +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm import pandas.core.common as com - -def test_copy(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_copy = df.copy() - - # the deep copy doesn't share memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) - if using_copy_on_write: - assert df_copy._mgr.refs is None - - # mutating copy doesn't mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 1 - - # copy of df + copy of subset - # normal copy -> refs are removed, no mutation of parent - # deep=None -> refs are still generated / kept - # copy=False -> refs are kept? But if we then edit it - - # df = ... - # subset = df[1:3] - # subset_shallow_copy = subset.copy(deep=False) - # -> expected behaviour: mutating subset_shallow_copy should mutate subset - # but not mutate parent df - # - if we keep refs -> we copy on setitem -> subset is not mutated - # - if we remove refs -> we don't copy on setitem, but then also parent df - # is mutated - # -> disallow taking a shallow copy of a DataFrame that referencing other arrays? - - -def test_copy_shallow(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_copy = df.copy(deep=False) - - # the shallow copy still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) - if using_copy_on_write: - assert df_copy._mgr.refs is not None - - if using_copy_on_write: - # mutating shallow copy doesn't mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 1 - # mutating triggered a copy-on-write -> no longer shares memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) - # but still shares memory for the other columns/blocks - assert np.may_share_memory(df_copy["c"].values, df["c"].values) - else: - # mutating shallow copy does mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 0 - # and still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) - - # ----------------------------------------------------------------------------- -# DataFrame methods returning new DataFrame using shallow copy - - -def test_reset_index(using_copy_on_write): - # Case: resetting the index (i.e. adding a new column) + mutating the - # resulting dataframe - df = pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] - ) - df_orig = df.copy() - df2 = df.reset_index() - df2._mgr._verify_integrity() - - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["b"].values, df["b"].values) - assert np.may_share_memory(df2["c"].values, df["c"].values) - # mutating df2 triggers a copy-on-write for that column / block - df2.iloc[0, 2] = 0 - assert not np.may_share_memory(df2["b"].values, df["b"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) - tm.assert_frame_equal(df, df_orig) - - -def test_rename_columns(using_copy_on_write): - # Case: renaming columns returns a new dataframe - # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_orig = df.copy() - df2 = df.rename(columns=str.upper) - - if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) - df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) - tm.assert_frame_equal(df2, expected) - tm.assert_frame_equal(df, df_orig) - - -def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): - # Case: renaming columns returns a new dataframe - # + afterwards modifying the original (parent) dataframe - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df2 = df.rename(columns=str.upper) - df2_orig = df2.copy() - - if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) - else: - assert not np.may_share_memory(df2["A"].values, df["a"].values) - df.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) - if using_array_manager: - assert np.may_share_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(df2, df2_orig) - - -def test_reindex_columns(using_copy_on_write): - # Case: reindexing the column returns a new dataframe - # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_orig = df.copy() - df2 = df.reindex(columns=["a", "c"]) - - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["a"].values, df["a"].values) - else: - assert not np.may_share_memory(df2["a"].values, df["a"].values) - # mutating df2 triggers a copy-on-write for that column - df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["a"].values, df["a"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) - tm.assert_frame_equal(df, df_orig) - - -def test_select_dtypes(using_copy_on_write): - # Case: selecting columns using `select_dtypes()` returns a new dataframe - # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_orig = df.copy() - df2 = df.select_dtypes("int64") - df2._mgr._verify_integrity() - - # currently this always returns a "view" - assert np.may_share_memory(df2["a"].values, df["a"].values) - - # mutating df2 triggers a copy-on-write for that column/block - df2.iloc[0, 0] = 0 - if using_copy_on_write: - assert not np.may_share_memory(df2["a"].values, df["a"].values) - tm.assert_frame_equal(df, df_orig) - else: - # but currently select_dtypes() actually returns a view -> mutates parent - df_orig.iloc[0, 0] = 0 - tm.assert_frame_equal(df, df_orig) - - -# # ----------------------------------------------------------------------------- -# # Indexing operations taking subset + modifying the subset/parent +# Indexing operations taking subset + modifying the subset/parent def test_subset_column_selection(using_copy_on_write): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[["a", "c"]] if using_copy_on_write: # the subset shares memory ... - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) # ... but uses CoW when being modified subset.iloc[0, 0] = 0 else: - assert not np.may_share_memory(subset["a"].values, df["a"].values) - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): - subset.iloc[0, 0] = 0 + assert not np.shares_memory(subset["a"].values, df["a"].values) + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) - expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) tm.assert_frame_equal(df, df_orig) @@ -199,38 +43,38 @@ def test_subset_column_selection(using_copy_on_write): def test_subset_column_selection_modify_parent(using_copy_on_write): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the parent - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) subset = df[["a", "c"]] if using_copy_on_write: # the subset shares memory ... - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) # ... but parent uses CoW parent when it is modified df.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) if using_copy_on_write: # different column/block still shares memory - assert np.may_share_memory(subset["c"].values, df["c"].values) + assert np.shares_memory(subset["c"].values, df["c"].values) - expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) def test_subset_row_slice(using_copy_on_write): # Case: taking a subset of the rows of a DataFrame using a slice # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[1:3] subset._mgr._verify_integrity() - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) if using_copy_on_write: subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) else: # INFO this no longer raise warning since pandas 1.4 @@ -240,9 +84,7 @@ def test_subset_row_slice(using_copy_on_write): subset._mgr._verify_integrity() - expected = pd.DataFrame( - {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) - ) + expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) if using_copy_on_write: # original parent dataframe is not modified (CoW) @@ -253,28 +95,43 @@ def test_subset_row_slice(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_subset_column_slice(using_copy_on_write): +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + single_block = (dtype == "int64") and not using_array_manager + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) df_orig = df.copy() subset = df.iloc[:, 1:] subset._mgr._verify_integrity() if using_copy_on_write: - assert np.may_share_memory(subset["b"].values, df["b"].values) + assert np.shares_memory(subset["b"].values, df["b"].values) subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["b"].values, df["b"].values) + assert not np.shares_memory(subset["b"].values, df["b"].values) else: - subset.iloc[0, 0] = 0 + # we only get a warning in case of a single block + warn = com.SettingWithCopyWarning if single_block else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + subset.iloc[0, 0] = 0 - expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) tm.assert_frame_equal(subset, expected) - # original parent dataframe is not modified (also not for BlockManager case) - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (also not for BlockManager case, + # except for single block) + if not using_copy_on_write and (using_array_manager or single_block): + df_orig.iloc[0, 1] = 0 + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -285,7 +142,7 @@ def test_subset_column_slice(using_copy_on_write): def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): # Case: setting values with a row indexer on a viewing subset # subset[indexer] = value and subset.iloc[indexer] = value - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) df_orig = df.copy() subset = df[1:4] @@ -299,11 +156,13 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): if using_copy_on_write: indexer_si(subset)[indexer] = 0 else: + # INFO iloc no longer raises warning since pandas 1.4 + warn = com.SettingWithCopyWarning if indexer_si is tm.setitem else None with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): + with tm.assert_produces_warning(warn): indexer_si(subset)[indexer] = 0 - expected = pd.DataFrame( + expected = DataFrame( {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) @@ -318,7 +177,7 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): def test_subset_set_with_mask(using_copy_on_write): # Case: setting values with a mask on a viewing subset: subset[mask] = value - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) df_orig = df.copy() subset = df[1:4] @@ -331,7 +190,7 @@ def test_subset_set_with_mask(using_copy_on_write): with tm.assert_produces_warning(com.SettingWithCopyWarning): subset[mask] = 0 - expected = pd.DataFrame( + expected = DataFrame( {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) @@ -347,7 +206,7 @@ def test_subset_set_with_mask(using_copy_on_write): def test_subset_set_column(using_copy_on_write): # Case: setting a single column on a viewing subset -> subset[col] = value - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[1:3] @@ -359,7 +218,7 @@ def test_subset_set_column(using_copy_on_write): subset["a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() - expected = pd.DataFrame( + expected = DataFrame( {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) ) tm.assert_frame_equal(subset, expected) @@ -369,10 +228,10 @@ def test_subset_set_column(using_copy_on_write): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_column_with_loc(using_copy_on_write, dtype): +def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dtype): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) df_orig = df.copy() @@ -386,20 +245,26 @@ def test_subset_set_column_with_loc(using_copy_on_write, dtype): subset.loc[:, "a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() - expected = pd.DataFrame( + expected = DataFrame( {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3), ) tm.assert_frame_equal(subset, expected) - tm.assert_frame_equal(df, df_orig) + if using_copy_on_write or using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64") + tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2(using_copy_on_write): +def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate # code path - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() subset = df[1:3] @@ -411,9 +276,15 @@ def test_subset_set_column_with_loc2(using_copy_on_write): subset.loc[:, "a"] = 0 subset._mgr._verify_integrity() - expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) + expected = DataFrame({"a": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - tm.assert_frame_equal(df, df_orig) + if using_copy_on_write or using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = 0 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -422,7 +293,7 @@ def test_subset_set_column_with_loc2(using_copy_on_write): def test_subset_set_columns(using_copy_on_write, dtype): # Case: setting multiple columns on a viewing subset # -> subset[[col1, col2]] = value - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) df_orig = df.copy() @@ -439,7 +310,7 @@ def test_subset_set_columns(using_copy_on_write, dtype): if using_copy_on_write: # first and third column should certainly have no references anymore assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) - expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) tm.assert_frame_equal(df, df_orig) @@ -449,10 +320,12 @@ def test_subset_set_columns(using_copy_on_write, dtype): [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], ids=["slice", "mask", "array"], ) -def test_subset_set_with_column_indexer(indexer, using_copy_on_write): +def test_subset_set_with_column_indexer( + indexer, using_copy_on_write, using_array_manager +): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value - df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) df_orig = df.copy() subset = df[1:3] @@ -464,13 +337,11 @@ def test_subset_set_with_column_indexer(indexer, using_copy_on_write): subset.loc[:, indexer] = 0 subset._mgr._verify_integrity() - expected = pd.DataFrame( - {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) - ) + expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3)) # TODO full row slice .loc[:, idx] update inplace instead of overwrite? expected["b"] = expected["b"].astype("int64") tm.assert_frame_equal(subset, expected) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: tm.assert_frame_equal(df, df_orig) else: # In the mixed case with BlockManager, only one of the two columns is @@ -488,18 +359,18 @@ def test_subset_set_with_column_indexer(indexer, using_copy_on_write): def test_series_getitem_slice(using_copy_on_write): # Case: taking a slice of a Series + afterwards modifying the subset - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() subset = s[:] - assert np.may_share_memory(subset.values, s.values) + assert np.shares_memory(subset.values, s.values) subset.iloc[0] = 0 if using_copy_on_write: - assert not np.may_share_memory(subset.values, s.values) + assert not np.shares_memory(subset.values, s.values) - expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + expected = Series([0, 2, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) if using_copy_on_write: @@ -517,19 +388,12 @@ def test_series_getitem_slice(using_copy_on_write): ) def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write): # Case: setting values in a viewing Series with an indexer - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() subset = s[:] - # if ( - # indexer_si is tm.setitem - # and isinstance(indexer, np.ndarray) - # and indexer.dtype == "int" - # ): - # pytest.skip("setitem with labels selects on columns") - - expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) indexer_si(subset)[indexer] = 0 + expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) if using_copy_on_write: @@ -537,10 +401,6 @@ def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write else: tm.assert_series_equal(s, expected) - expected = pd.DataFrame( - {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) - ) - # ----------------------------------------------------------------------------- # del operator @@ -549,15 +409,15 @@ def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write def test_del_frame(using_copy_on_write): # Case: deleting a column with `del` on a viewing child dataframe should # not modify parent + update the references - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df[:] - assert np.may_share_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(df["a"].values, df2["a"].values) del df2["b"] - assert np.may_share_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(df["a"].values, df2["a"].values) tm.assert_frame_equal(df, df_orig) tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() @@ -565,22 +425,24 @@ def test_del_frame(using_copy_on_write): # TODO in theory modifying column "b" of the parent wouldn't need a CoW # but the weakref is still alive and so we still perform CoW + df2.loc[0, "a"] = 100 if using_copy_on_write: # modifying child after deleting a column still doesn't update parent - df2.loc[0, "a"] = 100 tm.assert_frame_equal(df, df_orig) + else: + assert df.loc[0, "a"] == 100 def test_del_series(): - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() s2 = s[:] - assert np.may_share_memory(s.values, s2.values) + assert np.shares_memory(s.values, s2.values) del s2["a"] - assert not np.may_share_memory(s.values, s2.values) + assert not np.shares_memory(s.values, s2.values) tm.assert_series_equal(s, s_orig) tm.assert_series_equal(s2, s_orig[["b", "c"]]) @@ -594,26 +456,26 @@ def test_del_series(): # Accessing column as Series -def test_column_as_series(using_copy_on_write): +def test_column_as_series(using_copy_on_write, using_array_manager): # Case: selecting a single column now also uses Copy-on-Write - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() s = df["a"] - assert np.may_share_memory(s.values, df["a"].values) + assert np.shares_memory(s.values, df["a"].values) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: s[0] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): s[0] = 0 - expected = pd.Series([0, 2, 3], name="a") + expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: - # assert not np.may_share_memory(s.values, df["a"].values) + # assert not np.shares_memory(s.values, df["a"].values) tm.assert_frame_equal(df, df_orig) # ensure cached series on getitem is not the changed series tm.assert_series_equal(df["a"], df_orig["a"]) @@ -622,22 +484,22 @@ def test_column_as_series(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_column_as_series_set_with_upcast(using_copy_on_write): +def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manager): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent # DataFrame through the cache mechanism - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() s = df["a"] - if using_copy_on_write: + if using_copy_on_write or using_array_manager: s[0] = "foo" else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): s[0] = "foo" - expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + expected = Series(["foo", 2, 3], dtype=object, name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) @@ -656,58 +518,21 @@ def test_dataframe_add_column_from_series(): # -> always already takes a copy on assignment # (no change in behaviour here) # TODO can we achieve the same behaviour with Copy-on-Write? - df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - s = pd.Series([10, 11, 12]) + s = Series([10, 11, 12]) df["new"] = s - assert not np.may_share_memory(df["new"].values, s.values) + assert not np.shares_memory(df["new"].values, s.values) # editing series -> doesn't modify column in frame s[0] = 0 - expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) tm.assert_frame_equal(df, expected) # editing column in frame -> doesn't modify series df.loc[2, "new"] = 100 - expected = pd.Series([0, 11, 12]) - tm.assert_series_equal(s, expected) + expected_s = Series([0, 11, 12]) + tm.assert_series_equal(s, expected_s) # TODO add tests for constructors - - -def test_consolidate(using_copy_on_write): - - # create unconsolidated DataFrame - df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - df["c"] = [4, 5, 6] - - # take a viewing subset - subset = df[:] - - # each block of subset references a block of df - assert subset._mgr.refs is not None and all( - ref is not None for ref in subset._mgr.refs - ) - - # consolidate the two int64 blocks - subset._consolidate_inplace() - - # the float64 block still references the parent one because it still a view - assert subset._mgr.refs[0] is not None - # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) - # but avoids caching df["b"] - assert np.shares_memory(df._get_column_array(1), subset["b"].values) - - # the new consolidated int64 block does not reference another - assert subset._mgr.refs[1] is None - - # the parent dataframe now also only is linked for the float column - assert df._mgr._has_no_reference(0) - assert not df._mgr._has_no_reference(1) - assert df._mgr._has_no_reference(2) - - # and modifying subset still doesn't modify parent - subset.iloc[0, 1] = 0.0 - assert df._mgr._has_no_reference(1) - assert df.loc[0, "b"] == 0.1 diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py new file mode 100644 index 0000000000000..066b6eab5ef57 --- /dev/null +++ b/pandas/tests/copy_view/test_internals.py @@ -0,0 +1,43 @@ +import numpy as np + +import pandas.util._test_decorators as td + +from pandas import DataFrame + + +@td.skip_array_manager_invalid_test +def test_consolidate(): + + # create unconsolidated DataFrame + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df["c"] = [4, 5, 6] + + # take a viewing subset + subset = df[:] + + # each block of subset references a block of df + assert subset._mgr.refs is not None and all( + ref is not None for ref in subset._mgr.refs + ) + + # consolidate the two int64 blocks + subset._consolidate_inplace() + + # the float64 block still references the parent one because it still a view + assert subset._mgr.refs[0] is not None + # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) + # but avoids caching df["b"] + assert np.shares_memory(df._get_column_array(1), subset["b"].values) + + # the new consolidated int64 block does not reference another + assert subset._mgr.refs[1] is None + + # the parent dataframe now also only is linked for the float column + assert df._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(2) + + # and modifying subset still doesn't modify parent + subset.iloc[0, 1] = 0.0 + assert df._mgr._has_no_reference(1) + assert df.loc[0, "b"] == 0.1 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py new file mode 100644 index 0000000000000..1c65a23401050 --- /dev/null +++ b/pandas/tests/copy_view/test_methods.py @@ -0,0 +1,149 @@ +import numpy as np + +from pandas import DataFrame +import pandas._testing as tm + + +def test_copy(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.shares_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is None # type: ignore[union-attr] + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + +def test_copy_shallow(using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.shares_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is not None # type: ignore[union-attr] + + if using_copy_on_write: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.shares_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns/blocks + assert np.shares_memory(df_copy["c"].values, df["c"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.shares_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_copy_on_write): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + df2._mgr._verify_integrity() + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(df2["b"].values, df["b"].values) + assert np.shares_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 2] = 0 + assert not np.shares_memory(df2["b"].values, df["b"].values) + if using_copy_on_write: + assert np.shares_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_copy_on_write: + assert np.shares_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.shares_memory(df2["A"].values, df["a"].values) + if using_copy_on_write: + assert np.shares_memory(df2["C"].values, df["c"].values) + expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_copy_on_write: + assert np.shares_memory(df2["A"].values, df["a"].values) + else: + assert not np.shares_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.shares_memory(df2["A"].values, df["a"].values) + if using_copy_on_write: + assert np.shares_memory(df2["C"].values, df["c"].values) + expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_reindex_columns(using_copy_on_write): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.shares_memory(df2["a"].values, df["a"].values) + else: + assert not np.shares_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.shares_memory(df2["a"].values, df["a"].values) + if using_copy_on_write: + assert np.shares_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_select_dtypes(using_copy_on_write): + # Case: selecting columns using `select_dtypes()` returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.select_dtypes("int64") + df2._mgr._verify_integrity() + + # currently this always returns a "view" + assert np.may_share_memory(df2["a"].values, df["a"].values) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) From b2a142830368b5c14056528ffabbe971d09bf479 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 May 2022 14:47:04 +0200 Subject: [PATCH 14/27] clean-up fast_xs workarounds now it returns a SingleBlockManager --- pandas/core/frame.py | 5 ----- pandas/core/generic.py | 5 ----- pandas/core/internals/managers.py | 4 +++- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3e60301eaca9..ef46c30d269a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -30,7 +30,6 @@ overload, ) import warnings -import weakref import numpy as np import numpy.ma as ma @@ -3493,10 +3492,6 @@ def _ixs(self, i: int, axis: int = 0): self ) result._set_is_copy(self, copy=copy) - # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) - if not copy and isinstance(self._mgr, BlockManager): - # assert len(self._mgr.blocks) == 1 - result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] return result # icol diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f81ead6886e2d..181edb749db09 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3820,11 +3820,6 @@ class animal locomotion result = self._constructor_sliced( new_mgr, name=self.index[loc] ).__finalize__(self) - # TODO(CoW) cleaner solution (eg let fast_xs return a SingleBM?) - if isinstance(self._mgr, BlockManager) and len(self._mgr.blocks) == 1: - # in the case of a single block, new_values is a view - result._mgr.refs = [weakref.ref(self._mgr.blocks[0])] - elif is_scalar(loc): result = self.iloc[:, slice(loc, loc + 1)] elif axis == 1: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d69dd634fffb7..b3c04311aeff9 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1065,7 +1065,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: if len(self.blocks) == 1: result = self.blocks[0].iget((slice(None), loc)) block = new_block(result, placement=slice(0, len(result)), ndim=1) - return SingleBlockManager(block, self.axes[0]) + # in the case of a single block, the new block is a view + ref = weakref.ref(self.blocks[0]) + return SingleBlockManager(block, self.axes[0], [ref]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) From 4b1ccf6493b9f82e43ce3feed82bdcbcdb28c50a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 May 2022 14:58:19 +0200 Subject: [PATCH 15/27] tracks refs in to_frame --- pandas/core/internals/managers.py | 3 ++- pandas/tests/copy_view/test_methods.py | 31 +++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b3c04311aeff9..04848b3c47431 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1889,7 +1889,8 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: bp = BlockPlacement(0) new_blk = type(blk)(arr, placement=bp, ndim=2) axes = [columns, self.axes[0]] - return BlockManager([new_blk], axes=axes, verify_integrity=False) + refs = [weakref.ref(blk)] + return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False) def _has_no_reference(self, i: int = 0) -> bool: """ diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 1c65a23401050..6b4d8accb9b96 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,9 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm @@ -136,14 +139,36 @@ def test_select_dtypes(using_copy_on_write): df2._mgr._verify_integrity() # currently this always returns a "view" - assert np.may_share_memory(df2["a"].values, df["a"].values) + assert np.shares_memory(df2["a"].values, df["a"].values) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 0 if using_copy_on_write: - assert not np.may_share_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(df2["a"].values, df["a"].values) tm.assert_frame_equal(df, df_orig) else: # but currently select_dtypes() actually returns a view -> mutates parent df_orig.iloc[0, 0] = 0 tm.assert_frame_equal(df, df_orig) + + +def test_to_frame(using_copy_on_write): + # Case: converting a Series to a DataFrame with to_frame + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + + df = ser.to_frame() + + # currently this always returns a "view" + assert np.shares_memory(ser.values, df._get_column_array(0)) + + df.iloc[0, 0] = 0 + + if using_copy_on_write: + # mutating df triggers a copy-on-write for that column + assert not np.shares_memory(ser.values, df[0].values) + tm.assert_series_equal(ser, ser_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + ser_orig.iloc[0] = 0 + tm.assert_series_equal(ser, ser_orig) From 9339f15f9999764b5e6e043a54272ad43e6f44a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 16 Jul 2022 11:32:42 +0200 Subject: [PATCH 16/27] fixup after updata main and column_setitem + iloc inplace setitem changes (gh-45333) --- pandas/core/frame.py | 13 ------------- pandas/tests/copy_view/test_indexing.py | 8 ++++---- pandas/tests/copy_view/test_setitem.py | 10 ++++++---- pandas/tests/frame/indexing/test_setitem.py | 2 +- pandas/tests/indexing/test_iloc.py | 9 --------- pandas/tests/indexing/test_loc.py | 16 ++++------------ 6 files changed, 15 insertions(+), 43 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9faf8b3017482..1236a7222b502 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3941,19 +3941,6 @@ def _set_value( Sets whether or not index/col interpreted as indexers """ try: - if ( - get_option("mode.copy_on_write") - and get_option("mode.data_manager") == "block" - ): - if not takeable: - icol = self.columns.get_loc(col) - index = self.index.get_loc(index) - self._mgr.column_setitem(icol, index, value) - else: - self._mgr.column_setitem(col, index, value) - self._clear_item_cache() - return - if takeable: icol = col iindex = cast(int, index) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6eb7237c4f41c..d917a3c79aa97 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -150,7 +150,7 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): ids=["slice", "mask", "array"], ) def test_subset_loc_rows_columns( - dtype, row_indexer, column_indexer, using_array_manager + dtype, row_indexer, column_indexer, using_array_manager, using_copy_on_write ): # Case: taking a subset of the rows+columns of a DataFrame using .loc # + afterwards modifying the subset @@ -177,7 +177,7 @@ def test_subset_loc_rows_columns( if ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and (using_array_manager or dtype == "int64") + and (using_array_manager or (dtype == "int64" and not using_copy_on_write)) ): df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -197,7 +197,7 @@ def test_subset_loc_rows_columns( ids=["slice", "mask", "array"], ) def test_subset_iloc_rows_columns( - dtype, row_indexer, column_indexer, using_array_manager + dtype, row_indexer, column_indexer, using_array_manager, using_copy_on_write ): # Case: taking a subset of the rows+columns of a DataFrame using .iloc # + afterwards modifying the subset @@ -224,7 +224,7 @@ def test_subset_iloc_rows_columns( if ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and (using_array_manager or dtype == "int64") + and (using_array_manager or (dtype == "int64" and not using_copy_on_write)) ): df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index dd42983179806..9e0d350dde0de 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -34,8 +34,9 @@ def test_set_column_with_series(using_copy_on_write): df["c"] = ser if using_copy_on_write: - # with CoW we can delay the copy - assert np.shares_memory(df["c"].values, ser.values) + # TODO(CoW) with CoW we can delay the copy + # assert np.shares_memory(df["c"].values, ser.values) + assert not np.shares_memory(df["c"].values, ser.values) else: # the series data is copied assert not np.shares_memory(df["c"].values, ser.values) @@ -78,8 +79,9 @@ def test_set_columns_with_dataframe(using_copy_on_write): df[["c", "d"]] = df2 if using_copy_on_write: - # with CoW we can delay the copy - assert np.shares_memory(df["c"].values, df2["c"].values) + # TODO(CoW) with CoW we can delay the copy + # assert np.shares_memory(df["c"].values, df2["c"].values) + assert not np.shares_memory(df["c"].values, df2["c"].values) else: # the data is copied assert not np.shares_memory(df["c"].values, df2["c"].values) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ea320906ec37d..627e85a2b2f0c 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -776,7 +776,7 @@ def test_setitem_string_column_numpy_dtype_raising(self): expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) tm.assert_frame_equal(df, expected) - def test_setitem_empty_df_duplicate_columns(self): + def test_setitem_empty_df_duplicate_columns(self, using_copy_on_write): # GH#38521 df = DataFrame(columns=["a", "b", "b"], dtype="float64") msg = "will attempt to set the values inplace instead" diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 0b818a83df8e4..2301a326e464d 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -107,21 +107,12 @@ def test_iloc_setitem_fullcol_categorical( df.iloc[0, 0] = "gamma" assert cat[0] != "gamma" - # TODO with mixed dataframe ("split" path), we always overwrite the column - if using_copy_on_write and indexer == tm.loc: - # TODO(ArrayManager) with loc, slice(3) gets converted into slice(0, 3) - # which is then considered as "full" slice and does overwrite. For iloc - # this conversion is not done, and so it doesn't overwrite - overwrite = overwrite or (isinstance(key, slice) and key == slice(3)) - frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() orig_vals = df.values with tm.assert_produces_warning(FutureWarning, match=msg): indexer(df)[key, 0] = cat expected = DataFrame({0: cat, 1: range(3)}) - if using_copy_on_write and not overwrite: - expected[0] = expected[0].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 20aad25a3381d..9f2ede7ecc805 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -716,34 +716,26 @@ def test_loc_setitem_frame_with_reindex(self): expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex_mixed(self, using_copy_on_write): + def test_loc_setitem_frame_with_reindex_mixed(self): # GH#40480 df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" msg = "will attempt to set the values inplace instead" with tm.assert_produces_warning(FutureWarning, match=msg): df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) - if not using_copy_on_write: - # For default BlockManager case, this takes the "split" path, - # which still overwrites the column - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) expected["B"] = "string" tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_inverted_slice(self, using_copy_on_write): + def test_loc_setitem_frame_with_inverted_slice(self): # GH#40480 df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" msg = "will attempt to set the values inplace instead" with tm.assert_produces_warning(FutureWarning, match=msg): df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) - if not using_copy_on_write: - # For default BlockManager case, this takes the "split" path, - # which still overwrites the column - expected["A"] = expected["A"].astype("int64") + expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) tm.assert_frame_equal(df, expected) def test_loc_setitem_empty_frame(self): From 085d15b65d8215bccdb3b2f76b257bf871bd6402 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 16 Jul 2022 21:50:24 +0200 Subject: [PATCH 17/27] fix inplace fillna + fixup new tests --- pandas/core/internals/managers.py | 6 ++++++ pandas/tests/frame/indexing/test_getitem.py | 10 ++++++++-- pandas/tests/frame/methods/test_fillna.py | 15 +++++++++++---- pandas/tests/frame/methods/test_rename.py | 5 +++-- pandas/tests/frame/methods/test_update.py | 8 ++++++-- pandas/tests/frame/test_block_internals.py | 7 ++++--- pandas/tests/reshape/test_from_dummies.py | 4 ++-- 7 files changed, 40 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 260302e9d4f9c..8e99f7bea7924 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -429,6 +429,12 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) + if inplace: + # TODO(CoW) can be optimized to only copy those blocks that have refs + if any( + not self._has_no_reference_block(i) for i in range(len(self.blocks)) + ): + self = self.copy() return self.apply( "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 7994c56f8d68b..a98fa52e1009d 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -357,12 +357,18 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] tm.assert_frame_equal(df, df2) - def test_getitem_returns_view_when_column_is_unique_in_df(self): + def test_getitem_returns_view_when_column_is_unique_in_df( + self, using_copy_on_write + ): # GH#45316 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + df_orig = df.copy() view = df["b"] view.loc[:] = 100 - expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) + if using_copy_on_write: + expected = df_orig + else: + expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) tm.assert_frame_equal(df, expected) def test_getitem_frozenset_unique_in_column(self): diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index d86c1b2aedcac..20e59ed72666a 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -20,13 +20,16 @@ class TestFillNA: @td.skip_array_manager_not_yet_implemented - def test_fillna_on_column_view(self): + def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) df = DataFrame(arr) df[0].fillna(-1, inplace=True) - assert (arr[:, 0] == -1).all() + if using_copy_on_write: + assert np.isnan(arr[:, 0]).all() + else: + assert (arr[:, 0] == -1).all() # i.e. we didn't create a new 49-column block assert len(df._mgr.arrays) == 1 @@ -676,14 +679,18 @@ def test_fillna_inplace_with_columns_limit_and_value(self): @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) - def test_inplace_dict_update_view(self, val): + def test_inplace_dict_update_view(self, val, using_copy_on_write): # GH#47188 df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) + df_orig = df.copy() result_view = df[:] df.fillna(val, inplace=True) expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(result_view, expected) + if using_copy_on_write: + tm.assert_frame_equal(result_view, df_orig) + else: + tm.assert_frame_equal(result_view, expected) def test_single_block_df_with_horizontal_axis(self): # GH 47713 diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 574aef0200b12..f4443953a0d52 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -176,7 +176,9 @@ def test_rename_nocopy(self, float_frame, using_copy_on_write): assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) - with tm.assert_produces_warning(None): + # TODO(CoW) this also shouldn't warn in case of CoW, but the heuristic + # checking if the array shares memory doesn't work if CoW happened + with tm.assert_produces_warning(FutureWarning if using_copy_on_write else None): # This loc setitem already happens inplace, so no warning # that this will change in the future renamed.loc[:, "foo"] = 1.0 @@ -184,7 +186,6 @@ def test_rename_nocopy(self, float_frame, using_copy_on_write): assert not (float_frame["C"] == 1.0).all() else: assert (float_frame["C"] == 1.0).all() - assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): float_frame.rename(columns={"C": "foo"}) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index fd939e91d82d2..a35530100a425 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -153,12 +153,16 @@ def test_update_with_different_dtype(self, using_copy_on_write): tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self): + def test_update_modify_view(self, using_copy_on_write): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) + df2_orig = df2.copy() result_view = df2[:] df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - tm.assert_frame_equal(result_view, expected) + if using_copy_on_write: + tm.assert_frame_equal(result_view, df2_orig) + else: + tm.assert_frame_equal(result_view, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index fc46047714f13..46c712cf4d458 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -402,7 +402,7 @@ def test_add_column_with_pandas_array(self): tm.assert_frame_equal(df, df2) -def test_update_inplace_sets_valid_block_values(): +def test_update_inplace_sets_valid_block_values(using_copy_on_write): # https://github.com/pandas-dev/pandas/issues/33457 df = DataFrame({"a": Series([1, 2, None], dtype="category")}) @@ -412,8 +412,9 @@ def test_update_inplace_sets_valid_block_values(): # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) - # smoketest for OP bug from GH#35731 - assert df.isnull().sum().sum() == 0 + if not using_copy_on_write: + # smoketest for OP bug from GH#35731 + assert df.isnull().sum().sum() == 0 def test_nonconsolidated_item_cache_take(): diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index c52331e54f95e..ab80473725288 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -164,7 +164,7 @@ def test_error_with_prefix_default_category_dict_not_complete( def test_error_with_prefix_contains_nan(dummies_basic): - dummies_basic["col2_c"][2] = np.nan + dummies_basic.loc[2, "col2_c"] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" ): @@ -172,7 +172,7 @@ def test_error_with_prefix_contains_nan(dummies_basic): def test_error_with_prefix_contains_non_dummies(dummies_basic): - dummies_basic["col2_c"][2] = "str" + dummies_basic.loc[2, "col2_c"] = "str" with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"): from_dummies(dummies_basic, sep="_") From 773f03fde07fcb4bf5a7c4b444d2c5567443ff7f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 09:46:22 +0200 Subject: [PATCH 18/27] address comments + update some todo comments --- pandas/core/frame.py | 7 ++++++- pandas/core/internals/managers.py | 23 +++++++++-------------- pandas/core/series.py | 1 - pandas/io/stata.py | 1 + pandas/tests/indexing/test_iloc.py | 5 ++--- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 05d6f8a0620a2..1bacdc5dd374d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3680,14 +3680,19 @@ def _get_column_array(self, i: int) -> ArrayLike: """ Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). """ - # TODO(CoW) check if all use case are OK (i.e. don't need CoW handling) return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ Iterate over the arrays of all columns in order. This returns the values as stored in the Block (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). """ for i in range(len(self.columns)): yield self._get_column_array(i) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8e99f7bea7924..24bbff6fc1885 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -238,18 +238,18 @@ def items(self) -> Index: def _has_no_reference(self, i: int) -> bool: """ - Check for column `i` if has references. + Check for column `i` if it has references. (whether it references another array or is itself being referenced) - Returns True if the columns has no references. + Returns True if the column has no references. """ blkno = self.blknos[i] return self._has_no_reference_block(blkno) def _has_no_reference_block(self, blkno: int) -> bool: """ - Check for column `i` if has references. + Check for block `i` if it has references. (whether it references another array or is itself being referenced) - Returns True if the columns has no references. + Returns True if the block has no references. """ # TODO(CoW) include `or self.refs[blkno]() is None` ? return ( @@ -393,8 +393,6 @@ def putmask(self, mask, new, align: bool = True): # some reference -> copy full dataframe # TODO(CoW) this could be optimized to only copy the blocks that would # get modified - # self.blocks = tuple([blk.copy() for blk in self.blocks]) - # self.refs = None self = self.copy() if align: @@ -1037,9 +1035,11 @@ def _verify_integrity(self) -> None: ) if self.refs is not None: if len(self.refs) != len(self.blocks): - raise ValueError( + raise AssertionError( "Number of passed refs must equal the number of blocks: " f"{len(self.refs)} refs vs {len(self.blocks)} blocks." + "\nIf you see this error, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" ) @classmethod @@ -1061,9 +1061,6 @@ def fast_xs(self, loc: int) -> SingleBlockManager: """ Return the array corresponding to `frame.iloc[loc]`. - Warning! The returned array is a view in case of a single block, - but doesn't handle Copy-on-Write, so this should be used with caution. - Parameters ---------- loc : int @@ -1823,7 +1820,6 @@ def _consolidate_check(self) -> None: self._known_consolidated = True def _consolidate_inplace(self) -> None: - # TODO(CoW) correctly handle passing through refs # In general, _consolidate_inplace should only be called via # DataFrame._consolidate_inplace, otherwise we will fail to invalidate # the DataFrame's _item_cache. The exception is for newly-created @@ -1902,7 +1898,6 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: """ Manager analogue of Series.to_frame """ - # TODO(CoW) pass ref to ensure CoW for to_frame blk = self.blocks[0] arr = ensure_block_shape(blk.values, ndim=2) bp = BlockPlacement(0) @@ -1913,9 +1908,9 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: def _has_no_reference(self, i: int = 0) -> bool: """ - Check for column `i` if has references. + Check for column `i` if it has references. (whether it references another array or is itself being referenced) - Returns True if the columns has no references. + Returns True if the column has no references. """ return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount( self.blocks[0] diff --git a/pandas/core/series.py b/pandas/core/series.py index 075d0925d9a47..bbbc20488d04a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1297,7 +1297,6 @@ def _maybe_update_cacher( del self._cacher # for CoW, we never want to update the parent DataFrame cache # if the Series changed, and always pop the cached item - # TODO replace False with check for option elif ( not ( get_option("mode.copy_on_write") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index c727bb6641e7b..de92b2865b53c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2510,6 +2510,7 @@ def _set_formats_and_types(self, dtypes: Series) -> None: def _prepare_pandas(self, data: DataFrame) -> None: # NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io + data = data.copy() if self._write_index: diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2728b0d428ee6..a654319bd0b7c 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -71,9 +71,7 @@ class TestiLocBaseIndependent: ], ) @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical( - self, indexer, key, using_array_manager, using_copy_on_write - ): + def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) @@ -107,6 +105,7 @@ def test_iloc_setitem_fullcol_categorical( df.iloc[0, 0] = "gamma" assert cat[0] != "gamma" + # TODO with mixed dataframe ("split" path), we always overwrite the column frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) df = frame.copy() orig_vals = df.values From bfc2117044ebab78493eddcbed39da0fd07d80e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 09:47:19 +0200 Subject: [PATCH 19/27] Update pandas/core/internals/managers.py Co-authored-by: Matthew Roeschke --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 24bbff6fc1885..f276ebbf8826f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2281,7 +2281,7 @@ def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: return tuple(new_blocks) -def _consolidate_with_refs(blocks: tuple[Block, ...], refs) -> list[Block]: +def _consolidate_with_refs(blocks: tuple[Block, ...], refs) -> tuple[list[Block], list[weakref.ref | None]]: """ Merge blocks having same dtype, exclude non-consolidating blocks, handling refs From 89c4fffd609fcd9a912591356a1c1cc4b608245f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 09:48:09 +0200 Subject: [PATCH 20/27] fixup linting --- pandas/core/internals/managers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f276ebbf8826f..296a8b9b0d9e7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2281,7 +2281,9 @@ def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: return tuple(new_blocks) -def _consolidate_with_refs(blocks: tuple[Block, ...], refs) -> tuple[list[Block], list[weakref.ref | None]]: +def _consolidate_with_refs( + blocks: tuple[Block, ...], refs +) -> tuple[list[Block], list[weakref.ref | None]]: """ Merge blocks having same dtype, exclude non-consolidating blocks, handling refs From b2e56ea209b68be11f7220be4349206f5c2ea79b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 10:34:15 +0200 Subject: [PATCH 21/27] update new copy_view tests to use get_array helper --- pandas/tests/copy_view/test_internals.py | 3 ++- pandas/tests/copy_view/test_methods.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 066b6eab5ef57..86de38557ff50 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -3,6 +3,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame +from pandas.tests.copy_view.util import get_array @td.skip_array_manager_invalid_test @@ -27,7 +28,7 @@ def test_consolidate(): assert subset._mgr.refs[0] is not None # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) # but avoids caching df["b"] - assert np.shares_memory(df._get_column_array(1), subset["b"].values) + assert np.shares_memory(get_array(df, "b"), get_array(subset, "b")) # the new consolidated int64 block does not reference another assert subset._mgr.refs[1] is None diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index cf58c80e567af..b199dfcaee2c2 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -140,12 +140,12 @@ def test_select_dtypes(using_copy_on_write): df2._mgr._verify_integrity() # currently this always returns a "view" - assert np.shares_memory(df2["a"].values, df["a"].values) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 0 if using_copy_on_write: - assert not np.shares_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) else: # but currently select_dtypes() actually returns a view -> mutates parent @@ -161,13 +161,13 @@ def test_to_frame(using_copy_on_write): df = ser.to_frame() # currently this always returns a "view" - assert np.shares_memory(ser.values, df._get_column_array(0)) + assert np.shares_memory(ser.values, get_array(df, 0)) df.iloc[0, 0] = 0 if using_copy_on_write: # mutating df triggers a copy-on-write for that column - assert not np.shares_memory(ser.values, df[0].values) + assert not np.shares_memory(ser.values, get_array(df, 0)) tm.assert_series_equal(ser, ser_orig) else: # but currently select_dtypes() actually returns a view -> mutates parent From fdeb1544bdd7ec9abfbb01c7a9545c2f69b6b514 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 10:42:01 +0200 Subject: [PATCH 22/27] add comment to setitem --- pandas/core/internals/managers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 296a8b9b0d9e7..c97f4f0e75197 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -382,9 +382,8 @@ def setitem(self: T, indexer, value) -> T: if not self._has_no_reference(0): # if being referenced -> perform Copy-on-Write and clear the reference - # self.blocks = tuple([self.blocks[0].copy()]) + # this method is only called if there is a single block -> hardcoded 0 self = self.copy() - # self._clear_reference_block(blkno) return self.apply("setitem", indexer=indexer, value=value) From c521161c55565c1b062040116b38b6b074f64e5e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 11:23:50 +0200 Subject: [PATCH 23/27] switch default to False, ensure CoW copies only happen when enabled + add additional test build with CoW --- .github/workflows/ubuntu.yml | 5 +++++ pandas/core/config_init.py | 9 +++++--- pandas/core/internals/managers.py | 28 +++++++++++++++--------- pandas/tests/copy_view/test_internals.py | 9 ++++---- 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a759280c74521..4c1550accb688 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -52,6 +52,10 @@ jobs: extra_apt: "language-pack-zh-hans" lang: "zh_CN.utf8" lc_all: "zh_CN.utf8" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" - name: "Data Manager" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" @@ -84,6 +88,7 @@ jobs: LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }} PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index f960df13d1479..f269327b23d74 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -542,15 +542,18 @@ def use_inf_as_na_cb(key) -> None: # TODO better name? copy_on_write_doc = """ : bool - Use new copy-view behaviour using Copy-on-Write + Use new copy-view behaviour using Copy-on-Write. Defaults to False, + unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable (needs + to be set before pandas is imported). """ with cf.config_prefix("mode"): cf.register_option( "copy_on_write", - # TODO turn to False before merging - True, + # Get the default from an environment variable, if set, otherwise defaults + # to False. This environment variable can be set for testing. + os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", copy_on_write_doc, validator=is_bool, ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c97f4f0e75197..b1fd37af00777 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -380,7 +380,7 @@ def setitem(self: T, indexer, value) -> T: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") - if not self._has_no_reference(0): + if _using_copy_on_write() and not self._has_no_reference(0): # if being referenced -> perform Copy-on-Write and clear the reference # this method is only called if there is a single block -> hardcoded 0 self = self.copy() @@ -388,7 +388,11 @@ def setitem(self: T, indexer, value) -> T: return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): - if self.refs is not None and not all(ref is None for ref in self.refs): + if ( + _using_copy_on_write() + and self.refs is not None + and not all(ref is None for ref in self.refs) + ): # some reference -> copy full dataframe # TODO(CoW) this could be optimized to only copy the blocks that would # get modified @@ -428,7 +432,7 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: limit = libalgos.validate_limit(None, limit=limit) if inplace: # TODO(CoW) can be optimized to only copy those blocks that have refs - if any( + if _using_copy_on_write() and any( not self._has_no_reference_block(i) for i in range(len(self.blocks)) ): self = self.copy() @@ -618,7 +622,7 @@ def copy(self: T, deep=True) -> T: BlockManager """ if deep is None: - if get_option("mode.copy_on_write"): + if _using_copy_on_write(): # use shallow copy deep = False else: @@ -702,7 +706,7 @@ def reindex_indexer( pandas-indexer with -1's only. """ if copy is None: - if get_option("mode.copy_on_write"): + if _using_copy_on_write(): # use shallow copy copy = False else: @@ -875,7 +879,7 @@ def _slice_take_blocks_ax0( # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice or get_option("mode.copy_on_write"): + if only_slice or _using_copy_on_write(): taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): @@ -1218,7 +1222,7 @@ def value_getitem(placement): blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): # Updating inplace -> check if we need to do Copy-on-Write - if not self._has_no_reference_block(blkno_l): + if _using_copy_on_write() and not self._has_no_reference_block(blkno_l): blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) self._clear_reference_block(blkno_l) else: @@ -1316,7 +1320,7 @@ def _iset_single( if inplace and blk.should_store(value): copy = False - if not self._has_no_reference_block(blkno): + if _using_copy_on_write() and not self._has_no_reference_block(blkno): # perform Copy-on-Write and clear the reference copy = True self._clear_reference_block(blkno) @@ -1338,7 +1342,7 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ - if not self._has_no_reference(loc): + if _using_copy_on_write() and not self._has_no_reference(loc): # otherwise perform Copy-on-Write and clear the reference blkno = self.blknos[loc] blocks = list(self.blocks) @@ -2042,7 +2046,7 @@ def setitem_inplace(self, indexer, value) -> None: in place, not returning a new Manager (and Block), and thus never changing the dtype. """ - if not self._has_no_reference(0): + if _using_copy_on_write() and not self._has_no_reference(0): self.blocks = (self._block.copy(),) self.refs = None self._cache.clear() @@ -2372,3 +2376,7 @@ def _preprocess_slice_or_indexer( if not allow_fill: indexer = maybe_convert_indices(indexer, length) return "fancy", indexer, len(indexer) + + +def _using_copy_on_write(): + return get_option("mode.copy_on_write") diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 86de38557ff50..2191fc1b33218 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -7,7 +7,7 @@ @td.skip_array_manager_invalid_test -def test_consolidate(): +def test_consolidate(using_copy_on_write): # create unconsolidated DataFrame df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) @@ -39,6 +39,7 @@ def test_consolidate(): assert df._mgr._has_no_reference(2) # and modifying subset still doesn't modify parent - subset.iloc[0, 1] = 0.0 - assert df._mgr._has_no_reference(1) - assert df.loc[0, "b"] == 0.1 + if using_copy_on_write: + subset.iloc[0, 1] = 0.0 + assert df._mgr._has_no_reference(1) + assert df.loc[0, "b"] == 0.1 From fb297dfea0a7f04ed65f0a6a0c9cb02fc82f3901 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 21 Jul 2022 12:20:54 +0200 Subject: [PATCH 24/27] update type annotations --- pandas/core/internals/managers.py | 16 +++++++++------- pandas/tests/copy_view/test_methods.py | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b1fd37af00777..d15491c1ca99f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -162,7 +162,7 @@ def from_blocks( cls: type_t[T], blocks: list[Block], axes: list[Index], - refs: list[weakref.ref | None] | None, + refs: list[weakref.ref | None] | None = None, ) -> T: raise NotImplementedError @@ -594,7 +594,8 @@ def _combine( nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) new_blocks.append(nb) if not copy: - new_refs.append(weakref.ref(b)) + # None has no attribute "append" + new_refs.append(weakref.ref(b)) # type: ignore[union-attr] axes = list(self.axes) if index is not None: @@ -641,6 +642,7 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) + new_refs: list[weakref.ref | None] | None if deep: new_refs = None else: @@ -766,7 +768,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, - ) -> list[Block]: + ) -> tuple[list[Block], list[weakref.ref | None]]: """ Slice/take blocks along axis=0. @@ -845,7 +847,7 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] - refs = [] + refs: list[weakref.ref | None] = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: @@ -1906,7 +1908,7 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: bp = BlockPlacement(0) new_blk = type(blk)(arr, placement=bp, ndim=2) axes = [columns, self.axes[0]] - refs = [weakref.ref(blk)] + refs: list[weakref.ref | None] = [weakref.ref(blk)] return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False) def _has_no_reference(self, i: int = 0) -> bool: @@ -2267,7 +2269,7 @@ def _stack_arrays(tuples, dtype: np.dtype): return stacked, placement -def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: +def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: """ Merge blocks having same dtype, exclude non-consolidating blocks """ @@ -2286,7 +2288,7 @@ def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: def _consolidate_with_refs( blocks: tuple[Block, ...], refs -) -> tuple[list[Block], list[weakref.ref | None]]: +) -> tuple[tuple[Block, ...], list[weakref.ref | None]]: """ Merge blocks having same dtype, exclude non-consolidating blocks, handling refs diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index b199dfcaee2c2..cc4c219e6c5d9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -15,7 +15,7 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is None # type: ignore[union-attr] + assert df_copy._mgr.refs is None # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 @@ -29,7 +29,7 @@ def test_copy_shallow(using_copy_on_write): # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is not None # type: ignore[union-attr] + assert df_copy._mgr.refs is not None if using_copy_on_write: # mutating shallow copy doesn't mutate original From 14f753e3cd8206d6c3633b1d09aa4437ddee0f7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Aug 2022 17:56:09 +0200 Subject: [PATCH 25/27] Fix stata issue to avoid SettingWithCopyWarning in read_stata --- pandas/io/stata.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a81b70e579dcd..00e7b18fa14f4 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -338,6 +338,9 @@ def convert_delta_safe(base, deltas, unit) -> Series: has_bad_values = False if bad_locs.any(): has_bad_values = True + # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the + # `dates` Series is used to overwrite itself in the DataFramae) + dates._reset_cacher() dates[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) From 764838fdca3deb599b20430ef6e413f5e3ad1535 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 Aug 2022 21:03:56 +0200 Subject: [PATCH 26/27] update type + option comment --- pandas/conftest.py | 7 ++----- pandas/core/config_init.py | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index f9844d7223b0e..e3909953d8a58 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -29,10 +29,7 @@ from decimal import Decimal import operator import os -from typing import ( - Callable, - Literal, -) +from typing import Callable from dateutil.tz import ( tzlocal, @@ -1844,7 +1841,7 @@ def using_array_manager(): @pytest.fixture -def using_copy_on_write() -> Literal[False]: +def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 19e1e8ebe6b0e..2579f736a9703 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -544,8 +544,8 @@ def use_inf_as_na_cb(key) -> None: copy_on_write_doc = """ : bool Use new copy-view behaviour using Copy-on-Write. Defaults to False, - unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable (needs - to be set before pandas is imported). + unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable + (if set to "1" for True, needs to be set before pandas is imported). """ From 3d1377bdfe83e7b1ae083d556daecb40be46a857 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Aug 2022 09:45:51 +0200 Subject: [PATCH 27/27] fixup new rename test --- pandas/tests/series/methods/test_rename.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 729c07b8bdde7..d0392929cb082 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -143,10 +143,15 @@ def test_rename_error_arg(self): with pytest.raises(KeyError, match=match): ser.rename({2: 9}, errors="raise") - def test_rename_copy_false(self): + def test_rename_copy_false(self, using_copy_on_write): # GH 46889 ser = Series(["foo", "bar"]) + ser_orig = ser.copy() shallow_copy = ser.rename({1: 9}, copy=False) ser[0] = "foobar" - assert ser[0] == shallow_copy[0] - assert ser[1] == shallow_copy[9] + if using_copy_on_write: + assert ser_orig[0] == shallow_copy[0] + assert ser_orig[1] == shallow_copy[9] + else: + assert ser[0] == shallow_copy[0] + assert ser[1] == shallow_copy[9]