diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e66011acb978b..1f55a132d665b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1605,6 +1605,9 @@ def delete(self, loc) -> list[Block]: values = self.values.delete(loc) mgr_locs = self._mgr_locs.delete(loc) return [type(self)(values, placement=mgr_locs, ndim=self.ndim)] + elif self.values.ndim == 1: + # We get here through to_stata + return [] return super().delete(loc) @cache_readonly diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 517e6d7e48275..f003e5eb6a052 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1250,7 +1250,10 @@ def value_getitem(placement): self._known_consolidated = False def _iset_split_block( - self, blkno_l: int, blk_locs: np.ndarray, value: ArrayLike | None = None + self, + blkno_l: int, + blk_locs: np.ndarray | list[int], + value: ArrayLike | None = None, ) -> None: """Removes columns from a block by splitting the block. @@ -1271,12 +1274,8 @@ def _iset_split_block( nbs_tup = tuple(blk.delete(blk_locs)) if value is not None: - # Argument 1 to "BlockPlacement" has incompatible type "BlockPlacement"; - # expected "Union[int, slice, ndarray[Any, Any]]" - first_nb = new_block_2d( - value, - BlockPlacement(blk.mgr_locs[blk_locs]), # type: ignore[arg-type] - ) + locs = blk.mgr_locs.as_array[blk_locs] + first_nb = new_block_2d(value, BlockPlacement(locs)) else: first_nb = nbs_tup[0] nbs_tup = tuple(nbs_tup[1:]) @@ -1287,6 +1286,10 @@ def _iset_split_block( ) self.blocks = blocks_tup + if not nbs_tup and value is not None: + # No need to update anything if split did not happen + return + self._blklocs[first_nb.mgr_locs.indexer] = np.arange(len(first_nb)) for i, nb in enumerate(nbs_tup): @@ -1330,11 +1333,18 @@ def column_setitem( intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ if using_copy_on_write() and not self._has_no_reference(loc): - # otherwise perform Copy-on-Write and clear the reference blkno = self.blknos[loc] - blocks = list(self.blocks) - blocks[blkno] = blocks[blkno].copy() - self.blocks = tuple(blocks) + # Split blocks to only copy the column we want to modify + blk_loc = self.blklocs[loc] + # Copy our values + values = self.blocks[blkno].values + if values.ndim == 1: + values = values.copy() + else: + # Use [blk_loc] as indexer to keep ndim=2, this already results in a + # copy + values = values[[blk_loc]] + self._iset_split_block(blkno, [blk_loc], values) # this manager is only created temporarily to mutate the values in place # so don't track references, otherwise the `setitem` would perform CoW again diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index df6b83518eaff..a673d8b37a008 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -883,3 +883,39 @@ def test_dataframe_add_column_from_series(): df.loc[2, "new"] = 100 expected_s = Series([0, 11, 12]) tm.assert_series_equal(s, expected_s) + + +@pytest.mark.parametrize("val", [100, "a"]) +@pytest.mark.parametrize( + "indexer_func, indexer", + [ + (tm.loc, (0, "a")), + (tm.iloc, (0, 0)), + (tm.loc, ([0], "a")), + (tm.iloc, ([0], 0)), + (tm.loc, (slice(None), "a")), + (tm.iloc, (slice(None), 0)), + ], +) +def test_set_value_copy_only_necessary_column( + using_copy_on_write, indexer_func, indexer, val +): + # When setting inplace, only copy column that is modified instead of the whole + # block (by splitting the block) + # TODO multi-block only for now + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + view = df[:] + + indexer_func(df)[indexer] = val + + if using_copy_on_write: + assert np.shares_memory(get_array(df, "b"), get_array(view, "b")) + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + tm.assert_frame_equal(view, df_orig) + else: + assert np.shares_memory(get_array(df, "c"), get_array(view, "c")) + if val == "a": + assert not np.shares_memory(get_array(df, "a"), get_array(view, "a")) + else: + assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))