Skip to content

DEPR: enforce inplaceness for df.loc[:, foo]=bar #49775

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,7 @@ Removal of prior version deprecations/changes
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
- Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`)
- Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`)
- Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
- Using the method :meth:`DataFrameGroupBy.transform` with a callable that returns DataFrames will align to the input's index (:issue:`47244`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1789,6 +1789,10 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
return element
raise LossySetitemError

if dtype.kind == "V":
# i.e. np.void, which cannot hold _anything_
raise LossySetitemError

raise NotImplementedError(dtype)


Expand Down
14 changes: 6 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4217,7 +4217,7 @@ def _set_value(
else:
icol = self.columns.get_loc(col)
iindex = self.index.get_loc(index)
self._mgr.column_setitem(icol, iindex, value, inplace=True)
self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
self._clear_item_cache()

except (KeyError, TypeError, ValueError, LossySetitemError):
Expand Down Expand Up @@ -8111,10 +8111,10 @@ def update(
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
>>> df.update(new_df)
>>> df
A B
0 1 4.0
1 2 500.0
2 3 6.0
A B
0 1 4
1 2 500
2 3 6
"""
from pandas.core.computation import expressions

Expand Down Expand Up @@ -8152,9 +8152,7 @@ def update(
if mask.all():
continue

with warnings.catch_warnings():
warnings.filterwarnings("ignore", "In a future version, `df.iloc")
self.loc[:, col] = expressions.where(mask, this, that)
self.loc[:, col] = expressions.where(mask, this, that)

# ----------------------------------------------------------------------
# Data reshaping
Expand Down
96 changes: 33 additions & 63 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
cast,
final,
)
import warnings

import numpy as np

Expand All @@ -23,9 +22,9 @@
AbstractMethodError,
IndexingError,
InvalidIndexError,
LossySetitemError,
)
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import (
can_hold_element,
Expand Down Expand Up @@ -793,6 +792,7 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
if self.ndim != 2:
return

orig_key = key
if isinstance(key, tuple) and len(key) > 1:
# key may be a tuple if we are .loc
# if length of key is > 1 set key to column part
Expand All @@ -808,6 +808,23 @@ def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
):
# GH#38148
keys = self.obj.columns.union(key, sort=False)
diff = Index(key).difference(self.obj.columns, sort=False)

if len(diff) and com.is_null_slice(orig_key[0]):
# e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B"
# is a new column, add the new columns with dtype=np.void
# so that later when we go through setitem_single_column
# we will use isetitem. Without this, the reindex_axis
# below would create float64 columns in this example, which
# would successfully hold 7, so we would end up with the wrong
# dtype.
indexer = np.arange(len(keys), dtype=np.intp)
indexer[len(self.obj.columns) :] = -1
new_mgr = self.obj._mgr.reindex_indexer(
keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True
)
self.obj._mgr = new_mgr
return

self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True)

Expand Down Expand Up @@ -1979,72 +1996,25 @@ def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
"""
pi = plane_indexer

orig_values = self.obj._get_column_array(loc)

# perform the equivalent of a setitem on the info axis
# as we have a null slice or a slice with full bounds
# which means essentially reassign to the columns of a
# multi-dim object
# GH#6149 (null slice), GH#10408 (full bounds)
if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)):
pass
elif (
is_array_like(value)
and len(value.shape) > 0
and self.obj.shape[0] == value.shape[0]
and not is_empty_indexer(pi)
):
if is_list_like(pi) and not is_bool_dtype(pi):
value = value[np.argsort(pi)]
else:
# in case of slice
value = value[pi]
is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj))

if is_full_setter:

try:
self.obj._mgr.column_setitem(
loc, plane_indexer, value, inplace_only=True
)
except (ValueError, TypeError, LossySetitemError):
# If we're setting an entire column and we can't do it inplace,
# then we can use value's dtype (or inferred dtype)
# instead of object
self.obj.isetitem(loc, value)
else:
# set value into the column (first attempting to operate inplace, then
# falling back to casting if necessary)
self.obj._mgr.column_setitem(loc, plane_indexer, value)
self.obj._clear_item_cache()
return

self.obj._iset_item(loc, value)

# We will not operate in-place, but will attempt to in the future.
# To determine whether we need to issue a FutureWarning, see if the
# setting in-place would work, i.e. behavior will change.

new_values = self.obj._get_column_array(loc)

if can_hold_element(orig_values, new_values) and not len(new_values) == 0:
# Don't issue the warning yet, as we can still trim a few cases where
# behavior will not change.

if (
isinstance(new_values, np.ndarray)
and isinstance(orig_values, np.ndarray)
and (
np.shares_memory(new_values, orig_values)
or new_values.shape != orig_values.shape
)
):
# TODO: get something like tm.shares_memory working?
# The values were set inplace after all, no need to warn,
# e.g. test_rename_nocopy
# In case of enlarging we can not set inplace, so need to
# warn either
pass
else:
warnings.warn(
"In a future version, `df.iloc[:, i] = newvals` will attempt "
"to set the values inplace instead of always setting a new "
"array. To retain the old behavior, use either "
"`df[df.columns[i]] = newvals` or, if columns are non-unique, "
"`df.isetitem(i, newvals)`",
FutureWarning,
stacklevel=find_stack_level(),
)
# TODO: how to get future behavior?
# TODO: what if we got here indirectly via loc?
return
self.obj._clear_item_cache()

def _setitem_single_block(self, indexer, value, name: str) -> None:
"""
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,19 +856,21 @@ def iset(
return

def column_setitem(
self, loc: int, idx: int | slice | np.ndarray, value, inplace: bool = False
self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
) -> None:
"""
Set values ("setitem") into a single column (not setting the full column).

This is a method on the ArrayManager level, to avoid creating an
intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`)


"""
if not is_integer(loc):
raise TypeError("The column index should be an integer")
arr = self.arrays[loc]
mgr = SingleArrayManager([arr], [self._axes[0]])
if inplace:
if inplace_only:
mgr.setitem_inplace(idx, value)
else:
new_mgr = mgr.setitem((idx,), value)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1336,7 +1336,7 @@ def _iset_single(
return

def column_setitem(
self, loc: int, idx: int | slice | np.ndarray, value, inplace: bool = False
self, loc: int, idx: int | slice | np.ndarray, value, inplace_only: bool = False
) -> None:
"""
Set values ("setitem") into a single column (not setting the full column).
Expand All @@ -1355,7 +1355,7 @@ def column_setitem(
# this manager is only created temporarily to mutate the values in place
# so don't track references, otherwise the `setitem` would perform CoW again
col_mgr = self.iget(loc, track_ref=False)
if inplace:
if inplace_only:
col_mgr.setitem_inplace(idx, value)
else:
new_mgr = col_mgr.setitem((idx,), value)
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/computation/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1874,7 +1874,6 @@ def test_eval_no_support_column_name(request, column):
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented
def test_set_inplace(using_copy_on_write):
# https://github.com/pandas-dev/pandas/issues/47449
# Ensure we don't only update the DataFrame inplace, but also the actual
Expand Down
32 changes: 12 additions & 20 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,10 +337,8 @@ def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dt
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
else:
with pd.option_context("chained_assignment", "warn"):
# The (i)loc[:, col] inplace deprecation gets triggered here, ignore those
# warnings and only assert the SettingWithCopyWarning
with tm.assert_produces_warning(
SettingWithCopyWarning,
None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be considered as a false negative? (I would expect a warning here, but in general we are not super consistent about when we warn and when not)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i dont fully grok when we should be issuing the warning, but I think that no copies are being made here, so it would make sense to not get the warning

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in the past, we have often raised warnings both when the actual object is a copy (and people might think it is a view) or a view (and people might think it is a copy), despite the exact name of the warning. Now, I don't know if we ever clearly defined in which cases the warning should be raised, though ;)

It also seems to depend on the dtypes and the exact value that is being set, as I was running the above example on released pandas and eg when changing the setitem from df.loc[:, 'a'] to df.loc[1, 'a'] (scalar), the it does raise a warning with a single block, but not for a df with multiple dtypes ...

raise_on_extra_warnings=not using_array_manager,
):
subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
Expand All @@ -351,7 +349,7 @@ def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dt
index=range(1, 3),
)
tm.assert_frame_equal(subset, expected)
if using_copy_on_write or using_array_manager:
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
Expand All @@ -373,18 +371,16 @@ def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager):
subset.loc[:, "a"] = 0
else:
with pd.option_context("chained_assignment", "warn"):
# The (i)loc[:, col] inplace deprecation gets triggered here, ignore those
# warnings and only assert the SettingWithCopyWarning
with tm.assert_produces_warning(
SettingWithCopyWarning,
None,
raise_on_extra_warnings=not using_array_manager,
):
subset.loc[:, "a"] = 0

subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0]}, index=range(1, 3))
tm.assert_frame_equal(subset, expected)
if using_copy_on_write or using_array_manager:
if using_copy_on_write:
# original parent dataframe is not modified (CoW)
tm.assert_frame_equal(df, df_orig)
else:
Expand Down Expand Up @@ -439,24 +435,20 @@ def test_subset_set_with_column_indexer(
subset.loc[:, indexer] = 0
else:
with pd.option_context("chained_assignment", "warn"):
# The (i)loc[:, col] inplace deprecation gets triggered here, ignore those
# warnings and only assert the SettingWithCopyWarning
with tm.assert_produces_warning(
SettingWithCopyWarning, raise_on_extra_warnings=False
):
subset.loc[:, indexer] = 0
# As of 2.0, this setitem attempts (successfully) to set values
# inplace, so the assignment is not chained.
subset.loc[:, indexer] = 0

subset._mgr._verify_integrity()
expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3))
# TODO full row slice .loc[:, idx] update inplace instead of overwrite?
expected["b"] = expected["b"].astype("int64")
tm.assert_frame_equal(subset, expected)
if using_copy_on_write or using_array_manager:
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
# In the mixed case with BlockManager, only one of the two columns is
# mutated in the parent frame ..
df_orig.loc[1:2, ["a"]] = 0
# pre-2.0, in the mixed case with BlockManager, only column "a"
# would be mutated in the parent frame. this changed with the
# enforcement of GH#45333
df_orig.loc[1:2, ["a", "b"]] = 0
tm.assert_frame_equal(df, df_orig)


Expand Down
27 changes: 2 additions & 25 deletions pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
import numpy as np
import pytest

from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
IntervalDtype,
PandasDtype,
PeriodDtype,
)

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base.base import BaseExtensionTests
Expand Down Expand Up @@ -382,11 +375,6 @@ def test_setitem_frame_2d_values(self, data):
# GH#44514
df = pd.DataFrame({"A": data})

# These dtypes have non-broken implementations of _can_hold_element
has_can_hold_element = isinstance(
data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
)

# Avoiding using_array_manager fixture
# https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410
using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager)
Expand All @@ -396,24 +384,13 @@ def test_setitem_frame_2d_values(self, data):

orig = df.copy()

msg = "will attempt to set the values inplace instead"
warn = None
if has_can_hold_element and not isinstance(data.dtype, PandasDtype):
# PandasDtype excluded because it isn't *really* supported.
warn = FutureWarning

with tm.assert_produces_warning(warn, match=msg):
df.iloc[:] = df
df.iloc[:] = df
self.assert_frame_equal(df, orig)

df.iloc[:-1] = df.iloc[:-1]
self.assert_frame_equal(df, orig)

if isinstance(data.dtype, DatetimeTZDtype):
# no warning bc df.values casts to object dtype
warn = None
with tm.assert_produces_warning(warn, match=msg):
df.iloc[:] = df.values
df.iloc[:] = df.values
self.assert_frame_equal(df, orig)
if not using_array_manager and not using_copy_on_write:
# GH#33457 Check that this setting occurred in-place
Expand Down
Loading