From 0c8e753a632704c5538670836f3f725b57159ccf Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 27 Dec 2023 16:47:25 +0100 Subject: [PATCH 1/8] CoW: Enable CoW by default and remove warning build --- .github/workflows/unit-tests.yml | 12 ---------- pandas/_config/__init__.py | 12 ++-------- pandas/conftest.py | 12 ++-------- pandas/tests/copy_view/test_internals.py | 30 ------------------------ pandas/tests/extension/conftest.py | 12 ++-------- pandas/tests/test_downstream.py | 1 + 6 files changed, 7 insertions(+), 72 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6ca4d19196874..ec02d75926ad3 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -73,18 +73,6 @@ jobs: env_file: actions-312.yaml pattern: "not slow and not network and not single_cpu" pandas_copy_on_write: "1" - - name: "Copy-on-Write 3.11 (warnings)" - env_file: actions-311.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - - name: "Copy-on-Write 3.10 (warnings)" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - - name: "Copy-on-Write 3.9 (warnings)" - env_file: actions-39.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "warn" - name: "Pypy" env_file: actions-pypy-39.yaml pattern: "not slow and not network and not single_cpu" diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py index 97784c924dab4..0594d1c190a72 100644 --- a/pandas/_config/__init__.py +++ b/pandas/_config/__init__.py @@ -32,19 +32,11 @@ def using_copy_on_write() -> bool: - _mode_options = _global_config["mode"] - return ( - _mode_options["copy_on_write"] is True - and _mode_options["data_manager"] == "block" - ) + return True def warn_copy_on_write() -> bool: - _mode_options = _global_config["mode"] - return ( - _mode_options["copy_on_write"] == "warn" - and _mode_options["data_manager"] == "block" - ) + return False def using_nullable_dtypes() -> bool: diff --git a/pandas/conftest.py b/pandas/conftest.py index 046cda259eefd..8d2520afb1b0a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -48,8 +48,6 @@ utc, ) -from pandas._config.config import _get_option - import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( @@ -1882,10 +1880,7 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return ( - pd.options.mode.copy_on_write is True - and _get_option("mode.data_manager", silent=True) == "block" - ) + return True @pytest.fixture @@ -1893,10 +1888,7 @@ def warn_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is in warning mode. """ - return ( - pd.options.mode.copy_on_write == "warn" - and _get_option("mode.data_manager", silent=True) == "block" - ) + return False @pytest.fixture diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 615b024bd06bf..324d5ac7f5d3c 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,7 +1,6 @@ import numpy as np import pytest -import pandas as pd from pandas import DataFrame import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -42,35 +41,6 @@ def test_consolidate(using_copy_on_write): assert df.loc[0, "b"] == 0.1 -@pytest.mark.single_cpu -def test_switch_options(): - # ensure we can switch the value of the option within one session - # (assuming data is constructed after switching) - - # using the option_context to ensure we set back to global option value - # after running the test - with pd.option_context("mode.copy_on_write", False): - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df updated with CoW disabled - assert df.iloc[0, 0] == 0 - - pd.options.mode.copy_on_write = True - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df not updated with CoW enabled - assert df.iloc[0, 0] == 1 - - pd.options.mode.copy_on_write = False - df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - subset = df[:] - subset.iloc[0, 0] = 0 - # df updated with CoW disabled - assert df.iloc[0, 0] == 0 - - @pytest.mark.parametrize("dtype", [np.intp, np.int8]) @pytest.mark.parametrize( "locs, arr", diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py index c5b1295ee4a7d..3a3844d5a8b7a 100644 --- a/pandas/tests/extension/conftest.py +++ b/pandas/tests/extension/conftest.py @@ -2,12 +2,7 @@ import pytest -from pandas._config.config import _get_option - -from pandas import ( - Series, - options, -) +from pandas import Series @pytest.fixture @@ -224,7 +219,4 @@ def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return ( - options.mode.copy_on_write is True - and _get_option("mode.data_manager", silent=True) == "block" - ) + return True diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 51ce73ef54300..1c7ff3f924dc3 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -46,6 +46,7 @@ def test_dask(df): pd.set_option("compute.use_numexpr", olduse) +@pytest.mark.xfail(reason="array is read-only") def test_dask_ufunc(): # dask sets "compute.use_numexpr" to False, so catch the current value # and ensure to reset it afterwards to avoid impacting other tests From 7929800bc03f7a5c3a456fbc953121799951b6ed Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 27 Dec 2023 17:13:58 +0100 Subject: [PATCH 2/8] CoW: Boolean indexer in MultiIndex raising read-only error --- pandas/core/indexes/multi.py | 2 ++ pandas/tests/copy_view/test_indexing.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 2a4e027e2b806..681a3287e46e6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3487,6 +3487,8 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]: "cannot index with a boolean indexer that " "is not the same length as the index" ) + if isinstance(k, (ABCSeries, Index)): + k = k._values lvl_indexer = np.asarray(k) elif is_list_like(k): diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 9afc98e558c11..7f7be355b76c7 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -1180,6 +1180,24 @@ def test_series_midx_tuples_slice(using_copy_on_write, warn_copy_on_write): tm.assert_series_equal(ser, expected) +def test_midx_read_only_bool_indexer(): + def mklbl(prefix, n): + return [f"{prefix}{i}" for i in range(n)] + + idx = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + cols = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1) + + mask = df[("a", "foo")] == 1 + result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :] + expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :] + tm.assert_frame_equal(result, expected) + + def test_loc_enlarging_with_dataframe(using_copy_on_write): df = DataFrame({"a": [1, 2, 3]}) rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]}) From d9e0570812358dba91a8b771f7c0a70dc06a8f52 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 27 Dec 2023 17:27:14 +0100 Subject: [PATCH 3/8] Fixup ci --- pandas/tests/frame/methods/test_copy.py | 22 ---------------------- pandas/tests/series/test_ufunc.py | 5 +---- pandas/util/_test_decorators.py | 14 -------------- 3 files changed, 1 insertion(+), 40 deletions(-) diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index 6208d0256a655..5b72a84320c52 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -1,10 +1,7 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import DataFrame -import pandas._testing as tm class TestCopy: @@ -18,25 +15,6 @@ def test_copy_index_name_checking(self, float_frame, attr): getattr(cp, attr).name = "foo" assert getattr(float_frame, attr).name is None - @td.skip_copy_on_write_invalid_test - def test_copy_cache(self): - # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates - df = DataFrame({"a": [1]}) - - df["x"] = [0] - df["a"] - - df.copy() - - df["a"].values[0] = -1 - - tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0]})) - - df["y"] = [0] - - assert df["a"].values[0] == -1 - tm.assert_frame_equal(df, DataFrame({"a": [-1], "x": [0], "y": [0]})) - def test_copy(self, float_frame, float_string_frame): cop = float_frame.copy() cop["E"] = cop["A"] diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 9d13ebf740eab..ede6b98b7bf8e 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,8 +5,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd import pandas._testing as tm from pandas.arrays import SparseArray @@ -449,8 +447,7 @@ def add3(x, y, z): ufunc(ser, ser, df) -# TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082 -@td.skip_copy_on_write_not_yet_implemented +@pytest.mark.xfail(reason="see https://github.com/pandas-dev/pandas/pull/51082") def test_np_fix(): # np.fix is not a ufunc but is composed of several ufunc calls under the hood # with `out` and `where` keywords diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 37908c9ac255b..78626781289c4 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -33,12 +33,9 @@ def test_foo(): import pytest -from pandas._config import get_option - if TYPE_CHECKING: from pandas._typing import F - from pandas.compat import ( IS64, is_platform_windows, @@ -144,14 +141,3 @@ def documented_fixture(fixture): return fixture return documented_fixture - - -skip_copy_on_write_not_yet_implemented = pytest.mark.xfail( - get_option("mode.copy_on_write") is True, - reason="Not yet implemented/adapted for Copy-on-Write mode", -) - -skip_copy_on_write_invalid_test = pytest.mark.skipif( - get_option("mode.copy_on_write") is True, - reason="Test not valid for Copy-on-Write mode", -) From 1384b66c97724790f68cf67e40743cc0b5abf7f6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 27 Dec 2023 20:30:29 +0100 Subject: [PATCH 4/8] Update docs --- pandas/core/generic.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index de25a02c6b37c..2fe0b5cd5443a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6710,8 +6710,7 @@ def copy(self, deep: bool_t | None = True) -> Self: :ref:`gotchas ` when copying in a threading environment. - When ``copy_on_write`` in pandas config is set to ``True``, the - ``copy_on_write`` config takes effect even when ``deep=False``. + Copy-on-Write protects shallow copies against accidental modifications. This means that any changes to the copied data would make a new copy of the data upon write (and vice versa). Changes made to either the original or copied variable would not be reflected in the counterpart. @@ -6737,11 +6736,14 @@ def copy(self, deep: bool_t | None = True) -> Self: >>> deep = s.copy() >>> shallow = s.copy(deep=False) - Shallow copy shares data and index with original. + Shallow copy shares index with original, the data is a + view of the original. >>> s is shallow False - >>> s.values is shallow.values and s.index is shallow.index + >>> s.values is shallow.values + False + >>> s.index is shallow.index True Deep copy has own copy of data and index. @@ -6751,18 +6753,17 @@ def copy(self, deep: bool_t | None = True) -> Self: >>> s.values is deep.values or s.index is deep.index False - Updates to the data shared by shallow copy and original is reflected - in both (NOTE: this will no longer be true for pandas >= 3.0); - deep copy remains unchanged. + The shallow copy is protected against updating the original object + as well. Thus, updates will only reflect in one of both objects. >>> s.iloc[0] = 3 >>> shallow.iloc[1] = 4 >>> s a 3 - b 4 + b 2 dtype: int64 >>> shallow - a 3 + a 1 b 4 dtype: int64 >>> deep From 6936c873ed062d30fa7fe943d9261427acb930b8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 27 Dec 2023 20:40:40 +0100 Subject: [PATCH 5/8] Fixup asvs --- asv_bench/benchmarks/algos/isin.py | 3 ++- asv_bench/benchmarks/strings.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 2b3d32fb579dc..d650732e9c875 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -59,7 +59,8 @@ def setup(self, dtype): elif dtype in ["str", "string[python]", "string[pyarrow]"]: try: self.series = Series( - Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype + Index([f"i-{i}" for i in range(N)], dtype=object)._values, + dtype=dtype, ) except ImportError: raise NotImplementedError diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 1f4a104255057..accc970f71d60 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -19,7 +19,8 @@ class Dtypes: def setup(self, dtype): try: self.s = Series( - Index([f"i-{i}" for i in range(10000)], dtype=object), dtype=dtype + Index([f"i-{i}" for i in range(10000)], dtype=object)._values, + dtype=dtype, ) except ImportError: raise NotImplementedError From f68c2fece71e2175e91e0b9eeb0e10fdede4bb37 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 30 Jan 2024 00:13:05 +0000 Subject: [PATCH 6/8] Update docstring --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2fe0b5cd5443a..c517b56a2b91a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6743,7 +6743,7 @@ def copy(self, deep: bool_t | None = True) -> Self: False >>> s.values is shallow.values False - >>> s.index is shallow.index + >>> s.index is shallow.index True Deep copy has own copy of data and index. From 8c6fd9bc33ee09958459c4d1cca0ce735c8f5867 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 30 Jan 2024 20:57:36 +0000 Subject: [PATCH 7/8] Update docs --- pandas/core/generic.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d9ef33d9bc21e..31ed17268dc63 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6769,7 +6769,7 @@ def copy(self, deep: bool_t | None = True) -> Self: >>> s.values is shallow.values False >>> s.index is shallow.index - True + False Deep copy has own copy of data and index. @@ -6811,22 +6811,6 @@ def copy(self, deep: bool_t | None = True) -> Self: 0 [10, 2] 1 [3, 4] dtype: object - - **Copy-on-Write is set to true**, the shallow copy is not modified - when the original data is changed: - - >>> with pd.option_context("mode.copy_on_write", True): - ... s = pd.Series([1, 2], index=["a", "b"]) - ... copy = s.copy(deep=False) - ... s.iloc[0] = 100 - ... s - a 100 - b 2 - dtype: int64 - >>> copy - a 1 - b 2 - dtype: int64 """ data = self._mgr.copy(deep=deep) self._clear_item_cache() From ba224b88a59af6cd7207b2e6eae2bc5c296af714 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 31 Jan 2024 22:13:55 +0000 Subject: [PATCH 8/8] Use other function --- pandas/tests/test_downstream.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c904784d44dea..feba0e86c6b32 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -46,7 +46,6 @@ def test_dask(df): pd.set_option("compute.use_numexpr", olduse) -@pytest.mark.xfail(reason="array is read-only") def test_dask_ufunc(): # dask sets "compute.use_numexpr" to False, so catch the current value # and ensure to reset it afterwards to avoid impacting other tests @@ -59,8 +58,8 @@ def test_dask_ufunc(): s = Series([1.5, 2.3, 3.7, 4.0]) ds = dd.from_pandas(s, npartitions=2) - result = da.fix(ds).compute() - expected = np.fix(s) + result = da.log(ds).compute() + expected = np.log(s) tm.assert_series_equal(result, expected) finally: pd.set_option("compute.use_numexpr", olduse)