Skip to content

ENH: Use lazy copy in infer objects #50428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
3c4cee3
ENH: Use lazy copy in infer objects
phofl Dec 24, 2022
db5dba1
Update test_methods.py
phofl Dec 24, 2022
9eb4d85
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Jan 8, 2023
081dd29
Convert copy
phofl Jan 8, 2023
76c443b
Convert copy
phofl Jan 8, 2023
f7724ff
Remove setting
phofl Jan 8, 2023
a7b4e27
Fix
phofl Jan 8, 2023
1d4f726
Fix typing
phofl Jan 8, 2023
018cfe6
Revert unrelated change
phofl Jan 8, 2023
1696d8a
Fix
phofl Jan 8, 2023
1782fbd
Fix array manager
phofl Jan 8, 2023
8cb6355
Fix type
phofl Jan 8, 2023
cead228
Merge branch 'main' into cow_infer_objects
phofl Jan 12, 2023
47d85b3
Fix copy array manager
phofl Jan 13, 2023
a3d0a2b
Fix manager
phofl Jan 13, 2023
2e2ed0f
Merge remote-tracking branch 'upstream/main' into cow_inf_obj
phofl Jan 13, 2023
3a84382
Merge branch 'main' into cow_infer_objects
phofl Jan 17, 2023
64d550b
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Jan 27, 2023
716cef8
Refactor
phofl Jan 27, 2023
f693829
Fix mypy
phofl Jan 31, 2023
97fa214
Move to cython
phofl Jan 31, 2023
c3e4e66
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Feb 7, 2023
5d687dd
Merge CoW ref tracking
phofl Feb 7, 2023
f5bf65f
Refactor for new ref tracking logic
phofl Feb 7, 2023
af8af95
Merge remote-tracking branch 'upstream/main' into cow_infer_objects
phofl Feb 8, 2023
f47f6dd
Fix array manager diff
phofl Feb 8, 2023
e357b64
Fix merge conflicts
phofl Feb 8, 2023
bf1bb3b
Add todo
phofl Feb 8, 2023
5624983
Add test
phofl Feb 8, 2023
9a6d516
Adjust test
phofl Feb 8, 2023
8b0e2b0
Adjust test
phofl Feb 8, 2023
3f832ba
Adjust test
phofl Feb 8, 2023
9f11f0a
Fixup
phofl Feb 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6327,7 +6327,7 @@ def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
return self.copy(deep=True)

@final
def infer_objects(self: NDFrameT, copy: bool_t = True) -> NDFrameT:
def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:
"""
Attempt to infer better dtypes for object columns.

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,10 +371,10 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
)

def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
def astype(self: T, dtype, copy: bool | None = False, errors: str = "raise") -> T:
return self.apply(astype_array_safe, dtype=dtype, copy=copy, errors=errors)

def convert(self: T, copy: bool) -> T:
def convert(self: T, copy: bool | None) -> T:
def _convert(arr):
if is_object_dtype(arr.dtype):
# extract PandasArray for tests that patch PandasArray._typ
Expand All @@ -386,11 +386,11 @@ def _convert(arr):
convert_period=True,
convert_interval=True,
)
if result is arr and copy:
if result is arr and (copy or copy is None):
return arr.copy()
return result
else:
return arr.copy() if copy else arr
return arr.copy() if (copy or copy is None) else arr

return self.apply(_convert)

Expand Down
25 changes: 24 additions & 1 deletion pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
cast,
final,
)
import weakref

import numpy as np

Expand Down Expand Up @@ -451,12 +452,20 @@ def convert(
self,
*,
copy: bool = True,
using_copy_on_write: bool = False,
original_blocks: list[Block] = [],
) -> list[Block]:
"""
attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
if not copy and using_copy_on_write:
result = self.copy(deep=False)
result._ref = weakref.ref( # type: ignore[attr-defined]
original_blocks[self.mgr_locs.as_array[0]]
)
return [result]
return [self.copy()] if copy else [self]

# ---------------------------------------------------------------------
Expand Down Expand Up @@ -1963,6 +1972,8 @@ def convert(
self,
*,
copy: bool = True,
using_copy_on_write: bool = False,
original_blocks: list[Block] = [],
) -> list[Block]:
"""
attempt to cast any object types to better types return a copy of
Expand All @@ -1971,6 +1982,12 @@ def convert(
if self.dtype != _dtype_obj:
# GH#50067 this should be impossible in ObjectBlock, but until
# that is fixed, we short-circuit here.
if using_copy_on_write:
result = self.copy(deep=False)
result._ref = weakref.ref( # type: ignore[attr-defined]
original_blocks[self.mgr_locs.as_array[0]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was going to say: "isn't original_blocks[..] just self?". But that's not the case because of maybe_split?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah the block splitting is the problem here. self is a temporary block that is dead the moment we get back to the manager level.

)
return [result]
return [self]

values = self.values
Expand All @@ -1986,10 +2003,16 @@ def convert(
convert_period=True,
convert_interval=True,
)
ref = None
if copy and res_values is values:
res_values = values.copy()
elif res_values is values and using_copy_on_write:
ref = weakref.ref(original_blocks[self.mgr_locs.as_array[0]])

res_values = ensure_block_shape(res_values, self.ndim)
return [self.make_block(res_values)]
result = self.make_block(res_values)
result._ref = ref # type: ignore[attr-defined]
return [result]


# -----------------------------------------------------------------
Expand Down
20 changes: 18 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,11 +441,27 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
return self.apply("astype", dtype=dtype, copy=copy, errors=errors)

def convert(self: T, copy: bool) -> T:
return self.apply(
def convert(self: T, copy: bool | None) -> T:
if not copy and using_copy_on_write():
copy = False
elif copy is None:
copy = True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if not copy and using_copy_on_write():
copy = False
elif copy is None:
copy = True
if copy is None:
if using_copy_on_write():
copy = False
else:
copy = True

I know this does exactly the same in the end and is not shorter, but personally I find it more readable (it's clearer that only copy=None case is handled to translate it to True/False depending on the setting, while in the current code you are basically also overwriting copy=False with copy=False).

(and this is also the same pattern as used elsewhere, so ideally keep things consistent either way or the other)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed


if self.is_single_block:
original_blocks = [self.blocks[0]] * self.shape[0]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the * self.shape[0] needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are splitting blocks so we need the original blocks references * the number of columns

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only in case of object dtype, right? (although I suppose this list multiplication is not necessarily expensive, so maybe not worth to avoid doing when not needed)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's not that easy unfortunately,

Non object blocks are never copied, hence we need the references especially in this case. While object blocks might get copied (depends on the inference), so we might need the references there.

else:
original_blocks = [self.blocks[i] for i in self.blknos]
mgr = self.apply(
"convert",
copy=copy,
using_copy_on_write=using_copy_on_write(),
original_blocks=original_blocks,
)
refs = [getattr(blk, "_ref", None) for blk in mgr.blocks]
if any(ref is not None for ref in refs):
mgr.refs = refs
mgr.parent = self
return mgr

def replace(self: T, to_replace, value, inplace: bool) -> T:
inplace = validate_bool_kwarg(inplace, "inplace")
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,27 @@ def test_head_tail(method, using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


def test_infer_objects(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth adding a test where one of the object dtype columns actually gets converted? (eg change d to "d": np.array([0, 1], dtype=object)) To ensure this column owns its memory (and cover that part of the code additions)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, adjusted d in the two tests below to cover both cases.

Good to go apart from that? convert needs cow for a couple of other things

df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
if using_copy_on_write:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize(
"kwargs",
[
Expand Down