Skip to content

BUG: concat with ArrayManager not making copy #42797

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,15 @@ def _concatenate_array_managers(
concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))])
for j in range(len(mgrs[0].arrays))
]
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
else:
# concatting along the columns -> combine reindexed arrays in a single manager
assert concat_axis == 0
arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs]))
return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
if copy:
arrays = [x.copy() for x in arrays]

new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False)
return new_mgr


def concat_arrays(to_concat: list) -> ArrayLike:
Expand Down
51 changes: 30 additions & 21 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -43,40 +41,51 @@ def test_append_concat(self):
assert isinstance(result.index, PeriodIndex)
assert result.index[0] == s1.index[0]

# TODO(ArrayManager) using block internals to verify, needs rewrite
@td.skip_array_manager_invalid_test
def test_concat_copy(self):
def test_concat_copy(self, using_array_manager):
df = DataFrame(np.random.randn(4, 3))
df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1))
df3 = DataFrame({5: "foo"}, index=range(4))

# These are actual copies.
result = concat([df, df2, df3], axis=1, copy=True)

for b in result._mgr.blocks:
assert b.values.base is None
for arr in result._mgr.arrays:
assert arr.base is None

# These are the same.
result = concat([df, df2, df3], axis=1, copy=False)

for b in result._mgr.blocks:
if b.dtype.kind == "f":
assert b.values.base is df._mgr.blocks[0].values.base
elif b.dtype.kind in ["i", "u"]:
assert b.values.base is df2._mgr.blocks[0].values.base
elif b.is_object:
assert b.values.base is not None
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
assert arr.base is df._mgr.arrays[0].base
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
if using_array_manager:
# we get the same array object, which has no base
assert arr is df3._mgr.arrays[0]
else:
assert arr.base is not None

# Float block was consolidated.
df4 = DataFrame(np.random.randn(4, 1))
result = concat([df, df2, df3, df4], axis=1, copy=False)
for b in result._mgr.blocks:
if b.dtype.kind == "f":
assert b.values.base is None
elif b.dtype.kind in ["i", "u"]:
assert b.values.base is df2._mgr.blocks[0].values.base
elif b.is_object:
assert b.values.base is not None
for arr in result._mgr.arrays:
if arr.dtype.kind == "f":
if using_array_manager:
# this is a view on some array in either df or df4
assert any(
np.shares_memory(arr, other)
for other in df._mgr.arrays + df4._mgr.arrays
)
else:
# the block was consolidated, so we got a copy anyway
assert arr.base is None
elif arr.dtype.kind in ["i", "u"]:
assert arr.base is df2._mgr.arrays[0].base
elif arr.dtype == object:
# this is a view on df3
assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays)

def test_concat_with_group_keys(self):
# axis=0
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/reshape/concat/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,16 @@ def test_concat_duplicates_in_index_with_keys(self):
expected = DataFrame(data=data, index=mi)
tm.assert_frame_equal(result, expected)
tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date"))

@pytest.mark.parametrize("ignore_index", [True, False])
@pytest.mark.parametrize("order", ["C", "F"])
@pytest.mark.parametrize("axis", [0, 1])
def test_concat_copies(self, axis, order, ignore_index):
# based on asv ConcatDataFrames
df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order))

res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True)

for arr in res._iter_column_arrays():
for arr2 in df._iter_column_arrays():
assert not np.shares_memory(arr, arr2)