diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 9bc2404cefcfa..4b43ed92441a1 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -94,12 +94,15 @@ def _concatenate_array_managers( concat_arrays([mgrs[i].arrays[j] for i in range(len(mgrs))]) for j in range(len(mgrs[0].arrays)) ] - return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) else: # concatting along the columns -> combine reindexed arrays in a single manager assert concat_axis == 0 arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) - return ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) + if copy: + arrays = [x.copy() for x in arrays] + + new_mgr = ArrayManager(arrays, [axes[1], axes[0]], verify_integrity=False) + return new_mgr def concat_arrays(to_concat: list) -> ArrayLike: diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index a4c9e333f4d9c..347fd6e9f5bba 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -8,8 +8,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -43,9 +41,7 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - # TODO(ArrayManager) using block internals to verify, needs rewrite - @td.skip_array_manager_invalid_test - def test_concat_copy(self): + def test_concat_copy(self, using_array_manager): df = DataFrame(np.random.randn(4, 3)) df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) @@ -53,30 +49,43 @@ def test_concat_copy(self): # These are actual copies. result = concat([df, df2, df3], axis=1, copy=True) - for b in result._mgr.blocks: - assert b.values.base is None + for arr in result._mgr.arrays: + assert arr.base is None # These are the same. result = concat([df, df2, df3], axis=1, copy=False) - for b in result._mgr.blocks: - if b.dtype.kind == "f": - assert b.values.base is df._mgr.blocks[0].values.base - elif b.dtype.kind in ["i", "u"]: - assert b.values.base is df2._mgr.blocks[0].values.base - elif b.is_object: - assert b.values.base is not None + for arr in result._mgr.arrays: + if arr.dtype.kind == "f": + assert arr.base is df._mgr.arrays[0].base + elif arr.dtype.kind in ["i", "u"]: + assert arr.base is df2._mgr.arrays[0].base + elif arr.dtype == object: + if using_array_manager: + # we get the same array object, which has no base + assert arr is df3._mgr.arrays[0] + else: + assert arr.base is not None # Float block was consolidated. df4 = DataFrame(np.random.randn(4, 1)) result = concat([df, df2, df3, df4], axis=1, copy=False) - for b in result._mgr.blocks: - if b.dtype.kind == "f": - assert b.values.base is None - elif b.dtype.kind in ["i", "u"]: - assert b.values.base is df2._mgr.blocks[0].values.base - elif b.is_object: - assert b.values.base is not None + for arr in result._mgr.arrays: + if arr.dtype.kind == "f": + if using_array_manager: + # this is a view on some array in either df or df4 + assert any( + np.shares_memory(arr, other) + for other in df._mgr.arrays + df4._mgr.arrays + ) + else: + # the block was consolidated, so we got a copy anyway + assert arr.base is None + elif arr.dtype.kind in ["i", "u"]: + assert arr.base is df2._mgr.arrays[0].base + elif arr.dtype == object: + # this is a view on df3 + assert any(np.shares_memory(arr, other) for other in df3._mgr.arrays) def test_concat_with_group_keys(self): # axis=0 diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index dde8c0c19165f..01763926c6d89 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -192,3 +192,16 @@ def test_concat_duplicates_in_index_with_keys(self): expected = DataFrame(data=data, index=mi) tm.assert_frame_equal(result, expected) tm.assert_index_equal(result.index.levels[1], Index([1, 3], name="date")) + + @pytest.mark.parametrize("ignore_index", [True, False]) + @pytest.mark.parametrize("order", ["C", "F"]) + @pytest.mark.parametrize("axis", [0, 1]) + def test_concat_copies(self, axis, order, ignore_index): + # based on asv ConcatDataFrames + df = DataFrame(np.zeros((10000, 200), dtype=np.float32, order=order)) + + res = concat([df] * 5, axis=axis, ignore_index=ignore_index, copy=True) + + for arr in res._iter_column_arrays(): + for arr2 in df._iter_column_arrays(): + assert not np.shares_memory(arr, arr2)