diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0a7ac2325740a..c03722e32fea9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,8 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/test_constructors.py --array-manager + pytest pandas/tests/frame/constructors/ --array-manager pytest pandas/tests/frame/test_reductions.py --array-manager pytest pandas/tests/reductions/ --array-manager pytest pandas/tests/generic/test_generic.py --array-manager diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 830a7f4347132..f0f8d813bba96 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -563,39 +563,55 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - if isinstance(data, (BlockManager, ArrayManager)): - if index is None and columns is None and dtype is None and copy is False: - # GH#33357 fastpath - NDFrame.__init__(self, data) - return + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if ( + index is None + and columns is None + and dtype is None + and copy is False + and isinstance(data, (BlockManager, ArrayManager)) + ): + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + manager = get_option("mode.data_manager") + + if isinstance(data, (BlockManager, ArrayManager)): mgr = self._init_mgr( data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy ) elif isinstance(data, dict): - mgr = dict_to_mgr(data, index, columns, dtype=dtype) + mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords # masked recarray if isinstance(data, mrecords.MaskedRecords): - mgr = rec_array_to_mgr(data, index, columns, dtype, copy) + mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager) # a masked array else: data = sanitize_masked_array(data) - mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: # i.e. numpy structured array - mgr = rec_array_to_mgr(data, index, columns, dtype, copy) + mgr = rec_array_to_mgr(data, index, columns, dtype, copy, typ=manager) elif getattr(data, "name", None) is not None: # i.e. Series/Index with non-None name - mgr = dict_to_mgr({data.name: data}, index, columns, dtype=dtype) + mgr = dict_to_mgr( + {data.name: data}, index, columns, dtype=dtype, typ=manager + ) else: - mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): @@ -610,11 +626,15 @@ def __init__( arrays, columns, index = nested_data_to_arrays( data, columns, index, dtype ) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, columns, index, columns, dtype=dtype, typ=manager + ) else: - mgr = ndarray_to_mgr(data, index, columns, dtype=dtype, copy=copy) + mgr = ndarray_to_mgr( + data, index, columns, dtype=dtype, copy=copy, typ=manager + ) else: - mgr = dict_to_mgr({}, index, columns, dtype=dtype) + mgr = dict_to_mgr({}, index, columns, dtype=dtype, typ=manager) # For data is scalar else: if index is None or columns is None: @@ -631,18 +651,19 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, typ=manager + ) else: values = construct_2d_arraylike_from_scalar( data, len(index), len(columns), dtype, copy ) mgr = ndarray_to_mgr( - values, index, columns, dtype=values.dtype, copy=False + values, index, columns, dtype=values.dtype, copy=False, typ=manager ) # ensure correct Manager type according to settings - manager = get_option("mode.data_manager") mgr = mgr_to_mgr(mgr, typ=manager) NDFrame.__init__(self, mgr) @@ -1970,7 +1991,8 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + manager = get_option("mode.data_manager") + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns, typ=manager) return cls(mgr) @@ -2177,6 +2199,7 @@ def _from_arrays( if dtype is not None: dtype = pandas_dtype(dtype) + manager = get_option("mode.data_manager") mgr = arrays_to_mgr( arrays, columns, @@ -2184,6 +2207,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + typ=manager, ) return cls(mgr) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4774045849eb6..5bba7ab67b2bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -139,6 +139,7 @@ ArrayManager, BlockManager, ) +from pandas.core.internals.construction import mgr_to_mgr from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat @@ -5755,6 +5756,8 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Internal ONLY - only works for BlockManager """ mgr = self._mgr + # convert to BlockManager if needed -> this way support ArrayManager as well + mgr = mgr_to_mgr(mgr, "block") mgr = cast(BlockManager, mgr) return { k: self._constructor(v).__finalize__(self) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1b234cd2414a9..998f1ffcf02ee 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -840,7 +840,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False value = extract_array(value, extract_numpy=True) if value.ndim == 2: - value = value[0, :] + if value.shape[0] == 1: + value = value[0, :] + else: + raise ValueError( + f"Expected a 1D array, got an array with shape {value.shape}" + ) + # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7eade970253bf..d49114c0da719 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -69,7 +69,9 @@ get_objs_combined_axis, union_indexes, ) +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.managers import ( + BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, ) @@ -88,6 +90,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + typ: Optional[str] = None, ): """ Segregate Series based on type and coerce into matrices. @@ -114,7 +117,12 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + if typ == "block": + return create_block_manager_from_arrays(arrays, arr_names, axes) + elif typ == "array": + return ArrayManager(arrays, [index, columns]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") def rec_array_to_mgr( @@ -123,6 +131,7 @@ def rec_array_to_mgr( columns, dtype: Optional[DtypeObj], copy: bool, + typ: str, ): """ Extract from a masked rec array and create the manager. @@ -150,7 +159,7 @@ def rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) if copy: mgr = mgr.copy() @@ -180,11 +189,6 @@ def mgr_to_mgr(mgr, typ: str): Convert to specific type of Manager. Does not copy if the type is already correct. Does not guarantee a copy otherwise. """ - from pandas.core.internals import ( - ArrayManager, - BlockManager, - ) - new_mgr: Manager if typ == "block": @@ -192,7 +196,7 @@ def mgr_to_mgr(mgr, typ: str): new_mgr = mgr else: new_mgr = arrays_to_mgr( - mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], typ="block" ) elif typ == "array": if isinstance(mgr, ArrayManager): @@ -201,7 +205,7 @@ def mgr_to_mgr(mgr, typ: str): arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) else: - raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{typ}'") return new_mgr @@ -209,7 +213,9 @@ def mgr_to_mgr(mgr, typ: str): # DataFrame Constructor Interface -def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool): +def ndarray_to_mgr( + values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str +): # used in DataFrame.__init__ # input must be a ndarray, list, Series, index @@ -239,7 +245,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) # by definition an array here # the dtypes will be coerced to a single dtype @@ -303,7 +309,7 @@ def ndarray_to_mgr(values, index, columns, dtype: Optional[DtypeObj], copy: bool return create_block_manager_from_blocks(block_values, [columns, index]) -def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -349,7 +355,7 @@ def dict_to_mgr(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ) def nested_data_to_arrays( @@ -443,6 +449,11 @@ def _homogenize(data, index: Index, dtype: Optional[DtypeObj]): # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) + # TODO extract_array should be preferred, but that gives failures for + # `extension/test_numpy.py` (extract_array will convert numpy arrays + # to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 + # val = extract_array(val, extract_numpy=True) + val = val._values else: if isinstance(val, dict): if oindex is None: diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 1cda4b1948c6a..bc1007162884a 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,7 @@ import pytz from pandas.compat import is_platform_little_endian +import pandas.util._test_decorators as td from pandas import ( CategoricalIndex, @@ -119,6 +120,8 @@ def test_from_records_sequencelike(self): tm.assert_series_equal(result["C"], df["C"]) tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records + def test_from_records_sequencelike_empty(self): # empty case result = DataFrame.from_records([], columns=["foo", "bar", "baz"]) assert len(result) == 0 @@ -185,7 +188,12 @@ def test_from_records_bad_index_column(self): tm.assert_index_equal(df1.index, Index(df.C)) # should fail - msg = r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)" + msg = "|".join( + [ + r"Shape of passed values is \(10, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 10 vs 1", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(df, index=[2]) with pytest.raises(KeyError, match=r"^2$"): @@ -209,6 +217,7 @@ def __iter__(self): expected = DataFrame.from_records(tups) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_len0_with_columns(self): # GH#2633 result = DataFrame.from_records([], index="foo", columns=["foo", "bar"]) @@ -260,7 +269,12 @@ def test_from_records_to_records(self): tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length - msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" + msg = "|".join( + [ + r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)", + "Passed arrays should have the same length as the rows Index: 2 vs 1", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame.from_records(arr, index=index[:-1]) @@ -387,6 +401,7 @@ def create_dict(order_id): result = DataFrame.from_records(documents, index=["order_id", "quantity"]) assert result.index.names == ("order_id", "quantity") + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_misc_brokenness(self): # GH#2179 @@ -425,6 +440,7 @@ def test_from_records_misc_brokenness(self): ) tm.assert_series_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) empty from_records def test_from_records_empty(self): # GH#3562 result = DataFrame.from_records([], columns=["a", "b", "c"]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3bbe5f9e46efa..6035b8c2d6601 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytz from pandas.compat import np_version_under1p19 +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype from pandas.core.dtypes.dtypes import ( @@ -163,7 +164,12 @@ def test_constructor_cast_failure(self): df["foo"] = np.ones((4, 2)).tolist() # this is not ok - msg = "Wrong number of items passed 2, placement implies 1" + msg = "|".join( + [ + "Wrong number of items passed 2, placement implies 1", + "Expected a 1D array, got an array with shape \\(4, 2\\)", + ] + ) with pytest.raises(ValueError, match=msg): df["test"] = np.ones((4, 2)) @@ -178,12 +184,15 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view(self): + def test_constructor_dtype_nocast_view_dataframe(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 assert df.values[0, 0] == 99 + @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? + def test_constructor_dtype_nocast_view_2d_array(self): + df = DataFrame([[1, 2]]) should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -1946,6 +1955,8 @@ def test_constructor_frame_copy(self, float_frame): assert (cop["A"] == 5).all() assert not (float_frame["A"] == 5).all() + # TODO(ArrayManager) keep view on 2D array? + @td.skip_array_manager_not_yet_implemented def test_constructor_ndarray_copy(self, float_frame): df = DataFrame(float_frame.values) @@ -1956,6 +1967,8 @@ def test_constructor_ndarray_copy(self, float_frame): float_frame.values[6] = 6 assert not (df.values[6] == 6).all() + # TODO(ArrayManager) keep view on Series? + @td.skip_array_manager_not_yet_implemented def test_constructor_series_copy(self, float_frame): series = float_frame._series @@ -2069,7 +2082,12 @@ def test_from_nested_listlike_mixed_types(self): def test_construct_from_listlikes_mismatched_lengths(self): # invalid (shape) - msg = r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)" + msg = "|".join( + [ + r"Shape of passed values is \(6, 2\), indices imply \(3, 2\)", + "Passed arrays should have the same length as the rows Index", + ] + ) with pytest.raises(ValueError, match=msg): DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))]) @@ -2114,6 +2132,8 @@ def test_check_dtype_empty_numeric_column(self, dtype): assert data.b.dtype == dtype + # TODO(ArrayManager) astype to bytes dtypes does not yet give object dtype + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES ) @@ -2217,7 +2237,8 @@ class DatetimeSubclass(datetime): def test_with_mismatched_index_length_raises(self): # GH#33437 dti = date_range("2016-01-01", periods=3, tz="US/Pacific") - with pytest.raises(ValueError, match="Shape of passed values"): + msg = "Shape of passed values|Passed arrays should have the same length" + with pytest.raises(ValueError, match=msg): DataFrame(dti, index=range(4)) def test_frame_ctor_datetime64_column(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b270539921c9c..8cbb9d2443cb2 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1196,7 +1196,6 @@ def convert_force_pure(x): assert isinstance(result[0], Decimal) -@td.skip_array_manager_not_yet_implemented def test_groupby_dtype_inference_empty(): # GH 6733 df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})