diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3c5216b65a70b..d25482872c594 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -524,59 +524,49 @@ def to_arrays( if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) - elif isinstance(data[0], ABCSeries): - return _list_of_series_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) + elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns - elif ( - isinstance(data, (np.ndarray, ABCSeries, Index)) - and data.dtype.names is not None - ): + elif isinstance(data, np.ndarray) and data.dtype.names is not None: + # e.g. recarray columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns + + if isinstance(data[0], (list, tuple)): + content, columns = _list_to_arrays(data, columns) + elif isinstance(data[0], abc.Mapping): + content, columns = _list_of_dict_to_arrays(data, columns) + elif isinstance(data[0], ABCSeries): + content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + content, columns = _list_to_arrays(data, columns) + + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_to_arrays( data: List[Scalar], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: - if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) + # Note: we already check len(data) > 0 before getting hre + if isinstance(data[0], tuple): + content = lib.to_object_array_tuples(data) else: # list of lists - content = list(lib.to_object_array(data).T) - # gh-26429 do not raise user-facing AssertionError - try: - columns = _validate_or_indexify_columns(content, columns) - result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - except AssertionError as e: - raise ValueError(e) from e - return result, columns + content = lib.to_object_array(data) + return content, columns def _list_of_series_to_arrays( data: List, columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series @@ -599,22 +589,14 @@ def _list_of_series_to_arrays( values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) - values = np.vstack(aligned_values) + content = np.vstack(aligned_values) - if values.dtype == np.object_: - content = list(values.T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - return content, columns - else: - return values.T, columns + return content, columns def _list_of_dict_to_arrays( data: List[Dict], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -629,8 +611,6 @@ def _list_of_dict_to_arrays( data : iterable collection of records (OrderedDict, dict) columns: iterables or None - coerce_float : bool - dtype : np.dtype Returns ------- @@ -646,9 +626,29 @@ def _list_of_dict_to_arrays( # classes data = [(type(d) is dict) and d or dict(d) for d in data] - content = list(lib.dicts_to_array(data, list(columns)).T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + content = lib.dicts_to_array(data, list(columns)) + return content, columns + + +def _finalize_columns_and_data( + content: np.ndarray, + columns: Optional[Union[Index, List]], + dtype: Optional[DtypeObj], + coerce_float: bool, +) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]: + """ + Ensure we have valid columns, cast object dtypes if possible. + """ + content = list(content.T) + + try: + columns = _validate_or_indexify_columns(content, columns) + except AssertionError as err: + # GH#26429 do not raise user-facing AssertionError + raise ValueError(err) from err + + if len(content) and content[0].dtype == np.object_: + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) return content, columns