From 6feadcfe3470dac2f58d50464cbb49c5015aded0 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 12:59:55 -0800 Subject: [PATCH 1/4] CLN: de-duplicate internals.construction --- pandas/core/internals/construction.py | 54 ++++++++++++++++----------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3c5216b65a70b..ebe5d6e4aad26 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -505,6 +505,7 @@ def to_arrays( """ Return list of arrays, columns. """ + if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ @@ -558,18 +559,13 @@ def _list_to_arrays( coerce_float: bool = False, dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: - if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) + # Note: we already check len(data) > 0 before getting hre + if isinstance(data[0], tuple): + content = lib.to_object_array_tuples(data) else: # list of lists - content = list(lib.to_object_array(data).T) - # gh-26429 do not raise user-facing AssertionError - try: - columns = _validate_or_indexify_columns(content, columns) - result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - except AssertionError as e: - raise ValueError(e) from e - return result, columns + content = lib.to_object_array(data) + return _finalize_columns_and_data(content, columns, dtype, coerce_float) def _list_of_series_to_arrays( @@ -599,15 +595,10 @@ def _list_of_series_to_arrays( values = extract_array(s, extract_numpy=True) aligned_values.append(algorithms.take_1d(values, indexer)) - values = np.vstack(aligned_values) + content = np.vstack(aligned_values) - if values.dtype == np.object_: - content = list(values.T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) - return content, columns - else: - return values.T, columns + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_of_dict_to_arrays( @@ -646,9 +637,30 @@ def _list_of_dict_to_arrays( # classes data = [(type(d) is dict) and d or dict(d) for d in data] - content = list(lib.dicts_to_array(data, list(columns)).T) - columns = _validate_or_indexify_columns(content, columns) - content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) + content = lib.dicts_to_array(data, list(columns)) + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns + + +def _finalize_columns_and_data( + content: np.ndarray, + columns, + dtype: Optional[DtypeObj], + coerce_float: bool, +) -> Tuple[List[np.ndarray], Index]: + """ + Ensure we have valid columns, cast object dtypes if possible. + """ + content = list(content.T) + + try: + columns = _validate_or_indexify_columns(content, columns) + except AssertionError as err: + # GH#26429 do not raise user-facing AssertionError + raise ValueError(err) from err + + if len(content) and content[0].dtype == np.object_: + content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float) return content, columns From a57aa9b5933b73fba4755c2390aaca7f38043332 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 13:21:08 -0800 Subject: [PATCH 2/4] annotate --- pandas/core/internals/construction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ebe5d6e4aad26..f0719375f5656 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -644,10 +644,10 @@ def _list_of_dict_to_arrays( def _finalize_columns_and_data( content: np.ndarray, - columns, + columns: Optional[Union[Index, List]], dtype: Optional[DtypeObj], coerce_float: bool, -) -> Tuple[List[np.ndarray], Index]: +) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]: """ Ensure we have valid columns, cast object dtypes if possible. """ From 0ff9516254090af506856417ca00a038af2d5c91 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 13:23:22 -0800 Subject: [PATCH 3/4] fixup whitespace --- pandas/core/internals/construction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f0719375f5656..7129f71a5cf45 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -505,7 +505,6 @@ def to_arrays( """ Return list of arrays, columns. """ - if isinstance(data, ABCDataFrame): if columns is not None: arrays = [ From ba425a524345c61ee12bef747a2f8b691d2ffe3f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Dec 2020 19:33:42 -0800 Subject: [PATCH 4/4] REF: collect calls to _finalize_columns_and_data --- pandas/core/internals/construction.py | 41 ++++++++++----------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7129f71a5cf45..d25482872c594 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -524,39 +524,36 @@ def to_arrays( if columns is not None: return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] - if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) - elif isinstance(data[0], abc.Mapping): - return _list_of_dict_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) - elif isinstance(data[0], ABCSeries): - return _list_of_series_to_arrays( - data, columns, coerce_float=coerce_float, dtype=dtype - ) + elif isinstance(data[0], Categorical): if columns is None: columns = ibase.default_index(len(data)) return data, columns - elif ( - isinstance(data, (np.ndarray, ABCSeries, Index)) - and data.dtype.names is not None - ): + elif isinstance(data, np.ndarray) and data.dtype.names is not None: + # e.g. recarray columns = list(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns + + if isinstance(data[0], (list, tuple)): + content, columns = _list_to_arrays(data, columns) + elif isinstance(data[0], abc.Mapping): + content, columns = _list_of_dict_to_arrays(data, columns) + elif isinstance(data[0], ABCSeries): + content, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] - return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) + content, columns = _list_to_arrays(data, columns) + + content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_to_arrays( data: List[Scalar], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: # Note: we already check len(data) > 0 before getting hre if isinstance(data[0], tuple): @@ -564,14 +561,12 @@ def _list_to_arrays( else: # list of lists content = lib.to_object_array(data) - return _finalize_columns_and_data(content, columns, dtype, coerce_float) + return content, columns def _list_of_series_to_arrays( data: List, columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: if columns is None: # We know pass_data is non-empty because data[0] is a Series @@ -596,15 +591,12 @@ def _list_of_series_to_arrays( content = np.vstack(aligned_values) - content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) return content, columns def _list_of_dict_to_arrays( data: List[Dict], columns: Union[Index, List], - coerce_float: bool = False, - dtype: Optional[DtypeObj] = None, ) -> Tuple[List[Scalar], Union[Index, List[Axis]]]: """ Convert list of dicts to numpy arrays @@ -619,8 +611,6 @@ def _list_of_dict_to_arrays( data : iterable collection of records (OrderedDict, dict) columns: iterables or None - coerce_float : bool - dtype : np.dtype Returns ------- @@ -637,7 +627,6 @@ def _list_of_dict_to_arrays( data = [(type(d) is dict) and d or dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) - content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float) return content, columns