Skip to content

Automatic PR for c046378e-213d-4024-ab46-d652c331a152 #81

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: c046378e-213d-4024-ab46-d652c331a152-base
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 82 additions & 44 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,21 @@
common_dtype_categorical_compat,
find_common_type,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
ABCCategoricalIndex,
ABCExtensionArray,
ABCSeries,
)

if TYPE_CHECKING:
from pandas._typing import (
ArrayLike,
AxisInt,
DtypeObj,
)

from pandas.core.arrays import (
Expand Down Expand Up @@ -96,54 +100,45 @@ def concat_compat(
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
orig = to_concat
non_empties = [x for x in to_concat if _is_nonempty(x, axis)]
if non_empties and axis == 0 and not ea_compat_axis:
# ea_compat_axis see GH#39574
to_concat = non_empties

any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties)

if len(to_concat) < len(orig):
_, _, alt_dtype = _get_result_dtype(orig, non_empties)

if target_dtype is not None:
to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat]

if not isinstance(to_concat[0], np.ndarray):
# i.e. isinstance(to_concat[0], ExtensionArray)
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
cls = type(to_concat[0])
return cls._concat_same_type(to_concat_eas)
else:
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
result = np.concatenate(to_concat_arrs, axis=axis)

if not any_ea and "b" in kinds and result.dtype.kind in "iuf":
# GH#39817 cast to object instead of casting bools to numeric
result = result.astype(object, copy=False)
return result


def _get_result_dtype(
to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]
) -> tuple[bool, set[str], DtypeObj | None]:
target_dtype = None

dtypes = {obj.dtype for obj in to_concat}
kinds = {obj.dtype.kind for obj in to_concat}
contains_datetime = any(
isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in "mM"
for dtype in dtypes
) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat)

any_ea = any(not isinstance(x, np.ndarray) for x in to_concat)
if any_ea:
# i.e. any ExtensionArrays
all_empty = not len(non_empties)
single_dtype = len(dtypes) == 1
any_ea = any(isinstance(x, ExtensionDtype) for x in dtypes)

if contains_datetime:
return _concat_datetime(to_concat, axis=axis)

if any_ea:
# we ignore axis here, as internally concatting with EAs is always
# for axis=0
if len(dtypes) != 1:
if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])
target_dtype = common_dtype_categorical_compat(to_concat, target_dtype)
to_concat = [
astype_array(arr, target_dtype, copy=False) for arr in to_concat
]

if isinstance(to_concat[0], ABCExtensionArray):
# TODO: what about EA-backed Index?
to_concat_eas = cast("Sequence[ExtensionArray]", to_concat)
cls = type(to_concat[0])
return cls._concat_same_type(to_concat_eas)
else:
to_concat_arrs = cast("Sequence[np.ndarray]", to_concat)
return np.concatenate(to_concat_arrs)

elif not len(non_empties):
elif all_empty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
Expand All @@ -153,16 +148,17 @@ def _get_result_dtype(
pass
else:
# coerce to object
target_dtype = np.dtype(object)
to_concat = [x.astype("object") for x in to_concat]
kinds = {"o"}
else:
# Argument 1 to "list" has incompatible type "Set[Union[ExtensionDtype,
# Any]]"; expected "Iterable[Union[dtype[Any], None, Type[Any],
# _SupportsDType[dtype[Any]], str, Tuple[Any, Union[SupportsIndex,
# Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
target_dtype = np.find_common_type(list(dtypes), []) # type: ignore[arg-type]

return any_ea, kinds, target_dtype
# error: Argument 1 to "concatenate" has incompatible type
# "Sequence[Union[ExtensionArray, ndarray[Any, Any]]]"; expected
# "Union[_SupportsArray[dtype[Any]], _NestedSequence[_SupportsArray[dtype[Any]]]]"
result: np.ndarray = np.concatenate(to_concat, axis=axis) # type: ignore[arg-type]
if "b" in kinds and result.dtype.kind in "iuf":
# GH#39817 cast to object instead of casting bools to numeric
result = result.astype(object, copy=False)
return result


def union_categoricals(
Expand Down Expand Up @@ -324,3 +320,45 @@ def _maybe_unwrap(x):

dtype = CategoricalDtype(categories=categories, ordered=ordered)
return Categorical._simple_new(new_codes, dtype=dtype)


def _concatenate_2d(to_concat: Sequence[np.ndarray], axis: AxisInt) -> np.ndarray:
# coerce to 2d if needed & concatenate
if axis == 1:
to_concat = [np.atleast_2d(x) for x in to_concat]
return np.concatenate(to_concat, axis=axis)


def _concat_datetime(to_concat: Sequence[ArrayLike], axis: AxisInt = 0) -> ArrayLike:
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetime64[ns, tz] or m8[ns] dtype

Parameters
----------
to_concat : sequence of arrays
axis : axis to provide concatenation

Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.construction import ensure_wrapped_if_datetimelike

to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat]

single_dtype = lib.dtypes_all_equal([x.dtype for x in to_concat])

# multiple types, need to coerce to object
if not single_dtype:
# ensure_wrapped_if_datetimelike ensures that astype(object) wraps
# in Timestamp/Timedelta
return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis)

# error: Unexpected keyword argument "axis" for "_concat_same_type" of
# "ExtensionArray"
to_concat_eas = cast("list[ExtensionArray]", to_concat)
result = type(to_concat_eas[0])._concat_same_type( # type: ignore[call-arg]
to_concat_eas, axis=axis
)
return result