From 78ec7562dba8e4a607f7804167eb10e506605b7c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Apr 2020 12:30:55 -0700 Subject: [PATCH 1/3] REF: implement _concat_arrays, _concat_same_dtype in some EAs --- pandas/core/arrays/categorical.py | 79 ++++++++++++++++++++++++++- pandas/core/arrays/sparse/array.py | 13 +++++ pandas/core/dtypes/concat.py | 88 ++++-------------------------- 3 files changed, 100 insertions(+), 80 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b3fb3459891e0..ab53fadb06681 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -30,6 +30,7 @@ ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_dict_like, is_dtype_equal, is_extension_array_dtype, @@ -2348,10 +2349,82 @@ def _can_hold_na(self): return True @classmethod - def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import concat_categorical + def _concat_same_type(cls, to_concat): + return cls._concat_arrays(to_concat) + # TODO: lock down stricter behavior? - return concat_categorical(to_concat) + @classmethod + def _concat_same_dtype( + cls, + to_concat, + axis: int = 0, + sort_categories: bool = False, + ignore_order: bool = False, + ): + """ + Like _concat_same_type, but with the added restriction of matching dtypes. + """ + ordered = False + + first = to_concat[0] + + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + + if all(first.categories.equals(other.categories) for other in to_concat[1:]): + new_codes = np.concatenate([c.codes for c in to_concat]) + else: + codes = [first.codes] + [ + recode_for_categories(other.codes, other.categories, first.categories) + for other in to_concat[1:] + ] + new_codes = np.concatenate(codes) + + if sort_categories and not ignore_order and ordered: + raise TypeError("Cannot use sort_categories=True with ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = categories.get_indexer(first.categories) + + new_codes = take_1d(indexer, new_codes, fill_value=-1) + + if ignore_order: + ordered = False + + return cls(new_codes, categories=categories, ordered=ordered, fastpath=True) + + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + from pandas.core.dtypes.concat import concat_compat, union_categoricals + + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] + + # validate the categories + if len(categoricals) != len(to_concat): + pass + else: + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + # extract the categoricals & coerce to object if needed + to_concat = [ + x._internal_get_values() + if is_categorical_dtype(x.dtype) + else np.asarray(x).ravel() + if not is_datetime64tz_dtype(x) + else np.asarray(x.astype(object)) + for x in to_concat + ] + + result = concat_compat(to_concat) + if axis == 1: + # TODO(EA2D): this is a kludge for 1D EAs + result = result.reshape(1, len(result)) + return result def isin(self, values): """ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index a98875ace09aa..e58a62b4c7081 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1023,6 +1023,19 @@ def _concat_same_type(cls, to_concat): return cls(data, sparse_index=sp_index, fill_value=fill_value) + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)] + fill_value = fill_values[0] + + # TODO: Fix join unit generation so we aren't passed this. + to_concat = [ + x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value) + for x in to_concat + ] + + return cls._concat_same_type(to_concat) + def astype(self, dtype=None, copy=True): """ Change the dtype of a SparseArray. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 301c9bb7b3f5c..b0b4b4770f3c6 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -108,14 +108,18 @@ def is_nonempty(x) -> bool: if "category" in typs: # this must be prior to concat_datetime, # to support Categorical + datetime-like - return concat_categorical(to_concat, axis=axis) + from pandas import Categorical + + return Categorical._concat_arrays(to_concat, axis=axis) elif _contains_datetime or "timedelta" in typs or _contains_period: return concat_datetime(to_concat, axis=axis, typs=typs) # these are mandated to handle empties as well elif "sparse" in typs: - return _concat_sparse(to_concat, axis=axis, typs=typs) + from pandas.core.arrays import SparseArray + + return SparseArray._concat_arrays(to_concat, axis=axis) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 @@ -165,30 +169,9 @@ def concat_categorical(to_concat, axis: int = 0): # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical - categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] + from pandas import Categorical - # validate the categories - if len(categoricals) != len(to_concat): - pass - else: - # when all categories are identical - first = to_concat[0] - if all(first.is_dtype_equal(other) for other in to_concat[1:]): - return union_categoricals(categoricals) - - # extract the categoricals & coerce to object if needed - to_concat = [ - x._internal_get_values() - if is_categorical_dtype(x.dtype) - else np.asarray(x).ravel() - if not is_datetime64tz_dtype(x) - else np.asarray(x.astype(object)) - for x in to_concat - ] - result = concat_compat(to_concat) - if axis == 1: - result = result.reshape(1, len(result)) - return result + return Categorical._concat_arrays(to_concat, axis=axis) def union_categoricals( @@ -318,28 +301,10 @@ def _maybe_unwrap(x): ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath - categories = first.categories - ordered = first.ordered - - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) - - if sort_categories and not ignore_order and ordered: - raise TypeError("Cannot use sort_categories=True with ordered Categoricals") - - if sort_categories and not categories.is_monotonic_increasing: - categories = categories.sort_values() - indexer = categories.get_indexer(first.categories) - - from pandas.core.algorithms import take_1d + return Categorical._concat_same_dtype( + to_union, sort_categories=sort_categories, ignore_order=ignore_order, + ) - new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) @@ -454,34 +419,3 @@ def _concat_datetimetz(to_concat, name=None): return sample._concat_same_dtype(to_concat, name=name) elif isinstance(sample, ABCDatetimeArray): return sample._concat_same_type(to_concat) - - -def _concat_sparse(to_concat, axis=0, typs=None): - """ - provide concatenation of an sparse/dense array of arrays each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.arrays import SparseArray - - fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] - fill_value = fill_values[0] - - # TODO: Fix join unit generation so we aren't passed this. - to_concat = [ - x - if isinstance(x, SparseArray) - else SparseArray(x.squeeze(), fill_value=fill_value) - for x in to_concat - ] - - return SparseArray._concat_same_type(to_concat) From 490c3335e9da5647a7286814957cf6d2b09e3d49 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Apr 2020 14:56:54 -0700 Subject: [PATCH 2/3] REF: implement _concat_arrays --- pandas/core/arrays/datetimelike.py | 29 ++++++- pandas/core/dtypes/concat.py | 114 ++-------------------------- pandas/core/indexes/datetimelike.py | 8 +- pandas/core/internals/blocks.py | 4 +- 4 files changed, 39 insertions(+), 116 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 30a34282889f8..3eae853b8460f 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -24,6 +24,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, + is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -723,7 +724,7 @@ def take(self, indices, allow_fill=False, fill_value=None): return type(self)(new_values, dtype=self.dtype) @classmethod - def _concat_same_type(cls, to_concat): + def _concat_same_type(cls, to_concat, axis: int = 0): # do not pass tz to set because tzlocal cannot be hashed dtypes = {str(x.dtype) for x in to_concat} @@ -733,7 +734,7 @@ def _concat_same_type(cls, to_concat): obj = to_concat[0] dtype = obj.dtype - values = np.concatenate([x.asi8 for x in to_concat]) + values = np.concatenate([x.asi8 for x in to_concat], axis=axis) if is_period_dtype(to_concat[0].dtype): new_freq = obj.freq @@ -750,6 +751,30 @@ def _concat_same_type(cls, to_concat): return cls._simple_new(values, dtype=dtype, freq=new_freq) + @classmethod + def _concat_arrays(cls, to_concat, axis: int = 0): + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] + + if len({x.dtype for x in to_concat}) == 1: + if axis == 1 and is_extension_array_dtype(to_concat[0].dtype): + # TODO(EA2D): not necessary with 2D EAs + axis = 0 + + result = cls._concat_same_type(to_concat, axis=axis) + + if axis == 1 and result.ndim == 1: + # TODO(EA2D): not necessary with 2D EAs + result = result.reshape(1, -1) + return result + + to_concat = [x.astype(object) for x in to_concat] + if axis == 1: + # TODO(EA2D): not necessary with 2D EAs + to_concat = [np.atleast_2d(x) for x in to_concat] + return np.concatenate(to_concat, axis=axis) + def copy(self): values = self.asi8.copy() return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b0b4b4770f3c6..81d748bafcb83 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -4,11 +4,7 @@ import numpy as np -from pandas._libs import tslib, tslibs - from pandas.core.dtypes.common import ( - DT64NS_DTYPE, - TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, @@ -19,13 +15,7 @@ is_sparse, is_timedelta64_dtype, ) -from pandas.core.dtypes.generic import ( - ABCCategoricalIndex, - ABCDatetimeArray, - ABCIndexClass, - ABCRangeIndex, - ABCSeries, -) +from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries def get_dtype_kinds(l): @@ -149,31 +139,6 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat, axis=axis) -def concat_categorical(to_concat, axis: int = 0): - """ - Concatenate an object/categorical array of arrays, each of which is a - single dtype - - Parameters - ---------- - to_concat : array of arrays - axis : int - Axis to provide concatenation in the current implementation this is - always 0, e.g. we only have 1D categoricals - - Returns - ------- - Categorical - A single array, preserving the combined dtypes - """ - # we could have object blocks and categoricals here - # if we only have a single categoricals then combine everything - # else its a non-compat categorical - from pandas import Categorical - - return Categorical._concat_arrays(to_concat, axis=axis) - - def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): @@ -330,13 +295,6 @@ def _maybe_unwrap(x): return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) -def _concatenate_2d(to_concat, axis: int): - # coerce to 2d if needed & concatenate - if axis == 1: - to_concat = [np.atleast_2d(x) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - - def concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a @@ -352,70 +310,10 @@ def concat_datetime(to_concat, axis=0, typs=None): ------- a single array, preserving the combined dtypes """ - if typs is None: - typs = get_dtype_kinds(to_concat) - - # multiple types, need to coerce to object - if len(typs) != 1: - return _concatenate_2d( - [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis - ) - - # must be single dtype - if any(typ.startswith("datetime") for typ in typs): - - if "datetime" in typs: - to_concat = [x.astype(np.int64, copy=False) for x in to_concat] - return _concatenate_2d(to_concat, axis=axis).view(DT64NS_DTYPE) - else: - # when to_concat has different tz, len(typs) > 1. - # thus no need to care - return _concat_datetimetz(to_concat) - - elif "timedelta" in typs: - return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view( - TD64NS_DTYPE - ) - - elif any(typ.startswith("period") for typ in typs): - assert len(typs) == 1 - cls = to_concat[0] - new_values = cls._concat_same_type(to_concat) - return new_values + from pandas.core.arrays import datetimelike as dtl + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] -def _convert_datetimelike_to_object(x): - # coerce datetimelike array to object dtype - - # if dtype is of datetimetz or timezone - if x.dtype.kind == DT64NS_DTYPE.kind: - if getattr(x, "tz", None) is not None: - x = np.asarray(x.astype(object)) - else: - shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp") - x = x.reshape(shape) - - elif x.dtype == TD64NS_DTYPE: - shape = x.shape - x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) - x = x.reshape(shape) - - return x - - -def _concat_datetimetz(to_concat, name=None): - """ - concat DatetimeIndex with the same tz - all inputs must be DatetimeIndex - it is used in DatetimeIndex.append also - """ - # Right now, internals will pass a List[DatetimeArray] here - # for reductions like quantile. I would like to disentangle - # all this before we get here. - sample = to_concat[0] - - if isinstance(sample, ABCIndexClass): - return sample._concat_same_dtype(to_concat, name=name) - elif isinstance(sample, ABCDatetimeArray): - return sample._concat_same_type(to_concat) + obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0] + return type(obj)._concat_arrays(to_concat, axis=axis) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 25333b3a08dce..c15680a47d216 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -778,8 +778,8 @@ def _fast_union(self, other, sort=None): left, right = self, other left_start = left[0] loc = right.searchsorted(left_start, side="left") - right_chunk = right.values[:loc] - dates = concat_compat((left.values, right_chunk)) + right_chunk = right._values[:loc] + dates = concat_compat([left._values, right_chunk]) result = self._shallow_copy(dates) result._set_freq("infer") # TODO: can we infer that it has self.freq? @@ -793,8 +793,8 @@ def _fast_union(self, other, sort=None): # concatenate if left_end < right_end: loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) + right_chunk = right._values[loc:] + dates = concat_compat([left._values, right_chunk]) result = self._shallow_copy(dates) result._set_freq("infer") # TODO: can we infer that it has self.freq? diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 185b0f4da2627..daf7654b063bb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -48,7 +48,7 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_categorical, concat_datetime +from pandas.core.dtypes.concat import concat_datetime from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -2642,7 +2642,7 @@ class CategoricalBlock(ExtensionBlock): is_categorical = True _verify_integrity = True _can_hold_na = True - _concatenator = staticmethod(concat_categorical) + _concatenator = staticmethod(Categorical._concat_arrays) should_store = Block.should_store From 05d5eb7230f8b45fa92c299c7b12dbb1b8a699be Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Apr 2020 16:36:56 -0700 Subject: [PATCH 3/3] standardize on _concat_arrays usage --- pandas/core/dtypes/concat.py | 36 +++++++-------------------------- pandas/core/internals/blocks.py | 3 +-- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 81d748bafcb83..c40c183a8ce68 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -95,20 +95,22 @@ def is_nonempty(x) -> bool: _contains_datetime = any(typ.startswith("datetime") for typ in typs) _contains_period = any(typ.startswith("period") for typ in typs) + from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] + if "category" in typs: # this must be prior to concat_datetime, # to support Categorical + datetime-like - from pandas import Categorical - return Categorical._concat_arrays(to_concat, axis=axis) elif _contains_datetime or "timedelta" in typs or _contains_period: - return concat_datetime(to_concat, axis=axis, typs=typs) + obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0] + return type(obj)._concat_arrays(to_concat, axis=axis) # these are mandated to handle empties as well elif "sparse" in typs: - from pandas.core.arrays import SparseArray - return SparseArray._concat_arrays(to_concat, axis=axis) all_empty = not len(non_empties) @@ -293,27 +295,3 @@ def _maybe_unwrap(x): ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True) - - -def concat_datetime(to_concat, axis=0, typs=None): - """ - provide concatenation of an datetimelike array of arrays each of which is a - single M8[ns], datetimet64[ns, tz] or m8[ns] dtype - - Parameters - ---------- - to_concat : array of arrays - axis : axis to provide concatenation - typs : set of to_concat dtypes - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.arrays import datetimelike as dtl - from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array - - to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat] - - obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0] - return type(obj)._concat_arrays(to_concat, axis=axis) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index daf7654b063bb..1fc341c6d64db 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -48,7 +48,6 @@ is_timedelta64_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_datetime from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -2264,7 +2263,7 @@ def concat_same_type(self, to_concat): # Instead of placing the condition here, it could also go into the # is_uniform_join_units check, but I'm not sure what is better. if len({x.dtype for x in to_concat}) > 1: - values = concat_datetime([x.values for x in to_concat]) + values = self._holder._concat_arrays([x.values for x in to_concat]) values = values.astype(object, copy=False) placement = self.mgr_locs if self.ndim == 2 else slice(len(values))