From 78ec7562dba8e4a607f7804167eb10e506605b7c Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Mon, 13 Apr 2020 12:30:55 -0700
Subject: [PATCH 1/3] REF: implement _concat_arrays, _concat_same_dtype in some
 EAs

---
 pandas/core/arrays/categorical.py  | 79 ++++++++++++++++++++++++++-
 pandas/core/arrays/sparse/array.py | 13 +++++
 pandas/core/dtypes/concat.py       | 88 ++++--------------------------
 3 files changed, 100 insertions(+), 80 deletions(-)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index b3fb3459891e0..ab53fadb06681 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -30,6 +30,7 @@
     ensure_platform_int,
     is_categorical_dtype,
     is_datetime64_dtype,
+    is_datetime64tz_dtype,
     is_dict_like,
     is_dtype_equal,
     is_extension_array_dtype,
@@ -2348,10 +2349,82 @@ def _can_hold_na(self):
         return True
 
     @classmethod
-    def _concat_same_type(self, to_concat):
-        from pandas.core.dtypes.concat import concat_categorical
+    def _concat_same_type(cls, to_concat):
+        return cls._concat_arrays(to_concat)
+        # TODO: lock down stricter behavior?
 
-        return concat_categorical(to_concat)
+    @classmethod
+    def _concat_same_dtype(
+        cls,
+        to_concat,
+        axis: int = 0,
+        sort_categories: bool = False,
+        ignore_order: bool = False,
+    ):
+        """
+        Like _concat_same_type, but with the added restriction of matching dtypes.
+        """
+        ordered = False
+
+        first = to_concat[0]
+
+        # identical categories - fastpath
+        categories = first.categories
+        ordered = first.ordered
+
+        if all(first.categories.equals(other.categories) for other in to_concat[1:]):
+            new_codes = np.concatenate([c.codes for c in to_concat])
+        else:
+            codes = [first.codes] + [
+                recode_for_categories(other.codes, other.categories, first.categories)
+                for other in to_concat[1:]
+            ]
+            new_codes = np.concatenate(codes)
+
+        if sort_categories and not ignore_order and ordered:
+            raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
+
+        if sort_categories and not categories.is_monotonic_increasing:
+            categories = categories.sort_values()
+            indexer = categories.get_indexer(first.categories)
+
+            new_codes = take_1d(indexer, new_codes, fill_value=-1)
+
+        if ignore_order:
+            ordered = False
+
+        return cls(new_codes, categories=categories, ordered=ordered, fastpath=True)
+
+    @classmethod
+    def _concat_arrays(cls, to_concat, axis: int = 0):
+        from pandas.core.dtypes.concat import concat_compat, union_categoricals
+
+        categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
+
+        # validate the categories
+        if len(categoricals) != len(to_concat):
+            pass
+        else:
+            # when all categories are identical
+            first = to_concat[0]
+            if all(first.is_dtype_equal(other) for other in to_concat[1:]):
+                return union_categoricals(categoricals)
+
+        # extract the categoricals & coerce to object if needed
+        to_concat = [
+            x._internal_get_values()
+            if is_categorical_dtype(x.dtype)
+            else np.asarray(x).ravel()
+            if not is_datetime64tz_dtype(x)
+            else np.asarray(x.astype(object))
+            for x in to_concat
+        ]
+
+        result = concat_compat(to_concat)
+        if axis == 1:
+            # TODO(EA2D): this is a kludge for 1D EAs
+            result = result.reshape(1, len(result))
+        return result
 
     def isin(self, values):
         """
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index a98875ace09aa..e58a62b4c7081 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -1023,6 +1023,19 @@ def _concat_same_type(cls, to_concat):
 
         return cls(data, sparse_index=sp_index, fill_value=fill_value)
 
+    @classmethod
+    def _concat_arrays(cls, to_concat, axis: int = 0):
+        fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)]
+        fill_value = fill_values[0]
+
+        # TODO: Fix join unit generation so we aren't passed this.
+        to_concat = [
+            x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value)
+            for x in to_concat
+        ]
+
+        return cls._concat_same_type(to_concat)
+
     def astype(self, dtype=None, copy=True):
         """
         Change the dtype of a SparseArray.
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 301c9bb7b3f5c..b0b4b4770f3c6 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -108,14 +108,18 @@ def is_nonempty(x) -> bool:
     if "category" in typs:
         # this must be prior to concat_datetime,
         # to support Categorical + datetime-like
-        return concat_categorical(to_concat, axis=axis)
+        from pandas import Categorical
+
+        return Categorical._concat_arrays(to_concat, axis=axis)
 
     elif _contains_datetime or "timedelta" in typs or _contains_period:
         return concat_datetime(to_concat, axis=axis, typs=typs)
 
     # these are mandated to handle empties as well
     elif "sparse" in typs:
-        return _concat_sparse(to_concat, axis=axis, typs=typs)
+        from pandas.core.arrays import SparseArray
+
+        return SparseArray._concat_arrays(to_concat, axis=axis)
 
     all_empty = not len(non_empties)
     single_dtype = len({x.dtype for x in to_concat}) == 1
@@ -165,30 +169,9 @@ def concat_categorical(to_concat, axis: int = 0):
     # we could have object blocks and categoricals here
     # if we only have a single categoricals then combine everything
     # else its a non-compat categorical
-    categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
+    from pandas import Categorical
 
-    # validate the categories
-    if len(categoricals) != len(to_concat):
-        pass
-    else:
-        # when all categories are identical
-        first = to_concat[0]
-        if all(first.is_dtype_equal(other) for other in to_concat[1:]):
-            return union_categoricals(categoricals)
-
-    # extract the categoricals & coerce to object if needed
-    to_concat = [
-        x._internal_get_values()
-        if is_categorical_dtype(x.dtype)
-        else np.asarray(x).ravel()
-        if not is_datetime64tz_dtype(x)
-        else np.asarray(x.astype(object))
-        for x in to_concat
-    ]
-    result = concat_compat(to_concat)
-    if axis == 1:
-        result = result.reshape(1, len(result))
-    return result
+    return Categorical._concat_arrays(to_concat, axis=axis)
 
 
 def union_categoricals(
@@ -318,28 +301,10 @@ def _maybe_unwrap(x):
     ordered = False
     if all(first.is_dtype_equal(other) for other in to_union[1:]):
         # identical categories - fastpath
-        categories = first.categories
-        ordered = first.ordered
-
-        if all(first.categories.equals(other.categories) for other in to_union[1:]):
-            new_codes = np.concatenate([c.codes for c in to_union])
-        else:
-            codes = [first.codes] + [
-                recode_for_categories(other.codes, other.categories, first.categories)
-                for other in to_union[1:]
-            ]
-            new_codes = np.concatenate(codes)
-
-        if sort_categories and not ignore_order and ordered:
-            raise TypeError("Cannot use sort_categories=True with ordered Categoricals")
-
-        if sort_categories and not categories.is_monotonic_increasing:
-            categories = categories.sort_values()
-            indexer = categories.get_indexer(first.categories)
-
-            from pandas.core.algorithms import take_1d
+        return Categorical._concat_same_dtype(
+            to_union, sort_categories=sort_categories, ignore_order=ignore_order,
+        )
 
-            new_codes = take_1d(indexer, new_codes, fill_value=-1)
     elif ignore_order or all(not c.ordered for c in to_union):
         # different categories - union and recode
         cats = first.categories.append([c.categories for c in to_union[1:]])
@@ -454,34 +419,3 @@ def _concat_datetimetz(to_concat, name=None):
         return sample._concat_same_dtype(to_concat, name=name)
     elif isinstance(sample, ABCDatetimeArray):
         return sample._concat_same_type(to_concat)
-
-
-def _concat_sparse(to_concat, axis=0, typs=None):
-    """
-    provide concatenation of an sparse/dense array of arrays each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-    typs : set of to_concat dtypes
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-    from pandas.core.arrays import SparseArray
-
-    fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
-    fill_value = fill_values[0]
-
-    # TODO: Fix join unit generation so we aren't passed this.
-    to_concat = [
-        x
-        if isinstance(x, SparseArray)
-        else SparseArray(x.squeeze(), fill_value=fill_value)
-        for x in to_concat
-    ]
-
-    return SparseArray._concat_same_type(to_concat)

From 490c3335e9da5647a7286814957cf6d2b09e3d49 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Mon, 13 Apr 2020 14:56:54 -0700
Subject: [PATCH 2/3] REF: implement _concat_arrays

---
 pandas/core/arrays/datetimelike.py  |  29 ++++++-
 pandas/core/dtypes/concat.py        | 114 ++--------------------------
 pandas/core/indexes/datetimelike.py |   8 +-
 pandas/core/internals/blocks.py     |   4 +-
 4 files changed, 39 insertions(+), 116 deletions(-)

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 30a34282889f8..3eae853b8460f 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -24,6 +24,7 @@
     is_datetime64tz_dtype,
     is_datetime_or_timedelta_dtype,
     is_dtype_equal,
+    is_extension_array_dtype,
     is_float_dtype,
     is_integer_dtype,
     is_list_like,
@@ -723,7 +724,7 @@ def take(self, indices, allow_fill=False, fill_value=None):
         return type(self)(new_values, dtype=self.dtype)
 
     @classmethod
-    def _concat_same_type(cls, to_concat):
+    def _concat_same_type(cls, to_concat, axis: int = 0):
 
         # do not pass tz to set because tzlocal cannot be hashed
         dtypes = {str(x.dtype) for x in to_concat}
@@ -733,7 +734,7 @@ def _concat_same_type(cls, to_concat):
         obj = to_concat[0]
         dtype = obj.dtype
 
-        values = np.concatenate([x.asi8 for x in to_concat])
+        values = np.concatenate([x.asi8 for x in to_concat], axis=axis)
 
         if is_period_dtype(to_concat[0].dtype):
             new_freq = obj.freq
@@ -750,6 +751,30 @@ def _concat_same_type(cls, to_concat):
 
         return cls._simple_new(values, dtype=dtype, freq=new_freq)
 
+    @classmethod
+    def _concat_arrays(cls, to_concat, axis: int = 0):
+        from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array
+
+        to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]
+
+        if len({x.dtype for x in to_concat}) == 1:
+            if axis == 1 and is_extension_array_dtype(to_concat[0].dtype):
+                # TODO(EA2D): not necessary with 2D EAs
+                axis = 0
+
+            result = cls._concat_same_type(to_concat, axis=axis)
+
+            if axis == 1 and result.ndim == 1:
+                # TODO(EA2D): not necessary with 2D EAs
+                result = result.reshape(1, -1)
+            return result
+
+        to_concat = [x.astype(object) for x in to_concat]
+        if axis == 1:
+            # TODO(EA2D): not necessary with 2D EAs
+            to_concat = [np.atleast_2d(x) for x in to_concat]
+        return np.concatenate(to_concat, axis=axis)
+
     def copy(self):
         values = self.asi8.copy()
         return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq)
diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index b0b4b4770f3c6..81d748bafcb83 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -4,11 +4,7 @@
 
 import numpy as np
 
-from pandas._libs import tslib, tslibs
-
 from pandas.core.dtypes.common import (
-    DT64NS_DTYPE,
-    TD64NS_DTYPE,
     is_bool_dtype,
     is_categorical_dtype,
     is_datetime64_dtype,
@@ -19,13 +15,7 @@
     is_sparse,
     is_timedelta64_dtype,
 )
-from pandas.core.dtypes.generic import (
-    ABCCategoricalIndex,
-    ABCDatetimeArray,
-    ABCIndexClass,
-    ABCRangeIndex,
-    ABCSeries,
-)
+from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries
 
 
 def get_dtype_kinds(l):
@@ -149,31 +139,6 @@ def is_nonempty(x) -> bool:
     return np.concatenate(to_concat, axis=axis)
 
 
-def concat_categorical(to_concat, axis: int = 0):
-    """
-    Concatenate an object/categorical array of arrays, each of which is a
-    single dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : int
-        Axis to provide concatenation in the current implementation this is
-        always 0, e.g. we only have 1D categoricals
-
-    Returns
-    -------
-    Categorical
-        A single array, preserving the combined dtypes
-    """
-    # we could have object blocks and categoricals here
-    # if we only have a single categoricals then combine everything
-    # else its a non-compat categorical
-    from pandas import Categorical
-
-    return Categorical._concat_arrays(to_concat, axis=axis)
-
-
 def union_categoricals(
     to_union, sort_categories: bool = False, ignore_order: bool = False
 ):
@@ -330,13 +295,6 @@ def _maybe_unwrap(x):
     return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
 
 
-def _concatenate_2d(to_concat, axis: int):
-    # coerce to 2d if needed & concatenate
-    if axis == 1:
-        to_concat = [np.atleast_2d(x) for x in to_concat]
-    return np.concatenate(to_concat, axis=axis)
-
-
 def concat_datetime(to_concat, axis=0, typs=None):
     """
     provide concatenation of an datetimelike array of arrays each of which is a
@@ -352,70 +310,10 @@ def concat_datetime(to_concat, axis=0, typs=None):
     -------
     a single array, preserving the combined dtypes
     """
-    if typs is None:
-        typs = get_dtype_kinds(to_concat)
-
-    # multiple types, need to coerce to object
-    if len(typs) != 1:
-        return _concatenate_2d(
-            [_convert_datetimelike_to_object(x) for x in to_concat], axis=axis
-        )
-
-    # must be single dtype
-    if any(typ.startswith("datetime") for typ in typs):
-
-        if "datetime" in typs:
-            to_concat = [x.astype(np.int64, copy=False) for x in to_concat]
-            return _concatenate_2d(to_concat, axis=axis).view(DT64NS_DTYPE)
-        else:
-            # when to_concat has different tz, len(typs) > 1.
-            # thus no need to care
-            return _concat_datetimetz(to_concat)
-
-    elif "timedelta" in typs:
-        return _concatenate_2d([x.view(np.int64) for x in to_concat], axis=axis).view(
-            TD64NS_DTYPE
-        )
-
-    elif any(typ.startswith("period") for typ in typs):
-        assert len(typs) == 1
-        cls = to_concat[0]
-        new_values = cls._concat_same_type(to_concat)
-        return new_values
+    from pandas.core.arrays import datetimelike as dtl
+    from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array
 
+    to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]
 
-def _convert_datetimelike_to_object(x):
-    # coerce datetimelike array to object dtype
-
-    # if dtype is of datetimetz or timezone
-    if x.dtype.kind == DT64NS_DTYPE.kind:
-        if getattr(x, "tz", None) is not None:
-            x = np.asarray(x.astype(object))
-        else:
-            shape = x.shape
-            x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), box="timestamp")
-            x = x.reshape(shape)
-
-    elif x.dtype == TD64NS_DTYPE:
-        shape = x.shape
-        x = tslibs.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True)
-        x = x.reshape(shape)
-
-    return x
-
-
-def _concat_datetimetz(to_concat, name=None):
-    """
-    concat DatetimeIndex with the same tz
-    all inputs must be DatetimeIndex
-    it is used in DatetimeIndex.append also
-    """
-    # Right now, internals will pass a List[DatetimeArray] here
-    # for reductions like quantile. I would like to disentangle
-    # all this before we get here.
-    sample = to_concat[0]
-
-    if isinstance(sample, ABCIndexClass):
-        return sample._concat_same_dtype(to_concat, name=name)
-    elif isinstance(sample, ABCDatetimeArray):
-        return sample._concat_same_type(to_concat)
+    obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0]
+    return type(obj)._concat_arrays(to_concat, axis=axis)
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 25333b3a08dce..c15680a47d216 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -778,8 +778,8 @@ def _fast_union(self, other, sort=None):
             left, right = self, other
             left_start = left[0]
             loc = right.searchsorted(left_start, side="left")
-            right_chunk = right.values[:loc]
-            dates = concat_compat((left.values, right_chunk))
+            right_chunk = right._values[:loc]
+            dates = concat_compat([left._values, right_chunk])
             result = self._shallow_copy(dates)
             result._set_freq("infer")
             # TODO: can we infer that it has self.freq?
@@ -793,8 +793,8 @@ def _fast_union(self, other, sort=None):
         # concatenate
         if left_end < right_end:
             loc = right.searchsorted(left_end, side="right")
-            right_chunk = right.values[loc:]
-            dates = concat_compat((left.values, right_chunk))
+            right_chunk = right._values[loc:]
+            dates = concat_compat([left._values, right_chunk])
             result = self._shallow_copy(dates)
             result._set_freq("infer")
             # TODO: can we infer that it has self.freq?
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index 185b0f4da2627..daf7654b063bb 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -48,7 +48,7 @@
     is_timedelta64_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.concat import concat_categorical, concat_datetime
+from pandas.core.dtypes.concat import concat_datetime
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -2642,7 +2642,7 @@ class CategoricalBlock(ExtensionBlock):
     is_categorical = True
     _verify_integrity = True
     _can_hold_na = True
-    _concatenator = staticmethod(concat_categorical)
+    _concatenator = staticmethod(Categorical._concat_arrays)
 
     should_store = Block.should_store
 

From 05d5eb7230f8b45fa92c299c7b12dbb1b8a699be Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Mon, 13 Apr 2020 16:36:56 -0700
Subject: [PATCH 3/3] standardize on _concat_arrays usage

---
 pandas/core/dtypes/concat.py    | 36 +++++++--------------------------
 pandas/core/internals/blocks.py |  3 +--
 2 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
index 81d748bafcb83..c40c183a8ce68 100644
--- a/pandas/core/dtypes/concat.py
+++ b/pandas/core/dtypes/concat.py
@@ -95,20 +95,22 @@ def is_nonempty(x) -> bool:
     _contains_datetime = any(typ.startswith("datetime") for typ in typs)
     _contains_period = any(typ.startswith("period") for typ in typs)
 
+    from pandas.core.arrays import Categorical, SparseArray, datetimelike as dtl
+    from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array
+
+    to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]
+
     if "category" in typs:
         # this must be prior to concat_datetime,
         # to support Categorical + datetime-like
-        from pandas import Categorical
-
         return Categorical._concat_arrays(to_concat, axis=axis)
 
     elif _contains_datetime or "timedelta" in typs or _contains_period:
-        return concat_datetime(to_concat, axis=axis, typs=typs)
+        obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0]
+        return type(obj)._concat_arrays(to_concat, axis=axis)
 
     # these are mandated to handle empties as well
     elif "sparse" in typs:
-        from pandas.core.arrays import SparseArray
-
         return SparseArray._concat_arrays(to_concat, axis=axis)
 
     all_empty = not len(non_empties)
@@ -293,27 +295,3 @@ def _maybe_unwrap(x):
         ordered = False
 
     return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
-
-
-def concat_datetime(to_concat, axis=0, typs=None):
-    """
-    provide concatenation of an datetimelike array of arrays each of which is a
-    single M8[ns], datetimet64[ns, tz] or m8[ns] dtype
-
-    Parameters
-    ----------
-    to_concat : array of arrays
-    axis : axis to provide concatenation
-    typs : set of to_concat dtypes
-
-    Returns
-    -------
-    a single array, preserving the combined dtypes
-    """
-    from pandas.core.arrays import datetimelike as dtl
-    from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array
-
-    to_concat = [maybe_upcast_datetimelike_array(x) for x in to_concat]
-
-    obj = [x for x in to_concat if isinstance(x, dtl.DatetimeLikeArrayMixin)][0]
-    return type(obj)._concat_arrays(to_concat, axis=axis)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index daf7654b063bb..1fc341c6d64db 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -48,7 +48,6 @@
     is_timedelta64_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.concat import concat_datetime
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -2264,7 +2263,7 @@ def concat_same_type(self, to_concat):
         # Instead of placing the condition here, it could also go into the
         # is_uniform_join_units check, but I'm not sure what is better.
         if len({x.dtype for x in to_concat}) > 1:
-            values = concat_datetime([x.values for x in to_concat])
+            values = self._holder._concat_arrays([x.values for x in to_concat])
 
             values = values.astype(object, copy=False)
             placement = self.mgr_locs if self.ndim == 2 else slice(len(values))