From a4d985bf5040a0b7248f57a00107dfe6a75efa1f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 22 Dec 2020 21:32:03 -0800 Subject: [PATCH 1/5] BUG: MultiIndex, IntervalIndex intersection with Categorical --- pandas/core/indexes/period.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4d48bc0d51912..d492c41ea5c6d 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -11,6 +11,7 @@ from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, From 9156eae73e3adf14e152907759515661cdeaf013 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Dec 2020 07:33:31 -0800 Subject: [PATCH 2/5] standardize --- pandas/core/indexes/datetimelike.py | 2 ++ pandas/core/indexes/multi.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 94c055e264e71..26d9b80acdf9a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -12,6 +12,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly, doc +from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, @@ -654,6 +655,7 @@ def difference(self, other, sort=None): new_idx = super().difference(other, sort=sort)._with_freq(None) return new_idx + def _intersection(self, other: Index, sort=False) -> Index: """ intersection specialized to the case with matching dtypes. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c7be66b596246..273fa77c6d6b0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -25,11 +25,12 @@ from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc -from pandas.core.dtypes.cast import coerce_indexer_dtype +from pandas.core.dtypes.cast import coerce_indexer_dtype, find_common_type from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, + is_dtype_equal, is_hashable, is_integer, is_iterator, From 6558fcfcf4b604f78636e5b85e2506c89226a9fd Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Dec 2020 08:53:46 -0800 Subject: [PATCH 3/5] Share intersection --- pandas/core/indexes/datetimelike.py | 2 -- pandas/core/indexes/multi.py | 3 +-- pandas/core/indexes/period.py | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 26d9b80acdf9a..94c055e264e71 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -12,7 +12,6 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly, doc -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical_dtype, @@ -655,7 +654,6 @@ def difference(self, other, sort=None): new_idx = super().difference(other, sort=sort)._with_freq(None) return new_idx - def _intersection(self, other: Index, sort=False) -> Index: """ intersection specialized to the case with matching dtypes. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 273fa77c6d6b0..c7be66b596246 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -25,12 +25,11 @@ from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc -from pandas.core.dtypes.cast import coerce_indexer_dtype, find_common_type +from pandas.core.dtypes.cast import coerce_indexer_dtype from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, - is_dtype_equal, is_hashable, is_integer, is_iterator, diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d492c41ea5c6d..4d48bc0d51912 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -11,7 +11,6 @@ from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc -from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, From 9d3c864a2a37ed95aa469677b2c785863d6fbed9 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 23 Dec 2020 15:57:20 -0800 Subject: [PATCH 4/5] REF: share Index.union --- pandas/core/indexes/base.py | 22 +++++-- pandas/core/indexes/datetimelike.py | 6 +- pandas/core/indexes/multi.py | 89 ----------------------------- pandas/core/indexes/period.py | 11 ---- pandas/core/indexes/range.py | 3 - 5 files changed, 19 insertions(+), 112 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8d48a6277d412..3bb2aa3f7023e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2662,6 +2662,14 @@ def union(self, other, sort=None): other, result_name = self._convert_can_do_setop(other) if not is_dtype_equal(self.dtype, other.dtype): + if isinstance(self, ABCMultiIndex) and not is_object_dtype( + unpack_nested_dtype(other) + ): + raise NotImplementedError( + "Can only union MultiIndex with MultiIndex or Index of tuples, " + "try mi.to_flat_index().union(other) instead." + ) + dtype = find_common_type([self.dtype, other.dtype]) if self._is_numeric_dtype and other._is_numeric_dtype: # Right now, we treat union(int, float) a bit special. @@ -2680,6 +2688,14 @@ def union(self, other, sort=None): right = other.astype(dtype, copy=False) return left.union(right, sort=sort) + elif not len(other) or self.equals(other): + # NB: whether this (and the `if not len(self)` check below) come before + # or after the is_dtype_equal check above affects the returned dtype + return self._get_reconciled_name_object(other) + + elif not len(self): + return other._get_reconciled_name_object(self) + result = self._union(other, sort=sort) return self._wrap_setop_result(other, result) @@ -2703,12 +2719,6 @@ def _union(self, other, sort): ------- Index """ - if not len(other) or self.equals(other): - return self - - if not len(self): - return other - # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 94c055e264e71..d673d1b43f729 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -711,6 +711,9 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: # so intersection will preserve freq return True + elif not len(self) or not len(other): + return False + elif isinstance(self.freq, Tick): # We "line up" if and only if the difference between two of our points # is a multiple of our freq @@ -794,9 +797,6 @@ def _fast_union(self, other, sort=None): return left def _union(self, other, sort): - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - # We are called by `union`, which is responsible for this validation assert isinstance(other, type(self)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c7be66b596246..2b2b51d63a710 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3502,98 +3502,9 @@ def equal_levels(self, other) -> bool: # -------------------------------------------------------------------- # Set Methods - def union(self, other, sort=None): - """ - Form the union of two MultiIndex objects - - Parameters - ---------- - other : MultiIndex or array / Index of tuples - sort : False or None, default None - Whether to sort the resulting Index. - - * None : Sort the result, except when - - 1. `self` and `other` are equal. - 2. `self` has length 0. - 3. Some values in `self` or `other` cannot be compared. - A RuntimeWarning is issued in this case. - - * False : do not sort the result. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default value from ``True`` to ``None`` - (without change in behaviour). - - Returns - ------- - Index - - Examples - -------- - >>> idx1 = pd.MultiIndex.from_arrays( - ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ... ) - >>> idx1 - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (2, 'Red'), - (2, 'Blue')], - ) - >>> idx2 = pd.MultiIndex.from_arrays( - ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ... ) - >>> idx2 - MultiIndex([(3, 'Red'), - (3, 'Green'), - (2, 'Red'), - (2, 'Green')], - ) - - >>> idx1.union(idx2) - MultiIndex([(1, 'Blue'), - (1, 'Red'), - (2, 'Blue'), - (2, 'Green'), - (2, 'Red'), - (3, 'Green'), - (3, 'Red')], - ) - - >>> idx1.union(idx2, sort=False) - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (2, 'Red'), - (2, 'Blue'), - (3, 'Red'), - (3, 'Green'), - (2, 'Green')], - ) - """ - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) - - if not len(other) or self.equals(other): - return self._get_reconciled_name_object(other) - - if not len(self): - return other._get_reconciled_name_object(self) - - return self._union(other, sort=sort) - def _union(self, other, sort): other, result_names = self._convert_can_do_setop(other) - if not self._should_compare(other): - raise NotImplementedError( - "Can only union MultiIndex with MultiIndex or Index of tuples, " - "try mi.to_flat_index().union(other) instead." - ) - # We could get here with CategoricalIndex other rvals = other._values.astype(object, copy=False) uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4d48bc0d51912..d8a0cdb667ad8 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -656,17 +656,6 @@ def _difference(self, other, sort): return self._setop(other, sort, opname="difference") def _union(self, other, sort): - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - - # We are called by `union`, which is responsible for this validation - assert isinstance(other, type(self)) - - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype("O") - other = other.astype("O") - return this._union(other, sort=sort) - return self._setop(other, sort, opname="_union") # ------------------------------------------------------------------------ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e716605245da5..5e5280934dff4 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -576,9 +576,6 @@ def _union(self, other, sort): ------- union : Index """ - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - if isinstance(other, RangeIndex) and sort is None: start_s, step_s = self.start, self.step end_s = self.start + self.step * (len(self) - 1) From 40b1c8be5d3cc8499e3393a974d1e394524ac694 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 24 Dec 2020 10:07:49 -0800 Subject: [PATCH 5/5] move doctest examples --- pandas/core/indexes/base.py | 39 +++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3390dc3c23170..e425ee1a78de5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2656,6 +2656,45 @@ def union(self, other, sort=None): >>> idx2 = pd.Index([1, 2, 3, 4]) >>> idx1.union(idx2) Index(['a', 'b', 'c', 'd', 1, 2, 3, 4], dtype='object') + + MultiIndex case + + >>> idx1 = pd.MultiIndex.from_arrays( + ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ... ) + >>> idx1 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue')], + ) + >>> idx2 = pd.MultiIndex.from_arrays( + ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ... ) + >>> idx2 + MultiIndex([(3, 'Red'), + (3, 'Green'), + (2, 'Red'), + (2, 'Green')], + ) + >>> idx1.union(idx2) + MultiIndex([(1, 'Blue'), + (1, 'Red'), + (2, 'Blue'), + (2, 'Green'), + (2, 'Red'), + (3, 'Green'), + (3, 'Red')], + ) + >>> idx1.union(idx2, sort=False) + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue'), + (3, 'Red'), + (3, 'Green'), + (2, 'Green')], + ) """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other)