diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index 047404e93914b..948350df140eb 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -15,10 +15,84 @@ Whats New in 0.24.1 (February XX, 2019) These are the changes in pandas 0.24.1. See :ref:`release` for a full changelog including other versions of pandas. +.. _whatsnew_0241.api: + +API Changes +~~~~~~~~~~~ + +Changing the ``sort`` parameter for :meth:`Index.union` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The default ``sort`` value for :meth:`Index.union` has changed from ``True`` to ``None`` (:issue:`24959`). +The default *behavior* remains the same: The result is sorted, unless + +1. ``self`` and ``other`` are identical +2. ``self`` or ``other`` is empty +3. ``self`` or ``other`` contain values that can not be compared (a ``RuntimeWarning`` is raised). + +This allows ``sort=True`` to now mean "always sort". A ``TypeError`` is raised if the values cannot be compared. + +**Behavior in 0.24.0** + +.. ipython:: python + + In [1]: idx = pd.Index(['b', 'a']) + + In [2]: idx.union(idx) # sort=True was the default. + Out[2]: Index(['b', 'a'], dtype='object') + + In [3]: idx.union(idx, sort=True) # result is still not sorted. + Out[32]: Index(['b', 'a'], dtype='object') + +**New Behavior** + +.. ipython:: python + + idx = pd.Index(['b', 'a']) + idx.union(idx) # sort=None is the default. Don't sort identical operands. + + idx.union(idx, sort=True) + +The same change applies to :meth:`Index.difference` and :meth:`Index.symmetric_difference`, which +would previously not sort the result when ``sort=True`` but the values could not be compared. + +Changed the behavior of :meth:`Index.intersection` with ``sort=True`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When ``sort=True`` is provided to :meth:`Index.intersection`, the values are always sorted. In 0.24.0, +the values would not be sorted when ``self`` and ``other`` were identical. Pass ``sort=False`` to not +sort the values. This matches the behavior of pandas 0.23.4 and earlier. + +**Behavior in 0.23.4** + +.. ipython:: python + + In [2]: idx = pd.Index(['b', 'a']) + + In [3]: idx.intersection(idx) # sort was not a keyword. + Out[3]: Index(['b', 'a'], dtype='object') + +**Behavior in 0.24.0** + +.. ipython:: python + + In [5]: idx.intersection(idx) # sort=True by default. Don't sort identical. + Out[5]: Index(['b', 'a'], dtype='object') + + In [6]: idx.intersection(idx, sort=True) + Out[6]: Index(['b', 'a'], dtype='object') + +**New Behavior** + +.. ipython:: python + + idx.intersection(idx) # sort=False by default + idx.intersection(idx, sort=True) + .. _whatsnew_0241.regressions: Fixed Regressions -^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~ - Bug in :meth:`DataFrame.itertuples` with ``records`` orient raising an ``AttributeError`` when the ``DataFrame`` contained more than 255 columns (:issue:`24939`) - Bug in :meth:`DataFrame.itertuples` orient converting integer column names to strings prepended with an underscore (:issue:`24940`) @@ -28,7 +102,7 @@ Fixed Regressions .. _whatsnew_0241.enhancements: Enhancements -^^^^^^^^^^^^ +~~~~~~~~~~~~ .. _whatsnew_0241.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4a3440e14ba14..c9473149d8a84 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -233,11 +233,14 @@ def fast_unique_multiple(list arrays, sort: bool=True): if val not in table: table[val] = stub uniques.append(val) - if sort: + if sort is None: try: uniques.sort() except Exception: + # TODO: RuntimeWarning? pass + elif sort: + uniques.sort() return uniques diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d176012df22b..12880ed93cc2a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2245,18 +2245,34 @@ def _get_reconciled_name_object(self, other): return self._shallow_copy(name=name) return self - def union(self, other, sort=True): + def union(self, other, sort=None): """ Form the union of two Index objects. Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * True : sort the result. A TypeError is raised when the + values cannot be compared. + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default `sort` to None, matching the + behavior of pandas 0.23.4 and earlier. + Returns ------- union : Index @@ -2273,10 +2289,16 @@ def union(self, other, sort=True): other = ensure_index(other) if len(other) == 0 or self.equals(other): - return self._get_reconciled_name_object(other) + result = self._get_reconciled_name_object(other) + if sort: + result = result.sort_values() + return result if len(self) == 0: - return other._get_reconciled_name_object(self) + result = other._get_reconciled_name_object(self) + if sort: + result = result.sort_values() + return result # TODO: is_dtype_union_equal is a hack around # 1. buggy set ops with duplicates (GH #13432) @@ -2319,13 +2341,16 @@ def union(self, other, sort=True): else: result = lvals - if sort: + if sort is None: try: result = sorting.safe_sort(result) except TypeError as e: warnings.warn("{}, sort order is undefined for " "incomparable objects".format(e), RuntimeWarning, stacklevel=3) + elif sort: + # raise if not sortable. + result = sorting.safe_sort(result) # for subclasses return self._wrap_setop_result(other, result) @@ -2342,8 +2367,12 @@ def intersection(self, other, sort=False): Parameters ---------- other : Index or array-like - sort : bool, default False - Sort the resulting index if possible + sort : bool or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * True : sort the result. A TypeError is raised when the + values cannot be compared. .. versionadded:: 0.24.0 @@ -2367,7 +2396,10 @@ def intersection(self, other, sort=False): other = ensure_index(other) if self.equals(other): - return self._get_reconciled_name_object(other) + result = self._get_reconciled_name_object(other) + if sort: + result = result.sort_values() + return result if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') @@ -2415,7 +2447,7 @@ def intersection(self, other, sort=False): return taken - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Return a new Index with elements from the index that are not in `other`. @@ -2425,11 +2457,24 @@ def difference(self, other, sort=True): Parameters ---------- other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result, raising a TypeError if any elements + cannot be compared. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Added the `None` option, which matches the behavior of + pandas 0.23.4 and earlier. + Returns ------- difference : Index @@ -2460,15 +2505,17 @@ def difference(self, other, sort=True): label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) the_diff = this.values.take(label_diff) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: pass + elif sort: + the_diff = sorting.safe_sort(the_diff) return this._shallow_copy(the_diff, name=result_name, freq=None) - def symmetric_difference(self, other, result_name=None, sort=True): + def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. @@ -2476,11 +2523,24 @@ def symmetric_difference(self, other, result_name=None, sort=True): ---------- other : Index or array-like result_name : str - sort : bool, default True - Sort the resulting index if possible + sort : bool or None, default None + Whether to sort the resulting index. By default, the + values are attempted to be sorted, but any TypeError from + incomparable elements is caught by pandas. + + * None : Attempt to sort the result, but catch any TypeErrors + from comparing incomparable elements. + * False : Do not sort the result. + * True : Sort the result, raising a TypeError if any elements + cannot be compared. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Added the `None` option, which matches the behavior of + pandas 0.23.4 and earlier. + Returns ------- symmetric_difference : Index @@ -2524,11 +2584,13 @@ def symmetric_difference(self, other, result_name=None, sort=True): right_diff = other.values.take(right_indexer) the_diff = _concat._concat_compat([left_diff, right_diff]) - if sort: + if sort is None: try: the_diff = sorting.safe_sort(the_diff) except TypeError: pass + elif sort: + the_diff = sorting.safe_sort(the_diff) attribs = self._get_attributes_dict() attribs['name'] = result_name diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 16af3fe8eef26..32a5a09359019 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2879,18 +2879,34 @@ def equal_levels(self, other): return False return True - def union(self, other, sort=True): + def union(self, other, sort=None): """ Form the union of two MultiIndex objects Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True - Sort the resulting MultiIndex if possible + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * True : sort the result. A TypeError is raised when the + values cannot be compared. + * False : do not sort the result. .. versionadded:: 0.24.0 + .. versionchanged:: 0.24.1 + + Changed the default `sort` to None, matching the + behavior of pandas 0.23.4 and earlier. + Returns ------- Index @@ -2901,8 +2917,12 @@ def union(self, other, sort=True): other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): + if sort: + return self.sort_values() return self + # TODO: Index.union returns other when `len(self)` is 0. + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, other._ndarray_values], sort=sort) @@ -2917,7 +2937,7 @@ def intersection(self, other, sort=False): Parameters ---------- other : MultiIndex or array / Index of tuples - sort : bool, default True + sort : bool, default False Sort the resulting MultiIndex if possible .. versionadded:: 0.24.0 @@ -2934,6 +2954,8 @@ def intersection(self, other, sort=False): other, result_names = self._convert_can_do_setop(other) if self.equals(other): + if sort: + return self.sort_values() return self self_tuples = self._ndarray_values @@ -2951,7 +2973,7 @@ def intersection(self, other, sort=False): return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) - def difference(self, other, sort=True): + def difference(self, other, sort=None): """ Compute set difference of two MultiIndex objects @@ -2971,6 +2993,8 @@ def difference(self, other, sort=True): other, result_names = self._convert_can_do_setop(other) if len(other) == 0: + if sort: + return self.sort_values() return self if self.equals(other): diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_set_ops.py index 208d6cf1c639f..6a42e29aa8f5c 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_set_ops.py @@ -174,7 +174,10 @@ def test_difference(idx, sort): # name from empty array result = first.difference([], sort=sort) - assert first.equals(result) + if sort: + assert first.sort_values().equals(result) + else: + assert first.equals(result) assert first.names == result.names # name from non-empty array @@ -189,6 +192,36 @@ def test_difference(idx, sort): first.difference([1, 2, 3, 4, 5], sort=sort) +def test_difference_sort_special(): + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + # sort=None, the default + result = idx.difference([]) + tm.assert_index_equal(result, idx) + + result = idx.difference([], sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_difference_sort_incomparable(): + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000'), 2], + ['a', 'b']]) + + other = pd.MultiIndex.from_product([[3, pd.Timestamp('2000'), 4], + ['c', 'd']]) + # sort=None, the default + # result = idx.difference(other) + # tm.assert_index_equal(result, idx) + + # sort=False + result = idx.difference(other) + tm.assert_index_equal(result, idx) + + # sort=True, raises + with pytest.raises(TypeError): + idx.difference(other, sort=True) + + @pytest.mark.parametrize("sort", [True, False]) def test_union(idx, sort): piece1 = idx[:5][::-1] @@ -203,10 +236,16 @@ def test_union(idx, sort): # corner case, pass self or empty thing: the_union = idx.union(idx, sort=sort) - assert the_union is idx + if sort: + tm.assert_index_equal(the_union, idx.sort_values()) + else: + assert the_union is idx the_union = idx.union(idx[:0], sort=sort) - assert the_union is idx + if sort: + tm.assert_index_equal(the_union, idx.sort_values()) + else: + assert the_union is idx # won't work in python 3 # tuples = _index.values @@ -238,7 +277,10 @@ def test_intersection(idx, sort): # corner case, pass self the_int = idx.intersection(idx, sort=sort) - assert the_int is idx + if sort: + tm.assert_index_equal(the_int, idx.sort_values()) + else: + assert the_int is idx # empty intersection: disjoint empty = idx[:2].intersection(idx[2:], sort=sort) @@ -249,3 +291,47 @@ def test_intersection(idx, sort): # tuples = _index.values # result = _index & tuples # assert result.equals(tuples) + + +def test_intersect_equal_sort(): + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + sorted_ = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + + +@pytest.mark.parametrize('slice_', [slice(None), slice(0)]) +def test_union_sort_other_empty(slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, 0], ['a', 'b']]) + + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + # MultiIndex does not special case empty.union(idx) + # tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + # sort=True + result = idx.union(other, sort=True) + expected = pd.MultiIndex.from_product([[0, 1], ['a', 'b']]) + tm.assert_index_equal(result, expected) + + +def test_union_sort_other_incomparable(): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.MultiIndex.from_product([[1, pd.Timestamp('2000')], ['a', 'b']]) + + # default, sort=None + result = idx.union(idx[:1]) + tm.assert_index_equal(result, idx) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + + # sort=True + with pytest.raises(TypeError, match='Cannot compare'): + idx.union(idx[:1], sort=True) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 20e439de46bde..4e8555cbe1aab 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -3,6 +3,7 @@ from collections import defaultdict from datetime import datetime, timedelta import math +import operator import sys import numpy as np @@ -695,7 +696,10 @@ def test_intersection(self, sort): # Corner cases inter = first.intersection(first, sort=sort) - assert inter is first + if sort: + tm.assert_index_equal(inter, first.sort_values()) + else: + assert inter is first @pytest.mark.parametrize("index2,keeps_name", [ (Index([3, 4, 5, 6, 7], name="index"), True), # preserve same name @@ -770,6 +774,12 @@ def test_intersect_nosort(self): expected = pd.Index(['b', 'a']) tm.assert_index_equal(result, expected) + def test_intersect_equal_sort(self): + idx = pd.Index(['c', 'a', 'b']) + sorted_ = pd.Index(['a', 'b', 'c']) + tm.assert_index_equal(idx.intersection(idx, sort=False), idx) + tm.assert_index_equal(idx.intersection(idx, sort=True), sorted_) + @pytest.mark.parametrize("sort", [True, False]) def test_chained_union(self, sort): # Chained unions handles names correctly @@ -799,6 +809,41 @@ def test_union(self, sort): tm.assert_index_equal(union, everything.sort_values()) assert tm.equalContents(union, everything) + @pytest.mark.parametrize('slice_', [slice(None), slice(0)]) + def test_union_sort_other_special(self, slice_): + # https://github.com/pandas-dev/pandas/issues/24959 + + idx = pd.Index([1, 0, 2]) + # default, sort=None + other = idx[slice_] + tm.assert_index_equal(idx.union(other), idx) + tm.assert_index_equal(other.union(idx), idx) + + # sort=False + tm.assert_index_equal(idx.union(other, sort=False), idx) + + # sort=True + result = idx.union(other, sort=True) + expected = pd.Index([0, 1, 2]) + tm.assert_index_equal(result, expected) + + def test_union_sort_other_incomparable(self): + # https://github.com/pandas-dev/pandas/issues/24959 + idx = pd.Index([1, pd.Timestamp('2000')]) + # default, sort=None + with tm.assert_produces_warning(RuntimeWarning): + result = idx.union(idx[:1]) + + tm.assert_index_equal(result, idx) + + # sort=True + with pytest.raises(TypeError, match='.*'): + idx.union(idx[:1], sort=True) + + # sort=False + result = idx.union(idx[:1], sort=False) + tm.assert_index_equal(result, idx) + @pytest.mark.parametrize("klass", [ np.array, Series, list]) @pytest.mark.parametrize("sort", [True, False]) @@ -815,19 +860,20 @@ def test_union_from_iterables(self, klass, sort): tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) - @pytest.mark.parametrize("sort", [True, False]) + @pytest.mark.parametrize("sort", [None, True, False]) def test_union_identity(self, sort): # TODO: replace with fixturesult first = self.strIndex[5:20] union = first.union(first, sort=sort) - assert union is first + # i.e. identity is not preserved when sort is True + assert (union is first) is (not sort) union = first.union([], sort=sort) - assert union is first + assert (union is first) is (not sort) union = Index([]).union(first, sort=sort) - assert union is first + assert (union is first) is (not sort) @pytest.mark.parametrize("first_list", [list('ba'), list()]) @pytest.mark.parametrize("second_list", [list('ab'), list()]) @@ -1054,6 +1100,29 @@ def test_symmetric_difference(self, sort): assert tm.equalContents(result, expected) assert result.name is None + @pytest.mark.parametrize('opname', ['difference', 'symmetric_difference']) + def test_difference_incomparable(self, opname): + a = pd.Index([3, pd.Timestamp('2000'), 1]) + b = pd.Index([2, pd.Timestamp('1999'), 1]) + op = operator.methodcaller(opname, b) + + # sort=None, the default + result = op(a) + expected = pd.Index([3, pd.Timestamp('2000'), 2, pd.Timestamp('1999')]) + if opname == 'difference': + expected = expected[:2] + tm.assert_index_equal(result, expected) + + # sort=False + op = operator.methodcaller(opname, b, sort=False) + result = op(a) + tm.assert_index_equal(result, expected) + + # sort=True, raises + op = operator.methodcaller(opname, b, sort=True) + with pytest.raises(TypeError, match='Cannot compare'): + op(a) + @pytest.mark.parametrize("sort", [True, False]) def test_symmetric_difference_mi(self, sort): index1 = MultiIndex.from_tuples(self.tuples)