diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index afeb3fcc7764c..e3ee7d7c64c44 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1504,6 +1504,18 @@ operators: a & b a - b +Also available is the ``sym_diff (^)`` operation, which returns elements +that appear in either ``idx1`` or ``idx2`` but not both. This is +equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``, +with duplicates dropped. + +.. ipython:: python + + idx1 = Index([1, 2, 3, 4]) + idx2 = Index([2, 3, 4, 5]) + idx1.sym_diff(idx2) + idx1 ^ idx2 + The ``isin`` method of Index objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 6e1632f036f38..93c9812524278 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -54,6 +54,7 @@ New features ~~~~~~~~~~~~ - Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`) +- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`) API Changes ~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index a4eca1216ea84..8798a4dca472b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -866,6 +866,9 @@ def __and__(self, other): def __or__(self, other): return self.union(other) + def __xor__(self, other): + return self.sym_diff(other) + def union(self, other): """ Form the union of two Index objects and sorts if possible @@ -973,16 +976,20 @@ def diff(self, other): """ Compute sorted set difference of two Index objects + Parameters + ---------- + other : Index or array-like + + Returns + ------- + diff : Index + Notes ----- One can do either of these and achieve the same result >>> index - index2 >>> index.diff(index2) - - Returns - ------- - diff : Index """ if not hasattr(other, '__iter__'): @@ -1000,6 +1007,49 @@ def diff(self, other): theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) + def sym_diff(self, other, result_name=None): + """ + Compute the sorted symmetric_difference of two Index objects. + + Parameters + ---------- + + other : array-like + result_name : str + + Returns + ------- + sym_diff : Index + + Notes + ----- + ``sym_diff`` contains elements that appear in either ``idx1`` or + ``idx2`` but not both. Equivalent to the Index created by + ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. + + Examples + -------- + >>> idx1 = Index([1, 2, 3, 4]) + >>> idx2 = Index([2, 3, 4, 5]) + >>> idx1.sym_diff(idx2) + Int64Index([1, 5], dtype='int64') + + You can also use the ``^`` operator: + + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + if not hasattr(other, '__iter__'): + raise TypeError('Input must be iterable!') + + if not isinstance(other, Index): + other = Index(other) + result_name = result_name or self.name + + the_diff = sorted(set((self - other) + (other - self))) + return Index(the_diff, name=result_name) + + def unique(self): """ Return array of unique values in the Index. Significantly faster than diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index f938066011e06..59cec4f733b82 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -471,6 +471,52 @@ def test_diff(self): # non-iterable input assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5) + def test_symmetric_diff(self): + # smoke + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = Index([2, 3, 4, 5]) + result = idx1.sym_diff(idx2) + expected = Index([1, 5]) + self.assert_(tm.equalContents(result, expected)) + self.assert_(result.name is None) + + # __xor__ syntax + expected = idx1 ^ idx2 + self.assert_(tm.equalContents(result, expected)) + self.assert_(result.name is None) + + # multiIndex + idx1 = MultiIndex.from_tuples(self.tuples) + idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)]) + result = idx1.sym_diff(idx2) + expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) + self.assert_(tm.equalContents(result, expected)) + + # nans: + idx1 = Index([1, 2, np.nan]) + idx2 = Index([0, 1, np.nan]) + result = idx1.sym_diff(idx2) + expected = Index([0.0, np.nan, 2.0, np.nan]) # oddness with nans + nans = pd.isnull(expected) + self.assert_(pd.isnull(result[nans]).all()) + self.assert_(tm.equalContents(result[~nans], expected[~nans])) + + # other not an Index: + idx1 = Index([1, 2, 3, 4], name='idx1') + idx2 = np.array([2, 3, 4, 5]) + expected = Index([1, 5]) + result = idx1.sym_diff(idx2) + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'idx1') + + result = idx1.sym_diff(idx2, result_name='new_name') + self.assert_(tm.equalContents(result, expected)) + self.assertEquals(result.name, 'new_name') + + # other isn't iterable + with tm.assertRaises(TypeError): + idx1 - 1 + def test_pickle(self): def testit(index): pickled = pickle.dumps(index)