Skip to content

ENH: Add sym_diff for index #6016

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1504,6 +1504,18 @@ operators:
a & b
a - b

Also available is the ``sym_diff (^)`` operation, which returns elements
that appear in either ``idx1`` or ``idx2`` but not both. This is
equivalent to the Index created by ``(idx1 - idx2) + (idx2 - idx1)``,
with duplicates dropped.

.. ipython:: python

idx1 = Index([1, 2, 3, 4])
idx2 = Index([2, 3, 4, 5])
idx1.sym_diff(idx2)
idx1 ^ idx2

The ``isin`` method of Index objects
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ New features
~~~~~~~~~~~~

- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
- Added the ``sym_diff`` method to ``Index`` (:issue:`5543`)

API Changes
~~~~~~~~~~~
Expand Down
58 changes: 54 additions & 4 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,9 @@ def __and__(self, other):
def __or__(self, other):
return self.union(other)

def __xor__(self, other):
return self.sym_diff(other)

def union(self, other):
"""
Form the union of two Index objects and sorts if possible
Expand Down Expand Up @@ -973,16 +976,20 @@ def diff(self, other):
"""
Compute sorted set difference of two Index objects

Parameters
----------
other : Index or array-like

Returns
-------
diff : Index

Notes
-----
One can do either of these and achieve the same result

>>> index - index2
>>> index.diff(index2)

Returns
-------
diff : Index
"""

if not hasattr(other, '__iter__'):
Expand All @@ -1000,6 +1007,49 @@ def diff(self, other):
theDiff = sorted(set(self) - set(other))
return Index(theDiff, name=result_name)

def sym_diff(self, other, result_name=None):
"""
Compute the sorted symmetric_difference of two Index objects.

Parameters
----------

other : array-like
result_name : str

Returns
-------
sym_diff : Index

Notes
-----
``sym_diff`` contains elements that appear in either ``idx1`` or
``idx2`` but not both. Equivalent to the Index created by
``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped.

Examples
--------
>>> idx1 = Index([1, 2, 3, 4])
>>> idx2 = Index([2, 3, 4, 5])
>>> idx1.sym_diff(idx2)
Int64Index([1, 5], dtype='int64')

You can also use the ``^`` operator:

>>> idx1 ^ idx2
Int64Index([1, 5], dtype='int64')
"""
if not hasattr(other, '__iter__'):
raise TypeError('Input must be iterable!')

if not isinstance(other, Index):
other = Index(other)
result_name = result_name or self.name

the_diff = sorted(set((self - other) + (other - self)))
return Index(the_diff, name=result_name)


def unique(self):
"""
Return array of unique values in the Index. Significantly faster than
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,52 @@ def test_diff(self):
# non-iterable input
assertRaisesRegexp(TypeError, "iterable", first.diff, 0.5)

def test_symmetric_diff(self):
# smoke
idx1 = Index([1, 2, 3, 4], name='idx1')
idx2 = Index([2, 3, 4, 5])
result = idx1.sym_diff(idx2)
expected = Index([1, 5])
self.assert_(tm.equalContents(result, expected))
self.assert_(result.name is None)

# __xor__ syntax
expected = idx1 ^ idx2
self.assert_(tm.equalContents(result, expected))
self.assert_(result.name is None)

# multiIndex
idx1 = MultiIndex.from_tuples(self.tuples)
idx2 = MultiIndex.from_tuples([('foo', 1), ('bar', 3)])
result = idx1.sym_diff(idx2)
expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)])
self.assert_(tm.equalContents(result, expected))

# nans:
idx1 = Index([1, 2, np.nan])
idx2 = Index([0, 1, np.nan])
result = idx1.sym_diff(idx2)
expected = Index([0.0, np.nan, 2.0, np.nan]) # oddness with nans
nans = pd.isnull(expected)
self.assert_(pd.isnull(result[nans]).all())
self.assert_(tm.equalContents(result[~nans], expected[~nans]))

# other not an Index:
idx1 = Index([1, 2, 3, 4], name='idx1')
idx2 = np.array([2, 3, 4, 5])
expected = Index([1, 5])
result = idx1.sym_diff(idx2)
self.assert_(tm.equalContents(result, expected))
self.assertEquals(result.name, 'idx1')

result = idx1.sym_diff(idx2, result_name='new_name')
self.assert_(tm.equalContents(result, expected))
self.assertEquals(result.name, 'new_name')

# other isn't iterable
with tm.assertRaises(TypeError):
idx1 - 1

def test_pickle(self):
def testit(index):
pickled = pickle.dumps(index)
Expand Down