diff --git a/doc/source/api.rst b/doc/source/api.rst index ec6e2aff870c6..feb4da700354d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -374,6 +374,8 @@ Reindexing / Selection / Label manipulation Series.align Series.drop + Series.drop_duplicates + Series.duplicated Series.equals Series.first Series.head @@ -1165,6 +1167,8 @@ Modifying and Computations Index.diff Index.sym_diff Index.drop + Index.drop_duplicates + Index.duplicated Index.equals Index.factorize Index.identical diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index c6e784ac93e92..db3fea7d1e24f 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -179,6 +179,15 @@ API changes - Histogram from ``DataFrame.plot`` with ``kind='hist'`` (:issue:`7809`), See :ref:`the docs`. +- ``Index`` now supports ``duplicated`` and ``drop_duplicates``. (:issue:`4060`) + + .. ipython:: python + + idx = Index([1, 2, 3, 4, 1, 2]) + idx + idx.duplicated() + idx.drop_duplicates() + .. _whatsnew_0150.dt: .dt accessor diff --git a/pandas/core/base.py b/pandas/core/base.py index 1655d2a4e4e23..348fb4f23cefc 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -8,8 +8,14 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.tslib as tslib +import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly + +_shared_docs = dict() +_indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='') + + class StringMixin(object): """implements string methods so long as object defines a `__unicode__` @@ -474,12 +480,66 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) + _shared_docs['drop_duplicates'] = ( + """Return %(klass)s with duplicate values removed + + Parameters + ---------- + take_last : boolean, default False + Take the last observed index in a group. Default first + %(inplace)s + + Returns + ------- + deduplicated : %(klass)s + """) + + @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) + def drop_duplicates(self, take_last=False, inplace=False): + duplicated = self.duplicated(take_last=take_last) + result = self[~duplicated.values] + if inplace: + return self._update_inplace(result) + else: + return result + + _shared_docs['duplicated'] = ( + """Return boolean %(klass)s denoting duplicate values + + Parameters + ---------- + take_last : boolean, default False + Take the last observed index in a group. Default first + + Returns + ------- + duplicated : %(klass)s + """) + + @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) + def duplicated(self, take_last=False): + keys = com._ensure_object(self.values) + duplicated = lib.duplicated(keys, take_last=take_last) + try: + return self._constructor(duplicated, + index=self.index).__finalize__(self) + except AttributeError: + from pandas.core.index import Index + return Index(duplicated) + #---------------------------------------------------------------------- # unbox reductions all = _unbox(np.ndarray.all) any = _unbox(np.ndarray.any) + #---------------------------------------------------------------------- + # abstracts + + def _update_inplace(self, result): + raise NotImplementedError + + class DatetimeIndexOpsMixin(object): """ common ops mixin to support a unified inteface datetimelike Index """ @@ -497,7 +557,6 @@ def _box_values(self, values): """ apply box func to passed values """ - import pandas.lib as lib return lib.map_infer(values, self._box_func) @cache_readonly diff --git a/pandas/core/index.py b/pandas/core/index.py index a58a3331f9759..0c0969cce8018 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -12,7 +12,7 @@ import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, is_datetime_array -from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin +from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs from pandas.util.decorators import Appender, cache_readonly, deprecate from pandas.core.common import isnull, array_equivalent import pandas.core.common as com @@ -30,6 +30,8 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) +_index_doc_kwargs = dict(klass='Index', inplace='') + def _try_get_item(x): try: @@ -209,6 +211,10 @@ def _simple_new(cls, values, name=None, **kwargs): result._reset_identity() return result + def _update_inplace(self, result): + # guard when called from IndexOpsMixin + raise TypeError("Index can't be updated inplace") + def is_(self, other): """ More flexible, faster check like ``is`` but that works through views @@ -2019,6 +2025,15 @@ def drop(self, labels): raise ValueError('labels %s not contained in axis' % labels[mask]) return self.delete(indexer) + @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) + def drop_duplicates(self, take_last=False): + result = super(Index, self).drop_duplicates(take_last=take_last) + return self._constructor(result) + + @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + def duplicated(self, take_last=False): + return super(Index, self).duplicated(take_last=take_last) + @classmethod def _add_numeric_methods_disabled(cls): """ add in numeric methods to disable """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 5a490992c478c..2f0e651bfc5b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -52,10 +52,13 @@ __all__ = ['Series'] + _shared_doc_kwargs = dict( axes='index', klass='Series', - axes_single_arg="{0,'index'}" + axes_single_arg="{0,'index'}", + inplace="""inplace : boolean, default False + If True, performs operation inplace and returns None.""" ) @@ -265,6 +268,9 @@ def _set_subtyp(self, is_all_dates): else: object.__setattr__(self, '_subtyp', 'series') + def _update_inplace(self, result): + return generic.NDFrame._update_inplace(self, result) + # ndarray compatibility @property def dtype(self): @@ -1114,45 +1120,14 @@ def mode(self): from pandas.core.algorithms import mode return mode(self) + @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) def drop_duplicates(self, take_last=False, inplace=False): - """ - Return Series with duplicate values removed - - Parameters - ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first - inplace : boolean, default False - If True, performs operation inplace and returns None. - - Returns - ------- - deduplicated : Series - """ - duplicated = self.duplicated(take_last=take_last) - result = self[-duplicated] - if inplace: - return self._update_inplace(result) - else: - return result + return super(Series, self).drop_duplicates(take_last=take_last, + inplace=inplace) + @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) def duplicated(self, take_last=False): - """ - Return boolean Series denoting duplicate values - - Parameters - ---------- - take_last : boolean, default False - Take the last observed index in a group. Default first - - Returns - ------- - duplicated : Series - """ - keys = _ensure_object(self.values) - duplicated = lib.duplicated(keys, take_last=take_last) - return self._constructor(duplicated, - index=self.index).__finalize__(self) + return super(Series, self).duplicated(take_last=take_last) def idxmin(self, axis=None, out=None, skipna=True): """ diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index b171b31528a55..8b0605dd391be 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -339,9 +339,13 @@ def test_value_counts_unique_nunique(self): # freq must be specified because repeat makes freq ambiguous expected_index = o[::-1] o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) - else: + elif isinstance(o, Index): expected_index = values[::-1] o = klass(np.repeat(values, range(1, len(o) + 1))) + else: + expected_index = values[::-1] + idx = np.repeat(o.index.values, range(1, len(o) + 1)) + o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx) expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64') tm.assert_series_equal(o.value_counts(), expected_s) @@ -374,11 +378,16 @@ def test_value_counts_unique_nunique(self): # create repeated values, 'n'th element is repeated by n+1 times if isinstance(o, PeriodIndex): + # freq must be specified because repeat makes freq ambiguous expected_index = o o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq) - else: + elif isinstance(o, Index): expected_index = values o = klass(np.repeat(values, range(1, len(o) + 1))) + else: + expected_index = values + idx = np.repeat(o.index.values, range(1, len(o) + 1)) + o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx) expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64') @@ -571,6 +580,63 @@ def test_factorize(self): expected = o[5:].append(o[:5]) self.assertTrue(uniques.equals(expected)) + def test_duplicated_drop_duplicates(self): + # GH 4060 + for original in self.objs: + + if isinstance(original, Index): + # original doesn't have duplicates + expected = Index([False] * len(original)) + tm.assert_index_equal(original.duplicated(), expected) + result = original.drop_duplicates() + tm.assert_index_equal(result, original) + self.assertFalse(result is original) + + # create repeated values, 3rd and 5th values are duplicated + idx = original[list(range(len(original))) + [5, 3]] + expected = Index([False] * len(original) + [True, True]) + tm.assert_index_equal(idx.duplicated(), expected) + tm.assert_index_equal(idx.drop_duplicates(), original) + + last_base = [False] * len(idx) + last_base[3] = True + last_base[5] = True + expected = Index(last_base) + tm.assert_index_equal(idx.duplicated(take_last=True), expected) + tm.assert_index_equal(idx.drop_duplicates(take_last=True), + idx[~np.array(last_base)]) + + with tm.assertRaisesRegexp(TypeError, + "drop_duplicates\(\) got an unexpected keyword argument"): + idx.drop_duplicates(inplace=True) + + else: + expected = Series([False] * len(original), index=original.index) + tm.assert_series_equal(original.duplicated(), expected) + result = original.drop_duplicates() + tm.assert_series_equal(result, original) + self.assertFalse(result is original) + + idx = original.index[list(range(len(original))) + [5, 3]] + values = original.values[list(range(len(original))) + [5, 3]] + s = Series(values, index=idx) + + expected = Series([False] * len(original) + [True, True], index=idx) + tm.assert_series_equal(s.duplicated(), expected) + tm.assert_series_equal(s.drop_duplicates(), original) + + last_base = [False] * len(idx) + last_base[3] = True + last_base[5] = True + expected = Series(last_base, index=idx) + expected + tm.assert_series_equal(s.duplicated(take_last=True), expected) + tm.assert_series_equal(s.drop_duplicates(take_last=True), + s[~np.array(last_base)]) + + s.drop_duplicates(inplace=True) + tm.assert_series_equal(s, original) + class TestDatetimeIndexOps(Ops): tz = [None, 'UTC', 'Asia/Tokyo', 'US/Eastern', diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ed078ae5749de..a0c5d3ce5959a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2031,6 +2031,20 @@ def test_duplicate_mi(self): result = df.loc[('foo','bar')] assert_frame_equal(result,expected) + def test_duplicated_drop_duplicates(self): + # GH 4060 + idx = MultiIndex.from_arrays(([1, 2, 3, 1, 2 ,3], [1, 1, 1, 1, 2, 2])) + + expected = Index([False, False, False, True, False, False]) + tm.assert_index_equal(idx.duplicated(), expected) + expected = MultiIndex.from_arrays(([1, 2, 3, 2 ,3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(), expected) + + expected = Index([True, False, False, False, False, False]) + tm.assert_index_equal(idx.duplicated(take_last=True), expected) + expected = MultiIndex.from_arrays(([2, 3, 1, 2 ,3], [1, 1, 1, 2, 2])) + tm.assert_index_equal(idx.drop_duplicates(take_last=True), expected) + def test_multiindex_set_index(self): # segfault in #3308 d = {'t1': [2, 2.5, 3], 't2': [4, 5, 6]}