From ebfffc20861db8dd2fd784fc8a1878db00f74edb Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 16 Jul 2018 18:20:08 +0200 Subject: [PATCH 1/7] ENH: add return_inverse kwarg to duplicated-method (rebased) --- asv_bench/benchmarks/frame_methods.py | 26 ++- asv_bench/benchmarks/index_object.py | 18 ++ asv_bench/benchmarks/multiindex_object.py | 13 +- asv_bench/benchmarks/series_methods.py | 18 ++ doc/source/whatsnew/v0.24.0.txt | 50 ++++++ pandas/core/algorithms.py | 59 ++++++- pandas/core/base.py | 31 +++- pandas/core/frame.py | 154 +++++++++++++++++- pandas/core/indexes/base.py | 49 +++++- pandas/core/indexes/category.py | 6 +- pandas/core/indexes/multi.py | 13 +- pandas/core/series.py | 120 ++++++++++---- pandas/tests/frame/test_duplicates.py | 69 ++++++++ pandas/tests/indexes/common.py | 76 +++++++++ pandas/tests/indexes/multi/test_duplicates.py | 104 +++++++++--- pandas/tests/series/test_duplicates.py | 63 +++++++ 16 files changed, 777 insertions(+), 92 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 1819cfa2725db..50789d64edda1 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -412,21 +412,35 @@ def time_frame_nunique(self): class Duplicated(object): goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError - def setup(self): n = (1 << 20) t = date_range('2015-01-01', freq='S', periods=(n // 64)) xs = np.random.randn(n // 64).round(2) self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), 'b': np.random.choice(t, n), 'c': np.random.choice(xs, n)}) - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T + # df2 will not have any duplicates + self.df2 = DataFrame(np.random.randn(100, 1000).astype(str)) + + df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)), + columns=list('ABCDE')) + df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str)) + self.df3 = df3 + + def time_frame_duplicated(self, keep, return_inverse): + self.df.duplicated(keep=keep, return_inverse=return_inverse) - def time_frame_duplicated(self): - self.df.duplicated() + def time_frame_duplicated_wide(self, keep, return_inverse): + self.df2.duplicated(keep=keep, return_inverse=return_inverse) - def time_frame_duplicated_wide(self): - self.df2.duplicated() + def time_frame_duplicated_mixed(self, keep, return_inverse): + self.df3.duplicated(keep=keep, return_inverse=return_inverse) class XS(object): diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index f1703e163917a..a3a7f7f17d332 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -84,6 +84,24 @@ def time_modulo(self, dtype): self.index % 2 +class Duplicated(object): + + goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 + base = tm.makeStringIndex(n) + self.idx = Index(base[np.random.choice(n, k * n)]) + + def time_duplicated(self, keep, return_inverse): + self.idx.duplicated(keep=keep, return_inverse=return_inverse) + + class Range(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 0c92214795557..ac73c2d9c72dc 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -83,17 +83,22 @@ def time_is_monotonic(self): class Duplicated(object): goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] - def setup(self): - n, k = 200, 5000 + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] labels = [np.random.choice(n, (k * n)) for lev in levels] self.mi = MultiIndex(levels=levels, labels=labels) - def time_duplicated(self): - self.mi.duplicated() + def time_duplicated(self, keep, return_inverse): + self.mi.duplicated(keep=keep, return_inverse=return_inverse) class Sortlevel(object): diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a26c5d89bc483..cc08355b61e88 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -192,3 +192,21 @@ def setup(self): def time_series_datetimeindex_repr(self): getattr(self.s, 'a', None) + + +class Duplicated(object): + + goal_time = 0.2 + params = (['first', 'last', False], [True, False]) + param_names = ['keep', 'return_inverse'] + + def setup(self, keep, return_inverse): + if keep is False and return_inverse: + raise NotImplementedError + + n, k = 200, 1000 + base = tm.makeStringIndex(n) + self.s = Series(base[np.random.choice(n, k * n)]) + + def time_series_duplicated(self, keep, return_inverse): + self.s.duplicated(keep=keep, return_inverse=return_inverse) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 317b8b8878308..c829c2bf1cc1f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -159,6 +159,56 @@ This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. +.. _whatsnew_0240.enhancements.duplicated_inverse: + +The ``duplicated``-method has gained the ``return_inverse`` kwarg +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, +which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) +that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`). + +For ``Index`` objects, the inverse is an ``np.ndarray``: + +.. ipython:: python + + idx = pd.Index(['a', 'b', 'b', 'c', 'a']) + idx.has_duplicates + isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first' + isduplicate + inverse + +This allows to reconstruct the original ``Index`` as follows: + +.. ipython:: python + + unique = idx[~isduplicate] # same as idx.drop_duplicates() + unique + + reconstruct = unique[inverse] + reconstruct.equals(idx) + +For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``, +which contains the mapping from the index of the deduplicated, unique subset back to the original index. + +.. ipython:: python + + df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, + index=[1, 4, 9, 16, 25]) + df + isduplicate, inverse = df.duplicated(keep='last', return_inverse=True) + isduplicate + inverse + + unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last') + unique + reconstruct = unique.reindex(inverse.values).set_index(inverse.index) + reconstruct.equals(df) + +The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible +to construct an inverse). + + .. _whatsnew_0240.enhancements.other: Other Enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e5b6c84d37541..181d2fbc5f601 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -770,7 +770,7 @@ def _value_counts_arraylike(values, dropna): return keys, counts -def duplicated(values, keep='first'): +def duplicated(values, keep='first', return_inverse=False): """ Return boolean ndarray denoting duplicate values. @@ -785,16 +785,67 @@ def duplicated(values, keep='first'): occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + - False : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection of (integer) indices from the array + of unique values (created e.g. by selecting the boolean complement of + the first output, or by using `.drop_duplicates` with the same + `keep`-parameter) that can be used to reconstruct "values". + + .. versionadded:: 0.24.0 Returns ------- - duplicated : ndarray + duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True """ + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + values, dtype, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) - return f(values, keep=keep) + isdup = f(values, keep=keep) + if not return_inverse: + return isdup + elif not isdup.any(): + # no need to calculate inverse if no duplicates + inv = np.arange(len(values)) + return isdup, inv + + if keep == 'first': + # o2u: original indices to indices of ARRAY of unique values + # u2o: reduplication from array of unique values to original array + # this fits together in the way that values[o2u] are the unique values + # and values[o2u][u2o] == values + _, o2u, u2o = np.unique(values, return_index=True, + return_inverse=True) + elif keep == 'last': + # np.unique takes first occurrence as unique value, + # so we flip values that first becomes last + values = values[::-1] + _, o2u, u2o = np.unique(values, return_index=True, + return_inverse=True) + # the values in "values" correspond(ed) to the index of "values", + # which is simply np.arange(len(values)). + # By flipping "values" around, we need to do the same for the index, + # ___because o2u and u2o are relative to that order___. + # Finally, to fit with the original order again, we need to flip the + # result around one last time. + o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1] + + # np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative + # to this order. To restore the original order, we argsort o2u, because o2u + # would be ordered if np.unique had not sorted implicitly. The first + # argsort gives the permutation from o2u to its sorted form, but we need + # the inverse permutation (the map from the unsorted uniques to o2u, from + # which we can continue with u2o). This inversion (as a permutation) is + # achieved by the second argsort. + inv = np.argsort(np.argsort(o2u))[u2o] + return isdup, inv def mode(values, dropna=True): diff --git a/pandas/core/base.py b/pandas/core/base.py index 084a976320d77..2ab7f92e2dc03 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1246,16 +1246,39 @@ def drop_duplicates(self, keep='first', inplace=False): else: return result - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") + if isinstance(self, ABCIndexClass): if self.is_unique: - return np.zeros(len(self), dtype=np.bool) - return duplicated(self, keep=keep) - else: + isdup = np.zeros(len(self), dtype=np.bool) + if not return_inverse: + return isdup + return isdup, np.arange(len(self)) + # core.algorithms.duplicated has the same output signature as + # Index.duplicated -> no need to distinguish cases here + return duplicated(self, keep=keep, return_inverse=return_inverse) + + # Series case + if not return_inverse: return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) + # return_inverse = True + isdup_array, inv_array = duplicated(self, keep=keep, + return_inverse=True) + isdup = self._constructor(isdup_array, + index=self.index).__finalize__(self) + inv = self._constructor(self.loc[~isdup_array].index[inv_array], + index=self.index) + return isdup, inv + # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 052952103e28c..49f681771d4db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4335,7 +4335,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - def duplicated(self, subset=None, keep='first'): + def duplicated(self, subset=None, keep='first', return_inverse=False): """ Return boolean Series denoting duplicate rows, optionally only considering certain columns @@ -4350,14 +4350,150 @@ def duplicated(self, subset=None, keep='first'): first occurrence. - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + - False : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection from the index from the + DataFrame of unique values (created e.g. by selecting the boolean + complement of the first output, or by using `.drop_duplicates` with + the same `keep`-parameter) and how they relate to the index of the + current DataFrame. This allows to reconstruct the original + DataFrame from the subset of unique values, see example below. + + .. versionadded:: 0.24.0 Returns ------- - duplicated : Series + duplicated : Series or tuple of Series if return_inverse is True + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set on False and all others on True: + + >>> data = {'species': ['lama', 'cow', 'lama', 'ant', 'lama', 'bee'], + 'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']} + >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25]) + >>> animals + species type + 1 lama mammal + 4 cow mammal + 9 lama mammal + 16 ant insect + 25 lama mammal + 36 bee insect + >>> + >>> animals.duplicated() # default: keep='first' + 1 False + 4 False + 9 True + 16 False + 25 True + 36 False + dtype: bool + + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: + + >>> animals.duplicated(keep='last') + 1 True + 4 False + 9 True + 16 False + 25 False + 36 False + dtype: bool + + By specifying `keep=False`, all duplicates are set to True: + + >>> animals.duplicated(keep=False) + 1 True + 4 False + 9 True + 16 False + 25 True + 36 False + dtype: bool + + By specifying the `subset`-keyword, the duplicates will be calculated + based on just the subset of columns given + + >>> animals.duplicated(subset=['type']) # default: keep='first' + 1 False + 4 True + 9 True + 16 False + 25 True + 36 True + dtype: bool + + Using the keyword `return_inverse=True`, the output becomes a tuple of + `Series`: + + >>> isduplicate, inverse = animals.duplicated(return_inverse=True) + >>> inverse + 1 1 + 4 4 + 9 1 + 16 16 + 25 1 + 36 36 + dtype: int64 + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> # same as animals.drop_duplicates() + >>> animals_unique = animals.loc[~isduplicate] + >>> animals_unique + species type + 1 lama mammal + 4 cow mammal + 16 ant insect + 36 bee insect + >>> + >>> reconstruct = animals_unique.reindex(inverse) + >>> reconstruct + species type + 1 lama mammal + 4 cow mammal + 1 lama mammal + 16 ant insect + 1 lama mammal + 36 bee insect + + We see that the values of `animals` get reconstructed correctly, but + the index does not match yet -- consequently, the last step is to + correctly set the index. + + >>> reconstruct = reconstruct.set_index(inverse.index) + >>> reconstruct + species type + 1 lama mammal + 4 cow mammal + 9 lama mammal + 16 ant insect + 25 lama mammal + 36 bee insect + >>> + >>> reconstruct.equals(animals) + True + + See Also + -------- + pandas.Index.duplicated : Equivalent method on pandas.Index + pandas.Series.duplicated : Equivalent method on pandas.Series + pandas.DataFrame.drop_duplicates : Remove duplicate values """ from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + from pandas._libs.hashtable import _SIZE_HINT_LIMIT + from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") def f(vals): labels, shape = algorithms.factorize( @@ -4383,7 +4519,15 @@ def f(vals): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - return Series(duplicated_int64(ids, keep), index=self.index) + if not return_inverse: + return Series(duplicated(ids, keep=keep), index=self.index) + + # return_inverse = True + isdup_array, inv_array = duplicated(ids, keep=keep, + return_inverse=True) + isdup = Series(isdup_array, index=self.index) + inv = Series(self.loc[~isdup_array].index[inv_array], index=self.index) + return isdup, inv # ---------------------------------------------------------------------- # Sorting diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 34cfa0b23f082..2a5e4958596f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4568,7 +4568,7 @@ def drop_duplicates(self, keep='first'): """ return super(Index, self).drop_duplicates(keep=keep) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate index values. @@ -4585,7 +4585,20 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection of (integer) indices from the + Index with unique values (created e.g. by selecting the boolean + complement of the first output, or by using `.drop_duplicates` with + the same `keep`-parameter). This allows to reconstruct the original + Index from the subset of unique values, see example below. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : ndarray or or tuple of ndarray if return_inverse is True Examples -------- @@ -4601,20 +4614,37 @@ def duplicated(self, keep='first'): >>> idx.duplicated(keep='first') array([False, False, True, False, True]) - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: >>> idx.duplicated(keep='last') array([ True, False, True, False, False]) - By setting keep on ``False``, all duplicates are True: + By specifying `keep=False`, all duplicates are set to True: >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) - Returns - ------- - numpy.ndarray + Using the keyword `return_inverse=True`, the output becomes a tuple of + `np.ndarray`: + + >>> isduplicate, inverse = idx.duplicated(return_inverse=True) + >>> inverse + array([0, 1, 0, 2, 0], dtype=int64) + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> idx_unique = idx[~isduplicate] # same as idx.drop_duplicates() + >>> idx_unique + Index(['lama', 'cow', 'beetle'], dtype='object') + >>> + >>> reconstruct = idx_unique[inverse] + >>> reconstruct + Index(['lama', 'cow', 'lama', 'beetle', 'lama'], dtype='object') + >>> + >>> reconstruct.equals(idx) + True See Also -------- @@ -4622,7 +4652,8 @@ def duplicated(self, keep='first'): pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Index.drop_duplicates : Remove duplicate values from Index """ - return super(Index, self).duplicated(keep=keep) + return super(Index, self).duplicated(keep=keep, + return_inverse=return_inverse) _index_shared_docs['fillna'] = """ Fill NA/NaN values with the specified value diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ab180a13ab4f3..0a63fe5d08ccd 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -392,10 +392,10 @@ def unique(self, level=None): ordered=result.ordered) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): - from pandas._libs.hashtable import duplicated_int64 + def duplicated(self, keep='first', return_inverse=False): + from pandas.core.algorithms import duplicated codes = self.codes.astype('i8') - return duplicated_int64(codes, keep) + return duplicated(codes, keep=keep, return_inverse=return_inverse) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 5b2e3a76adf05..2f85beb107af4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -903,14 +903,19 @@ def f(k, stringify): return hash_tuple(key) @Appender(Index.duplicated.__doc__) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64 + from pandas.core.algorithms import duplicated + + if return_inverse and keep is False: + raise ValueError("The parameters return_inverse=True and " + "keep=False cannot be used together (impossible " + "to calculate an inverse when discarding all " + "instances of a duplicate).") shape = map(len, self.levels) ids = get_group_index(self.labels, shape, sort=False, xnull=False) - - return duplicated_int64(ids, keep) + return duplicated(ids, keep=keep, return_inverse=return_inverse) def fillna(self, value=None, downcast=None): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 4558314d612d0..476221ac54993 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1590,7 +1590,7 @@ def drop_duplicates(self, keep='first', inplace=False): """ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep='first'): + def duplicated(self, keep='first', return_inverse=False): """ Indicate duplicate Series values. @@ -1605,56 +1605,115 @@ def duplicated(self, keep='first'): occurrence. - 'last' : Mark duplicates as ``True`` except for the last occurrence. - - ``False`` : Mark all duplicates as ``True``. + - ``False`` : Mark all duplicates as ``True``. This option is not + compatible with ``return_inverse``. + return_inverse : boolean, default False + If True, also return the selection from the index from the Series + of unique values (created e.g. by selecting the boolean complement + of the first output, or by using `.drop_duplicates` with the same + `keep`-parameter) and how they relate to the index of the current + Series. This allows to reconstruct the original Series from the + subset of unique values, see example below. + + .. versionadded:: 0.24.0 + + Returns + ------- + duplicated : Series or or tuple of Series if return_inverse is True Examples -------- By default, for each set of duplicated values, the first occurrence is set on False and all others on True: - >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'], + index=[1, 4, 9, 16, 25]) >>> animals.duplicated() - 0 False - 1 False - 2 True - 3 False - 4 True + 1 False + 4 False + 9 True + 16 False + 25 True dtype: bool which is equivalent to >>> animals.duplicated(keep='first') - 0 False - 1 False - 2 True - 3 False - 4 True + 1 False + 4 False + 9 True + 16 False + 25 True dtype: bool - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: + By using `'last'`, the last occurrence of each set of duplicated values + is set to False and all others to True: >>> animals.duplicated(keep='last') - 0 True - 1 False - 2 True - 3 False - 4 False + 1 True + 4 False + 9 True + 16 False + 25 False dtype: bool - By setting keep on ``False``, all duplicates are True: + By specifying `keep=False`, all duplicates are set to True: >>> animals.duplicated(keep=False) - 0 True - 1 False - 2 True - 3 False - 4 True + 1 True + 4 False + 9 True + 16 False + 25 True dtype: bool - Returns - ------- - pandas.core.series.Series + Using the keyword `return_inverse=True`, the output becomes a tuple of + `Series`: + + >>> isduplicate, inverse = animals.duplicated(return_inverse=True) + >>> inverse + 1 1 + 4 4 + 9 1 + 16 16 + 25 1 + dtype: int64 + + This can be used to reconstruct the original object from its unique + elements as follows: + + >>> # same as animals.drop_duplicates() + >>> animals_unique = animals.loc[~isduplicate] + >>> animals_unique + 1 lama + 4 cow + 16 beetle + dtype: object + >>> + >>> reconstruct = animals_unique.reindex(inverse) + >>> reconstruct + 1 lama + 4 cow + 1 lama + 16 beetle + 1 lama + dtype: object + + We see that the values of `animals` get reconstructed correctly, but + the index does not match yet -- consequently, the last step is to + correctly set the index. + + >>> reconstruct.index = inverse.index + >>> reconstruct + 1 lama + 4 cow + 9 lama + 16 beetle + 25 lama + dtype: object + >>> + >>> reconstruct.equals(animals) + True See Also -------- @@ -1662,7 +1721,8 @@ def duplicated(self, keep='first'): pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame pandas.Series.drop_duplicates : Remove duplicate values from Series """ - return super(Series, self).duplicated(keep=keep) + return super(Series, self).duplicated(keep=keep, + return_inverse=return_inverse) def idxmin(self, axis=0, skipna=True, *args, **kwargs): """ diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 940692ec5b46a..43f312fa3ebb8 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -88,6 +88,75 @@ def test_duplicated_subset(subset, keep): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize('keep, expected_inv_values', [ + ('first', [1, 4, 4, 16, 1]), + ('last', [25, 9, 9, 16, 25]) +]) +def test_duplicated_inverse(keep, expected_inv_values): + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + idx = [1, 4, 9, 16, 25] + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}, + index=idx) + + expected_isdup = df.duplicated(keep=keep) + expected_inv = Series(expected_inv_values, index=idx) + result_isdup, result_inv = df.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + tm.assert_series_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = df.loc[~expected_isdup] + reconstr = unique.reindex(result_inv).set_index(result_inv.index) + tm.assert_frame_equal(reconstr, df) + + +def test_duplicated_inverse_raises(): + df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']}) + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + df.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +def test_duplicated_inverse_large(subset, keep): + # unsorted index important to check 'first'/'last' functionality + df = DataFrame(np.random.randint(0, 10, (10000, 3)), + columns=list('ABC')).sample(5000) + + expected_isdup = df.duplicated(keep=keep, subset=subset) + result_isdup, result_inv = df.duplicated(keep=keep, subset=subset, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, string_types): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + unique = df.loc[~expected_isdup, subset] + reconstr = unique.reindex(result_inv).set_index(result_inv.index) + tm.assert_frame_equal(reconstr, df[subset]) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(keep): + df = DataFrame({'A': range(10)}) # no duplicates + + expected_isdup = df.duplicated(keep=keep) + result_isdup, result_inv = df.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + expected_inv = Series(range(10)) + tm.assert_series_equal(result_inv, expected_inv) + + def test_drop_duplicates(): df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'bar', 'foo'], diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 56f59851d6d04..ba6f10e521824 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -378,6 +378,82 @@ def test_duplicated(self, indices, keep): result = idx.duplicated(keep=keep) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize('keep', ['first', 'last']) + def test_duplicated_inverse(self, indices, keep): + # check that return_inverse kwarg does not affect outcome + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + if not len(indices) or isinstance(indices, MultiIndex): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates + pytest.skip('Skip check for empty Index and MultiIndex') + + idx = self._holder(indices) + if idx.has_duplicates: + # We need to be able to control creation of duplicates here + # This is slightly circular, as drop_duplicates depends on + # duplicated, but in the end, it all works out because we + # cross-check with Series.duplicated + idx = idx.drop_duplicates() + + n, k = len(idx), 10 + duplicated_selection = np.random.choice(n, k * n) + idx = self._holder(idx.values[duplicated_selection]) + + expected_isdup = idx.duplicated(keep=keep) + if keep == 'first': + _, tmp_ind, tmp_inv = np.unique(idx, return_index=True, + return_inverse=True) + else: # 'last' + # switch order before calling unique then restore correct ordering + # for tmp_ind, tmp_inv + _, tmp_ind, tmp_inv = np.unique(idx[::-1], return_index=True, + return_inverse=True) + tmp_ind = np.arange(len(idx))[::-1][tmp_ind] + tmp_inv = tmp_inv[::-1] + # explanation in pandas.core.algorithms.duplicated + expected_inv = np.argsort(np.argsort(tmp_ind))[tmp_inv] + + result_isdup, result_inv = idx.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + tm.assert_numpy_array_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = idx[~expected_isdup] + reconstr = unique[result_inv] + tm.assert_index_equal(reconstr, idx) + + def test_duplicated_inverse_raises(self, indices): + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + self._holder(indices).duplicated(keep=False, return_inverse=True) + + @pytest.mark.parametrize('keep', ['first', 'last']) + def test_duplicated_inverse_fastpath(self, indices, keep): + if type(indices) is not self._holder: + pytest.skip('Can only check if we have the correct type') + if not len(indices) or isinstance(indices, MultiIndex): + # MultiIndex tested separately in: + # tests/indexes/multi/test_unique_and_duplicates + pytest.skip('Skip check for empty Index and MultiIndex') + + idx = self._holder(indices) + if idx.has_duplicates: + # fastpath only possible if no duplicates + idx = idx.drop_duplicates() + + expected_isdup = idx.duplicated(keep=keep) + result_isdup, result_inv = idx.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + expected_inv = np.arange(len(idx)) + tm.assert_numpy_array_equal(result_inv, expected_inv) + def test_unique(self, indices): # don't test a MultiIndex here (as its tested separated) # don't test a CategoricalIndex because categories change (GH 18291) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 1cdf0ca6e013e..171e9a3772fd9 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -212,29 +212,6 @@ def f(a): check(8, True) -@pytest.mark.parametrize('keep, expected', [ - ('first', np.array([False, False, False, True, True, False])), - ('last', np.array([False, True, True, False, False, False])), - (False, np.array([False, True, True, True, True, False])) -]) -def test_duplicated(idx_dup, keep, expected): - result = idx_dup.duplicated(keep=keep) - tm.assert_numpy_array_equal(result, expected) - - -@pytest.mark.parametrize('keep', ['first', 'last', False]) -def test_duplicated_large(keep): - # GH 9125 - n, k = 200, 5000 - levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] - labels = [np.random.choice(n, k * n) for lev in levels] - mi = MultiIndex(levels=levels, labels=labels) - - result = mi.duplicated(keep=keep) - expected = hashtable.duplicated_object(mi.values, keep=keep) - tm.assert_numpy_array_equal(result, expected) - - def test_get_duplicates(): # GH5873 for a in [101, 102]: @@ -264,3 +241,84 @@ def test_get_duplicates(): tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool')) + + +@pytest.mark.parametrize('keep, expected', [ + ('first', np.array([False, False, False, True, True, False])), + ('last', np.array([False, True, True, False, False, False])), + (False, np.array([False, True, True, True, True, False])) +]) +def test_duplicated(idx_dup, keep, expected): + result = idx_dup.duplicated(keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last', False]) +def test_duplicated_large(keep): + # GH 9125 + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + labels = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, labels=labels) + + result = mi.duplicated(keep=keep) + expected = hashtable.duplicated_object(mi.values, keep=keep) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse(idx_dup, keep): + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + + expected_isdup = idx_dup.duplicated(keep=keep) + expected_inv = np.array([0, 1, 2, 1, 2, 3], dtype='int64') + result_isdup, result_inv = idx_dup.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + tm.assert_numpy_array_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = MultiIndex.from_tuples(idx_dup.values[~expected_isdup]) + reconstr = MultiIndex.from_tuples(unique.values[result_inv], + names=idx_dup.names) + tm.assert_index_equal(reconstr, idx_dup) + + +def test_duplicated_inverse_raises(idx_dup): + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + idx_dup.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_large(keep): + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + labels = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, labels=labels) + + expected_isdup = mi.duplicated(keep=keep) + result_isdup, result_inv = mi.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + # test that result_inv works (and fits together with expected_isdup) + unique = MultiIndex.from_tuples(mi.values[~expected_isdup]) + reconstr = MultiIndex.from_tuples(unique.values[result_inv], + names=mi.names) + tm.assert_index_equal(reconstr, mi) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(idx_dup, keep): + # fastpath is only taken if there are no duplicates + mi = idx_dup.drop_duplicates() + + expected_isdup = mi.duplicated(keep=keep) + result_isdup, result_inv = mi.duplicated(keep=keep, + return_inverse=True) + tm.assert_numpy_array_equal(result_isdup, expected_isdup) + + expected_inv = np.arange(4) + tm.assert_numpy_array_equal(result_inv, expected_inv) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 2e4d64188307c..8f264f85c8cda 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -138,3 +138,66 @@ def test_duplicated_nan_none(keep, expected): result = s.duplicated(keep=keep) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('keep, expected_inv_values', [ + ('first', [1, 4, 4, 16, 1]), + ('last', [25, 9, 9, 16, 25]) +]) +def test_duplicated_inverse(keep, expected_inv_values): + # check that return_inverse kwarg does not affect outcome; + # index of inverse must be correctly transformed as well + idx = [1, 4, 9, 16, 25] + s = Series(['a', 'b', 'b', 'c', 'a'], index=idx) + + expected_isdup = s.duplicated(keep=keep) + expected_inv = Series(expected_inv_values, index=idx) + result_isdup, result_inv = s.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + tm.assert_series_equal(result_inv, expected_inv) + + # test that result_inv works (and fits together with expected_isdup) + unique = s.loc[~expected_isdup] + reconstr = unique.reindex(result_inv) + # Series has no set_index (GH21684) + reconstr.index = result_inv.index + tm.assert_series_equal(reconstr, s) + + +def test_duplicated_inverse_raises(): + s = Series(['a', 'b', 'b', 'c', 'a']) + + rgx = 'The parameters return_inverse=True and keep=False cannot be.*' + with tm.assert_raises_regex(ValueError, rgx): + s.duplicated(keep=False, return_inverse=True) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_large(keep): + # unsorted index important to check 'first'/'last' functionality + s = Series(np.random.randint(0, 1000, 10000)).sample(5000) + + expected_isdup = s.duplicated(keep=keep) + result_isdup, result_inv = s.duplicated(keep=keep, return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + # test that result_inv works (and fits together with expected_isdup) + unique = s.loc[~expected_isdup] + reconstr = unique.reindex(result_inv) + # Series has no set_index (GH21684) + reconstr.index = result_inv.index + tm.assert_series_equal(reconstr, s) + + +@pytest.mark.parametrize('keep', ['first', 'last']) +def test_duplicated_inverse_fastpath(keep): + s = Series(range(10)) # no duplicates + + expected_isdup = s.duplicated(keep=keep) + result_isdup, result_inv = s.duplicated(keep=keep, + return_inverse=True) + tm.assert_series_equal(result_isdup, expected_isdup) + + expected_inv = Series(range(10)) + tm.assert_series_equal(result_inv, expected_inv) From 6e005c2438ea0149570c112c42a4fe847fdc85f3 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 Aug 2018 23:38:10 +0200 Subject: [PATCH 2/7] Review (gfyoung) --- doc/source/whatsnew/v0.24.0.txt | 10 +++------- pandas/core/frame.py | 7 +++++++ pandas/core/indexes/base.py | 7 +++++++ pandas/core/series.py | 7 +++++++ pandas/tests/frame/test_duplicates.py | 13 ++++--------- pandas/tests/indexes/common.py | 1 + pandas/tests/indexes/multi/test_duplicates.py | 1 + pandas/tests/series/test_duplicates.py | 1 + 8 files changed, 31 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c829c2bf1cc1f..16420b8ce562a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -161,11 +161,11 @@ This is the same behavior as ``Series.values`` for categorical data. See .. _whatsnew_0240.enhancements.duplicated_inverse: -The ``duplicated``-method has gained the ``return_inverse`` kwarg +The `duplicated`-method has gained the `return_inverse` kwarg ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, -which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) +The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword, +which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`). For ``Index`` objects, the inverse is an ``np.ndarray``: @@ -173,7 +173,6 @@ For ``Index`` objects, the inverse is an ``np.ndarray``: .. ipython:: python idx = pd.Index(['a', 'b', 'b', 'c', 'a']) - idx.has_duplicates isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first' isduplicate inverse @@ -205,9 +204,6 @@ which contains the mapping from the index of the deduplicated, unique subset bac reconstruct = unique.reindex(inverse.values).set_index(inverse.index) reconstruct.equals(df) -The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible -to construct an inverse). - .. _whatsnew_0240.enhancements.other: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 49f681771d4db..6e01f72443eb2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4366,6 +4366,13 @@ def duplicated(self, subset=None, keep='first', return_inverse=False): ------- duplicated : Series or tuple of Series if return_inverse is True + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). + Examples -------- By default, for each set of duplicated values, the first occurrence is diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2a5e4958596f5..f86f8854fce41 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4600,6 +4600,13 @@ def duplicated(self, keep='first', return_inverse=False): ------- duplicated : ndarray or or tuple of ndarray if return_inverse is True + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). + Examples -------- By default, for each set of duplicated values, the first occurrence is diff --git a/pandas/core/series.py b/pandas/core/series.py index 476221ac54993..2ed94987ec8a7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1621,6 +1621,13 @@ def duplicated(self, keep='first', return_inverse=False): ------- duplicated : Series or or tuple of Series if return_inverse is True + Notes + ----- + The `return_inverse`-keyword works as expected for + ``keep='first'|'last'``, but cannot be used together with + ``keep=False`` (since discarding all duplicates makes it impossible to + construct an inverse). + Examples -------- By default, for each set of duplicated values, the first occurrence is diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 43f312fa3ebb8..6e276d03dab38 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -93,6 +93,7 @@ def test_duplicated_subset(subset, keep): ('last', [25, 9, 9, 16, 25]) ]) def test_duplicated_inverse(keep, expected_inv_values): + # GH 21357 # check that return_inverse kwarg does not affect outcome; # index of inverse must be correctly transformed as well idx = [1, 4, 9, 16, 25] @@ -121,9 +122,10 @@ def test_duplicated_inverse_raises(): @pytest.mark.parametrize('keep', ['first', 'last']) -@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A']) +@pytest.mark.parametrize('subset', [['A', 'B', 'C'], ['A', 'B'], ['A']]) def test_duplicated_inverse_large(subset, keep): - # unsorted index important to check 'first'/'last' functionality + # unsorted index (through .sample); important to check correct + # 'first'/'last' functionality of return_inverse df = DataFrame(np.random.randint(0, 10, (10000, 3)), columns=list('ABC')).sample(5000) @@ -132,13 +134,6 @@ def test_duplicated_inverse_large(subset, keep): return_inverse=True) tm.assert_series_equal(result_isdup, expected_isdup) - if subset is None: - subset = list(df.columns) - elif isinstance(subset, string_types): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - unique = df.loc[~expected_isdup, subset] reconstr = unique.reindex(result_inv).set_index(result_inv.index) tm.assert_frame_equal(reconstr, df[subset]) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ba6f10e521824..3f2430738e7f9 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -380,6 +380,7 @@ def test_duplicated(self, indices, keep): @pytest.mark.parametrize('keep', ['first', 'last']) def test_duplicated_inverse(self, indices, keep): + # GH 21357 # check that return_inverse kwarg does not affect outcome if type(indices) is not self._holder: pytest.skip('Can only check if we have the correct type') diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 171e9a3772fd9..44fcbbc692bb9 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -268,6 +268,7 @@ def test_duplicated_large(keep): @pytest.mark.parametrize('keep', ['first', 'last']) def test_duplicated_inverse(idx_dup, keep): + # GH 21357 # check that return_inverse kwarg does not affect outcome; # index of inverse must be correctly transformed as well diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 8f264f85c8cda..61a628081702d 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -145,6 +145,7 @@ def test_duplicated_nan_none(keep, expected): ('last', [25, 9, 9, 16, 25]) ]) def test_duplicated_inverse(keep, expected_inv_values): + # GH 21357 # check that return_inverse kwarg does not affect outcome; # index of inverse must be correctly transformed as well idx = [1, 4, 9, 16, 25] From 16c3103a36d174fd49a7dcc9892c497a24238b97 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 30 Aug 2018 23:46:13 +0200 Subject: [PATCH 3/7] Removed explicit test for array of inverse; tested implicitly --- pandas/tests/indexes/common.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 3f2430738e7f9..cc35744a69002 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -402,23 +402,9 @@ def test_duplicated_inverse(self, indices, keep): idx = self._holder(idx.values[duplicated_selection]) expected_isdup = idx.duplicated(keep=keep) - if keep == 'first': - _, tmp_ind, tmp_inv = np.unique(idx, return_index=True, - return_inverse=True) - else: # 'last' - # switch order before calling unique then restore correct ordering - # for tmp_ind, tmp_inv - _, tmp_ind, tmp_inv = np.unique(idx[::-1], return_index=True, - return_inverse=True) - tmp_ind = np.arange(len(idx))[::-1][tmp_ind] - tmp_inv = tmp_inv[::-1] - # explanation in pandas.core.algorithms.duplicated - expected_inv = np.argsort(np.argsort(tmp_ind))[tmp_inv] - result_isdup, result_inv = idx.duplicated(keep=keep, return_inverse=True) tm.assert_numpy_array_equal(result_isdup, expected_isdup) - tm.assert_numpy_array_equal(result_inv, expected_inv) # test that result_inv works (and fits together with expected_isdup) unique = idx[~expected_isdup] From 18a0de7367f50c1a0558a99a723ba3a7c1c9702c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 31 Aug 2018 07:54:54 +0200 Subject: [PATCH 4/7] Improve comments, fix doc string errors --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 2 +- pandas/tests/indexes/common.py | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 181dd8ab7f9bf..4a40716fbee76 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4382,8 +4382,8 @@ def duplicated(self, subset=None, keep='first', return_inverse=False): set on False and all others on True: >>> data = {'species': ['lama', 'cow', 'lama', 'ant', 'lama', 'bee'], - 'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']} - >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25]) + ... 'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']} + >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25, 36]) >>> animals species type 1 lama mammal diff --git a/pandas/core/series.py b/pandas/core/series.py index b473a70fb8258..19260efc51bd9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1633,7 +1633,7 @@ def duplicated(self, keep='first', return_inverse=False): set on False and all others on True: >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'], - index=[1, 4, 9, 16, 25]) + ... index=[1, 4, 9, 16, 25]) >>> animals.duplicated() 1 False 4 False diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index cc35744a69002..544a4855fdb32 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -406,7 +406,9 @@ def test_duplicated_inverse(self, indices, keep): return_inverse=True) tm.assert_numpy_array_equal(result_isdup, expected_isdup) - # test that result_inv works (and fits together with expected_isdup) + # the following tests the correctness of result_inv in two ways: + # - it needs to fit together with expected_isdup + # - it needs to correctly reconstruct the object unique = idx[~expected_isdup] reconstr = unique[result_inv] tm.assert_index_equal(reconstr, idx) From db8693f97b84432beb8799c6312143bf2d078265 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Tue, 11 Sep 2018 08:27:03 +0200 Subject: [PATCH 5/7] Review (jorisvandenbossche) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/frame.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d8820c87221e5..01bebbdeb8cc5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -164,7 +164,7 @@ This is the same behavior as ``Series.values`` for categorical data. See The `duplicated`-method has gained the `return_inverse` kwarg ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword, +The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`). diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 96996893278e0..449ab241886ad 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4382,12 +4382,11 @@ def duplicated(self, subset=None, keep='first', return_inverse=False): - False : Mark all duplicates as ``True``. This option is not compatible with ``return_inverse``. return_inverse : boolean, default False - If True, also return the selection from the index from the - DataFrame of unique values (created e.g. by selecting the boolean - complement of the first output, or by using `.drop_duplicates` with - the same `keep`-parameter) and how they relate to the index of the - current DataFrame. This allows to reconstruct the original - DataFrame from the subset of unique values, see example below. + If True, also return a Series mapping the index of the current + DataFrame to the index after deduplication (created e.g. by using + `.drop_duplicates` or by selecting everything that is not + duplicate). This allows to reconstruct the original DataFrame from + the subset of deduplicated (=unique) values, see example below. .. versionadded:: 0.24.0 From d4e803e09d58d94d346215ec62722c7ee2d5df4b Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Thu, 13 Sep 2018 08:35:34 +0200 Subject: [PATCH 6/7] Add reference to method in whatsnew --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 01bebbdeb8cc5..b10e7d0c3d058 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -164,7 +164,7 @@ This is the same behavior as ``Series.values`` for categorical data. See The `duplicated`-method has gained the `return_inverse` kwarg ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, +The :meth:`~DataFrame.duplicated`-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword, which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple) that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`). From b8cbb13938e4103ffde8b8c8698fe1c971a08199 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sun, 23 Sep 2018 23:40:43 +0200 Subject: [PATCH 7/7] Review (jreback) --- pandas/core/algorithms.py | 54 ++++++++++++++++++++------------------- pandas/core/base.py | 21 +++++++-------- pandas/core/frame.py | 11 ++++---- 3 files changed, 45 insertions(+), 41 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 181d2fbc5f601..647af764caeab 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -808,44 +808,46 @@ def duplicated(values, keep='first', return_inverse=False): values, dtype, ndtype = _ensure_data(values) f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype)) - isdup = f(values, keep=keep) + isduplicate = f(values, keep=keep) if not return_inverse: - return isdup - elif not isdup.any(): + return isduplicate + elif not isduplicate.any(): # no need to calculate inverse if no duplicates - inv = np.arange(len(values)) - return isdup, inv + inverse = np.arange(len(values)) + return isduplicate, inverse if keep == 'first': - # o2u: original indices to indices of ARRAY of unique values - # u2o: reduplication from array of unique values to original array - # this fits together in the way that values[o2u] are the unique values - # and values[o2u][u2o] == values - _, o2u, u2o = np.unique(values, return_index=True, - return_inverse=True) + # values2unique: original indices to indices of ARRAY of unique values + # unique2values: reduplication from array of uniques to original array + # this fits together in the way that values[values2unique] are the + # unique values and values[values2unique][unique2values] == values + _, values2unique, unique2values = np.unique(values, return_index=True, + return_inverse=True) elif keep == 'last': - # np.unique takes first occurrence as unique value, + # np.unique takes first occurrence per unique value, # so we flip values that first becomes last values = values[::-1] - _, o2u, u2o = np.unique(values, return_index=True, - return_inverse=True) + _, values2unique, unique2values = np.unique(values, return_index=True, + return_inverse=True) # the values in "values" correspond(ed) to the index of "values", # which is simply np.arange(len(values)). # By flipping "values" around, we need to do the same for the index, - # ___because o2u and u2o are relative to that order___. + # _because values2unique and unique2values are relative to that order_. # Finally, to fit with the original order again, we need to flip the # result around one last time. - o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1] - - # np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative - # to this order. To restore the original order, we argsort o2u, because o2u - # would be ordered if np.unique had not sorted implicitly. The first - # argsort gives the permutation from o2u to its sorted form, but we need - # the inverse permutation (the map from the unsorted uniques to o2u, from - # which we can continue with u2o). This inversion (as a permutation) is - # achieved by the second argsort. - inv = np.argsort(np.argsort(o2u))[u2o] - return isdup, inv + values2unique = np.arange(len(values))[::-1][values2unique] + unique2values = unique2values[::-1] + + # np.unique yields a ___sorted___ list of uniques, and values2unique resp. + # unique2values are relative to this order. To restore the original order, + # we argsort values2unique, because values2unique would be ordered if + # np.unique had not sorted implicitly. + # The first argsort gives the permutation from values2unique to its sorted + # form, but we need the inverse permutation (the map from the unsorted + # uniques to values2unique, from which we can continue with unique2values). + # This inversion (as a permutation) is achieved by the second argsort. + inverse = np.argsort(np.argsort(values2unique))[unique2values] + return isduplicate, inverse def mode(values, dropna=True): diff --git a/pandas/core/base.py b/pandas/core/base.py index 2ab7f92e2dc03..f6f8661322550 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1257,10 +1257,10 @@ def duplicated(self, keep='first', return_inverse=False): if isinstance(self, ABCIndexClass): if self.is_unique: - isdup = np.zeros(len(self), dtype=np.bool) + isduplicate = np.zeros(len(self), dtype=np.bool) if not return_inverse: - return isdup - return isdup, np.arange(len(self)) + return isduplicate + return isduplicate, np.arange(len(self)) # core.algorithms.duplicated has the same output signature as # Index.duplicated -> no need to distinguish cases here return duplicated(self, keep=keep, return_inverse=return_inverse) @@ -1271,13 +1271,14 @@ def duplicated(self, keep='first', return_inverse=False): index=self.index).__finalize__(self) # return_inverse = True - isdup_array, inv_array = duplicated(self, keep=keep, - return_inverse=True) - isdup = self._constructor(isdup_array, - index=self.index).__finalize__(self) - inv = self._constructor(self.loc[~isdup_array].index[inv_array], - index=self.index) - return isdup, inv + isduplicate_array, inverse_array = duplicated(self, keep=keep, + return_inverse=True) + isduplicate = self._constructor(isduplicate_array, + index=self.index).__finalize__(self) + inverse = self._constructor( + self.loc[~isduplicate_array].index[inverse_array], + index=self.index) + return isduplicate, inverse # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 449ab241886ad..b9c15304a3746 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4561,11 +4561,12 @@ def f(vals): return Series(duplicated(ids, keep=keep), index=self.index) # return_inverse = True - isdup_array, inv_array = duplicated(ids, keep=keep, - return_inverse=True) - isdup = Series(isdup_array, index=self.index) - inv = Series(self.loc[~isdup_array].index[inv_array], index=self.index) - return isdup, inv + isduplicated_array, inverse_array = duplicated(ids, keep=keep, + return_inverse=True) + isduplicated = Series(isduplicated_array, index=self.index) + inverse = Series(self.loc[~isduplicated_array].index[inverse_array], + index=self.index) + return isduplicated, inverse # ---------------------------------------------------------------------- # Sorting