From ebfffc20861db8dd2fd784fc8a1878db00f74edb Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Mon, 16 Jul 2018 18:20:08 +0200
Subject: [PATCH 1/7] ENH: add return_inverse kwarg to duplicated-method
 (rebased)

---
 asv_bench/benchmarks/frame_methods.py         |  26 ++-
 asv_bench/benchmarks/index_object.py          |  18 ++
 asv_bench/benchmarks/multiindex_object.py     |  13 +-
 asv_bench/benchmarks/series_methods.py        |  18 ++
 doc/source/whatsnew/v0.24.0.txt               |  50 ++++++
 pandas/core/algorithms.py                     |  59 ++++++-
 pandas/core/base.py                           |  31 +++-
 pandas/core/frame.py                          | 154 +++++++++++++++++-
 pandas/core/indexes/base.py                   |  49 +++++-
 pandas/core/indexes/category.py               |   6 +-
 pandas/core/indexes/multi.py                  |  13 +-
 pandas/core/series.py                         | 120 ++++++++++----
 pandas/tests/frame/test_duplicates.py         |  69 ++++++++
 pandas/tests/indexes/common.py                |  76 +++++++++
 pandas/tests/indexes/multi/test_duplicates.py | 104 +++++++++---
 pandas/tests/series/test_duplicates.py        |  63 +++++++
 16 files changed, 777 insertions(+), 92 deletions(-)

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 1819cfa2725db..50789d64edda1 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -412,21 +412,35 @@ def time_frame_nunique(self):
 class Duplicated(object):
 
     goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
 
-    def setup(self):
         n = (1 << 20)
         t = date_range('2015-01-01', freq='S', periods=(n // 64))
         xs = np.random.randn(n // 64).round(2)
         self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
                              'b': np.random.choice(t, n),
                              'c': np.random.choice(xs, n)})
-        self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
+        # df2 will not have any duplicates
+        self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
+
+        df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
+                        columns=list('ABCDE'))
+        df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
+        self.df3 = df3
+
+    def time_frame_duplicated(self, keep, return_inverse):
+        self.df.duplicated(keep=keep, return_inverse=return_inverse)
 
-    def time_frame_duplicated(self):
-        self.df.duplicated()
+    def time_frame_duplicated_wide(self, keep, return_inverse):
+        self.df2.duplicated(keep=keep, return_inverse=return_inverse)
 
-    def time_frame_duplicated_wide(self):
-        self.df2.duplicated()
+    def time_frame_duplicated_mixed(self, keep, return_inverse):
+        self.df3.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class XS(object):
diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py
index f1703e163917a..a3a7f7f17d332 100644
--- a/asv_bench/benchmarks/index_object.py
+++ b/asv_bench/benchmarks/index_object.py
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
         self.index % 2
 
 
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.idx = Index(base[np.random.choice(n, k * n)])
+
+    def time_duplicated(self, keep, return_inverse):
+        self.idx.duplicated(keep=keep, return_inverse=return_inverse)
+
+
 class Range(object):
 
     goal_time = 0.2
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
index 0c92214795557..ac73c2d9c72dc 100644
--- a/asv_bench/benchmarks/multiindex_object.py
+++ b/asv_bench/benchmarks/multiindex_object.py
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
 class Duplicated(object):
 
     goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
 
-    def setup(self):
-        n, k = 200, 5000
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
         levels = [np.arange(n),
                   tm.makeStringIndex(n).values,
                   1000 + np.arange(n)]
         labels = [np.random.choice(n, (k * n)) for lev in levels]
         self.mi = MultiIndex(levels=levels, labels=labels)
 
-    def time_duplicated(self):
-        self.mi.duplicated()
+    def time_duplicated(self, keep, return_inverse):
+        self.mi.duplicated(keep=keep, return_inverse=return_inverse)
 
 
 class Sortlevel(object):
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
index a26c5d89bc483..cc08355b61e88 100644
--- a/asv_bench/benchmarks/series_methods.py
+++ b/asv_bench/benchmarks/series_methods.py
@@ -192,3 +192,21 @@ def setup(self):
 
     def time_series_datetimeindex_repr(self):
         getattr(self.s, 'a', None)
+
+
+class Duplicated(object):
+
+    goal_time = 0.2
+    params = (['first', 'last', False], [True, False])
+    param_names = ['keep', 'return_inverse']
+
+    def setup(self, keep, return_inverse):
+        if keep is False and return_inverse:
+            raise NotImplementedError
+
+        n, k = 200, 1000
+        base = tm.makeStringIndex(n)
+        self.s = Series(base[np.random.choice(n, k * n)])
+
+    def time_series_duplicated(self, keep, return_inverse):
+        self.s.duplicated(keep=keep, return_inverse=return_inverse)
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 317b8b8878308..c829c2bf1cc1f 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -159,6 +159,56 @@ This is the same behavior as ``Series.values`` for categorical data. See
 :ref:`whatsnew_0240.api_breaking.interval_values` for more.
 
 
+.. _whatsnew_0240.enhancements.duplicated_inverse:
+
+The ``duplicated``-method has gained the ``return_inverse`` kwarg
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
+which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
+that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
+
+For ``Index`` objects, the inverse is an ``np.ndarray``:
+
+.. ipython:: python
+
+    idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
+    idx.has_duplicates
+    isduplicate, inverse = idx.duplicated(return_inverse=True)  # default: keep='first'
+    isduplicate
+    inverse
+
+This allows to reconstruct the original ``Index`` as follows:
+
+.. ipython:: python
+
+    unique = idx[~isduplicate]  # same as idx.drop_duplicates()
+    unique
+
+    reconstruct = unique[inverse]
+    reconstruct.equals(idx)
+
+For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
+which contains the mapping from the index of the deduplicated, unique subset back to the original index.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                      index=[1, 4, 9, 16, 25])
+    df
+    isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
+    isduplicate
+    inverse
+
+    unique = df.loc[~isduplicate]  # same as df.drop_duplicates(keep='last')
+    unique
+    reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
+    reconstruct.equals(df)
+
+The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
+to construct an inverse).
+
+
 .. _whatsnew_0240.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e5b6c84d37541..181d2fbc5f601 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -770,7 +770,7 @@ def _value_counts_arraylike(values, dropna):
     return keys, counts
 
 
-def duplicated(values, keep='first'):
+def duplicated(values, keep='first', return_inverse=False):
     """
     Return boolean ndarray denoting duplicate values.
 
@@ -785,16 +785,67 @@ def duplicated(values, keep='first'):
           occurrence.
         - ``last`` : Mark duplicates as ``True`` except for the last
           occurrence.
-        - False : Mark all duplicates as ``True``.
+        - False : Mark all duplicates as ``True``. This option is not
+          compatible with ``return_inverse``.
+    return_inverse : boolean, default False
+        If True, also return the selection of (integer) indices from the array
+        of unique values (created e.g. by selecting the boolean complement of
+        the first output, or by using `.drop_duplicates` with the same
+        `keep`-parameter) that can be used to reconstruct "values".
+
+        .. versionadded:: 0.24.0
 
     Returns
     -------
-    duplicated : ndarray
+    duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True
     """
 
+    if return_inverse and keep is False:
+        raise ValueError("The parameters return_inverse=True and "
+                         "keep=False cannot be used together (impossible "
+                         "to calculate an inverse when discarding all "
+                         "instances of a duplicate).")
+
     values, dtype, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
-    return f(values, keep=keep)
+    isdup = f(values, keep=keep)
+    if not return_inverse:
+        return isdup
+    elif not isdup.any():
+        # no need to calculate inverse if no duplicates
+        inv = np.arange(len(values))
+        return isdup, inv
+
+    if keep == 'first':
+        # o2u: original indices to indices of ARRAY of unique values
+        # u2o: reduplication from array of unique values to original array
+        # this fits together in the way that values[o2u] are the unique values
+        # and values[o2u][u2o] == values
+        _, o2u, u2o = np.unique(values, return_index=True,
+                                return_inverse=True)
+    elif keep == 'last':
+        # np.unique takes first occurrence as unique value,
+        # so we flip values that first becomes last
+        values = values[::-1]
+        _, o2u, u2o = np.unique(values, return_index=True,
+                                return_inverse=True)
+        # the values in "values" correspond(ed) to the index of "values",
+        # which is simply np.arange(len(values)).
+        # By flipping "values" around, we need to do the same for the index,
+        # ___because o2u and u2o are relative to that order___.
+        # Finally, to fit with the original order again, we need to flip the
+        # result around one last time.
+        o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1]
+
+    # np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative
+    # to this order. To restore the original order, we argsort o2u, because o2u
+    # would be ordered if np.unique had not sorted implicitly. The first
+    # argsort gives the permutation from o2u to its sorted form, but we need
+    # the inverse permutation (the map from the unsorted uniques to o2u, from
+    # which we can continue with u2o). This inversion (as a permutation) is
+    # achieved by the second argsort.
+    inv = np.argsort(np.argsort(o2u))[u2o]
+    return isdup, inv
 
 
 def mode(values, dropna=True):
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 084a976320d77..2ab7f92e2dc03 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1246,16 +1246,39 @@ def drop_duplicates(self, keep='first', inplace=False):
         else:
             return result
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         from pandas.core.algorithms import duplicated
+
+        if return_inverse and keep is False:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
+
         if isinstance(self, ABCIndexClass):
             if self.is_unique:
-                return np.zeros(len(self), dtype=np.bool)
-            return duplicated(self, keep=keep)
-        else:
+                isdup = np.zeros(len(self), dtype=np.bool)
+                if not return_inverse:
+                    return isdup
+                return isdup, np.arange(len(self))
+            # core.algorithms.duplicated has the same output signature as
+            # Index.duplicated -> no need to distinguish cases here
+            return duplicated(self, keep=keep, return_inverse=return_inverse)
+
+        # Series case
+        if not return_inverse:
             return self._constructor(duplicated(self, keep=keep),
                                      index=self.index).__finalize__(self)
 
+        # return_inverse = True
+        isdup_array, inv_array = duplicated(self, keep=keep,
+                                            return_inverse=True)
+        isdup = self._constructor(isdup_array,
+                                  index=self.index).__finalize__(self)
+        inv = self._constructor(self.loc[~isdup_array].index[inv_array],
+                                index=self.index)
+        return isdup, inv
+
     # ----------------------------------------------------------------------
     # abstracts
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 052952103e28c..49f681771d4db 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4335,7 +4335,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False):
         else:
             return self[-duplicated]
 
-    def duplicated(self, subset=None, keep='first'):
+    def duplicated(self, subset=None, keep='first', return_inverse=False):
         """
         Return boolean Series denoting duplicate rows, optionally only
         considering certain columns
@@ -4350,14 +4350,150 @@ def duplicated(self, subset=None, keep='first'):
               first occurrence.
             - ``last`` : Mark duplicates as ``True`` except for the
               last occurrence.
-            - False : Mark all duplicates as ``True``.
+            - False : Mark all duplicates as ``True``. This option is not
+              compatible with ``return_inverse``.
+        return_inverse : boolean, default False
+            If True, also return the selection from the index from the
+            DataFrame of unique values (created e.g. by selecting the boolean
+            complement of the first output, or by using `.drop_duplicates` with
+            the same `keep`-parameter) and how they relate to the index of the
+            current DataFrame. This allows to reconstruct the original
+            DataFrame from the subset of unique values, see example below.
+
+            .. versionadded:: 0.24.0
 
         Returns
         -------
-        duplicated : Series
+        duplicated : Series or tuple of Series if return_inverse is True
+
+        Examples
+        --------
+        By default, for each set of duplicated values, the first occurrence is
+        set on False and all others on True:
+
+        >>> data = {'species': ['lama', 'cow', 'lama', 'ant', 'lama', 'bee'],
+                    'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']}
+        >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25])
+        >>> animals
+           species    type
+        1     lama  mammal
+        4      cow  mammal
+        9     lama  mammal
+        16     ant  insect
+        25    lama  mammal
+        36     bee  insect
+        >>>
+        >>> animals.duplicated()  # default: keep='first'
+        1     False
+        4     False
+        9      True
+        16    False
+        25     True
+        36    False
+        dtype: bool
+
+        By using `'last'`, the last occurrence of each set of duplicated values
+        is set to False and all others to True:
+
+        >>> animals.duplicated(keep='last')
+        1      True
+        4     False
+        9      True
+        16    False
+        25    False
+        36    False
+        dtype: bool
+
+        By specifying `keep=False`, all duplicates are set to True:
+
+        >>> animals.duplicated(keep=False)
+        1      True
+        4     False
+        9      True
+        16    False
+        25     True
+        36    False
+        dtype: bool
+
+        By specifying the `subset`-keyword, the duplicates will be calculated
+        based on just the subset of columns given
+
+        >>> animals.duplicated(subset=['type'])  # default: keep='first'
+        1     False
+        4      True
+        9      True
+        16    False
+        25     True
+        36     True
+        dtype: bool
+
+        Using the keyword `return_inverse=True`, the output becomes a tuple of
+        `Series`:
+
+        >>> isduplicate, inverse = animals.duplicated(return_inverse=True)
+        >>> inverse
+        1      1
+        4      4
+        9      1
+        16    16
+        25     1
+        36    36
+        dtype: int64
+
+        This can be used to reconstruct the original object from its unique
+        elements as follows:
+
+        >>> # same as animals.drop_duplicates()
+        >>> animals_unique = animals.loc[~isduplicate]
+        >>> animals_unique
+           species    type
+        1     lama  mammal
+        4      cow  mammal
+        16     ant  insect
+        36     bee  insect
+        >>>
+        >>> reconstruct = animals_unique.reindex(inverse)
+        >>> reconstruct
+           species    type
+        1     lama  mammal
+        4      cow  mammal
+        1     lama  mammal
+        16     ant  insect
+        1     lama  mammal
+        36     bee  insect
+
+        We see that the values of `animals` get reconstructed correctly, but
+        the index does not match yet -- consequently, the last step is to
+        correctly set the index.
+
+        >>> reconstruct = reconstruct.set_index(inverse.index)
+        >>> reconstruct
+           species    type
+        1     lama  mammal
+        4      cow  mammal
+        9     lama  mammal
+        16     ant  insect
+        25    lama  mammal
+        36     bee  insect
+        >>>
+        >>> reconstruct.equals(animals)
+        True
+
+        See Also
+        --------
+        pandas.Index.duplicated : Equivalent method on pandas.Index
+        pandas.Series.duplicated : Equivalent method on pandas.Series
+        pandas.DataFrame.drop_duplicates : Remove duplicate values
         """
         from pandas.core.sorting import get_group_index
-        from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
+        from pandas._libs.hashtable import _SIZE_HINT_LIMIT
+        from pandas.core.algorithms import duplicated
+
+        if return_inverse and keep is False:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
 
         def f(vals):
             labels, shape = algorithms.factorize(
@@ -4383,7 +4519,15 @@ def f(vals):
         labels, shape = map(list, zip(*map(f, vals)))
 
         ids = get_group_index(labels, shape, sort=False, xnull=False)
-        return Series(duplicated_int64(ids, keep), index=self.index)
+        if not return_inverse:
+            return Series(duplicated(ids, keep=keep), index=self.index)
+
+        # return_inverse = True
+        isdup_array, inv_array = duplicated(ids, keep=keep,
+                                            return_inverse=True)
+        isdup = Series(isdup_array, index=self.index)
+        inv = Series(self.loc[~isdup_array].index[inv_array], index=self.index)
+        return isdup, inv
 
     # ----------------------------------------------------------------------
     # Sorting
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 34cfa0b23f082..2a5e4958596f5 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4568,7 +4568,7 @@ def drop_duplicates(self, keep='first'):
         """
         return super(Index, self).drop_duplicates(keep=keep)
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         """
         Indicate duplicate index values.
 
@@ -4585,7 +4585,20 @@ def duplicated(self, keep='first'):
               occurrence.
             - 'last' : Mark duplicates as ``True`` except for the last
               occurrence.
-            - ``False`` : Mark all duplicates as ``True``.
+            - ``False`` : Mark all duplicates as ``True``. This option is not
+              compatible with ``return_inverse``.
+        return_inverse : boolean, default False
+            If True, also return the selection of (integer) indices from the
+            Index with unique values (created e.g. by selecting the boolean
+            complement of the first output, or by using `.drop_duplicates` with
+            the same `keep`-parameter). This allows to reconstruct the original
+            Index from the subset of unique values, see example below.
+
+            .. versionadded:: 0.24.0
+
+        Returns
+        -------
+        duplicated : ndarray or or tuple of ndarray if return_inverse is True
 
         Examples
         --------
@@ -4601,20 +4614,37 @@ def duplicated(self, keep='first'):
         >>> idx.duplicated(keep='first')
         array([False, False,  True, False,  True])
 
-        By using 'last', the last occurrence of each set of duplicated values
-        is set on False and all others on True:
+        By using `'last'`, the last occurrence of each set of duplicated values
+        is set to False and all others to True:
 
         >>> idx.duplicated(keep='last')
         array([ True, False,  True, False, False])
 
-        By setting keep on ``False``, all duplicates are True:
+        By specifying `keep=False`, all duplicates are set to True:
 
         >>> idx.duplicated(keep=False)
         array([ True, False,  True, False,  True])
 
-        Returns
-        -------
-        numpy.ndarray
+        Using the keyword `return_inverse=True`, the output becomes a tuple of
+        `np.ndarray`:
+
+        >>> isduplicate, inverse = idx.duplicated(return_inverse=True)
+        >>> inverse
+        array([0, 1, 0, 2, 0], dtype=int64)
+
+        This can be used to reconstruct the original object from its unique
+        elements as follows:
+
+        >>> idx_unique = idx[~isduplicate]  # same as idx.drop_duplicates()
+        >>> idx_unique
+        Index(['lama', 'cow', 'beetle'], dtype='object')
+        >>>
+        >>> reconstruct = idx_unique[inverse]
+        >>> reconstruct
+        Index(['lama', 'cow', 'lama', 'beetle', 'lama'], dtype='object')
+        >>>
+        >>> reconstruct.equals(idx)
+        True
 
         See Also
         --------
@@ -4622,7 +4652,8 @@ def duplicated(self, keep='first'):
         pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
         pandas.Index.drop_duplicates : Remove duplicate values from Index
         """
-        return super(Index, self).duplicated(keep=keep)
+        return super(Index, self).duplicated(keep=keep,
+                                             return_inverse=return_inverse)
 
     _index_shared_docs['fillna'] = """
         Fill NA/NaN values with the specified value
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index ab180a13ab4f3..0a63fe5d08ccd 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -392,10 +392,10 @@ def unique(self, level=None):
                                   ordered=result.ordered)
 
     @Appender(Index.duplicated.__doc__)
-    def duplicated(self, keep='first'):
-        from pandas._libs.hashtable import duplicated_int64
+    def duplicated(self, keep='first', return_inverse=False):
+        from pandas.core.algorithms import duplicated
         codes = self.codes.astype('i8')
-        return duplicated_int64(codes, keep)
+        return duplicated(codes, keep=keep, return_inverse=return_inverse)
 
     def _to_safe_for_reshape(self):
         """ convert to object if we are a categorical """
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 5b2e3a76adf05..2f85beb107af4 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -903,14 +903,19 @@ def f(k, stringify):
         return hash_tuple(key)
 
     @Appender(Index.duplicated.__doc__)
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         from pandas.core.sorting import get_group_index
-        from pandas._libs.hashtable import duplicated_int64
+        from pandas.core.algorithms import duplicated
+
+        if return_inverse and keep is False:
+            raise ValueError("The parameters return_inverse=True and "
+                             "keep=False cannot be used together (impossible "
+                             "to calculate an inverse when discarding all "
+                             "instances of a duplicate).")
 
         shape = map(len, self.levels)
         ids = get_group_index(self.labels, shape, sort=False, xnull=False)
-
-        return duplicated_int64(ids, keep)
+        return duplicated(ids, keep=keep, return_inverse=return_inverse)
 
     def fillna(self, value=None, downcast=None):
         """
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 4558314d612d0..476221ac54993 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1590,7 +1590,7 @@ def drop_duplicates(self, keep='first', inplace=False):
         """
         return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)
 
-    def duplicated(self, keep='first'):
+    def duplicated(self, keep='first', return_inverse=False):
         """
         Indicate duplicate Series values.
 
@@ -1605,56 +1605,115 @@ def duplicated(self, keep='first'):
               occurrence.
             - 'last' : Mark duplicates as ``True`` except for the last
               occurrence.
-            - ``False`` : Mark all duplicates as ``True``.
+            - ``False`` : Mark all duplicates as ``True``. This option is not
+              compatible with ``return_inverse``.
+        return_inverse : boolean, default False
+            If True, also return the selection from the index from the Series
+            of unique values (created e.g. by selecting the boolean complement
+            of the first output, or by using `.drop_duplicates` with the same
+            `keep`-parameter) and how they relate to the index of the current
+            Series. This allows to reconstruct the original Series from the
+            subset of unique values, see example below.
+
+            .. versionadded:: 0.24.0
+
+        Returns
+        -------
+        duplicated : Series or or tuple of Series if return_inverse is True
 
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
         set on False and all others on True:
 
-        >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'])
+        >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'],
+                                index=[1, 4, 9, 16, 25])
         >>> animals.duplicated()
-        0    False
-        1    False
-        2     True
-        3    False
-        4     True
+        1     False
+        4     False
+        9      True
+        16    False
+        25     True
         dtype: bool
 
         which is equivalent to
 
         >>> animals.duplicated(keep='first')
-        0    False
-        1    False
-        2     True
-        3    False
-        4     True
+        1     False
+        4     False
+        9      True
+        16    False
+        25     True
         dtype: bool
 
-        By using 'last', the last occurrence of each set of duplicated values
-        is set on False and all others on True:
+        By using `'last'`, the last occurrence of each set of duplicated values
+        is set to False and all others to True:
 
         >>> animals.duplicated(keep='last')
-        0     True
-        1    False
-        2     True
-        3    False
-        4    False
+        1      True
+        4     False
+        9      True
+        16    False
+        25    False
         dtype: bool
 
-        By setting keep on ``False``, all duplicates are True:
+        By specifying `keep=False`, all duplicates are set to True:
 
         >>> animals.duplicated(keep=False)
-        0     True
-        1    False
-        2     True
-        3    False
-        4     True
+        1      True
+        4     False
+        9      True
+        16    False
+        25     True
         dtype: bool
 
-        Returns
-        -------
-        pandas.core.series.Series
+        Using the keyword `return_inverse=True`, the output becomes a tuple of
+        `Series`:
+
+        >>> isduplicate, inverse = animals.duplicated(return_inverse=True)
+        >>> inverse
+        1      1
+        4      4
+        9      1
+        16    16
+        25     1
+        dtype: int64
+
+        This can be used to reconstruct the original object from its unique
+        elements as follows:
+
+        >>> # same as animals.drop_duplicates()
+        >>> animals_unique = animals.loc[~isduplicate]
+        >>> animals_unique
+        1       lama
+        4        cow
+        16    beetle
+        dtype: object
+        >>>
+        >>> reconstruct = animals_unique.reindex(inverse)
+        >>> reconstruct
+        1       lama
+        4        cow
+        1       lama
+        16    beetle
+        1       lama
+        dtype: object
+
+        We see that the values of `animals` get reconstructed correctly, but
+        the index does not match yet  -- consequently, the last step is to
+        correctly set the index.
+
+        >>> reconstruct.index = inverse.index
+        >>> reconstruct
+        1       lama
+        4        cow
+        9       lama
+        16    beetle
+        25      lama
+        dtype: object
+        >>>
+        >>> reconstruct.equals(animals)
+        True
 
         See Also
         --------
@@ -1662,7 +1721,8 @@ def duplicated(self, keep='first'):
         pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame
         pandas.Series.drop_duplicates : Remove duplicate values from Series
         """
-        return super(Series, self).duplicated(keep=keep)
+        return super(Series, self).duplicated(keep=keep,
+                                              return_inverse=return_inverse)
 
     def idxmin(self, axis=0, skipna=True, *args, **kwargs):
         """
diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py
index 940692ec5b46a..43f312fa3ebb8 100644
--- a/pandas/tests/frame/test_duplicates.py
+++ b/pandas/tests/frame/test_duplicates.py
@@ -88,6 +88,75 @@ def test_duplicated_subset(subset, keep):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize('keep, expected_inv_values', [
+    ('first', [1, 4, 4, 16, 1]),
+    ('last', [25, 9, 9, 16, 25])
+])
+def test_duplicated_inverse(keep, expected_inv_values):
+    # check that return_inverse kwarg does not affect outcome;
+    # index of inverse must be correctly transformed as well
+    idx = [1, 4, 9, 16, 25]
+    df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
+                   index=idx)
+
+    expected_isdup = df.duplicated(keep=keep)
+    expected_inv = Series(expected_inv_values, index=idx)
+    result_isdup, result_inv = df.duplicated(keep=keep,
+                                             return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+    tm.assert_series_equal(result_inv, expected_inv)
+
+    # test that result_inv works (and fits together with expected_isdup)
+    unique = df.loc[~expected_isdup]
+    reconstr = unique.reindex(result_inv).set_index(result_inv.index)
+    tm.assert_frame_equal(reconstr, df)
+
+
+def test_duplicated_inverse_raises():
+    df = DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']})
+
+    rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
+    with tm.assert_raises_regex(ValueError, rgx):
+        df.duplicated(keep=False, return_inverse=True)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
+def test_duplicated_inverse_large(subset, keep):
+    # unsorted index important to check 'first'/'last' functionality
+    df = DataFrame(np.random.randint(0, 10, (10000, 3)),
+                   columns=list('ABC')).sample(5000)
+
+    expected_isdup = df.duplicated(keep=keep, subset=subset)
+    result_isdup, result_inv = df.duplicated(keep=keep, subset=subset,
+                                             return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+
+    if subset is None:
+        subset = list(df.columns)
+    elif isinstance(subset, string_types):
+        # need to have a DataFrame, not a Series
+        # -> select columns with singleton list, not string
+        subset = [subset]
+
+    unique = df.loc[~expected_isdup, subset]
+    reconstr = unique.reindex(result_inv).set_index(result_inv.index)
+    tm.assert_frame_equal(reconstr, df[subset])
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse_fastpath(keep):
+    df = DataFrame({'A': range(10)})  # no duplicates
+
+    expected_isdup = df.duplicated(keep=keep)
+    result_isdup, result_inv = df.duplicated(keep=keep,
+                                             return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+
+    expected_inv = Series(range(10))
+    tm.assert_series_equal(result_inv, expected_inv)
+
+
 def test_drop_duplicates():
     df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'bar', 'foo'],
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 56f59851d6d04..ba6f10e521824 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -378,6 +378,82 @@ def test_duplicated(self, indices, keep):
         result = idx.duplicated(keep=keep)
         tm.assert_numpy_array_equal(result, expected)
 
+    @pytest.mark.parametrize('keep', ['first', 'last'])
+    def test_duplicated_inverse(self, indices, keep):
+        # check that return_inverse kwarg does not affect outcome
+        if type(indices) is not self._holder:
+            pytest.skip('Can only check if we have the correct type')
+        if not len(indices) or isinstance(indices, MultiIndex):
+            # MultiIndex tested separately in:
+            # tests/indexes/multi/test_unique_and_duplicates
+            pytest.skip('Skip check for empty Index and MultiIndex')
+
+        idx = self._holder(indices)
+        if idx.has_duplicates:
+            # We need to be able to control creation of duplicates here
+            # This is slightly circular, as drop_duplicates depends on
+            # duplicated, but in the end, it all works out because we
+            # cross-check with Series.duplicated
+            idx = idx.drop_duplicates()
+
+        n, k = len(idx), 10
+        duplicated_selection = np.random.choice(n, k * n)
+        idx = self._holder(idx.values[duplicated_selection])
+
+        expected_isdup = idx.duplicated(keep=keep)
+        if keep == 'first':
+            _, tmp_ind, tmp_inv = np.unique(idx, return_index=True,
+                                            return_inverse=True)
+        else:  # 'last'
+            # switch order before calling unique then restore correct ordering
+            # for tmp_ind, tmp_inv
+            _, tmp_ind, tmp_inv = np.unique(idx[::-1], return_index=True,
+                                            return_inverse=True)
+            tmp_ind = np.arange(len(idx))[::-1][tmp_ind]
+            tmp_inv = tmp_inv[::-1]
+        # explanation in pandas.core.algorithms.duplicated
+        expected_inv = np.argsort(np.argsort(tmp_ind))[tmp_inv]
+
+        result_isdup, result_inv = idx.duplicated(keep=keep,
+                                                  return_inverse=True)
+        tm.assert_numpy_array_equal(result_isdup, expected_isdup)
+        tm.assert_numpy_array_equal(result_inv, expected_inv)
+
+        # test that result_inv works (and fits together with expected_isdup)
+        unique = idx[~expected_isdup]
+        reconstr = unique[result_inv]
+        tm.assert_index_equal(reconstr, idx)
+
+    def test_duplicated_inverse_raises(self, indices):
+        if type(indices) is not self._holder:
+            pytest.skip('Can only check if we have the correct type')
+
+        rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
+        with tm.assert_raises_regex(ValueError, rgx):
+            self._holder(indices).duplicated(keep=False, return_inverse=True)
+
+    @pytest.mark.parametrize('keep', ['first', 'last'])
+    def test_duplicated_inverse_fastpath(self, indices, keep):
+        if type(indices) is not self._holder:
+            pytest.skip('Can only check if we have the correct type')
+        if not len(indices) or isinstance(indices, MultiIndex):
+            # MultiIndex tested separately in:
+            # tests/indexes/multi/test_unique_and_duplicates
+            pytest.skip('Skip check for empty Index and MultiIndex')
+
+        idx = self._holder(indices)
+        if idx.has_duplicates:
+            # fastpath only possible if no duplicates
+            idx = idx.drop_duplicates()
+
+        expected_isdup = idx.duplicated(keep=keep)
+        result_isdup, result_inv = idx.duplicated(keep=keep,
+                                                  return_inverse=True)
+        tm.assert_numpy_array_equal(result_isdup, expected_isdup)
+
+        expected_inv = np.arange(len(idx))
+        tm.assert_numpy_array_equal(result_inv, expected_inv)
+
     def test_unique(self, indices):
         # don't test a MultiIndex here (as its tested separated)
         # don't test a CategoricalIndex because categories change (GH 18291)
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
index 1cdf0ca6e013e..171e9a3772fd9 100644
--- a/pandas/tests/indexes/multi/test_duplicates.py
+++ b/pandas/tests/indexes/multi/test_duplicates.py
@@ -212,29 +212,6 @@ def f(a):
     check(8, True)
 
 
-@pytest.mark.parametrize('keep, expected', [
-    ('first', np.array([False, False, False, True, True, False])),
-    ('last', np.array([False, True, True, False, False, False])),
-    (False, np.array([False, True, True, True, True, False]))
-])
-def test_duplicated(idx_dup, keep, expected):
-    result = idx_dup.duplicated(keep=keep)
-    tm.assert_numpy_array_equal(result, expected)
-
-
-@pytest.mark.parametrize('keep', ['first', 'last', False])
-def test_duplicated_large(keep):
-    # GH 9125
-    n, k = 200, 5000
-    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
-    labels = [np.random.choice(n, k * n) for lev in levels]
-    mi = MultiIndex(levels=levels, labels=labels)
-
-    result = mi.duplicated(keep=keep)
-    expected = hashtable.duplicated_object(mi.values, keep=keep)
-    tm.assert_numpy_array_equal(result, expected)
-
-
 def test_get_duplicates():
     # GH5873
     for a in [101, 102]:
@@ -264,3 +241,84 @@ def test_get_duplicates():
 
             tm.assert_numpy_array_equal(mi.duplicated(),
                                         np.zeros(len(mi), dtype='bool'))
+
+
+@pytest.mark.parametrize('keep, expected', [
+    ('first', np.array([False, False, False, True, True, False])),
+    ('last', np.array([False, True, True, False, False, False])),
+    (False, np.array([False, True, True, True, True, False]))
+])
+def test_duplicated(idx_dup, keep, expected):
+    result = idx_dup.duplicated(keep=keep)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last', False])
+def test_duplicated_large(keep):
+    # GH 9125
+    n, k = 200, 5000
+    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
+    labels = [np.random.choice(n, k * n) for lev in levels]
+    mi = MultiIndex(levels=levels, labels=labels)
+
+    result = mi.duplicated(keep=keep)
+    expected = hashtable.duplicated_object(mi.values, keep=keep)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse(idx_dup, keep):
+    # check that return_inverse kwarg does not affect outcome;
+    # index of inverse must be correctly transformed as well
+
+    expected_isdup = idx_dup.duplicated(keep=keep)
+    expected_inv = np.array([0, 1, 2, 1, 2, 3], dtype='int64')
+    result_isdup, result_inv = idx_dup.duplicated(keep=keep,
+                                                  return_inverse=True)
+    tm.assert_numpy_array_equal(result_isdup, expected_isdup)
+    tm.assert_numpy_array_equal(result_inv, expected_inv)
+
+    # test that result_inv works (and fits together with expected_isdup)
+    unique = MultiIndex.from_tuples(idx_dup.values[~expected_isdup])
+    reconstr = MultiIndex.from_tuples(unique.values[result_inv],
+                                      names=idx_dup.names)
+    tm.assert_index_equal(reconstr, idx_dup)
+
+
+def test_duplicated_inverse_raises(idx_dup):
+    rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
+    with tm.assert_raises_regex(ValueError, rgx):
+        idx_dup.duplicated(keep=False, return_inverse=True)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse_large(keep):
+    n, k = 200, 5000
+    levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
+    labels = [np.random.choice(n, k * n) for lev in levels]
+    mi = MultiIndex(levels=levels, labels=labels)
+
+    expected_isdup = mi.duplicated(keep=keep)
+    result_isdup, result_inv = mi.duplicated(keep=keep,
+                                             return_inverse=True)
+    tm.assert_numpy_array_equal(result_isdup, expected_isdup)
+
+    # test that result_inv works (and fits together with expected_isdup)
+    unique = MultiIndex.from_tuples(mi.values[~expected_isdup])
+    reconstr = MultiIndex.from_tuples(unique.values[result_inv],
+                                      names=mi.names)
+    tm.assert_index_equal(reconstr, mi)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse_fastpath(idx_dup, keep):
+    # fastpath is only taken if there are no duplicates
+    mi = idx_dup.drop_duplicates()
+
+    expected_isdup = mi.duplicated(keep=keep)
+    result_isdup, result_inv = mi.duplicated(keep=keep,
+                                             return_inverse=True)
+    tm.assert_numpy_array_equal(result_isdup, expected_isdup)
+
+    expected_inv = np.arange(4)
+    tm.assert_numpy_array_equal(result_inv, expected_inv)
diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py
index 2e4d64188307c..8f264f85c8cda 100644
--- a/pandas/tests/series/test_duplicates.py
+++ b/pandas/tests/series/test_duplicates.py
@@ -138,3 +138,66 @@ def test_duplicated_nan_none(keep, expected):
 
     result = s.duplicated(keep=keep)
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize('keep, expected_inv_values', [
+    ('first', [1, 4, 4, 16, 1]),
+    ('last', [25, 9, 9, 16, 25])
+])
+def test_duplicated_inverse(keep, expected_inv_values):
+    # check that return_inverse kwarg does not affect outcome;
+    # index of inverse must be correctly transformed as well
+    idx = [1, 4, 9, 16, 25]
+    s = Series(['a', 'b', 'b', 'c', 'a'], index=idx)
+
+    expected_isdup = s.duplicated(keep=keep)
+    expected_inv = Series(expected_inv_values, index=idx)
+    result_isdup, result_inv = s.duplicated(keep=keep,
+                                            return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+    tm.assert_series_equal(result_inv, expected_inv)
+
+    # test that result_inv works (and fits together with expected_isdup)
+    unique = s.loc[~expected_isdup]
+    reconstr = unique.reindex(result_inv)
+    # Series has no set_index (GH21684)
+    reconstr.index = result_inv.index
+    tm.assert_series_equal(reconstr, s)
+
+
+def test_duplicated_inverse_raises():
+    s = Series(['a', 'b', 'b', 'c', 'a'])
+
+    rgx = 'The parameters return_inverse=True and keep=False cannot be.*'
+    with tm.assert_raises_regex(ValueError, rgx):
+        s.duplicated(keep=False, return_inverse=True)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse_large(keep):
+    # unsorted index important to check 'first'/'last' functionality
+    s = Series(np.random.randint(0, 1000, 10000)).sample(5000)
+
+    expected_isdup = s.duplicated(keep=keep)
+    result_isdup, result_inv = s.duplicated(keep=keep, return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+
+    # test that result_inv works (and fits together with expected_isdup)
+    unique = s.loc[~expected_isdup]
+    reconstr = unique.reindex(result_inv)
+    # Series has no set_index (GH21684)
+    reconstr.index = result_inv.index
+    tm.assert_series_equal(reconstr, s)
+
+
+@pytest.mark.parametrize('keep', ['first', 'last'])
+def test_duplicated_inverse_fastpath(keep):
+    s = Series(range(10))  # no duplicates
+
+    expected_isdup = s.duplicated(keep=keep)
+    result_isdup, result_inv = s.duplicated(keep=keep,
+                                            return_inverse=True)
+    tm.assert_series_equal(result_isdup, expected_isdup)
+
+    expected_inv = Series(range(10))
+    tm.assert_series_equal(result_inv, expected_inv)

From 6e005c2438ea0149570c112c42a4fe847fdc85f3 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 30 Aug 2018 23:38:10 +0200
Subject: [PATCH 2/7] Review (gfyoung)

---
 doc/source/whatsnew/v0.24.0.txt               | 10 +++-------
 pandas/core/frame.py                          |  7 +++++++
 pandas/core/indexes/base.py                   |  7 +++++++
 pandas/core/series.py                         |  7 +++++++
 pandas/tests/frame/test_duplicates.py         | 13 ++++---------
 pandas/tests/indexes/common.py                |  1 +
 pandas/tests/indexes/multi/test_duplicates.py |  1 +
 pandas/tests/series/test_duplicates.py        |  1 +
 8 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index c829c2bf1cc1f..16420b8ce562a 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -161,11 +161,11 @@ This is the same behavior as ``Series.values`` for categorical data. See
 
 .. _whatsnew_0240.enhancements.duplicated_inverse:
 
-The ``duplicated``-method has gained the ``return_inverse`` kwarg
+The `duplicated`-method has gained the `return_inverse` kwarg
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
-which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
+The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
+which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
 that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
 
 For ``Index`` objects, the inverse is an ``np.ndarray``:
@@ -173,7 +173,6 @@ For ``Index`` objects, the inverse is an ``np.ndarray``:
 .. ipython:: python
 
     idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
-    idx.has_duplicates
     isduplicate, inverse = idx.duplicated(return_inverse=True)  # default: keep='first'
     isduplicate
     inverse
@@ -205,9 +204,6 @@ which contains the mapping from the index of the deduplicated, unique subset bac
     reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
     reconstruct.equals(df)
 
-The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
-to construct an inverse).
-
 
 .. _whatsnew_0240.enhancements.other:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 49f681771d4db..6e01f72443eb2 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4366,6 +4366,13 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
         -------
         duplicated : Series or tuple of Series if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 2a5e4958596f5..f86f8854fce41 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4600,6 +4600,13 @@ def duplicated(self, keep='first', return_inverse=False):
         -------
         duplicated : ndarray or or tuple of ndarray if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 476221ac54993..2ed94987ec8a7 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1621,6 +1621,13 @@ def duplicated(self, keep='first', return_inverse=False):
         -------
         duplicated : Series or or tuple of Series if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py
index 43f312fa3ebb8..6e276d03dab38 100644
--- a/pandas/tests/frame/test_duplicates.py
+++ b/pandas/tests/frame/test_duplicates.py
@@ -93,6 +93,7 @@ def test_duplicated_subset(subset, keep):
     ('last', [25, 9, 9, 16, 25])
 ])
 def test_duplicated_inverse(keep, expected_inv_values):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
     idx = [1, 4, 9, 16, 25]
@@ -121,9 +122,10 @@ def test_duplicated_inverse_raises():
 
 
 @pytest.mark.parametrize('keep', ['first', 'last'])
-@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
+@pytest.mark.parametrize('subset', [['A', 'B', 'C'], ['A', 'B'], ['A']])
 def test_duplicated_inverse_large(subset, keep):
-    # unsorted index important to check 'first'/'last' functionality
+    # unsorted index (through .sample); important to check correct
+    # 'first'/'last' functionality of return_inverse
     df = DataFrame(np.random.randint(0, 10, (10000, 3)),
                    columns=list('ABC')).sample(5000)
 
@@ -132,13 +134,6 @@ def test_duplicated_inverse_large(subset, keep):
                                              return_inverse=True)
     tm.assert_series_equal(result_isdup, expected_isdup)
 
-    if subset is None:
-        subset = list(df.columns)
-    elif isinstance(subset, string_types):
-        # need to have a DataFrame, not a Series
-        # -> select columns with singleton list, not string
-        subset = [subset]
-
     unique = df.loc[~expected_isdup, subset]
     reconstr = unique.reindex(result_inv).set_index(result_inv.index)
     tm.assert_frame_equal(reconstr, df[subset])
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index ba6f10e521824..3f2430738e7f9 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -380,6 +380,7 @@ def test_duplicated(self, indices, keep):
 
     @pytest.mark.parametrize('keep', ['first', 'last'])
     def test_duplicated_inverse(self, indices, keep):
+        # GH 21357
         # check that return_inverse kwarg does not affect outcome
         if type(indices) is not self._holder:
             pytest.skip('Can only check if we have the correct type')
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
index 171e9a3772fd9..44fcbbc692bb9 100644
--- a/pandas/tests/indexes/multi/test_duplicates.py
+++ b/pandas/tests/indexes/multi/test_duplicates.py
@@ -268,6 +268,7 @@ def test_duplicated_large(keep):
 
 @pytest.mark.parametrize('keep', ['first', 'last'])
 def test_duplicated_inverse(idx_dup, keep):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
 
diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py
index 8f264f85c8cda..61a628081702d 100644
--- a/pandas/tests/series/test_duplicates.py
+++ b/pandas/tests/series/test_duplicates.py
@@ -145,6 +145,7 @@ def test_duplicated_nan_none(keep, expected):
     ('last', [25, 9, 9, 16, 25])
 ])
 def test_duplicated_inverse(keep, expected_inv_values):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
     idx = [1, 4, 9, 16, 25]

From 16c3103a36d174fd49a7dcc9892c497a24238b97 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 30 Aug 2018 23:46:13 +0200
Subject: [PATCH 3/7] Removed explicit test for array of inverse; tested
 implicitly

---
 pandas/tests/indexes/common.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 3f2430738e7f9..cc35744a69002 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -402,23 +402,9 @@ def test_duplicated_inverse(self, indices, keep):
         idx = self._holder(idx.values[duplicated_selection])
 
         expected_isdup = idx.duplicated(keep=keep)
-        if keep == 'first':
-            _, tmp_ind, tmp_inv = np.unique(idx, return_index=True,
-                                            return_inverse=True)
-        else:  # 'last'
-            # switch order before calling unique then restore correct ordering
-            # for tmp_ind, tmp_inv
-            _, tmp_ind, tmp_inv = np.unique(idx[::-1], return_index=True,
-                                            return_inverse=True)
-            tmp_ind = np.arange(len(idx))[::-1][tmp_ind]
-            tmp_inv = tmp_inv[::-1]
-        # explanation in pandas.core.algorithms.duplicated
-        expected_inv = np.argsort(np.argsort(tmp_ind))[tmp_inv]
-
         result_isdup, result_inv = idx.duplicated(keep=keep,
                                                   return_inverse=True)
         tm.assert_numpy_array_equal(result_isdup, expected_isdup)
-        tm.assert_numpy_array_equal(result_inv, expected_inv)
 
         # test that result_inv works (and fits together with expected_isdup)
         unique = idx[~expected_isdup]

From 18a0de7367f50c1a0558a99a723ba3a7c1c9702c Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Fri, 31 Aug 2018 07:54:54 +0200
Subject: [PATCH 4/7] Improve comments, fix doc string errors

---
 pandas/core/frame.py           | 4 ++--
 pandas/core/series.py          | 2 +-
 pandas/tests/indexes/common.py | 4 +++-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 181dd8ab7f9bf..4a40716fbee76 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4382,8 +4382,8 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
         set on False and all others on True:
 
         >>> data = {'species': ['lama', 'cow', 'lama', 'ant', 'lama', 'bee'],
-                    'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']}
-        >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25])
+        ...         'type': ['mammal'] * 3 + ['insect', 'mammal', 'insect']}
+        >>> animals = pd.DataFrame(data, index=[1, 4, 9, 16, 25, 36])
         >>> animals
            species    type
         1     lama  mammal
diff --git a/pandas/core/series.py b/pandas/core/series.py
index b473a70fb8258..19260efc51bd9 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1633,7 +1633,7 @@ def duplicated(self, keep='first', return_inverse=False):
         set on False and all others on True:
 
         >>> animals = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama'],
-                                index=[1, 4, 9, 16, 25])
+        ...                     index=[1, 4, 9, 16, 25])
         >>> animals.duplicated()
         1     False
         4     False
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index cc35744a69002..544a4855fdb32 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -406,7 +406,9 @@ def test_duplicated_inverse(self, indices, keep):
                                                   return_inverse=True)
         tm.assert_numpy_array_equal(result_isdup, expected_isdup)
 
-        # test that result_inv works (and fits together with expected_isdup)
+        # the following tests the correctness of result_inv in two ways:
+        # - it needs to fit together with expected_isdup
+        # - it needs to correctly reconstruct the object
         unique = idx[~expected_isdup]
         reconstr = unique[result_inv]
         tm.assert_index_equal(reconstr, idx)

From db8693f97b84432beb8799c6312143bf2d078265 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Tue, 11 Sep 2018 08:27:03 +0200
Subject: [PATCH 5/7] Review (jorisvandenbossche)

---
 doc/source/whatsnew/v0.24.0.txt |  2 +-
 pandas/core/frame.py            | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index d8820c87221e5..01bebbdeb8cc5 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -164,7 +164,7 @@ This is the same behavior as ``Series.values`` for categorical data. See
 The `duplicated`-method has gained the `return_inverse` kwarg
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
+The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
 which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
 that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 96996893278e0..449ab241886ad 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4382,12 +4382,11 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
             - False : Mark all duplicates as ``True``. This option is not
               compatible with ``return_inverse``.
         return_inverse : boolean, default False
-            If True, also return the selection from the index from the
-            DataFrame of unique values (created e.g. by selecting the boolean
-            complement of the first output, or by using `.drop_duplicates` with
-            the same `keep`-parameter) and how they relate to the index of the
-            current DataFrame. This allows to reconstruct the original
-            DataFrame from the subset of unique values, see example below.
+            If True, also return a Series mapping the index of the current
+            DataFrame to the index after deduplication (created e.g. by using
+            `.drop_duplicates` or by selecting everything that is not
+            duplicate). This allows to reconstruct the original DataFrame from
+            the subset of deduplicated (=unique) values, see example below.
 
             .. versionadded:: 0.24.0
 

From d4e803e09d58d94d346215ec62722c7ee2d5df4b Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Thu, 13 Sep 2018 08:35:34 +0200
Subject: [PATCH 6/7] Add reference to method in whatsnew

---
 doc/source/whatsnew/v0.24.0.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index 01bebbdeb8cc5..b10e7d0c3d058 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -164,7 +164,7 @@ This is the same behavior as ``Series.values`` for categorical data. See
 The `duplicated`-method has gained the `return_inverse` kwarg
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
+The :meth:`~DataFrame.duplicated`-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
 which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
 that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
 

From b8cbb13938e4103ffde8b8c8698fe1c971a08199 Mon Sep 17 00:00:00 2001
From: "H. Vetinari" <h.vetinari@gmx.com>
Date: Sun, 23 Sep 2018 23:40:43 +0200
Subject: [PATCH 7/7] Review (jreback)

---
 pandas/core/algorithms.py | 54 ++++++++++++++++++++-------------------
 pandas/core/base.py       | 21 +++++++--------
 pandas/core/frame.py      | 11 ++++----
 3 files changed, 45 insertions(+), 41 deletions(-)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index 181d2fbc5f601..647af764caeab 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -808,44 +808,46 @@ def duplicated(values, keep='first', return_inverse=False):
 
     values, dtype, ndtype = _ensure_data(values)
     f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
-    isdup = f(values, keep=keep)
+    isduplicate = f(values, keep=keep)
     if not return_inverse:
-        return isdup
-    elif not isdup.any():
+        return isduplicate
+    elif not isduplicate.any():
         # no need to calculate inverse if no duplicates
-        inv = np.arange(len(values))
-        return isdup, inv
+        inverse = np.arange(len(values))
+        return isduplicate, inverse
 
     if keep == 'first':
-        # o2u: original indices to indices of ARRAY of unique values
-        # u2o: reduplication from array of unique values to original array
-        # this fits together in the way that values[o2u] are the unique values
-        # and values[o2u][u2o] == values
-        _, o2u, u2o = np.unique(values, return_index=True,
-                                return_inverse=True)
+        # values2unique: original indices to indices of ARRAY of unique values
+        # unique2values: reduplication from array of uniques to original array
+        # this fits together in the way that values[values2unique] are the
+        # unique values and values[values2unique][unique2values] == values
+        _, values2unique, unique2values = np.unique(values, return_index=True,
+                                                    return_inverse=True)
     elif keep == 'last':
-        # np.unique takes first occurrence as unique value,
+        # np.unique takes first occurrence per unique value,
         # so we flip values that first becomes last
         values = values[::-1]
-        _, o2u, u2o = np.unique(values, return_index=True,
-                                return_inverse=True)
+        _, values2unique, unique2values = np.unique(values, return_index=True,
+                                                    return_inverse=True)
         # the values in "values" correspond(ed) to the index of "values",
         # which is simply np.arange(len(values)).
         # By flipping "values" around, we need to do the same for the index,
-        # ___because o2u and u2o are relative to that order___.
+        # _because values2unique and unique2values are relative to that order_.
         # Finally, to fit with the original order again, we need to flip the
         # result around one last time.
-        o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1]
-
-    # np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative
-    # to this order. To restore the original order, we argsort o2u, because o2u
-    # would be ordered if np.unique had not sorted implicitly. The first
-    # argsort gives the permutation from o2u to its sorted form, but we need
-    # the inverse permutation (the map from the unsorted uniques to o2u, from
-    # which we can continue with u2o). This inversion (as a permutation) is
-    # achieved by the second argsort.
-    inv = np.argsort(np.argsort(o2u))[u2o]
-    return isdup, inv
+        values2unique = np.arange(len(values))[::-1][values2unique]
+        unique2values = unique2values[::-1]
+
+    # np.unique yields a ___sorted___ list of uniques, and values2unique resp.
+    # unique2values are relative to this order. To restore the original order,
+    # we argsort values2unique, because values2unique would be ordered if
+    # np.unique had not sorted implicitly.
+    # The first argsort gives the permutation from values2unique to its sorted
+    # form, but we need the inverse permutation (the map from the unsorted
+    # uniques to values2unique, from which we can continue with unique2values).
+    # This inversion (as a permutation) is achieved by the second argsort.
+    inverse = np.argsort(np.argsort(values2unique))[unique2values]
+    return isduplicate, inverse
 
 
 def mode(values, dropna=True):
diff --git a/pandas/core/base.py b/pandas/core/base.py
index 2ab7f92e2dc03..f6f8661322550 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -1257,10 +1257,10 @@ def duplicated(self, keep='first', return_inverse=False):
 
         if isinstance(self, ABCIndexClass):
             if self.is_unique:
-                isdup = np.zeros(len(self), dtype=np.bool)
+                isduplicate = np.zeros(len(self), dtype=np.bool)
                 if not return_inverse:
-                    return isdup
-                return isdup, np.arange(len(self))
+                    return isduplicate
+                return isduplicate, np.arange(len(self))
             # core.algorithms.duplicated has the same output signature as
             # Index.duplicated -> no need to distinguish cases here
             return duplicated(self, keep=keep, return_inverse=return_inverse)
@@ -1271,13 +1271,14 @@ def duplicated(self, keep='first', return_inverse=False):
                                      index=self.index).__finalize__(self)
 
         # return_inverse = True
-        isdup_array, inv_array = duplicated(self, keep=keep,
-                                            return_inverse=True)
-        isdup = self._constructor(isdup_array,
-                                  index=self.index).__finalize__(self)
-        inv = self._constructor(self.loc[~isdup_array].index[inv_array],
-                                index=self.index)
-        return isdup, inv
+        isduplicate_array, inverse_array = duplicated(self, keep=keep,
+                                                      return_inverse=True)
+        isduplicate = self._constructor(isduplicate_array,
+                                        index=self.index).__finalize__(self)
+        inverse = self._constructor(
+            self.loc[~isduplicate_array].index[inverse_array],
+            index=self.index)
+        return isduplicate, inverse
 
     # ----------------------------------------------------------------------
     # abstracts
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 449ab241886ad..b9c15304a3746 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4561,11 +4561,12 @@ def f(vals):
             return Series(duplicated(ids, keep=keep), index=self.index)
 
         # return_inverse = True
-        isdup_array, inv_array = duplicated(ids, keep=keep,
-                                            return_inverse=True)
-        isdup = Series(isdup_array, index=self.index)
-        inv = Series(self.loc[~isdup_array].index[inv_array], index=self.index)
-        return isdup, inv
+        isduplicated_array, inverse_array = duplicated(ids, keep=keep,
+                                                       return_inverse=True)
+        isduplicated = Series(isduplicated_array, index=self.index)
+        inverse = Series(self.loc[~isduplicated_array].index[inverse_array],
+                         index=self.index)
+        return isduplicated, inverse
 
     # ----------------------------------------------------------------------
     # Sorting