Review (gfyoung)

h-vetinari · h-vetinari · commit 6e005c2438ea · 2018-08-30T23:38:10.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -161,19 +161,18 @@ This is the same behavior as ``Series.values`` for categorical data. See
 
 .. _whatsnew_0240.enhancements.duplicated_inverse:
 
-The ``duplicated``-method has gained the ``return_inverse`` kwarg
+The `duplicated`-method has gained the `return_inverse` kwarg
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
-which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
+The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
+which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
 that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
 
 For ``Index`` objects, the inverse is an ``np.ndarray``:
 
 .. ipython:: python
 
     idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
-    idx.has_duplicates
     isduplicate, inverse = idx.duplicated(return_inverse=True)  # default: keep='first'
     isduplicate
     inverse
@@ -205,9 +204,6 @@ which contains the mapping from the index of the deduplicated, unique subset bac
     reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
     reconstruct.equals(df)
 
-The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
-to construct an inverse).
-
 
 .. _whatsnew_0240.enhancements.other:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4366,6 +4366,13 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
         -------
         duplicated : Series or tuple of Series if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4600,6 +4600,13 @@ def duplicated(self, keep='first', return_inverse=False):
         -------
         duplicated : ndarray or or tuple of ndarray if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1621,6 +1621,13 @@ def duplicated(self, keep='first', return_inverse=False):
         -------
         duplicated : Series or or tuple of Series if return_inverse is True
 
+        Notes
+        -----
+        The `return_inverse`-keyword works as expected for
+        ``keep='first'|'last'``, but cannot be used together with
+        ``keep=False`` (since discarding all duplicates makes it impossible to
+        construct an inverse).
+
         Examples
         --------
         By default, for each set of duplicated values, the first occurrence is
diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py
@@ -93,6 +93,7 @@ def test_duplicated_subset(subset, keep):
     ('last', [25, 9, 9, 16, 25])
 ])
 def test_duplicated_inverse(keep, expected_inv_values):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
     idx = [1, 4, 9, 16, 25]
@@ -121,9 +122,10 @@ def test_duplicated_inverse_raises():
 
 
 @pytest.mark.parametrize('keep', ['first', 'last'])
-@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
+@pytest.mark.parametrize('subset', [['A', 'B', 'C'], ['A', 'B'], ['A']])
 def test_duplicated_inverse_large(subset, keep):
-    # unsorted index important to check 'first'/'last' functionality
+    # unsorted index (through .sample); important to check correct
+    # 'first'/'last' functionality of return_inverse
     df = DataFrame(np.random.randint(0, 10, (10000, 3)),
                    columns=list('ABC')).sample(5000)
 
@@ -132,13 +134,6 @@ def test_duplicated_inverse_large(subset, keep):
                                              return_inverse=True)
     tm.assert_series_equal(result_isdup, expected_isdup)
 
-    if subset is None:
-        subset = list(df.columns)
-    elif isinstance(subset, string_types):
-        # need to have a DataFrame, not a Series
-        # -> select columns with singleton list, not string
-        subset = [subset]
-
     unique = df.loc[~expected_isdup, subset]
     reconstr = unique.reindex(result_inv).set_index(result_inv.index)
     tm.assert_frame_equal(reconstr, df[subset])
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
@@ -380,6 +380,7 @@ def test_duplicated(self, indices, keep):
 
     @pytest.mark.parametrize('keep', ['first', 'last'])
     def test_duplicated_inverse(self, indices, keep):
+        # GH 21357
         # check that return_inverse kwarg does not affect outcome
         if type(indices) is not self._holder:
             pytest.skip('Can only check if we have the correct type')
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
@@ -268,6 +268,7 @@ def test_duplicated_large(keep):
 
 @pytest.mark.parametrize('keep', ['first', 'last'])
 def test_duplicated_inverse(idx_dup, keep):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
 
diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py
@@ -145,6 +145,7 @@ def test_duplicated_nan_none(keep, expected):
     ('last', [25, 9, 9, 16, 25])
 ])
 def test_duplicated_inverse(keep, expected_inv_values):
+    # GH 21357
     # check that return_inverse kwarg does not affect outcome;
     # index of inverse must be correctly transformed as well
     idx = [1, 4, 9, 16, 25]