Skip to content

Commit 6e005c2

Browse files
committed
Review (gfyoung)
1 parent ebfffc2 commit 6e005c2

File tree

8 files changed

+31
-16
lines changed

8 files changed

+31
-16
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -161,19 +161,18 @@ This is the same behavior as ``Series.values`` for categorical data. See
161161

162162
.. _whatsnew_0240.enhancements.duplicated_inverse:
163163

164-
The ``duplicated``-method has gained the ``return_inverse`` kwarg
164+
The `duplicated`-method has gained the `return_inverse` kwarg
165165
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
166166

167-
The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
168-
which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
167+
The `duplicated`-method for `Series`, `DataFrame` and all flavors of `Index` has gained a `return_inverse` keyword,
168+
which is ``False`` by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
169169
that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
170170

171171
For ``Index`` objects, the inverse is an ``np.ndarray``:
172172

173173
.. ipython:: python
174174

175175
idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
176-
idx.has_duplicates
177176
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
178177
isduplicate
179178
inverse
@@ -205,9 +204,6 @@ which contains the mapping from the index of the deduplicated, unique subset bac
205204
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
206205
reconstruct.equals(df)
207206

208-
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
209-
to construct an inverse).
210-
211207

212208
.. _whatsnew_0240.enhancements.other:
213209

pandas/core/frame.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4366,6 +4366,13 @@ def duplicated(self, subset=None, keep='first', return_inverse=False):
43664366
-------
43674367
duplicated : Series or tuple of Series if return_inverse is True
43684368
4369+
Notes
4370+
-----
4371+
The `return_inverse`-keyword works as expected for
4372+
``keep='first'|'last'``, but cannot be used together with
4373+
``keep=False`` (since discarding all duplicates makes it impossible to
4374+
construct an inverse).
4375+
43694376
Examples
43704377
--------
43714378
By default, for each set of duplicated values, the first occurrence is

pandas/core/indexes/base.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4600,6 +4600,13 @@ def duplicated(self, keep='first', return_inverse=False):
46004600
-------
46014601
duplicated : ndarray or or tuple of ndarray if return_inverse is True
46024602
4603+
Notes
4604+
-----
4605+
The `return_inverse`-keyword works as expected for
4606+
``keep='first'|'last'``, but cannot be used together with
4607+
``keep=False`` (since discarding all duplicates makes it impossible to
4608+
construct an inverse).
4609+
46034610
Examples
46044611
--------
46054612
By default, for each set of duplicated values, the first occurrence is

pandas/core/series.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1621,6 +1621,13 @@ def duplicated(self, keep='first', return_inverse=False):
16211621
-------
16221622
duplicated : Series or or tuple of Series if return_inverse is True
16231623
1624+
Notes
1625+
-----
1626+
The `return_inverse`-keyword works as expected for
1627+
``keep='first'|'last'``, but cannot be used together with
1628+
``keep=False`` (since discarding all duplicates makes it impossible to
1629+
construct an inverse).
1630+
16241631
Examples
16251632
--------
16261633
By default, for each set of duplicated values, the first occurrence is

pandas/tests/frame/test_duplicates.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ def test_duplicated_subset(subset, keep):
9393
('last', [25, 9, 9, 16, 25])
9494
])
9595
def test_duplicated_inverse(keep, expected_inv_values):
96+
# GH 21357
9697
# check that return_inverse kwarg does not affect outcome;
9798
# index of inverse must be correctly transformed as well
9899
idx = [1, 4, 9, 16, 25]
@@ -121,9 +122,10 @@ def test_duplicated_inverse_raises():
121122

122123

123124
@pytest.mark.parametrize('keep', ['first', 'last'])
124-
@pytest.mark.parametrize('subset', [None, ['A', 'B'], 'A'])
125+
@pytest.mark.parametrize('subset', [['A', 'B', 'C'], ['A', 'B'], ['A']])
125126
def test_duplicated_inverse_large(subset, keep):
126-
# unsorted index important to check 'first'/'last' functionality
127+
# unsorted index (through .sample); important to check correct
128+
# 'first'/'last' functionality of return_inverse
127129
df = DataFrame(np.random.randint(0, 10, (10000, 3)),
128130
columns=list('ABC')).sample(5000)
129131

@@ -132,13 +134,6 @@ def test_duplicated_inverse_large(subset, keep):
132134
return_inverse=True)
133135
tm.assert_series_equal(result_isdup, expected_isdup)
134136

135-
if subset is None:
136-
subset = list(df.columns)
137-
elif isinstance(subset, string_types):
138-
# need to have a DataFrame, not a Series
139-
# -> select columns with singleton list, not string
140-
subset = [subset]
141-
142137
unique = df.loc[~expected_isdup, subset]
143138
reconstr = unique.reindex(result_inv).set_index(result_inv.index)
144139
tm.assert_frame_equal(reconstr, df[subset])

pandas/tests/indexes/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ def test_duplicated(self, indices, keep):
380380

381381
@pytest.mark.parametrize('keep', ['first', 'last'])
382382
def test_duplicated_inverse(self, indices, keep):
383+
# GH 21357
383384
# check that return_inverse kwarg does not affect outcome
384385
if type(indices) is not self._holder:
385386
pytest.skip('Can only check if we have the correct type')

pandas/tests/indexes/multi/test_duplicates.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ def test_duplicated_large(keep):
268268

269269
@pytest.mark.parametrize('keep', ['first', 'last'])
270270
def test_duplicated_inverse(idx_dup, keep):
271+
# GH 21357
271272
# check that return_inverse kwarg does not affect outcome;
272273
# index of inverse must be correctly transformed as well
273274

pandas/tests/series/test_duplicates.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ def test_duplicated_nan_none(keep, expected):
145145
('last', [25, 9, 9, 16, 25])
146146
])
147147
def test_duplicated_inverse(keep, expected_inv_values):
148+
# GH 21357
148149
# check that return_inverse kwarg does not affect outcome;
149150
# index of inverse must be correctly transformed as well
150151
idx = [1, 4, 9, 16, 25]

0 commit comments

Comments
 (0)