Skip to content

Commit ebfffc2

Browse files
committed
ENH: add return_inverse kwarg to duplicated-method (rebased)
1 parent 25e6a21 commit ebfffc2

File tree

16 files changed

+777
-92
lines changed

16 files changed

+777
-92
lines changed

asv_bench/benchmarks/frame_methods.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -412,21 +412,35 @@ def time_frame_nunique(self):
412412
class Duplicated(object):
413413

414414
goal_time = 0.2
415+
params = (['first', 'last', False], [True, False])
416+
param_names = ['keep', 'return_inverse']
417+
418+
def setup(self, keep, return_inverse):
419+
if keep is False and return_inverse:
420+
raise NotImplementedError
415421

416-
def setup(self):
417422
n = (1 << 20)
418423
t = date_range('2015-01-01', freq='S', periods=(n // 64))
419424
xs = np.random.randn(n // 64).round(2)
420425
self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n),
421426
'b': np.random.choice(t, n),
422427
'c': np.random.choice(xs, n)})
423-
self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T
428+
# df2 will not have any duplicates
429+
self.df2 = DataFrame(np.random.randn(100, 1000).astype(str))
430+
431+
df3 = DataFrame(np.random.randint(0, 10, (2 ** 18, 5)),
432+
columns=list('ABCDE'))
433+
df3.loc[:, 'F'] = Series('', index=df3.index).str.cat(df3.astype(str))
434+
self.df3 = df3
435+
436+
def time_frame_duplicated(self, keep, return_inverse):
437+
self.df.duplicated(keep=keep, return_inverse=return_inverse)
424438

425-
def time_frame_duplicated(self):
426-
self.df.duplicated()
439+
def time_frame_duplicated_wide(self, keep, return_inverse):
440+
self.df2.duplicated(keep=keep, return_inverse=return_inverse)
427441

428-
def time_frame_duplicated_wide(self):
429-
self.df2.duplicated()
442+
def time_frame_duplicated_mixed(self, keep, return_inverse):
443+
self.df3.duplicated(keep=keep, return_inverse=return_inverse)
430444

431445

432446
class XS(object):

asv_bench/benchmarks/index_object.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,24 @@ def time_modulo(self, dtype):
8484
self.index % 2
8585

8686

87+
class Duplicated(object):
88+
89+
goal_time = 0.2
90+
params = (['first', 'last', False], [True, False])
91+
param_names = ['keep', 'return_inverse']
92+
93+
def setup(self, keep, return_inverse):
94+
if keep is False and return_inverse:
95+
raise NotImplementedError
96+
97+
n, k = 200, 1000
98+
base = tm.makeStringIndex(n)
99+
self.idx = Index(base[np.random.choice(n, k * n)])
100+
101+
def time_duplicated(self, keep, return_inverse):
102+
self.idx.duplicated(keep=keep, return_inverse=return_inverse)
103+
104+
87105
class Range(object):
88106

89107
goal_time = 0.2

asv_bench/benchmarks/multiindex_object.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,22 @@ def time_is_monotonic(self):
8383
class Duplicated(object):
8484

8585
goal_time = 0.2
86+
params = (['first', 'last', False], [True, False])
87+
param_names = ['keep', 'return_inverse']
8688

87-
def setup(self):
88-
n, k = 200, 5000
89+
def setup(self, keep, return_inverse):
90+
if keep is False and return_inverse:
91+
raise NotImplementedError
92+
93+
n, k = 200, 1000
8994
levels = [np.arange(n),
9095
tm.makeStringIndex(n).values,
9196
1000 + np.arange(n)]
9297
labels = [np.random.choice(n, (k * n)) for lev in levels]
9398
self.mi = MultiIndex(levels=levels, labels=labels)
9499

95-
def time_duplicated(self):
96-
self.mi.duplicated()
100+
def time_duplicated(self, keep, return_inverse):
101+
self.mi.duplicated(keep=keep, return_inverse=return_inverse)
97102

98103

99104
class Sortlevel(object):

asv_bench/benchmarks/series_methods.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,21 @@ def setup(self):
192192

193193
def time_series_datetimeindex_repr(self):
194194
getattr(self.s, 'a', None)
195+
196+
197+
class Duplicated(object):
198+
199+
goal_time = 0.2
200+
params = (['first', 'last', False], [True, False])
201+
param_names = ['keep', 'return_inverse']
202+
203+
def setup(self, keep, return_inverse):
204+
if keep is False and return_inverse:
205+
raise NotImplementedError
206+
207+
n, k = 200, 1000
208+
base = tm.makeStringIndex(n)
209+
self.s = Series(base[np.random.choice(n, k * n)])
210+
211+
def time_series_duplicated(self, keep, return_inverse):
212+
self.s.duplicated(keep=keep, return_inverse=return_inverse)

doc/source/whatsnew/v0.24.0.txt

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,56 @@ This is the same behavior as ``Series.values`` for categorical data. See
159159
:ref:`whatsnew_0240.api_breaking.interval_values` for more.
160160

161161

162+
.. _whatsnew_0240.enhancements.duplicated_inverse:
163+
164+
The ``duplicated``-method has gained the ``return_inverse`` kwarg
165+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
166+
167+
The ``duplicated``-method for ``Series``, ``DataFrame`` and all flavors of ``Index`` has gained a ``return_inverse`` keyword,
168+
which is False by default. Specifying ``return_inverse=True`` will add an object to the output (which therefore becomes a tuple)
169+
that allows reconstructing the original object from the deduplicated, unique subset (:issue:`21357`).
170+
171+
For ``Index`` objects, the inverse is an ``np.ndarray``:
172+
173+
.. ipython:: python
174+
175+
idx = pd.Index(['a', 'b', 'b', 'c', 'a'])
176+
idx.has_duplicates
177+
isduplicate, inverse = idx.duplicated(return_inverse=True) # default: keep='first'
178+
isduplicate
179+
inverse
180+
181+
This allows to reconstruct the original ``Index`` as follows:
182+
183+
.. ipython:: python
184+
185+
unique = idx[~isduplicate] # same as idx.drop_duplicates()
186+
unique
187+
188+
reconstruct = unique[inverse]
189+
reconstruct.equals(idx)
190+
191+
For ``DataFrame`` and ``Series`` the inverse needs to take into account the original index as well, and is therefore a ``Series``,
192+
which contains the mapping from the index of the deduplicated, unique subset back to the original index.
193+
194+
.. ipython:: python
195+
196+
df = pd.DataFrame({'A': [0, 1, 1, 2, 0], 'B': ['a', 'b', 'b', 'c', 'a']},
197+
index=[1, 4, 9, 16, 25])
198+
df
199+
isduplicate, inverse = df.duplicated(keep='last', return_inverse=True)
200+
isduplicate
201+
inverse
202+
203+
unique = df.loc[~isduplicate] # same as df.drop_duplicates(keep='last')
204+
unique
205+
reconstruct = unique.reindex(inverse.values).set_index(inverse.index)
206+
reconstruct.equals(df)
207+
208+
The keyword works as expected for ``keep='first'|'last'``, but cannot be used together with ``keep=False`` (since discarding all duplicates makes it impossible
209+
to construct an inverse).
210+
211+
162212
.. _whatsnew_0240.enhancements.other:
163213

164214
Other Enhancements

pandas/core/algorithms.py

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -770,7 +770,7 @@ def _value_counts_arraylike(values, dropna):
770770
return keys, counts
771771

772772

773-
def duplicated(values, keep='first'):
773+
def duplicated(values, keep='first', return_inverse=False):
774774
"""
775775
Return boolean ndarray denoting duplicate values.
776776
@@ -785,16 +785,67 @@ def duplicated(values, keep='first'):
785785
occurrence.
786786
- ``last`` : Mark duplicates as ``True`` except for the last
787787
occurrence.
788-
- False : Mark all duplicates as ``True``.
788+
- False : Mark all duplicates as ``True``. This option is not
789+
compatible with ``return_inverse``.
790+
return_inverse : boolean, default False
791+
If True, also return the selection of (integer) indices from the array
792+
of unique values (created e.g. by selecting the boolean complement of
793+
the first output, or by using `.drop_duplicates` with the same
794+
`keep`-parameter) that can be used to reconstruct "values".
795+
796+
.. versionadded:: 0.24.0
789797
790798
Returns
791799
-------
792-
duplicated : ndarray
800+
duplicated : ndarray or tuple of ndarray if ``return_inverse`` is True
793801
"""
794802

803+
if return_inverse and keep is False:
804+
raise ValueError("The parameters return_inverse=True and "
805+
"keep=False cannot be used together (impossible "
806+
"to calculate an inverse when discarding all "
807+
"instances of a duplicate).")
808+
795809
values, dtype, ndtype = _ensure_data(values)
796810
f = getattr(htable, "duplicated_{dtype}".format(dtype=ndtype))
797-
return f(values, keep=keep)
811+
isdup = f(values, keep=keep)
812+
if not return_inverse:
813+
return isdup
814+
elif not isdup.any():
815+
# no need to calculate inverse if no duplicates
816+
inv = np.arange(len(values))
817+
return isdup, inv
818+
819+
if keep == 'first':
820+
# o2u: original indices to indices of ARRAY of unique values
821+
# u2o: reduplication from array of unique values to original array
822+
# this fits together in the way that values[o2u] are the unique values
823+
# and values[o2u][u2o] == values
824+
_, o2u, u2o = np.unique(values, return_index=True,
825+
return_inverse=True)
826+
elif keep == 'last':
827+
# np.unique takes first occurrence as unique value,
828+
# so we flip values that first becomes last
829+
values = values[::-1]
830+
_, o2u, u2o = np.unique(values, return_index=True,
831+
return_inverse=True)
832+
# the values in "values" correspond(ed) to the index of "values",
833+
# which is simply np.arange(len(values)).
834+
# By flipping "values" around, we need to do the same for the index,
835+
# ___because o2u and u2o are relative to that order___.
836+
# Finally, to fit with the original order again, we need to flip the
837+
# result around one last time.
838+
o2u, u2o = np.arange(len(values))[::-1][o2u], u2o[::-1]
839+
840+
# np.unique yields a ___sorted___ list of uniques, and o2u/u2o are relative
841+
# to this order. To restore the original order, we argsort o2u, because o2u
842+
# would be ordered if np.unique had not sorted implicitly. The first
843+
# argsort gives the permutation from o2u to its sorted form, but we need
844+
# the inverse permutation (the map from the unsorted uniques to o2u, from
845+
# which we can continue with u2o). This inversion (as a permutation) is
846+
# achieved by the second argsort.
847+
inv = np.argsort(np.argsort(o2u))[u2o]
848+
return isdup, inv
798849

799850

800851
def mode(values, dropna=True):

pandas/core/base.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,16 +1246,39 @@ def drop_duplicates(self, keep='first', inplace=False):
12461246
else:
12471247
return result
12481248

1249-
def duplicated(self, keep='first'):
1249+
def duplicated(self, keep='first', return_inverse=False):
12501250
from pandas.core.algorithms import duplicated
1251+
1252+
if return_inverse and keep is False:
1253+
raise ValueError("The parameters return_inverse=True and "
1254+
"keep=False cannot be used together (impossible "
1255+
"to calculate an inverse when discarding all "
1256+
"instances of a duplicate).")
1257+
12511258
if isinstance(self, ABCIndexClass):
12521259
if self.is_unique:
1253-
return np.zeros(len(self), dtype=np.bool)
1254-
return duplicated(self, keep=keep)
1255-
else:
1260+
isdup = np.zeros(len(self), dtype=np.bool)
1261+
if not return_inverse:
1262+
return isdup
1263+
return isdup, np.arange(len(self))
1264+
# core.algorithms.duplicated has the same output signature as
1265+
# Index.duplicated -> no need to distinguish cases here
1266+
return duplicated(self, keep=keep, return_inverse=return_inverse)
1267+
1268+
# Series case
1269+
if not return_inverse:
12561270
return self._constructor(duplicated(self, keep=keep),
12571271
index=self.index).__finalize__(self)
12581272

1273+
# return_inverse = True
1274+
isdup_array, inv_array = duplicated(self, keep=keep,
1275+
return_inverse=True)
1276+
isdup = self._constructor(isdup_array,
1277+
index=self.index).__finalize__(self)
1278+
inv = self._constructor(self.loc[~isdup_array].index[inv_array],
1279+
index=self.index)
1280+
return isdup, inv
1281+
12591282
# ----------------------------------------------------------------------
12601283
# abstracts
12611284

0 commit comments

Comments
 (0)