Skip to content

Commit 93f9073

Browse files
committed
BUG: Bug in MultiIndex.has_duplicates when having many levels causes an indexer overflow (GH9075)
1 parent e92df22 commit 93f9073

File tree

5 files changed

+62
-2
lines changed

5 files changed

+62
-2
lines changed

doc/source/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,7 @@ Attributes
11761176
Index.is_monotonic_increasing
11771177
Index.is_monotonic_decreasing
11781178
Index.is_unique
1179+
Index.has_duplicates
11791180
Index.dtype
11801181
Index.inferred_type
11811182
Index.is_all_dates

doc/source/whatsnew/v0.16.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,5 @@ Bug Fixes
4848
~~~~~~~~~
4949

5050
.. _whatsnew_0160.bug_fixes:
51+
52+
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)

pandas/core/index.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,10 @@ def is_unique(self):
600600
""" return if the index has unique values """
601601
return self._engine.is_unique
602602

603+
@property
604+
def has_duplicates(self):
605+
return not self.is_unique
606+
603607
def is_boolean(self):
604608
return self.inferred_type in ['boolean']
605609

@@ -3223,12 +3227,19 @@ def has_duplicates(self):
32233227
"""
32243228
Return True if there are no unique groups
32253229
"""
3226-
# has duplicates
3230+
3231+
from pandas.core.groupby import _int64_overflow_possible
3232+
3233+
# if we have a possible overflow, then fallback to safe method
32273234
shape = [len(lev) for lev in self.levels]
3235+
if _int64_overflow_possible(shape):
3236+
return self.duplicated().any()
3237+
3238+
# int64 capable
32283239
group_index = np.zeros(len(self), dtype='i8')
32293240
for i in range(len(shape)):
32303241
stride = np.prod([x for x in shape[i + 1:]], dtype='i8')
3231-
group_index += self.labels[i] * stride
3242+
group_index += _ensure_int64(self.labels[i]) * stride
32323243

32333244
if len(np.unique(group_index)) < len(group_index):
32343245
return True

pandas/tests/test_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self):
620620
tm.assert_index_equal(result, original)
621621
self.assertFalse(result is original)
622622

623+
# has_duplicates
624+
self.assertFalse(original.has_duplicates)
625+
623626
# create repeated values, 3rd and 5th values are duplicated
624627
idx = original[list(range(len(original))) + [5, 3]]
625628
expected = Index([False] * len(original) + [True, True])

pandas/tests/test_index.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3435,6 +3435,49 @@ def test_has_duplicates(self):
34353435
[0, 1, 2, 0, 0, 1, 2]])
34363436
self.assertTrue(index.has_duplicates)
34373437

3438+
# GH 9075
3439+
t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169),
3440+
(u'x', u'out', u'z', 7, u'y', u'in', u'z', 119),
3441+
(u'x', u'out', u'z', 9, u'y', u'in', u'z', 135),
3442+
(u'x', u'out', u'z', 13, u'y', u'in', u'z', 145),
3443+
(u'x', u'out', u'z', 14, u'y', u'in', u'z', 158),
3444+
(u'x', u'out', u'z', 16, u'y', u'in', u'z', 122),
3445+
(u'x', u'out', u'z', 17, u'y', u'in', u'z', 160),
3446+
(u'x', u'out', u'z', 18, u'y', u'in', u'z', 180),
3447+
(u'x', u'out', u'z', 20, u'y', u'in', u'z', 143),
3448+
(u'x', u'out', u'z', 21, u'y', u'in', u'z', 128),
3449+
(u'x', u'out', u'z', 22, u'y', u'in', u'z', 129),
3450+
(u'x', u'out', u'z', 25, u'y', u'in', u'z', 111),
3451+
(u'x', u'out', u'z', 28, u'y', u'in', u'z', 114),
3452+
(u'x', u'out', u'z', 29, u'y', u'in', u'z', 121),
3453+
(u'x', u'out', u'z', 31, u'y', u'in', u'z', 126),
3454+
(u'x', u'out', u'z', 32, u'y', u'in', u'z', 155),
3455+
(u'x', u'out', u'z', 33, u'y', u'in', u'z', 123),
3456+
(u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)]
3457+
index = pd.MultiIndex.from_tuples(t)
3458+
self.assertFalse(index.has_duplicates)
3459+
3460+
# handle int64 overflow if possible
3461+
def check(nlevels):
3462+
labels = np.tile(np.arange(500), 2)
3463+
level = np.arange(500)
3464+
3465+
# no dups
3466+
index = MultiIndex(levels=[level] * nlevels + [[0, 1]],
3467+
labels=[labels] * nlevels + [np.arange(2).repeat(500)])
3468+
self.assertFalse(index.has_duplicates)
3469+
3470+
# with a dup
3471+
values = index.values.tolist()
3472+
index = MultiIndex.from_tuples(values + [values[0]])
3473+
self.assertTrue(index.has_duplicates)
3474+
3475+
# no overflow
3476+
check(4)
3477+
3478+
# overflow possible
3479+
check(8)
3480+
34383481
def test_tolist(self):
34393482
result = self.index.tolist()
34403483
exp = list(self.index.values)

0 commit comments

Comments
 (0)