Skip to content

Commit c934e02

Browse files
committed
BUG: unstacking int64 overflow with many levels. re #2616
1 parent 4f3472d commit c934e02

File tree

3 files changed

+72
-5
lines changed

3 files changed

+72
-5
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ pandas 0.10.1
6868
- Fix bug and possible segfault when grouping by hierarchical level that
6969
contains NA values (GH2616_)
7070
- Ensure that MultiIndex tuples can be constructed with NAs (seen in #2616)
71+
- Fix int64 overflow issue when unstacking MultiIndex with many levels (#2616)
7172

7273
**API Changes**
7374

pandas/core/reshape.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,11 @@ def _make_sorted_values_labels(self):
8383
to_sort = labs[:v] + labs[v + 1:] + [labs[v]]
8484
sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]]
8585

86-
group_index = get_group_index(to_sort, sizes)
87-
comp_index, obs_ids = _compress_group_index(group_index)
86+
comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
87+
88+
# group_index = get_group_index(to_sort, sizes)
89+
# comp_index, obs_ids = _compress_group_index(group_index)
90+
8891
ngroups = len(obs_ids)
8992

9093
indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
@@ -97,10 +100,10 @@ def _make_selectors(self):
97100
new_levels = self.new_index_levels
98101

99102
# make the mask
100-
group_index = get_group_index(self.sorted_labels[:-1],
101-
[len(x) for x in new_levels])
103+
remaining_labels = self.sorted_labels[:-1]
104+
level_sizes = [len(x) for x in new_levels]
102105

103-
comp_index, obs_ids = _compress_group_index(group_index)
106+
comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
104107
ngroups = len(obs_ids)
105108

106109
comp_index = _ensure_platform_int(comp_index)
@@ -391,6 +394,36 @@ def _unstack_frame(obj, level):
391394
value_columns=obj.columns)
392395
return unstacker.get_result()
393396

397+
def get_compressed_ids(labels, sizes):
398+
# no overflow
399+
if _long_prod(sizes) < 2**63:
400+
group_index = get_group_index(labels, sizes)
401+
comp_index, obs_ids = _compress_group_index(group_index)
402+
else:
403+
n = len(labels[0])
404+
mask = np.zeros(n, dtype=bool)
405+
for v in labels:
406+
mask |= v < 0
407+
408+
while _long_prod(sizes) >= 2**63:
409+
i = len(sizes)
410+
while _long_prod(sizes[:i]) >= 2**63:
411+
i -= 1
412+
413+
rem_index, rem_ids = get_compressed_ids(labels[:i],
414+
sizes[:i])
415+
sizes = [len(rem_ids)] + sizes[i:]
416+
labels = [rem_index] + labels[i:]
417+
418+
return get_compressed_ids(labels, sizes)
419+
420+
return comp_index, obs_ids
421+
422+
def _long_prod(vals):
423+
result = 1L
424+
for x in vals:
425+
result *= x
426+
return result
394427

395428
def stack(frame, level=-1, dropna=True):
396429
"""

pandas/tests/test_multilevel.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1267,6 +1267,39 @@ def test_unstack_preserve_types(self):
12671267
self.assert_(unstacked['E', 1].dtype == np.object_)
12681268
self.assert_(unstacked['F', 1].dtype == np.float64)
12691269

1270+
def test_unstack_group_index_overflow(self):
1271+
labels = np.tile(np.arange(500), 2)
1272+
level = np.arange(500)
1273+
1274+
index = MultiIndex(levels=[level] * 8 + [[0, 1]],
1275+
labels=[labels] * 8 + [np.arange(2).repeat(500)])
1276+
1277+
s = Series(np.arange(1000), index=index)
1278+
result = s.unstack()
1279+
self.assertEqual(result.shape, (500, 2))
1280+
1281+
# test roundtrip
1282+
stacked = result.stack()
1283+
assert_series_equal(s.astype(np.float64),
1284+
stacked.reindex(s.index))
1285+
1286+
# put it at beginning
1287+
index = MultiIndex(levels=[[0, 1]] + [level] * 8,
1288+
labels=[np.arange(2).repeat(500)] + [labels] * 8)
1289+
1290+
s = Series(np.arange(1000), index=index)
1291+
result = s.unstack(0)
1292+
self.assertEqual(result.shape, (500, 2))
1293+
1294+
# put it in middle
1295+
index = MultiIndex(levels=[level] * 4 + [[0, 1]] + [level] * 4,
1296+
labels=([labels] * 4 + [np.arange(2).repeat(500)]
1297+
+ [labels] * 4))
1298+
1299+
s = Series(np.arange(1000), index=index)
1300+
result = s.unstack(4)
1301+
self.assertEqual(result.shape, (500, 2))
1302+
12701303
def test_getitem_lowerdim_corner(self):
12711304
self.assertRaises(KeyError, self.frame.ix.__getitem__,
12721305
(('bar', 'three'), 'B'))

0 commit comments

Comments
 (0)