Skip to content

Commit f70a6e2

Browse files
committed
Merge pull request #5214 from jtratner/mi-validate-level-label-compat
BUG/CLN: MI now checks level & label compatibility
2 parents 0aa1800 + cdfb299 commit f70a6e2

File tree

10 files changed

+120
-42
lines changed

10 files changed

+120
-42
lines changed

doc/source/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,8 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
400400
instead they are generated and cached on the fly. The internal
401401
representation and handling of DateOffsets has also been clarified.
402402
(:issue:`5189`, related :issue:`5004`)
403+
- ``MultiIndex`` constructor now validates that passed levels and labels are
404+
compatible. (:issue:`5213`, :issue:`5214`)
403405

404406
.. _release.bug_fixes-0.13.0:
405407

pandas/core/index.py

Lines changed: 64 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,7 +1394,7 @@ def _join_level(self, other, level, how='left', return_indexers=False):
13941394
new_levels[level] = new_level
13951395

13961396
join_index = MultiIndex(levels=new_levels, labels=new_labels,
1397-
names=left.names)
1397+
names=left.names, verify_integrity=False)
13981398
left_indexer = np.arange(len(left))[new_lev_labels != -1]
13991399
else:
14001400
join_index = left
@@ -1856,7 +1856,7 @@ class MultiIndex(Index):
18561856
rename = Index.set_names
18571857

18581858
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
1859-
copy=False):
1859+
copy=False, verify_integrity=True):
18601860
if levels is None or labels is None:
18611861
raise TypeError("Must pass both levels and labels")
18621862
if len(levels) != len(labels):
@@ -1886,12 +1886,36 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
18861886
else:
18871887
subarr.sortorder = sortorder
18881888

1889+
if verify_integrity:
1890+
subarr._verify_integrity()
1891+
18891892
return subarr
18901893

1894+
def _verify_integrity(self):
1895+
"""Raises ValueError if length of levels and labels don't match or any
1896+
label would exceed level bounds"""
1897+
# NOTE: Currently does not check, among other things, that cached
1898+
# nlevels matches nor that sortorder matches actually sortorder.
1899+
labels, levels = self.labels, self.levels
1900+
if len(levels) != len(labels):
1901+
raise ValueError("Length of levels and labels must match. NOTE:"
1902+
" this index is in an inconsistent state.")
1903+
label_length = len(self.labels[0])
1904+
for i, (level, label) in enumerate(zip(levels, labels)):
1905+
if len(label) != label_length:
1906+
raise ValueError("Unequal label lengths: %s" % (
1907+
[len(lab) for lab in labels]))
1908+
if len(label) and label.max() >= len(level):
1909+
raise ValueError("On level %d, label max (%d) >= length of"
1910+
" level (%d). NOTE: this index is in an"
1911+
" inconsistent state" % (i, label.max(),
1912+
len(level)))
1913+
18911914
def _get_levels(self):
18921915
return self._levels
18931916

1894-
def _set_levels(self, levels, copy=False, validate=True):
1917+
def _set_levels(self, levels, copy=False, validate=True,
1918+
verify_integrity=False):
18951919
# This is NOT part of the levels property because it should be
18961920
# externally not allowed to set levels. User beware if you change
18971921
# _levels directly
@@ -1907,7 +1931,10 @@ def _set_levels(self, levels, copy=False, validate=True):
19071931
self._set_names(names)
19081932
self._tuples = None
19091933

1910-
def set_levels(self, levels, inplace=False):
1934+
if verify_integrity:
1935+
self._verify_integrity()
1936+
1937+
def set_levels(self, levels, inplace=False, verify_integrity=True):
19111938
"""
19121939
Set new levels on MultiIndex. Defaults to returning
19131940
new index.
@@ -1918,6 +1945,8 @@ def set_levels(self, levels, inplace=False):
19181945
new levels to apply
19191946
inplace : bool
19201947
if True, mutates in place
1948+
verify_integrity : bool (default True)
1949+
if True, checks that levels and labels are compatible
19211950
19221951
Returns
19231952
-------
@@ -1930,27 +1959,33 @@ def set_levels(self, levels, inplace=False):
19301959
else:
19311960
idx = self._shallow_copy()
19321961
idx._reset_identity()
1933-
idx._set_levels(levels)
1962+
idx._set_levels(levels, validate=True,
1963+
verify_integrity=verify_integrity)
19341964
if not inplace:
19351965
return idx
19361966

19371967
# remove me in 0.14 and change to read only property
19381968
__set_levels = deprecate("setting `levels` directly",
1939-
partial(set_levels, inplace=True),
1969+
partial(set_levels, inplace=True,
1970+
verify_integrity=True),
19401971
alt_name="set_levels")
19411972
levels = property(fget=_get_levels, fset=__set_levels)
19421973

19431974
def _get_labels(self):
19441975
return self._labels
19451976

1946-
def _set_labels(self, labels, copy=False, validate=True):
1977+
def _set_labels(self, labels, copy=False, validate=True,
1978+
verify_integrity=False):
19471979
if validate and len(labels) != self.nlevels:
19481980
raise ValueError("Length of labels must match length of levels")
19491981
self._labels = FrozenList(_ensure_frozen(labs, copy=copy)._shallow_copy()
19501982
for labs in labels)
19511983
self._tuples = None
19521984

1953-
def set_labels(self, labels, inplace=False):
1985+
if verify_integrity:
1986+
self._verify_integrity()
1987+
1988+
def set_labels(self, labels, inplace=False, verify_integrity=True):
19541989
"""
19551990
Set new labels on MultiIndex. Defaults to returning
19561991
new index.
@@ -1961,6 +1996,8 @@ def set_labels(self, labels, inplace=False):
19611996
new labels to apply
19621997
inplace : bool
19631998
if True, mutates in place
1999+
verify_integrity : bool (default True)
2000+
if True, checks that levels and labels are compatible
19642001
19652002
Returns
19662003
-------
@@ -1973,13 +2010,14 @@ def set_labels(self, labels, inplace=False):
19732010
else:
19742011
idx = self._shallow_copy()
19752012
idx._reset_identity()
1976-
idx._set_labels(labels)
2013+
idx._set_labels(labels, verify_integrity=verify_integrity)
19772014
if not inplace:
19782015
return idx
19792016

19802017
# remove me in 0.14 and change to readonly property
19812018
__set_labels = deprecate("setting labels directly",
1982-
partial(set_labels, inplace=True),
2019+
partial(set_labels, inplace=True,
2020+
verify_integrity=True),
19832021
alt_name="set_labels")
19842022
labels = property(fget=_get_labels, fset=__set_labels)
19852023

@@ -2392,7 +2430,8 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
23922430
names = [c.name for c in cats]
23932431

23942432
return MultiIndex(levels=levels, labels=labels,
2395-
sortorder=sortorder, names=names)
2433+
sortorder=sortorder, names=names,
2434+
verify_integrity=False)
23962435

23972436
@classmethod
23982437
def from_tuples(cls, tuples, sortorder=None, names=None):
@@ -2463,6 +2502,7 @@ def __setstate__(self, state):
24632502
self._set_labels(labels)
24642503
self._set_names(names)
24652504
self.sortorder = sortorder
2505+
self._verify_integrity()
24662506

24672507
def __getitem__(self, key):
24682508
if np.isscalar(key):
@@ -2502,7 +2542,7 @@ def take(self, indexer, axis=None):
25022542
indexer = com._ensure_platform_int(indexer)
25032543
new_labels = [lab.take(indexer) for lab in self.labels]
25042544
return MultiIndex(levels=self.levels, labels=new_labels,
2505-
names=self.names)
2545+
names=self.names, verify_integrity=False)
25062546

25072547
def append(self, other):
25082548
"""
@@ -2618,7 +2658,7 @@ def droplevel(self, level=0):
26182658
return result
26192659
else:
26202660
return MultiIndex(levels=new_levels, labels=new_labels,
2621-
names=new_names)
2661+
names=new_names, verify_integrity=False)
26222662

26232663
def swaplevel(self, i, j):
26242664
"""
@@ -2645,7 +2685,7 @@ def swaplevel(self, i, j):
26452685
new_names[i], new_names[j] = new_names[j], new_names[i]
26462686

26472687
return MultiIndex(levels=new_levels, labels=new_labels,
2648-
names=new_names)
2688+
names=new_names, verify_integrity=False)
26492689

26502690
def reorder_levels(self, order):
26512691
"""
@@ -2664,7 +2704,7 @@ def reorder_levels(self, order):
26642704
new_names = [self.names[i] for i in order]
26652705

26662706
return MultiIndex(levels=new_levels, labels=new_labels,
2667-
names=new_names)
2707+
names=new_names, verify_integrity=False)
26682708

26692709
def __getslice__(self, i, j):
26702710
return self.__getitem__(slice(i, j))
@@ -2705,7 +2745,8 @@ def sortlevel(self, level=0, ascending=True):
27052745
new_labels = [lab.take(indexer) for lab in self.labels]
27062746

27072747
new_index = MultiIndex(labels=new_labels, levels=self.levels,
2708-
names=self.names, sortorder=level)
2748+
names=self.names, sortorder=level,
2749+
verify_integrity=False)
27092750

27102751
return new_index, indexer
27112752

@@ -3086,7 +3127,8 @@ def truncate(self, before=None, after=None):
30863127
new_labels = [lab[left:right] for lab in self.labels]
30873128
new_labels[0] = new_labels[0] - i
30883129

3089-
return MultiIndex(levels=new_levels, labels=new_labels)
3130+
return MultiIndex(levels=new_levels, labels=new_labels,
3131+
verify_integrity=False)
30903132

30913133
def equals(self, other):
30923134
"""
@@ -3180,7 +3222,7 @@ def intersection(self, other):
31803222
if len(uniq_tuples) == 0:
31813223
return MultiIndex(levels=[[]] * self.nlevels,
31823224
labels=[[]] * self.nlevels,
3183-
names=result_names)
3225+
names=result_names, verify_integrity=False)
31843226
else:
31853227
return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0,
31863228
names=result_names)
@@ -3210,14 +3252,14 @@ def diff(self, other):
32103252
if self.equals(other):
32113253
return MultiIndex(levels=[[]] * self.nlevels,
32123254
labels=[[]] * self.nlevels,
3213-
names=result_names)
3255+
names=result_names, verify_integrity=False)
32143256

32153257
difference = sorted(set(self.values) - set(other.values))
32163258

32173259
if len(difference) == 0:
32183260
return MultiIndex(levels=[[]] * self.nlevels,
32193261
labels=[[]] * self.nlevels,
3220-
names=result_names)
3262+
names=result_names, verify_integrity=False)
32213263
else:
32223264
return MultiIndex.from_tuples(difference, sortorder=0,
32233265
names=result_names)
@@ -3269,7 +3311,7 @@ def insert(self, loc, item):
32693311
new_labels.append(np.insert(labels, loc, lev_loc))
32703312

32713313
return MultiIndex(levels=new_levels, labels=new_labels,
3272-
names=self.names)
3314+
names=self.names, verify_integrity=False)
32733315

32743316
def delete(self, loc):
32753317
"""
@@ -3281,7 +3323,7 @@ def delete(self, loc):
32813323
"""
32823324
new_labels = [np.delete(lab, loc) for lab in self.labels]
32833325
return MultiIndex(levels=self.levels, labels=new_labels,
3284-
names=self.names)
3326+
names=self.names, verify_integrity=False)
32853327

32863328
get_major_bounds = slice_locs
32873329

pandas/core/panel.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,8 @@ def panel_index(time, panels, names=['time', 'panel']):
9696

9797
labels = [time_factor.labels, panel_factor.labels]
9898
levels = [time_factor.levels, panel_factor.levels]
99-
return MultiIndex(levels, labels, sortorder=None, names=names)
99+
return MultiIndex(levels, labels, sortorder=None, names=names,
100+
verify_integrity=False)
100101

101102

102103

@@ -838,7 +839,7 @@ def to_frame(self, filter_observations=True):
838839

839840
index = MultiIndex(levels=[self.major_axis, self.minor_axis],
840841
labels=[major_labels, minor_labels],
841-
names=[maj_name, min_name])
842+
names=[maj_name, min_name], verify_integrity=False)
842843

843844
return DataFrame(data, index=index, columns=self.items)
844845

pandas/core/reshape.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def get_new_columns(self):
218218
new_labels.append(np.tile(np.arange(stride), width))
219219

220220
return MultiIndex(levels=new_levels, labels=new_labels,
221-
names=new_names)
221+
names=new_names, verify_integrity=False)
222222

223223
def get_new_index(self):
224224
result_labels = []
@@ -234,7 +234,8 @@ def get_new_index(self):
234234
else:
235235
new_index = MultiIndex(levels=self.new_index_levels,
236236
labels=result_labels,
237-
names=self.new_index_names)
237+
names=self.new_index_names,
238+
verify_integrity=False)
238239

239240
return new_index
240241

@@ -286,7 +287,8 @@ def _unstack_multiple(data, clocs):
286287

287288
dummy_index = MultiIndex(levels=rlevels + [obs_ids],
288289
labels=rlabels + [comp_ids],
289-
names=rnames + ['__placeholder__'])
290+
names=rnames + ['__placeholder__'],
291+
verify_integrity=False)
290292

291293
if isinstance(data, Series):
292294
dummy = Series(data.values, index=dummy_index)
@@ -320,7 +322,7 @@ def _unstack_multiple(data, clocs):
320322
new_labels.append(rec.take(unstcols.labels[-1]))
321323

322324
new_columns = MultiIndex(levels=new_levels, labels=new_labels,
323-
names=new_names)
325+
names=new_names, verify_integrity=False)
324326

325327
if isinstance(unstacked, Series):
326328
unstacked.index = new_columns
@@ -505,13 +507,14 @@ def stack(frame, level=-1, dropna=True):
505507
new_names = list(frame.index.names)
506508
new_names.append(frame.columns.name)
507509
new_index = MultiIndex(levels=new_levels, labels=new_labels,
508-
names=new_names)
510+
names=new_names, verify_integrity=False)
509511
else:
510512
ilabels = np.arange(N).repeat(K)
511513
clabels = np.tile(np.arange(K), N).ravel()
512514
new_index = MultiIndex(levels=[frame.index, frame.columns],
513515
labels=[ilabels, clabels],
514-
names=[frame.index.name, frame.columns.name])
516+
names=[frame.index.name, frame.columns.name],
517+
verify_integrity=False)
515518

516519
new_values = frame.values.ravel()
517520
if dropna:
@@ -590,7 +593,7 @@ def _stack_multi_columns(frame, level=-1, dropna=True):
590593
new_names.append(frame.columns.names[level])
591594

592595
new_index = MultiIndex(levels=new_levels, labels=new_labels,
593-
names=new_names)
596+
names=new_names, verify_integrity=False)
594597

595598
result = DataFrame(new_data, index=new_index, columns=new_columns)
596599

pandas/io/pytables.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2209,7 +2209,8 @@ def read_multi_index(self, key):
22092209
lab = self.read_array(label_key)
22102210
labels.append(lab)
22112211

2212-
return MultiIndex(levels=levels, labels=labels, names=names)
2212+
return MultiIndex(levels=levels, labels=labels, names=names,
2213+
verify_integrity=True)
22132214

22142215
def read_index_node(self, node):
22152216
data = node[:]

pandas/sparse/frame.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -767,7 +767,8 @@ def stack_sparse_frame(frame):
767767
major_labels = np.concatenate(inds_to_concat)
768768
stacked_values = np.concatenate(vals_to_concat)
769769
index = MultiIndex(levels=[frame.index, frame.columns],
770-
labels=[major_labels, minor_labels])
770+
labels=[major_labels, minor_labels],
771+
verify_integrity=False)
771772

772773
lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index,
773774
columns=['foo'])

pandas/sparse/panel.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ def to_frame(self, filter_observations=True):
317317
minor_labels = inds // N
318318

319319
index = MultiIndex(levels=[self.major_axis, self.minor_axis],
320-
labels=[major_labels, minor_labels])
320+
labels=[major_labels, minor_labels],
321+
verify_integrity=False)
321322

322323
df = DataFrame(values, index=index, columns=self.items)
323324
return df.sortlevel(level=0)

0 commit comments

Comments
 (0)