Skip to content

Commit c9898c1

Browse files
committed
BUG: Rewrite _make_concat_multiindex
closes #20565
1 parent f75a220 commit c9898c1

File tree

3 files changed

+69
-108
lines changed

3 files changed

+69
-108
lines changed

doc/source/whatsnew/v0.25.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ Reshaping
183183

184184
- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
185185
- :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`)
186+
- Bug in :func:`concat` creating a malformed :class:`MultiIndex` when passed multiple frames indexed by identical :class:`MultiIndex`es (:issue:`20565`)
186187
-
187188

188189

pandas/core/reshape/concat.py

Lines changed: 55 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,8 @@
88

99
from pandas import DataFrame, Index, MultiIndex, Series, compat
1010
from pandas.core import common as com
11-
from pandas.core.arrays.categorical import (
12-
_factorize_from_iterable, _factorize_from_iterables)
1311
from pandas.core.generic import NDFrame
14-
from pandas.core.index import (
15-
_all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
16-
ensure_index)
12+
from pandas.core.index import _get_objs_combined_axis, ensure_index
1713
import pandas.core.indexes.base as ibase
1814
from pandas.core.internals import concatenate_block_managers
1915

@@ -533,103 +529,62 @@ def _concat_indexes(indexes):
533529

534530

535531
def _make_concat_multiindex(indexes, keys, levels=None, names=None):
532+
"""
533+
Produce a MultiIndex which includes concatenated pieces in "indexes",
534+
prepended by one or more levels defined by "keys".
536535
537-
if ((levels is None and isinstance(keys[0], tuple)) or
538-
(levels is not None and len(levels) > 1)):
539-
zipped = compat.lzip(*keys)
540-
if names is None:
541-
names = [None] * len(zipped)
542-
543-
if levels is None:
544-
_, levels = _factorize_from_iterables(zipped)
545-
else:
546-
levels = [ensure_index(x) for x in levels]
547-
else:
548-
zipped = [keys]
549-
if names is None:
550-
names = [None]
551-
552-
if levels is None:
553-
levels = [ensure_index(keys)]
554-
else:
555-
levels = [ensure_index(x) for x in levels]
556-
557-
if not _all_indexes_same(indexes):
558-
codes_list = []
559-
560-
# things are potentially different sizes, so compute the exact codes
561-
# for each level and pass those to MultiIndex.from_arrays
562-
563-
for hlevel, level in zip(zipped, levels):
564-
to_concat = []
565-
for key, index in zip(hlevel, indexes):
566-
try:
567-
i = level.get_loc(key)
568-
except KeyError:
569-
raise ValueError('Key {key!s} not in level {level!s}'
570-
.format(key=key, level=level))
571-
572-
to_concat.append(np.repeat(i, len(index)))
573-
codes_list.append(np.concatenate(to_concat))
536+
Parameters
537+
----------
538+
indexes : sequence of Index (or subclass) instances.
539+
Pieces of new Index.
540+
keys : sequence of labels, same length as "indexes".
541+
Labels used to index the pieces in "indexes".
542+
levels : list of sequences, default None
543+
Used to override the ".levels" in the resulting hierarchical index.
544+
names : list, default None
545+
Names for the levels in the resulting hierarchical index.
574546
575-
concat_index = _concat_indexes(indexes)
547+
Returns
548+
-------
549+
concatenated : MultiIndex
576550
577-
# these go at the end
578-
if isinstance(concat_index, MultiIndex):
579-
levels.extend(concat_index.levels)
580-
codes_list.extend(concat_index.codes)
581-
else:
582-
codes, categories = _factorize_from_iterable(concat_index)
583-
levels.append(categories)
584-
codes_list.append(codes)
551+
"""
585552

586-
if len(names) == len(levels):
587-
names = list(names)
553+
orig = _concat_indexes(indexes)
554+
555+
# Simplest way to create and prepend the keys level(s):
556+
keys_chunks = [([key] * len(idx)) for (key, idx) in zip(keys, indexes)]
557+
keys_levs = Index([i for l in keys_chunks for i in l],
558+
tupleize_cols=True)
559+
tot_df = concat([keys_levs.to_frame().reset_index(drop=True),
560+
orig.to_frame().reset_index(drop=True)], axis=1)
561+
temp_names = [None] * keys_levs.nlevels + list(orig.names)
562+
result = MultiIndex.from_frame(tot_df, names=temp_names)
563+
564+
if names is not None:
565+
if len(names) == keys_levs.nlevels:
566+
# Received only names for keys level(s)
567+
result.names = list(names) + list(result.names)[len(names):]
588568
else:
589-
# make sure that all of the passed indices have the same nlevels
590-
if not len({idx.nlevels for idx in indexes}) == 1:
591-
raise AssertionError("Cannot concat indices that do"
592-
" not have the same number of levels")
593-
594-
# also copies
595-
names = names + _get_consensus_names(indexes)
596-
597-
return MultiIndex(levels=levels, codes=codes_list, names=names,
598-
verify_integrity=False)
599-
600-
new_index = indexes[0]
601-
n = len(new_index)
602-
kpieces = len(indexes)
603-
604-
# also copies
605-
new_names = list(names)
606-
new_levels = list(levels)
607-
608-
# construct codes
609-
new_codes = []
610-
611-
# do something a bit more speedy
612-
613-
for hlevel, level in zip(zipped, levels):
614-
hlevel = ensure_index(hlevel)
615-
mapped = level.get_indexer(hlevel)
616-
617-
mask = mapped == -1
618-
if mask.any():
619-
raise ValueError('Values not found in passed level: {hlevel!s}'
620-
.format(hlevel=hlevel[mask]))
621-
622-
new_codes.append(np.repeat(mapped, n))
623-
624-
if isinstance(new_index, MultiIndex):
625-
new_levels.extend(new_index.levels)
626-
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
627-
else:
628-
new_levels.append(new_index)
629-
new_codes.append(np.tile(np.arange(n), kpieces))
630-
631-
if len(new_names) < len(new_levels):
632-
new_names.extend(new_index.names)
633-
634-
return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
635-
verify_integrity=False)
569+
# Received names for all levels
570+
result.names = names
571+
572+
if levels is not None:
573+
for i, level in enumerate(levels):
574+
if level is None:
575+
continue
576+
cur_lev = result.levels[i]
577+
new_lev = Index(level)
578+
not_found = np.where(new_lev.get_indexer(cur_lev) == -1)[0]
579+
580+
if len(not_found):
581+
missing = [level[i] for i in not_found]
582+
raise ValueError("Values not found in passed level: "
583+
"{missing!s}"
584+
.format(missing=missing))
585+
cur_val = result.get_level_values(i)
586+
result = (result.set_levels(new_lev, level=i)
587+
.set_labels(new_lev.get_indexer_for(cur_val),
588+
level=i))
589+
590+
return result

pandas/tests/reshape/test_concat.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,14 +1303,10 @@ def test_concat_keys_levels_no_overlap(self):
13031303
df2 = DataFrame(np.random.randn(1, 4), index=['b'])
13041304

13051305
msg = "Values not found in passed level"
1306-
with pytest.raises(ValueError, match=msg):
1307-
concat([df, df],
1308-
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
1309-
1310-
msg = "Key one not in level"
1311-
with pytest.raises(ValueError, match=msg):
1312-
concat([df, df2],
1313-
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
1306+
for other in df, df2:
1307+
with pytest.raises(ValueError, match=msg):
1308+
concat([df, other],
1309+
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
13141310

13151311
def test_concat_rename_index(self):
13161312
a = DataFrame(np.random.rand(3, 3),
@@ -2436,6 +2432,15 @@ def test_concat_different_extension_dtypes_upcasts(self):
24362432
], dtype=object)
24372433
tm.assert_series_equal(result, expected)
24382434

2435+
def test_concat_repeated_index(self):
2436+
# GH 20565
2437+
df = pd.DataFrame(np.random.randn(3, 2),
2438+
columns=['A', 'B'], index=['Z1'] * 3)
2439+
2440+
result = pd.concat([df, df], keys=['Key1', 'Key2'],
2441+
names=['KEY', 'ID'])
2442+
expected = pd.Index(['Z1'], name='ID')
2443+
tm.assert_index_equal(result.index.levels[1], expected)
24392444

24402445
@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
24412446
@pytest.mark.parametrize('dt', np.sctypes['float'])

0 commit comments

Comments
 (0)