BUG: Rewrite _make_concat_multiindex

toobaz · toobaz · commit c9898c177771 · 2019-02-03T16:52:55.000+01:00
closes #20565
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -183,6 +183,7 @@ Reshaping
 
 - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
 - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`)
+- Bug in :func:`concat` creating a malformed :class:`MultiIndex` when passed multiple frames indexed by identical :class:`MultiIndex`es (:issue:`20565`)
 -
 
 
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -8,12 +8,8 @@
 
 from pandas import DataFrame, Index, MultiIndex, Series, compat
 from pandas.core import common as com
-from pandas.core.arrays.categorical import (
-    _factorize_from_iterable, _factorize_from_iterables)
 from pandas.core.generic import NDFrame
-from pandas.core.index import (
-    _all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
-    ensure_index)
+from pandas.core.index import _get_objs_combined_axis, ensure_index
 import pandas.core.indexes.base as ibase
 from pandas.core.internals import concatenate_block_managers
 
@@ -533,103 +529,62 @@ def _concat_indexes(indexes):
 
 
 def _make_concat_multiindex(indexes, keys, levels=None, names=None):
+    """
+    Produce a MultiIndex which includes concatenated pieces in "indexes",
+    prepended by one or more levels defined by "keys".
 
-    if ((levels is None and isinstance(keys[0], tuple)) or
-            (levels is not None and len(levels) > 1)):
-        zipped = compat.lzip(*keys)
-        if names is None:
-            names = [None] * len(zipped)
-
-        if levels is None:
-            _, levels = _factorize_from_iterables(zipped)
-        else:
-            levels = [ensure_index(x) for x in levels]
-    else:
-        zipped = [keys]
-        if names is None:
-            names = [None]
-
-        if levels is None:
-            levels = [ensure_index(keys)]
-        else:
-            levels = [ensure_index(x) for x in levels]
-
-    if not _all_indexes_same(indexes):
-        codes_list = []
-
-        # things are potentially different sizes, so compute the exact codes
-        # for each level and pass those to MultiIndex.from_arrays
-
-        for hlevel, level in zip(zipped, levels):
-            to_concat = []
-            for key, index in zip(hlevel, indexes):
-                try:
-                    i = level.get_loc(key)
-                except KeyError:
-                    raise ValueError('Key {key!s} not in level {level!s}'
-                                     .format(key=key, level=level))
-
-                to_concat.append(np.repeat(i, len(index)))
-            codes_list.append(np.concatenate(to_concat))
+    Parameters
+    ----------
+    indexes : sequence of Index (or subclass) instances.
+        Pieces of new Index.
+    keys : sequence of labels, same length as "indexes".
+        Labels used to index the pieces in "indexes".
+    levels : list of sequences, default None
+        Used to override the ".levels" in the resulting hierarchical index.
+    names : list, default None
+        Names for the levels in the resulting hierarchical index.
 
-        concat_index = _concat_indexes(indexes)
+    Returns
+    -------
+    concatenated : MultiIndex
 
-        # these go at the end
-        if isinstance(concat_index, MultiIndex):
-            levels.extend(concat_index.levels)
-            codes_list.extend(concat_index.codes)
-        else:
-            codes, categories = _factorize_from_iterable(concat_index)
-            levels.append(categories)
-            codes_list.append(codes)
+    """
 
-        if len(names) == len(levels):
-            names = list(names)
+    orig = _concat_indexes(indexes)
+
+    # Simplest way to create and prepend the keys level(s):
+    keys_chunks = [([key] * len(idx)) for (key, idx) in zip(keys, indexes)]
+    keys_levs = Index([i for l in keys_chunks for i in l],
+                      tupleize_cols=True)
+    tot_df = concat([keys_levs.to_frame().reset_index(drop=True),
+                     orig.to_frame().reset_index(drop=True)], axis=1)
+    temp_names = [None] * keys_levs.nlevels + list(orig.names)
+    result = MultiIndex.from_frame(tot_df, names=temp_names)
+
+    if names is not None:
+        if len(names) == keys_levs.nlevels:
+            # Received only names for keys level(s)
+            result.names = list(names) + list(result.names)[len(names):]
         else:
-            # make sure that all of the passed indices have the same nlevels
-            if not len({idx.nlevels for idx in indexes}) == 1:
-                raise AssertionError("Cannot concat indices that do"
-                                     " not have the same number of levels")
-
-            # also copies
-            names = names + _get_consensus_names(indexes)
-
-        return MultiIndex(levels=levels, codes=codes_list, names=names,
-                          verify_integrity=False)
-
-    new_index = indexes[0]
-    n = len(new_index)
-    kpieces = len(indexes)
-
-    # also copies
-    new_names = list(names)
-    new_levels = list(levels)
-
-    # construct codes
-    new_codes = []
-
-    # do something a bit more speedy
-
-    for hlevel, level in zip(zipped, levels):
-        hlevel = ensure_index(hlevel)
-        mapped = level.get_indexer(hlevel)
-
-        mask = mapped == -1
-        if mask.any():
-            raise ValueError('Values not found in passed level: {hlevel!s}'
-                             .format(hlevel=hlevel[mask]))
-
-        new_codes.append(np.repeat(mapped, n))
-
-    if isinstance(new_index, MultiIndex):
-        new_levels.extend(new_index.levels)
-        new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
-    else:
-        new_levels.append(new_index)
-        new_codes.append(np.tile(np.arange(n), kpieces))
-
-    if len(new_names) < len(new_levels):
-        new_names.extend(new_index.names)
-
-    return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
-                      verify_integrity=False)
+            # Received names for all levels
+            result.names = names
+
+    if levels is not None:
+        for i, level in enumerate(levels):
+            if level is None:
+                continue
+            cur_lev = result.levels[i]
+            new_lev = Index(level)
+            not_found = np.where(new_lev.get_indexer(cur_lev) == -1)[0]
+
+            if len(not_found):
+                missing = [level[i] for i in not_found]
+                raise ValueError("Values not found in passed level: "
+                                 "{missing!s}"
+                                 .format(missing=missing))
+            cur_val = result.get_level_values(i)
+            result = (result.set_levels(new_lev, level=i)
+                            .set_labels(new_lev.get_indexer_for(cur_val),
+                                        level=i))
+
+    return result
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
@@ -1303,14 +1303,10 @@ def test_concat_keys_levels_no_overlap(self):
         df2 = DataFrame(np.random.randn(1, 4), index=['b'])
 
         msg = "Values not found in passed level"
-        with pytest.raises(ValueError, match=msg):
-            concat([df, df],
-                   keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
-
-        msg = "Key one not in level"
-        with pytest.raises(ValueError, match=msg):
-            concat([df, df2],
-                   keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
+        for other in df, df2:
+            with pytest.raises(ValueError, match=msg):
+                concat([df, other],
+                       keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
 
     def test_concat_rename_index(self):
         a = DataFrame(np.random.rand(3, 3),
@@ -2436,6 +2432,15 @@ def test_concat_different_extension_dtypes_upcasts(self):
         ], dtype=object)
         tm.assert_series_equal(result, expected)
 
+    def test_concat_repeated_index(self):
+        # GH 20565
+        df = pd.DataFrame(np.random.randn(3, 2),
+                          columns=['A', 'B'], index=['Z1'] * 3)
+
+        result = pd.concat([df, df], keys=['Key1', 'Key2'],
+                           names=['KEY', 'ID'])
+        expected = pd.Index(['Z1'], name='ID')
+        tm.assert_index_equal(result.index.levels[1], expected)
 
 @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
 @pytest.mark.parametrize('dt', np.sctypes['float'])

Original file line number	Diff line number	Diff line change
`@@ -183,6 +183,7 @@ Reshaping`
`183`	`183`
`184`	`184`	- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
`185`	`185`	- :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`)
	`186`	+- Bug in :func:`concat` creating a malformed :class:`MultiIndex` when passed multiple frames indexed by identical :class:`MultiIndex`es (:issue:`20565`)
`186`	`187`	`-`
`187`	`188`
`188`	`189`