Skip to content

BUG: Rewrite _make_concat_multiindex #25117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ Reshaping

- Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`)
- :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`)
- Bug in :func:`concat` creating a malformed :class:`MultiIndex` when passed multiple frames indexed by identical :class:`MultiIndex`es (:issue:`20565`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you need a space before the es I think

-


Expand Down
155 changes: 55 additions & 100 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@

from pandas import DataFrame, Index, MultiIndex, Series, compat
from pandas.core import common as com
from pandas.core.arrays.categorical import (
_factorize_from_iterable, _factorize_from_iterables)
from pandas.core.generic import NDFrame
from pandas.core.index import (
_all_indexes_same, _get_consensus_names, _get_objs_combined_axis,
ensure_index)
from pandas.core.index import _get_objs_combined_axis, ensure_index
import pandas.core.indexes.base as ibase
from pandas.core.internals import concatenate_block_managers

Expand Down Expand Up @@ -533,103 +529,62 @@ def _concat_indexes(indexes):


def _make_concat_multiindex(indexes, keys, levels=None, names=None):
"""
Produce a MultiIndex which includes concatenated pieces in "indexes",
prepended by one or more levels defined by "keys".

if ((levels is None and isinstance(keys[0], tuple)) or
(levels is not None and len(levels) > 1)):
zipped = compat.lzip(*keys)
if names is None:
names = [None] * len(zipped)

if levels is None:
_, levels = _factorize_from_iterables(zipped)
else:
levels = [ensure_index(x) for x in levels]
else:
zipped = [keys]
if names is None:
names = [None]

if levels is None:
levels = [ensure_index(keys)]
else:
levels = [ensure_index(x) for x in levels]

if not _all_indexes_same(indexes):
codes_list = []

# things are potentially different sizes, so compute the exact codes
# for each level and pass those to MultiIndex.from_arrays

for hlevel, level in zip(zipped, levels):
to_concat = []
for key, index in zip(hlevel, indexes):
try:
i = level.get_loc(key)
except KeyError:
raise ValueError('Key {key!s} not in level {level!s}'
.format(key=key, level=level))

to_concat.append(np.repeat(i, len(index)))
codes_list.append(np.concatenate(to_concat))
Parameters
----------
indexes : sequence of Index (or subclass) instances.
Pieces of new Index.
keys : sequence of labels, same length as "indexes".
Labels used to index the pieces in "indexes".
levels : list of sequences, default None
Used to override the ".levels" in the resulting hierarchical index.
names : list, default None
Names for the levels in the resulting hierarchical index.

concat_index = _concat_indexes(indexes)
Returns
-------
concatenated : MultiIndex

# these go at the end
if isinstance(concat_index, MultiIndex):
levels.extend(concat_index.levels)
codes_list.extend(concat_index.codes)
else:
codes, categories = _factorize_from_iterable(concat_index)
levels.append(categories)
codes_list.append(codes)
"""

if len(names) == len(levels):
names = list(names)
orig = _concat_indexes(indexes)

# Simplest way to create and prepend the keys level(s):
keys_chunks = [([key] * len(idx)) for (key, idx) in zip(keys, indexes)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe make this a generator comprehension instead

keys_levs = Index([i for l in keys_chunks for i in l],
tupleize_cols=True)
tot_df = concat([keys_levs.to_frame().reset_index(drop=True),
orig.to_frame().reset_index(drop=True)], axis=1)
temp_names = [None] * keys_levs.nlevels + list(orig.names)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

name this empty_ames

result = MultiIndex.from_frame(tot_df, names=temp_names)

if names is not None:
if len(names) == keys_levs.nlevels:
# Received only names for keys level(s)
result.names = list(names) + list(result.names)[len(names):]
else:
# make sure that all of the passed indices have the same nlevels
if not len({idx.nlevels for idx in indexes}) == 1:
raise AssertionError("Cannot concat indices that do"
" not have the same number of levels")

# also copies
names = names + _get_consensus_names(indexes)

return MultiIndex(levels=levels, codes=codes_list, names=names,
verify_integrity=False)

new_index = indexes[0]
n = len(new_index)
kpieces = len(indexes)

# also copies
new_names = list(names)
new_levels = list(levels)

# construct codes
new_codes = []

# do something a bit more speedy

for hlevel, level in zip(zipped, levels):
hlevel = ensure_index(hlevel)
mapped = level.get_indexer(hlevel)

mask = mapped == -1
if mask.any():
raise ValueError('Values not found in passed level: {hlevel!s}'
.format(hlevel=hlevel[mask]))

new_codes.append(np.repeat(mapped, n))

if isinstance(new_index, MultiIndex):
new_levels.extend(new_index.levels)
new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes])
else:
new_levels.append(new_index)
new_codes.append(np.tile(np.arange(n), kpieces))

if len(new_names) < len(new_levels):
new_names.extend(new_index.names)

return MultiIndex(levels=new_levels, codes=new_codes, names=new_names,
verify_integrity=False)
# Received names for all levels
result.names = names

if levels is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some commentary what you are doing here

for i, level in enumerate(levels):
if level is None:
continue
cur_lev = result.levels[i]
new_lev = Index(level)
not_found = np.where(new_lev.get_indexer(cur_lev) == -1)[0]

if len(not_found):
missing = [level[i] for i in not_found]
raise ValueError("Values not found in passed level: "
"{missing!s}"
.format(missing=missing))
cur_val = result.get_level_values(i)
result = (result.set_levels(new_lev, level=i)
.set_labels(new_lev.get_indexer_for(cur_val),
level=i))

return result
21 changes: 13 additions & 8 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1303,14 +1303,10 @@ def test_concat_keys_levels_no_overlap(self):
df2 = DataFrame(np.random.randn(1, 4), index=['b'])

msg = "Values not found in passed level"
with pytest.raises(ValueError, match=msg):
concat([df, df],
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])

msg = "Key one not in level"
with pytest.raises(ValueError, match=msg):
concat([df, df2],
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])
for other in df, df2:
with pytest.raises(ValueError, match=msg):
concat([df, other],
keys=['one', 'two'], levels=[['foo', 'bar', 'baz']])

def test_concat_rename_index(self):
a = DataFrame(np.random.rand(3, 3),
Expand Down Expand Up @@ -2436,6 +2432,15 @@ def test_concat_different_extension_dtypes_upcasts(self):
], dtype=object)
tm.assert_series_equal(result, expected)

def test_concat_repeated_index(self):
# GH 20565
df = pd.DataFrame(np.random.randn(3, 2),
columns=['A', 'B'], index=['Z1'] * 3)

result = pd.concat([df, df], keys=['Key1', 'Key2'],
names=['KEY', 'ID'])
expected = pd.Index(['Z1'], name='ID')
tm.assert_index_equal(result.index.levels[1], expected)

@pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel])
@pytest.mark.parametrize('dt', np.sctypes['float'])
Expand Down