Skip to content
This repository was archived by the owner on Dec 22, 2019. It is now read-only.

Handle columns index duplicates #7

Merged
merged 4 commits into from
Sep 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 12 additions & 60 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6420,31 +6420,15 @@ def _append_list_of_frames(self, other, *args, **kwargs):
_obj_type = kwargs['_obj_type']
_item_type = kwargs.get('_item_type')

from pandas.core.indexes.api import (
CannotSortError,
_normalize_dataframes,
)
from pandas.core.indexes.api import _normalize_dataframes
from pandas.core.reshape.concat import concat

# The default value of sort in version 0.23.0 is None.
# The behavior when this was the value is very
# varied and changes according to input type, columns index
# type, whether a reindex is necessary or not, etc.
#
# The code below is a try to reproduce the old behavior,
# but note that this is deprecated.
#
# TODO: handle sort=None here

# The behavior of concat is a bit problematic as it is. To get around
# this, we prepare the DataFrames before feeding them into concat.
# TODO: sorting behavior when sort=None

# The behavior of concat is a bit problematic as it is. To get around,
# we prepare the DataFrames before feeding them into concat.
to_concat = [self] + other
try:
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
except CannotSortError:
raise TypeError("The resulting columns could not be sorted."
" You can try setting sort=False or use"
" compatible index types.")
to_concat_norm = _normalize_dataframes(to_concat, sort=sort)
result = concat(to_concat_norm, ignore_index=ignore_index,
verify_integrity=verify_integrity, sort=sort)

Expand All @@ -6454,45 +6438,13 @@ def _append_list_of_frames(self, other, *args, **kwargs):
if not ignore_index:
result.index.name = self.index.name

# the conditionals below will be refactored or removed

if sort is None:
# The sorting behaviour for None was weird.
# It is getting deprecated.
#
# By now, fix tests by only sorting when the
# original 'other' was a series or a dict.
if _obj_type in (dict, Series):
sort = False
elif _item_type in (dict, Series):
# A list of dicts/Series had a different behaviour
# when sorting is None.
#
# We do not sort if the 'other' columns are all
# contained in self.columns. Otherwise we do
# sort.
#
# TODO: as per documentation, this seems like the original
# behaviour intended for append. Should I implement this
# for any inputs that come?
self_idx = self.columns
other_idx = other[0].columns
idx_diff = other_idx.difference(self_idx)
sort = len(idx_diff) > 0
else:
sort = True

# Reindexing the columns created an artificial float64 where it
# was not needed. We can convert the columns back to the expected
# type.
if result.shape[0] == 1:
from pandas.core.dtypes.cast import find_common_type

# Reindexing the columns created an artificial float64 where it
# was not needed. We can convert the columns back to the expected
# type.

for col in result:
types = [df[col].dtype for df in to_concat if col in df]
common_type = find_common_type(types)
result[col] = result[col].astype(common_type)
base_frame = next(df for df in to_concat_norm if df.shape[0] == 1)
dtypes = base_frame.dtypes.to_dict()
result = result.astype(dtypes) # won't work well dups cols

return result

Expand Down
106 changes: 84 additions & 22 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
_new_Index,
ensure_index,
ensure_index_from_sequences,
CannotSortError,
InvalidIndexError
InvalidIndexError,
)
from pandas.core.indexes.category import CategoricalIndex # noqa
from pandas.core.indexes.multi import MultiIndex # noqa
Expand All @@ -38,6 +37,18 @@
""")


class _CannotSortError(Exception):
pass


class _CannotSortDuplicatesError(Exception):
pass


class _DuplicatesError(Exception):
pass


# TODO: there are many places that rely on these private methods existing in
# pandas.core.index
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
Expand Down Expand Up @@ -181,20 +192,40 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
----------
index_list: list of Index objects
verify_inputs: boolean, default True
Verify if the input indexes contain overlapping values.
Verify if the input indexes contain duplicate values. Ignored when all
input indexes share the same identity (a is b).
sort: boolean, default False
Order result index. If False, values will come in the order they
Order resulting index. If False, values will come in the order they
appear.

Raises
------
CannotSortError
When sort=True and the result index is not sortable.
InvalidIndexError
When verify_inputs=True and 1+ of the indexes contain duplicates.
InvalidIndexError:
When there are duplicates in at least one of the indexes (col)
and they are not allowed.
TypeError:
When sort=True and the resulting index (col) could not be sorted.
"""
orig_columns = [df.columns for df in frame_list]
merged_columns = _merge_index_list(orig_columns, verify_inputs, sort)

kwargs = {
'verify_dups': verify_inputs,
'allow_matching_dups': verify_inputs,
'sort': sort,
}

try:
merged_columns = _merge_index_list(orig_columns, **kwargs)
except _DuplicatesError:
raise InvalidIndexError("Indexes with duplicates are only allowed"
" when they are the same (a is b).")
except _CannotSortDuplicatesError:
raise InvalidIndexError("When sort=True, indexes with duplicate"
" values are not allowed.")
except _CannotSortError:
raise TypeError("The resulting columns could not be sorted."
" You can try setting sort=False or use"
" compatible index types.")

# Because _merge_index_list may infer the index dtype based on values,
# we have to provide a workaround to conserve the original dtype.
Expand All @@ -217,33 +248,64 @@ def _normalize_dataframes(frame_list, verify_inputs=True, sort=False):
return [_reindex(df, merged_columns, axis=1) for df in frame_list]


def _merge_index_list(index_list, verify_inputs=True, sort=False):
def _merge_index_list(index_list,
verify_dups=True,
allow_matching_dups=False,
sort=False):
"""Merge a list of indexes into one big index

Parameters
----------
index_list: list of Index objects
verify_inputs: boolean, default True
Verify if the input indexes contain overlapping values.
verify_dups: boolean, default True
Verify if the input indexes contain duplicate values.
allow_matching_dups: boolean, default False
Only relevant when verify_dups=True. Allow duplicate values when all
indexes have the same identity.
sort: boolean, default False
Order result index. If False, values will come in the order they
appear.

Raises
------
CannotSortError
_CannotSortError
When sort=True and the result index is not sortable.
InvalidIndexError
When verify_inputs=True and 1+ of the indexes contain duplicates.
_CannotSortDuplicatesError
When sort=True and at least one of the inputs contain duplicate
values.
_DuplicatesError
When verify_dups=True and at least one of the input indexes contain
duplicate values. This is error is not raised if
allow_matching_dups=True and all the indexes have a common identity.
"""
if verify_inputs:
if any([ix.has_duplicates for ix in index_list]):
raise InvalidIndexError("Input index has duplicate values")

result = index_list[0]
for idx in index_list[1:]:
# unique index list (a is b)
uindex_list = com.get_distinct_objs(index_list)

# verify duplicates
if sort or verify_dups:
has_dups = any(ix.has_duplicates for ix in uindex_list)
if has_dups:
if sort:
raise _CannotSortDuplicatesError("Cannot sort an index that"
" contains duplicate values.")
elif verify_dups and not allow_matching_dups:
raise _DuplicatesError("Index has duplicate values.")
elif verify_dups and allow_matching_dups and len(uindex_list) >= 2:
raise _DuplicatesError("Index has duplicate values and does"
" not match other indexes.")

# edge results
if len(uindex_list) == 0:
return pd.Index()
elif len(uindex_list) == 1:
return uindex_list[0]

# reduce to one result
result = uindex_list[0]
for idx in uindex_list[1:]:
result = _merge_indexes(result, idx)

# sort
return result if not sort else _sort_index(result)


Expand Down Expand Up @@ -278,7 +340,7 @@ def _sort_index(index):
try:
return index.sort_values()
except TypeError:
raise CannotSortError
raise _CannotSortError


def _reindex(df, new_index, axis=0):
Expand Down
4 changes: 0 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,6 @@ def index_arithmetic_method(self, other):
return set_function_name(index_arithmetic_method, name, cls)


class CannotSortError(Exception):
pass


class InvalidIndexError(Exception):
pass

Expand Down
Loading