Skip to content

CLN: union_indexes #58183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ def fast_multiget(
default=...,
) -> ArrayLike: ...
def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
def fast_unique_multiple_list(lists: list, sort: bool | None = ...) -> list: ...
@overload
def map_infer(
arr: np.ndarray,
Expand Down
34 changes: 3 additions & 31 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -312,34 +312,6 @@ def item_from_zerodim(val: object) -> object:
return val


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:
cdef:
list buf
Py_ssize_t k = len(lists)
Py_ssize_t i, j, n
list uniques = []
dict table = {}
object val, stub = 0

for i in range(k):
buf = lists[i]
n = len(buf)
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
uniques.append(val)
if sort:
try:
uniques.sort()
except TypeError:
pass

return uniques


@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
Expand All @@ -361,15 +333,15 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
list buf
Py_ssize_t j, n
list uniques = []
dict table = {}
object val, stub = 0
set table = set()
object val

for buf in gen:
n = len(buf)
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
table.add(val)
uniques.append(val)
if sort:
try:
Expand Down
90 changes: 27 additions & 63 deletions pandas/core/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,60 +209,6 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:

indexes, kind = _sanitize_and_check(indexes)

def _unique_indices(inds, dtype) -> Index:
"""
Concatenate indices and remove duplicates.

Parameters
----------
inds : list of Index or list objects
dtype : dtype to set for the resulting Index

Returns
-------
Index
"""
if all(isinstance(ind, Index) for ind in inds):
inds = [ind.astype(dtype, copy=False) for ind in inds]
result = inds[0].unique()
other = inds[1].append(inds[2:])
diff = other[result.get_indexer_for(other) == -1]
if len(diff):
result = result.append(diff.unique())
if sort:
result = result.sort_values()
return result

def conv(i):
if isinstance(i, Index):
i = i.tolist()
return i

return Index(
lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort),
dtype=dtype,
)

def _find_common_index_dtype(inds):
"""
Finds a common type for the indexes to pass through to resulting index.

Parameters
----------
inds: list of Index or list objects

Returns
-------
The common type or None if no indexes were given
"""
dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
if dtypes:
dtype = find_common_type(dtypes)
else:
dtype = None

return dtype

if kind == "special":
result = indexes[0]

Expand Down Expand Up @@ -294,18 +240,36 @@ def _find_common_index_dtype(inds):
return result

elif kind == "array":
dtype = _find_common_index_dtype(indexes)
index = indexes[0]
if not all(index.equals(other) for other in indexes[1:]):
index = _unique_indices(indexes, dtype)
if not all_indexes_same(indexes):
dtype = find_common_type([idx.dtype for idx in indexes])
inds = [ind.astype(dtype, copy=False) for ind in indexes]
index = inds[0].unique()
other = inds[1].append(inds[2:])
diff = other[index.get_indexer_for(other) == -1]
if len(diff):
index = index.append(diff.unique())
if sort:
index = index.sort_values()
else:
index = indexes[0]

name = get_unanimous_names(*indexes)[0]
if name != index.name:
index = index.rename(name)
return index
else: # kind='list'
dtype = _find_common_index_dtype(indexes)
return _unique_indices(indexes, dtype)
elif kind == "list":
dtypes = [idx.dtype for idx in indexes if isinstance(idx, Index)]
if dtypes:
dtype = find_common_type(dtypes)
else:
dtype = None
all_lists = (idx.tolist() if isinstance(idx, Index) else idx for idx in indexes)
return Index(
lib.fast_unique_multiple_list_gen(all_lists, sort=bool(sort)),
dtype=dtype,
)
else:
raise ValueError(f"{kind=} must be 'special', 'array' or 'list'.")


def _sanitize_and_check(indexes):
Expand All @@ -329,14 +293,14 @@ def _sanitize_and_check(indexes):
sanitized_indexes : list of Index or list objects
type : {'list', 'array', 'special'}
"""
kinds = list({type(index) for index in indexes})
kinds = {type(index) for index in indexes}

if list in kinds:
if len(kinds) > 1:
indexes = [
Index(list(x)) if not isinstance(x, Index) else x for x in indexes
]
kinds.remove(list)
kinds -= {list}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id expect this to be slightly slower?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [2]: kinds = {pd.DatetimeIndex, pd.Index, list}

In [3]: %time kinds -= {list}
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.29 µs

In [4]: kinds = list({pd.DatetimeIndex, pd.Index, list})

In [5]: %time kinds.remove(list)
CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs

else:
return indexes, "list"

Expand Down