From ee8911815ba164655d7b47324906882ae6c13fab Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Jun 2021 06:45:06 -0700 Subject: [PATCH] BUG: clean_index_list handle uint64 case --- pandas/_libs/lib.pyi | 2 +- pandas/_libs/lib.pyx | 28 ++++++++++++++++++++-------- pandas/core/indexes/base.py | 28 ++++++---------------------- pandas/core/indexing.py | 4 +++- pandas/tests/libs/test_lib.py | 12 ++++++++++++ 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 06620c2ad0dca..92daad2d6a5d7 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -185,7 +185,7 @@ def maybe_indices_to_slice( ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] def clean_index_list(obj: list) -> tuple[ - list | np.ndarray, # np.ndarray[object] | np.ndarray[np.int64] + list | np.ndarray, # np.ndarray[object | np.int64 | np.uint64] bool, ]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4d184ee13e3db..cbe5a556d55b0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -747,10 +747,14 @@ def clean_index_list(obj: list): object val bint all_arrays = True + # First check if we have a list of arraylikes, in which case we will + # pass them to MultiIndex.from_arrays for i in range(n): val = obj[i] if not (isinstance(val, list) or util.is_array(val) or hasattr(val, '_data')): + # TODO: EA? + # exclude tuples, frozensets as they may be contained in an Index all_arrays = False break @@ -762,11 +766,21 @@ def clean_index_list(obj: list): if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - # TODO: we infer an integer but it *could* be a uint64 - try: - return np.asarray(obj, dtype='int64'), 0 - except OverflowError: - return np.asarray(obj, dtype='object'), 0 + # we infer an integer but it *could* be a uint64 + + arr = np.asarray(obj) + if arr.dtype.kind not in ["i", "u"]: + # eg [0, uint64max] gets cast to float64, + # but then we know we have either uint64 or object + if (arr < 0).any(): + # TODO: similar to maybe_cast_to_integer_array + return np.asarray(obj, dtype="object"), 0 + + # GH#35481 + guess = np.asarray(obj, dtype="uint64") + return guess, 0 + + return arr, 0 return np.asarray(obj), 0 @@ -1552,9 +1566,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: for i in range(n): val = values[i] - if (util.is_integer_object(val) and - not util.is_timedelta64_object(val) and - not util.is_datetime64_object(val)): + if util.is_integer_object(val): return "mixed-integer" return "mixed" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 02fd680775141..14ec3d6009b61 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6299,27 +6299,18 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind if copy: index_like = index_like.copy() return index_like - if hasattr(index_like, "name"): - # https://github.com/python/mypy/issues/1424 - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # Sequence[Any]]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" - # has no attribute "name" - # error: "Sequence[Any]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no - # attribute "name" - # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no - # attribute "name" - name = index_like.name # type: ignore[union-attr, attr-defined] + + if isinstance(index_like, ABCSeries): + name = index_like.name return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) - # must check for exactly list here because of strict type - # check in clean_index_list if isinstance(index_like, list): - if type(index_like) != list: + if type(index_like) is not list: + # must check for exactly list here because of strict type + # check in clean_index_list index_like = list(index_like) converted, all_arrays = lib.clean_index_list(index_like) @@ -6329,13 +6320,6 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind return MultiIndex.from_arrays(converted) else: - if isinstance(converted, np.ndarray) and converted.dtype == np.int64: - # Check for overflows if we should actually be uint64 - # xref GH#35481 - alt = np.asarray(index_like) - if alt.dtype == np.uint64: - converted = alt - index_like = converted else: # clean_index_list does the equivalent of copying diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index be5b89f08b5ca..d5555561088eb 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1934,7 +1934,9 @@ def _setitem_with_indexer_missing(self, indexer, value): # e.g. 0.0 -> 0 # GH#12246 if index.is_unique: - new_indexer = index.get_indexer([new_index[-1]]) + # pass new_index[-1:] instead if [new_index[-1]] + # so that we retain dtype + new_indexer = index.get_indexer(new_index[-1:]) if (new_indexer != -1).any(): # We get only here with loc, so can hard code return self._setitem_with_indexer(new_indexer, value, "loc") diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 5b7e90fe16d8f..0b1f807f2da63 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -206,3 +206,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_clean_index_list(): + # with both 0 and a large-uint64, np.array will infer to float64 + # https://github.com/numpy/numpy/issues/19146 + # but a more accurate choice would be uint64 + values = [0, np.iinfo(np.uint64).max] + + result, _ = lib.clean_index_list(values) + + expected = np.array(values, dtype="uint64") + tm.assert_numpy_array_equal(result, expected, check_dtype=True)