From 26649f00fa6a52fa25fbd74e0e24b4094b50238d Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 20 Nov 2017 16:40:16 -0700 Subject: [PATCH 1/4] Change UInt64Index._na_value from 0 to np.nan --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/base.py | 19 ++++++----- pandas/core/indexes/numeric.py | 1 - pandas/tests/indexes/test_base.py | 21 +++++++++++- pandas/tests/indexes/test_numeric.py | 49 ++++++++++++++-------------- 5 files changed, 55 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 90032a692fd15..75b7abe9188ff 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -127,7 +127,7 @@ Bug Fixes Conversion ^^^^^^^^^^ -- +- Bug in :class:`Index` constructor with `dtype='uint64'` where int-like floats were not coerced to :class:`UInt64Index` (:issue:`18400`) - - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7a34e64724245..06935c1687033 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -251,7 +251,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # then coerce to integer. try: return cls._try_convert_to_int_index( - data, copy, name) + data, copy, name, dtype) except ValueError: pass @@ -307,7 +307,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if inferred == 'integer': try: return cls._try_convert_to_int_index( - subarr, copy, name) + subarr, copy, name, dtype) except ValueError: pass @@ -664,7 +664,7 @@ def ravel(self, order='C'): # construction helpers @classmethod - def _try_convert_to_int_index(cls, data, copy, name): + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ Attempt to convert an array of data into an integer index. @@ -685,12 +685,13 @@ def _try_convert_to_int_index(cls, data, copy, name): """ from .numeric import Int64Index, UInt64Index - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass + if not is_unsigned_integer_dtype(dtype): + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass # Conversion to int64 failed (possibly due to # overflow), so let's try now with uint64. diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ae6a810ece510..fddbb2de83dca 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -216,7 +216,6 @@ class UInt64Index(NumericIndex): _inner_indexer = libjoin.inner_join_indexer_uint64 _outer_indexer = libjoin.outer_join_indexer_uint64 _can_hold_na = False - _na_value = 0 _engine_type = libindex.UInt64Engine _default_dtype = np.uint64 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c55f53601848c..cb2c36561b7b7 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -14,7 +14,7 @@ import numpy as np from pandas import (period_range, date_range, Series, - DataFrame, Float64Index, Int64Index, + DataFrame, Float64Index, Int64Index, UInt64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, isna) from pandas.core.index import _get_combined_index, _ensure_index_from_sequences @@ -201,6 +201,25 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) + def test_constructor_int_dtype_float(self): + # GH 18400 + data = [0., 1., 2., 3.] + + expected = Int64Index([0, 1, 2, 3]) + result = Index(data, dtype='int64') + tm.assert_index_equal(result, expected) + + expected = UInt64Index([0, 1, 2, 3]) + result = Index(data, dtype='uint64') + tm.assert_index_equal(result, expected) + + # fall back to Float64Index + data = [0.0, 1.1, 2.2, 3.3] + expected = Float64Index(data) + for dtype in ('int64', 'uint64'): + result = Index(data, dtype=dtype) + tm.assert_index_equal(result, expected) + def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a96c677852339..8ee58afbaefc5 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -658,6 +658,30 @@ def test_ufunc_coercions(self): exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') tm.assert_index_equal(result, exp) + def test_where(self): + i = self.create_index() + result = i.where(notna(i)) + expected = i + tm.assert_index_equal(result, expected) + + _nan = i._na_value + cond = [False] + [True] * len(i[1:]) + expected = Float64Index([_nan] + i[1:].tolist()) + result = i.where(cond) + tm.assert_index_equal(result, expected) + + def test_where_array_like(self): + i = self.create_index() + + _nan = i._na_value + cond = [False] + [True] * (len(i) - 1) + klasses = [list, tuple, np.array, pd.Series] + expected = Float64Index([_nan] + i[1:].tolist()) + + for klass in klasses: + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + class TestInt64Index(NumericInt): _dtype = 'int64' @@ -726,31 +750,6 @@ def test_coerce_list(self): arr = Index([1, 2, 3, 4], dtype=object) assert isinstance(arr, Index) - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) From 901f209c6912eae27061ddf7602fb8d856e5fdf6 Mon Sep 17 00:00:00 2001 From: jschendel Date: Tue, 21 Nov 2017 00:46:05 -0700 Subject: [PATCH 2/4] update test --- pandas/tests/indexes/test_base.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index cb2c36561b7b7..99a99cc5cc3eb 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta import pandas.util.testing as tm +from pandas.core.dtypes.common import is_unsigned_integer_dtype from pandas.core.indexes.api import Index, MultiIndex from pandas.tests.indexes.common import Base @@ -201,25 +202,20 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) tm.assert_index_equal(result, expected) - def test_constructor_int_dtype_float(self): + @pytest.mark.parametrize('dtype', [ + int, 'int64', 'int32', 'int16', 'int8', 'uint64', 'uint32', + 'uint16', 'uint8']) + def test_constructor_int_dtype_float(self, dtype): # GH 18400 - data = [0., 1., 2., 3.] - - expected = Int64Index([0, 1, 2, 3]) - result = Index(data, dtype='int64') - tm.assert_index_equal(result, expected) + if is_unsigned_integer_dtype(dtype): + index_type = UInt64Index + else: + index_type = Int64Index - expected = UInt64Index([0, 1, 2, 3]) - result = Index(data, dtype='uint64') + expected = index_type([0, 1, 2, 3]) + result = Index([0., 1., 2., 3.], dtype=dtype) tm.assert_index_equal(result, expected) - # fall back to Float64Index - data = [0.0, 1.1, 2.2, 3.3] - expected = Float64Index(data) - for dtype in ('int64', 'uint64'): - result = Index(data, dtype=dtype) - tm.assert_index_equal(result, expected) - def test_constructor_int_dtype_nan(self): # see gh-15187 data = [np.nan] From 22a61a93a5e6f3db9e533b66b49f403c69327d53 Mon Sep 17 00:00:00 2001 From: jschendel Date: Wed, 22 Nov 2017 01:55:39 -0700 Subject: [PATCH 3/4] review edits --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/indexes/base.py | 6 ++-- pandas/tests/indexes/common.py | 28 +++++----------- pandas/tests/indexes/period/test_period.py | 23 ++++--------- pandas/tests/indexes/test_category.py | 27 +++++---------- pandas/tests/indexes/test_interval.py | 19 +++++------ pandas/tests/indexes/test_numeric.py | 38 ++++++++-------------- pandas/tests/indexes/test_range.py | 27 +-------------- 8 files changed, 52 insertions(+), 117 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 75b7abe9188ff..b544a626e5f33 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -52,6 +52,7 @@ Backwards incompatible API changes - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN`` (:issue:`18398`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 06935c1687033..c1423fab91a0d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -686,6 +686,8 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): from .numeric import Int64Index, UInt64Index if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired try: res = data.astype('i8', copy=False) if (res == data).all(): @@ -693,8 +695,8 @@ def _try_convert_to_int_index(cls, data, copy, name, dtype): except (OverflowError, TypeError, ValueError): pass - # Conversion to int64 failed (possibly due to - # overflow), so let's try now with uint64. + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. try: res = data.astype('u8', copy=False) if (res == data).all(): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 81360bc0c13f9..43b20f420eb48 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -9,8 +9,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, IntervalIndex, - notna, isna) + TimedeltaIndex, PeriodIndex, IntervalIndex, isna) from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.dtypes.common import needs_i8_conversion @@ -529,31 +528,20 @@ def test_numpy_repeat(self): tm.assert_raises_regex(ValueError, msg, np.repeat, i, rep, axis=0) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + + cond = [True] * len(i) + result = i.where(klass(cond)) expected = i tm.assert_index_equal(result, expected) - _nan = i._na_value cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - result = i.where(cond) + expected = pd.Index([i._na_value] + i[1:].tolist(), dtype=i.dtype) + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist(), dtype=i.dtype) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 7fefcc859d447..52558c27ce707 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -61,27 +61,18 @@ def test_pickle_round_trip(self): result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), - freq='D') - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, Series] - expected = pd.PeriodIndex([pd.NaT] + i[1:].tolist(), freq='D') - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = PeriodIndex([NaT] + i[1:].tolist(), freq='D') + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_where_other(self): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5e40e06d57413..5e6898f9c8711 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, IntervalIndex, compat, notna +from pandas import Categorical, IntervalIndex, compat from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -269,28 +269,19 @@ def f(x): ordered=False) tm.assert_index_equal(result, exp) - def test_where(self): + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, klass): i = self.create_index() - result = i.where(notna(i)) + cond = [True] * len(i) expected = i + result = i.where(klass(cond)) tm.assert_index_equal(result, expected) - i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), - categories=i.categories) - result = i.where(notna(i2)) - expected = i2 - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.CategoricalIndex([np.nan] + i[1:].tolist(), - categories=i.categories) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) + expected = CategoricalIndex([np.nan] + i[1:].tolist(), + categories=i.categories) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) def test_append(self): diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 399d88309072e..3876fa4f9939d 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -290,20 +290,19 @@ def test_astype(self, closed): expected = pd.Categorical(idx, ordered=True) tm.assert_categorical_equal(result, expected) - def test_where(self, closed): - expected = self.create_index(closed=closed) - result = expected.where(expected.notna()) + @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) + def test_where(self, closed, klass): + idx = self.create_index(closed=closed) + cond = [True] * len(idx) + expected = idx + result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) - idx = IntervalIndex.from_breaks([1, 2], closed=closed) - result = idx.where([True, False]) - expected = IntervalIndex.from_intervals( - [Interval(1.0, 2.0, closed=closed), np.nan]) + cond = [False] + [True] * len(idx[1:]) + expected = IntervalIndex([np.nan] + idx[1:].tolist()) + result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) - def test_where_array_like(self): - pass - def test_delete(self, closed): expected = IntervalIndex.from_breaks([1, 2], closed=closed) result = self.create_index(closed=closed).delete(0) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 8ee58afbaefc5..030d688f510b0 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -7,7 +7,7 @@ import numpy as np -from pandas import (date_range, notna, Series, Index, Float64Index, +from pandas import (date_range, Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex) import pandas.util.testing as tm @@ -175,6 +175,18 @@ def test_modulo(self): expected = Index(index.values % 2) tm.assert_index_equal(index % 2, expected) + @pytest.mark.parametrize('klass', [list, tuple, np.array, Series]) + def test_where(self, klass): + i = self.create_index() + cond = [True] * len(i) + expected = i + result = i.where(klass(cond)) + + cond = [False] + [True] * (len(i) - 1) + expected = Float64Index([i._na_value] + i[1:].tolist()) + result = i.where(klass(cond)) + tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric): _holder = Float64Index @@ -658,30 +670,6 @@ def test_ufunc_coercions(self): exp = Float64Index([0.5, 1., 1.5, 2., 2.5], name='x') tm.assert_index_equal(result, exp) - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = Float64Index([_nan] + i[1:].tolist()) - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = Float64Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - class TestInt64Index(NumericInt): _dtype = 'int64' diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 7d88b547746f6..b4d1c3760f25a 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -10,7 +10,7 @@ import numpy as np -from pandas import (isna, notna, Series, Index, Float64Index, +from pandas import (isna, Series, Index, Float64Index, Int64Index, RangeIndex) import pandas.util.testing as tm @@ -934,31 +934,6 @@ def test_len_specialised(self): i = RangeIndex(0, 5, step) assert len(i) == 0 - def test_where(self): - i = self.create_index() - result = i.where(notna(i)) - expected = i - tm.assert_index_equal(result, expected) - - _nan = i._na_value - cond = [False] + [True] * len(i[1:]) - expected = pd.Index([_nan] + i[1:].tolist()) - - result = i.where(cond) - tm.assert_index_equal(result, expected) - - def test_where_array_like(self): - i = self.create_index() - - _nan = i._na_value - cond = [False] + [True] * (len(i) - 1) - klasses = [list, tuple, np.array, pd.Series] - expected = pd.Index([_nan] + i[1:].tolist()) - - for klass in klasses: - result = i.where(klass(cond)) - tm.assert_index_equal(result, expected) - def test_append(self): # GH16212 RI = RangeIndex From 0852ecbbc5bfb47bb4cf23fbe32fe0de46eb5e91 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 23 Nov 2017 16:53:21 -0700 Subject: [PATCH 4/4] clarified whatsnew --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index b544a626e5f33..ff710cc899ac7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -52,7 +52,7 @@ Backwards incompatible API changes - :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`) -- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN`` (:issue:`18398`) +- The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) -