From f33a4c5342224653c3f9b20cae68eb7193d373b2 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 30 Apr 2018 20:57:36 -0700 Subject: [PATCH 1/3] Added test for preservation of NA values in Series unique --- pandas/tests/series/test_analytics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ea40329f4bc3..4ef6ef93ea4b9 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1015,6 +1015,11 @@ def test_unique(self): tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), check_dtype=False) + def test_unique_obj_na_preservation(self, nulls_fixture): + # GH 20866 + s = pd.Series(['foo', nulls_fixture]) + assert s.iloc[1] is nulls_fixture + @pytest.mark.parametrize( "tc1, tc2", [ From 1e83e6066d03d42cc8020595a99bfdcdd3a11c1d Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 4 May 2018 08:40:25 -0700 Subject: [PATCH 2/3] Reverted none to nan conversion in unique --- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/tests/series/test_analytics.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index dbeb8bda3e454..c669fec165ac3 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -870,7 +870,7 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) - if not checknull(val): + if not checknull(val) or val is None: k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: kh_put_pymap(self.table, val, &ret) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 4ef6ef93ea4b9..dfc953b8375d9 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1015,10 +1015,11 @@ def test_unique(self): tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), check_dtype=False) - def test_unique_obj_na_preservation(self, nulls_fixture): + def test_unique_obj_none_preservation(self): # GH 20866 - s = pd.Series(['foo', nulls_fixture]) - assert s.iloc[1] is nulls_fixture + s = pd.Series(['foo', None]) + result = s.unique() + assert result[1] is None @pytest.mark.parametrize( "tc1, tc2", From df28111c29a67213eefaf96d83cbd366f9c3555c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 5 May 2018 11:08:10 -0700 Subject: [PATCH 3/3] Moved test; added comments --- pandas/_libs/hashtable_class_helper.pxi.in | 5 +++++ pandas/tests/series/test_analytics.py | 6 ------ pandas/tests/test_algos.py | 8 ++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c669fec165ac3..b92eb0e651276 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -870,6 +870,11 @@ cdef class PyObjectHashTable(HashTable): for i in range(n): val = values[i] hash(val) + + # `val is None` below is exception to prevent mangling of None and + # other NA values; note however that other NA values (ex: pd.NaT + # and np.nan) will still get mangled, so many not be a permanent + # solution; see GH 20866 if not checknull(val) or val is None: k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index dfc953b8375d9..6ea40329f4bc3 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1015,12 +1015,6 @@ def test_unique(self): tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), check_dtype=False) - def test_unique_obj_none_preservation(self): - # GH 20866 - s = pd.Series(['foo', None]) - result = s.unique() - assert result[1] is None - @pytest.mark.parametrize( "tc1, tc2", [ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8a8a6f7de70d7..46bd879c2db87 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -491,6 +491,14 @@ def test_tuple_with_strings(self, arg, expected): result = pd.unique(arg) tm.assert_numpy_array_equal(result, expected) + def test_obj_none_preservation(self): + # GH 20866 + arr = np.array(['foo', None], dtype=object) + result = pd.unique(arr) + expected = np.array(['foo', None], dtype=object) + + tm.assert_numpy_array_equal(result, expected, strict_nan=True) + class TestIsin(object):