Use NA scalar in string dtype (#1)

TomAugspurger · jorisvandenbossche · commit c72e3eeae2a5 · 2019-11-14T14:53:03.000+01:00
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1500,7 +1500,7 @@ cdef class Validator:
                                   f'must define is_value_typed')
 
     cdef bint is_valid_null(self, object value) except -1:
-        return value is None or util.is_nan(value)
+        return value is None or value is C_NA or util.is_nan(value)
 
     cdef bint is_array_typed(self) except -1:
         return False
diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx
@@ -180,13 +180,15 @@ cpdef assert_almost_equal(a, b,
         # classes can't be the same, to raise error
         assert_class_equal(a, b, obj=obj)
 
-    if a == b:
-        # object comparison
-        return True
     if isna(a) and isna(b):
         # TODO: Should require same-dtype NA?
         # nan / None comparison
         return True
+
+    if a == b:
+        # object comparison
+        return True
+
     if is_comparable_as_number(a) and is_comparable_as_number(b):
         if array_equivalent(a, b, strict_nan=True):
             # inf comparison
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):
         return new_values
 
     def take(self, indices, allow_fill=False, fill_value=None):
+        if fill_value is None:
+            # Primarily for subclasses
+            fill_value = self.dtype.na_value
         result = take(
             self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value
         )
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -1,9 +1,9 @@
 import operator
-from typing import TYPE_CHECKING, Type
+from typing import Type
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import lib, missing as libmissing
 
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.common import pandas_dtype
@@ -17,9 +17,6 @@
 from pandas.core.construction import extract_array
 from pandas.core.missing import isna
 
-if TYPE_CHECKING:
-    from pandas._typing import Scalar
-
 
 @register_extension_dtype
 class StringDtype(ExtensionDtype):
@@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
     StringDtype
     """
 
-    @property
-    def na_value(self) -> "Scalar":
-        """
-        StringDtype uses :attr:`numpy.nan` as the missing NA value.
-
-        .. warning::
-
-           `na_value` may change in a future release.
-        """
-        return np.nan
+    #: StringDtype.na_value uses pandas.NA
+    na_value = libmissing.NA
 
     @property
     def type(self) -> Type:
@@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
             assert dtype == "string"
         result = super()._from_sequence(scalars, dtype=object, copy=copy)
-        # convert None to np.nan
+        # Standardize all missing-like values to NA
         # TODO: it would be nice to do this in _validate / lib.is_string_array
         # We are already doing a scan over the values there.
-        result[result.isna()] = np.nan
+        result[result.isna()] = StringDtype.na_value
         return result
 
     @classmethod
@@ -192,6 +181,12 @@ def __arrow_array__(self, type=None):
             type = pa.string()
         return pa.array(self._ndarray, type=type, from_pandas=True)
 
+    def _values_for_factorize(self):
+        arr = self._ndarray.copy()
+        mask = self.isna()
+        arr[mask] = -1
+        return arr, -1
+
     def __setitem__(self, key, value):
         value = extract_array(value, extract_numpy=True)
         if isinstance(value, type(self)):
@@ -205,9 +200,9 @@ def __setitem__(self, key, value):
 
         # validate new items
         if scalar_value:
-            if scalar_value is None:
-                value = np.nan
-            elif not (isinstance(value, str) or np.isnan(value)):
+            if isna(value):
+                value = StringDtype.na_value
+            elif not isinstance(value, str):
                 raise ValueError(
                     "Cannot set non-string value '{}' into a StringArray.".format(value)
                 )
@@ -265,7 +260,7 @@ def method(self, other):
                 other = other[valid]
 
             result = np.empty_like(self._ndarray, dtype="object")
-            result[mask] = np.nan
+            result[mask] = StringDtype.na_value
             result[valid] = op(self._ndarray[valid], other)
 
             if op.__name__ in {"add", "radd", "mul", "rmul"}:
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -446,6 +446,9 @@ def array_equivalent(left, right, strict_nan=False):
             if left_value is NaT and right_value is not NaT:
                 return False
 
+            elif left_value is libmissing.NA and right_value is not libmissing.NA:
+                return False
+
             elif isinstance(left_value, float) and np.isnan(left_value):
                 if not isinstance(right_value, float) or not np.isnan(right_value):
                     return False
@@ -457,6 +460,8 @@ def array_equivalent(left, right, strict_nan=False):
                     if "Cannot compare tz-naive" in str(err):
                         # tzawareness compat failure, see GH#28507
                         return False
+                    elif "boolean value of NA is ambiguous" in str(err):
+                        return False
                     raise
         return True
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -123,6 +123,13 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
     if na_mask:
         mask = isna(arr)
         convert = not np.all(mask)
+        if convert:
+            # XXX: This converts pd.NA to np.nan to match the output of
+            # object-dtype ops that return numeric, like str.count
+            # We probably want to return Int64Dtype instead.
+            # NA -> nan
+            arr[mask] = np.nan
+
         try:
             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
         except (TypeError, AttributeError) as e:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -12,7 +12,7 @@
 def test_none_to_nan():
     a = pd.arrays.StringArray._from_sequence(["a", None, "b"])
     assert a[1] is not None
-    assert np.isnan(a[1])
+    assert a[1] is pd.NA
 
 
 def test_setitem_validates():
@@ -24,6 +24,15 @@ def test_setitem_validates():
         a[:] = np.array([1, 2])
 
 
+def test_setitem_with_scalar_string():
+    # is_float_dtype considers some strings, like 'd', to be floats
+    # which can cause issues.
+    arr = pd.array(["a", "c"], dtype="string")
+    arr[0] = "d"
+    expected = pd.array(["d", "c"], dtype="string")
+    tm.assert_extension_array_equal(arr, expected)
+
+
 @pytest.mark.parametrize(
     "input, method",
     [
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -25,7 +25,7 @@ def data():
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    return StringArray._from_sequence([np.nan, "A"])
+    return StringArray._from_sequence([pd.NA, "A"])
 
 
 @pytest.fixture
@@ -35,17 +35,17 @@ def data_for_sorting():
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return StringArray._from_sequence(["B", np.nan, "A"])
+    return StringArray._from_sequence(["B", pd.NA, "A"])
 
 
 @pytest.fixture
 def na_value():
-    return np.nan
+    return pd.NA
 
 
 @pytest.fixture
 def data_for_grouping():
-    return StringArray._from_sequence(["B", "B", np.nan, np.nan, "A", "A", "B", "C"])
+    return StringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"])
 
 
 class TestDtype(base.BaseDtypeTests):
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -3507,3 +3507,11 @@ def test_string_array(any_string_method):
         assert all(result[columns].dtypes == "string")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="not implmented yet")
+def test_string_dtype_numeric():
+    s = Series(["a", "aa", None], dtype="string")
+    result = s.str.count("a")
+    expected = Series([1, 2, None], dtype="Int64")
+    tm.assert_series_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -278,6 +278,9 @@ def fillna(self, value=None, method=None, limit=None):`
`278`	`278`	`return new_values`
`279`	`279`
`280`	`280`	`def take(self, indices, allow_fill=False, fill_value=None):`
	`281`	`+ if fill_value is None:`
	`282`	`+ # Primarily for subclasses`
	`283`	`+ fill_value = self.dtype.na_value`
`281`	`284`	`result = take(`
`282`	`285`	`self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value`
`283`	`286`	`)`