pandas-dev · TomAugspurger · Nov 19, 2019 · Nov 15, 2019 · Nov 15, 2019 · Nov 16, 2019
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2208,9 +2208,12 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
     return objects
 
 
+_no_default = object()
+
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1):
+def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1,
+                   object na_value=_no_default, object dtype=object):
     """
     Substitute for np.vectorize with pandas-friendly dtype inference
 
@@ -2225,14 +2228,17 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1)
     """
     cdef:
         Py_ssize_t i, n
-        ndarray[object] result
+        ndarray result
         object val
 
     n = len(arr)
-    result = np.empty(n, dtype=object)
+    result = np.empty(n, dtype=dtype)
     for i in range(n):
         if mask[i]:
-            val = arr[i]
+            if na_value is _no_default:
+                val = arr[i]
+            else:
+                val = na_value
         else:
             val = f(arr[i])
 

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -15,10 +15,14 @@
     ensure_object,
     is_bool_dtype,
     is_categorical_dtype,
+    is_extension_array_dtype,
     is_integer,
+    is_integer_dtype,
     is_list_like,
+    is_object_dtype,
     is_re,
     is_scalar,
+    is_string_dtype,
 )
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -31,6 +35,7 @@
 from pandas.core.algorithms import take_1d
 from pandas.core.base import NoNewAttributesMixin
 import pandas.core.common as com
+from pandas.core.construction import extract_array
 
 _cpython_optimized_encoders = (
     "utf-8",
@@ -109,9 +114,46 @@ def cat_safe(list_of_columns: List, sep: str):
 
 def _na_map(f, arr, na_result=np.nan, dtype=object):
     # should really _check_ for NA
+    if is_extension_array_dtype(arr.dtype):
+        return _map_ea(f, arr, na_value=na_result, dtype=dtype)
     return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
 
 
+def _map_ea(f, arr, na_value, dtype):
+    from pandas.arrays import IntegerArray, StringArray
+
+    arr = extract_array(arr, extract_numpy=True)
+    mask = isna(arr)
+
+    assert isinstance(arr, StringArray)
+    arr = arr._ndarray
+
+    if is_integer_dtype(dtype):
+        na_value_is_na = isna(na_value)
+        if na_value_is_na:
+            na_value = 1
+        result = lib.map_infer_mask(
+            arr,
+            f,
+            mask.view("uint8"),
+            convert=False,
+            na_value=na_value,
+            dtype=np.dtype("int64"),
+        )
+
+        if not na_value_is_na:
+            mask[:] = False
+
+        return IntegerArray(result, mask)
+
+    elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+        result = lib.map_infer_mask(arr, f, mask.view("uint8"), na_value=na_value)
+        return StringArray(result)
+    # TODO: BooleanArray
+    else:
+        return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
+
 def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
     if not len(arr):
         return np.ndarray(0, dtype=dtype)

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -731,7 +731,10 @@ def test_count(self):
         tm.assert_series_equal(result, exp)
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_count(mixed, "a")
         xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
         tm.assert_numpy_array_equal(rs, xp)
@@ -755,14 +758,14 @@ def test_contains(self):
         expected = np.array([False, np.nan, False, False, True], dtype=np.object_)
         tm.assert_numpy_array_equal(result, expected)
 
-        values = ["foo", "xyz", "fooommm__foo", "mmm_"]
+        values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)
         result = strings.str_contains(values, pat)
         expected = np.array([False, False, True, True])
         assert result.dtype == np.bool_
         tm.assert_numpy_array_equal(result, expected)
 
         # case insensitive using regex
-        values = ["Foo", "xYz", "fOOomMm__fOo", "MMM_"]
+        values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)
         result = strings.str_contains(values, "FOO|mmm", case=False)
         expected = np.array([True, False, True, True])
         tm.assert_numpy_array_equal(result, expected)
@@ -773,7 +776,10 @@ def test_contains(self):
         tm.assert_numpy_array_equal(result, expected)
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_contains(mixed, "o")
         xp = np.array(
             [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan],
@@ -869,7 +875,10 @@ def test_endswith(self):
         tm.assert_series_equal(result, exp.fillna(False).astype(bool))
 
         # mixed
-        mixed = ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0]
+        mixed = np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
         rs = strings.str_endswith(mixed, "f")
         xp = np.array(
             [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan],
@@ -3488,10 +3497,13 @@ def test_casefold(self):
 
 
 def test_string_array(any_string_method):
+    method_name, args, kwargs = any_string_method
+    if method_name == "decode":
+        pytest.skip("decode requires bytes.")
+
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
     b = Series(data, dtype="string")
-    method_name, args, kwargs = any_string_method
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
     result = getattr(b.str, method_name)(*args, **kwargs)
@@ -3502,8 +3514,29 @@ def test_string_array(any_string_method):
         ):
             assert result.dtype == "string"
             result = result.astype(object)
+
+        elif expected.dtype == "float" and expected.isna().any():
+            assert result.dtype == "Int64"
+            result = result.astype("float")
+
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
         assert all(result[columns].dtypes == "string")
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,args,expected",
+    [
+        ("count", ("a",), [2, None]),
+        ("find", ("a",), [0, None]),
+        ("index", ("a",), [0, None]),
+        ("rindex", ("a",), [2, None]),
+    ],
+)
+def test_string_array_numeric_integer_array(method, args, expected):
+    s = Series(["aba", None], dtype="string")
+    result = getattr(s.str, method)(*args)
+    expected = Series(expected, dtype="Int64")
+    tm.assert_series_equal(result, expected)