11import operator
2- from typing import TYPE_CHECKING , Type
2+ from typing import Type
33
44import numpy as np
55
6- from pandas ._libs import lib
6+ from pandas ._libs import lib , missing as libmissing
77
88from pandas .core .dtypes .base import ExtensionDtype
99from pandas .core .dtypes .common import pandas_dtype
1717from pandas .core .construction import extract_array
1818from pandas .core .missing import isna
1919
20- if TYPE_CHECKING :
21- from pandas ._typing import Scalar
22-
2320
2421@register_extension_dtype
2522class StringDtype (ExtensionDtype ):
@@ -50,16 +47,8 @@ class StringDtype(ExtensionDtype):
5047 StringDtype
5148 """
5249
53- @property
54- def na_value (self ) -> "Scalar" :
55- """
56- StringDtype uses :attr:`numpy.nan` as the missing NA value.
57-
58- .. warning::
59-
60- `na_value` may change in a future release.
61- """
62- return np .nan
50+ #: StringDtype.na_value uses pandas.NA
51+ na_value = libmissing .NA
6352
6453 @property
6554 def type (self ) -> Type :
@@ -172,10 +161,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
172161 if dtype :
173162 assert dtype == "string"
174163 result = super ()._from_sequence (scalars , dtype = object , copy = copy )
175- # convert None to np.nan
164+ # Standardize all missing-like values to NA
176165 # TODO: it would be nice to do this in _validate / lib.is_string_array
177166 # We are already doing a scan over the values there.
178- result [result .isna ()] = np . nan
167+ result [result .isna ()] = StringDtype . na_value
179168 return result
180169
181170 @classmethod
@@ -192,6 +181,12 @@ def __arrow_array__(self, type=None):
192181 type = pa .string ()
193182 return pa .array (self ._ndarray , type = type , from_pandas = True )
194183
184+ def _values_for_factorize (self ):
185+ arr = self ._ndarray .copy ()
186+ mask = self .isna ()
187+ arr [mask ] = - 1
188+ return arr , - 1
189+
195190 def __setitem__ (self , key , value ):
196191 value = extract_array (value , extract_numpy = True )
197192 if isinstance (value , type (self )):
@@ -205,9 +200,9 @@ def __setitem__(self, key, value):
205200
206201 # validate new items
207202 if scalar_value :
208- if scalar_value is None :
209- value = np . nan
210- elif not ( isinstance (value , str ) or np . isnan ( value ) ):
203+ if isna ( value ) :
204+ value = StringDtype . na_value
205+ elif not isinstance (value , str ):
211206 raise ValueError (
212207 "Cannot set non-string value '{}' into a StringArray." .format (value )
213208 )
@@ -265,7 +260,7 @@ def method(self, other):
265260 other = other [valid ]
266261
267262 result = np .empty_like (self ._ndarray , dtype = "object" )
268- result [mask ] = np . nan
263+ result [mask ] = StringDtype . na_value
269264 result [valid ] = op (self ._ndarray [valid ], other )
270265
271266 if op .__name__ in {"add" , "radd" , "mul" , "rmul" }:
0 commit comments