13
13
import numpy as np
14
14
15
15
import pandas ._libs .lib as lib
16
- from pandas ._typing import (
17
- ArrayLike ,
18
- FrameOrSeriesUnion ,
19
- )
16
+ from pandas ._typing import FrameOrSeriesUnion
20
17
from pandas .util ._decorators import Appender
21
18
22
19
from pandas .core .dtypes .common import (
@@ -160,7 +157,6 @@ class StringMethods(NoNewAttributesMixin):
160
157
# TODO: Dispatch all the methods
161
158
# Currently the following are not dispatched to the array
162
159
# * cat
163
- # * extract
164
160
# * extractall
165
161
166
162
def __init__ (self , data ):
@@ -243,7 +239,7 @@ def _wrap_result(
243
239
self ,
244
240
result ,
245
241
name = None ,
246
- expand = None ,
242
+ expand : bool | None = None ,
247
243
fill_value = np .nan ,
248
244
returns_string = True ,
249
245
):
@@ -2385,10 +2381,7 @@ def extract(
2385
2381
2 NaN
2386
2382
dtype: object
2387
2383
"""
2388
- from pandas import (
2389
- DataFrame ,
2390
- array as pd_array ,
2391
- )
2384
+ from pandas import DataFrame
2392
2385
2393
2386
if not isinstance (expand , bool ):
2394
2387
raise ValueError ("expand must be True or False" )
@@ -2400,8 +2393,6 @@ def extract(
2400
2393
if not expand and regex .groups > 1 and isinstance (self ._data , ABCIndex ):
2401
2394
raise ValueError ("only one regex group is supported with Index" )
2402
2395
2403
- # TODO: dispatch
2404
-
2405
2396
obj = self ._data
2406
2397
result_dtype = _result_dtype (obj )
2407
2398
@@ -2415,8 +2406,8 @@ def extract(
2415
2406
result = DataFrame (columns = columns , dtype = result_dtype )
2416
2407
2417
2408
else :
2418
- result_list = _str_extract (
2419
- obj . array , pat , flags = flags , expand = returns_df
2409
+ result_list = self . _data . array . _str_extract (
2410
+ pat , flags = flags , expand = returns_df
2420
2411
)
2421
2412
2422
2413
result_index : Index | None
@@ -2431,9 +2422,7 @@ def extract(
2431
2422
2432
2423
else :
2433
2424
name = _get_single_group_name (regex )
2434
- result_arr = _str_extract (obj .array , pat , flags = flags , expand = returns_df )
2435
- # not dispatching, so we have to reconstruct here.
2436
- result = pd_array (result_arr , dtype = result_dtype )
2425
+ result = self ._data .array ._str_extract (pat , flags = flags , expand = returns_df )
2437
2426
return self ._wrap_result (result , name = name )
2438
2427
2439
2428
@forbid_nonstring_types (["bytes" ])
@@ -3121,33 +3110,6 @@ def _get_group_names(regex: re.Pattern) -> list[Hashable]:
3121
3110
return [names .get (1 + i , i ) for i in range (regex .groups )]
3122
3111
3123
3112
3124
- def _str_extract (arr : ArrayLike , pat : str , flags = 0 , expand : bool = True ):
3125
- """
3126
- Find groups in each string in the array using passed regular expression.
3127
-
3128
- Returns
3129
- -------
3130
- np.ndarray or list of lists is expand is True
3131
- """
3132
- regex = re .compile (pat , flags = flags )
3133
-
3134
- empty_row = [np .nan ] * regex .groups
3135
-
3136
- def f (x ):
3137
- if not isinstance (x , str ):
3138
- return empty_row
3139
- m = regex .search (x )
3140
- if m :
3141
- return [np .nan if item is None else item for item in m .groups ()]
3142
- else :
3143
- return empty_row
3144
-
3145
- if expand :
3146
- return [f (val ) for val in np .asarray (arr )]
3147
-
3148
- return np .array ([f (val )[0 ] for val in np .asarray (arr )], dtype = object )
3149
-
3150
-
3151
3113
def str_extractall (arr , pat , flags = 0 ):
3152
3114
regex = re .compile (pat , flags = flags )
3153
3115
# the regex must contain capture groups.
0 commit comments