@@ -2239,17 +2239,19 @@ def _str_findall(self, pat: str, flags: int = 0):
2239
2239
return type (self )(pa .chunked_array (result ))
2240
2240
2241
2241
def _str_get_dummies (self , sep : str = "|" ):
2242
- split = pc .split_pattern (self ._pa_array , sep ).combine_chunks ()
2243
- uniques = split .flatten ().unique ()
2242
+ split = pc .split_pattern (self ._pa_array , sep )
2243
+ flattened_values = pc .list_flatten (split )
2244
+ uniques = flattened_values .unique ()
2244
2245
uniques_sorted = uniques .take (pa .compute .array_sort_indices (uniques ))
2245
- result_data = []
2246
- for lst in split .to_pylist ():
2247
- if lst is None :
2248
- result_data .append ([False ] * len (uniques_sorted ))
2249
- else :
2250
- res = pc .is_in (uniques_sorted , pa .array (set (lst )))
2251
- result_data .append (res .to_pylist ())
2252
- result = type (self )(pa .array (result_data ))
2246
+ lengths = pc .list_value_length (split ).fill_null (0 ).to_numpy ()
2247
+ n_rows = len (self )
2248
+ n_cols = len (uniques )
2249
+ indices = pc .index_in (flattened_values , uniques_sorted ).to_numpy ()
2250
+ indices = indices + np .arange (n_rows ).repeat (lengths ) * n_cols
2251
+ dummies = np .zeros (n_rows * n_cols , dtype = np .bool_ )
2252
+ dummies [indices ] = True
2253
+ dummies = dummies .reshape ((n_rows , n_cols ))
2254
+ result = type (self )(pa .array (list (dummies )))
2253
2255
return result , uniques_sorted .to_pylist ()
2254
2256
2255
2257
def _str_index (self , sub : str , start : int = 0 , end : int | None = None ):
0 commit comments