Skip to content

Commit d95a42b

Browse files
committed
Defer Series.str.get_dummies to pandas.get_dummies
1 parent 891a419 commit d95a42b

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

pandas/core/strings.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from pandas.core.algorithms import take_1d
2121
from pandas.core.base import NoNewAttributesMixin
2222
import pandas.core.common as com
23+
from pandas.core.reshape.reshape import get_dummies
2324

2425
_cpython_optimized_encoders = (
2526
"utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii"
@@ -1005,17 +1006,14 @@ def str_get_dummies(arr, sep='|'):
10051006
except TypeError:
10061007
arr = sep + arr.astype(str) + sep
10071008

1008-
tags = set()
1009-
for ts in arr.str.split(sep):
1010-
tags.update(ts)
1011-
tags = sorted(tags - {""})
1009+
arr_split = arr.str.split(sep)
1010+
stacked = np.concatenate(arr_split)
1011+
stacked_idx = np.repeat(np.arange(len(arr)), arr_split.str.len())
10121012

1013-
dummies = np.empty((len(arr), len(tags)), dtype=np.int64)
1013+
dummies_stacked = get_dummies(stacked)
1014+
dummies = dummies_stacked.groupby(by=stacked_idx).sum()
10141015

1015-
for i, t in enumerate(tags):
1016-
pat = sep + t + sep
1017-
dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
1018-
return dummies, tags
1016+
return dummies.values, dummies.columns.values
10191017

10201018

10211019
def str_join(arr, sep):

0 commit comments

Comments
 (0)