Skip to content

Commit d8f94e9

Browse files
committed
PERF speed up str.get_dummies
1 parent 2c5e3d3 commit d8f94e9

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

pandas/core/strings.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -472,31 +472,31 @@ def str_get_dummies(arr, sep='|'):
472472
2 1 0 1
473473
474474
>>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
475-
a b c
476-
0 1 1 0
477-
1 NaN NaN NaN
478-
2 1 0 1
475+
a b c
476+
0 1 1 0
477+
1 0 0 0
478+
2 1 0 1
479479
480480
See also ``pd.get_dummies``.
481481
482482
"""
483-
def na_setunion(x, y):
484-
try:
485-
return x.union(y)
486-
except TypeError:
487-
return x
488-
489483
# TODO remove this hack?
490-
arr = sep + arr.fillna('').astype(str) + sep
484+
arr = arr.fillna('')
485+
try:
486+
arr = sep + arr + sep
487+
except TypeError:
488+
arr = sep + arr.astype(str) + sep
489+
490+
tags = set()
491+
for ts in arr.str.split(sep):
492+
tags.update(ts)
493+
tags = sorted(tags - set([""]))
491494

492-
from functools import reduce
493-
tags = sorted(reduce(na_setunion, arr.str.split(sep), set())
494-
- set(['']))
495495
dummies = np.empty((len(arr), len(tags)), dtype=int)
496496

497497
for i, t in enumerate(tags):
498498
pat = sep + t + sep
499-
dummies[:, i] = _na_map(lambda x: pat in x, arr)
499+
dummies[:, i] = lib.map_infer(arr.values, lambda x: pat in x)
500500
return DataFrame(dummies, arr.index, tags)
501501

502502

0 commit comments

Comments
 (0)