Skip to content

Commit ab769d8

Browse files
committed
Merge pull request #10840 from chris-b1/valuecounts-float64
PERF: value_counts_float64 #10821
2 parents 5052900 + 22af130 commit ab769d8

File tree

5 files changed

+84
-33
lines changed

5 files changed

+84
-33
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ Performance Improvements
603603
- Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
604604
- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
605605
- Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
606-
606+
- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
607607

608608
.. _whatsnew_0170.bug_fixes:
609609

pandas/core/algorithms.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
232232
values = PeriodIndex(values, name=name)
233233

234234
values = values.view(np.int64)
235-
keys, counts = htable.value_count_int64(values)
235+
keys, counts = htable.value_count_scalar64(values, dropna)
236236

237237
if dropna:
238238
from pandas.tslib import iNaT
@@ -244,7 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
244244

245245
elif com.is_integer_dtype(dtype):
246246
values = com._ensure_int64(values)
247-
keys, counts = htable.value_count_int64(values)
247+
keys, counts = htable.value_count_scalar64(values, dropna)
248+
elif com.is_float_dtype(dtype):
249+
values = com._ensure_float64(values)
250+
keys, counts = htable.value_count_scalar64(values, dropna)
248251

249252
else:
250253
values = com._ensure_object(values)

pandas/core/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True):
10301030
from pandas.core.index import CategoricalIndex
10311031

10321032
cat = self.dropna() if dropna else self
1033-
keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
1033+
keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
10341034
result = Series(counts, index=keys)
10351035

10361036
ix = np.arange(len(cat.categories), dtype='int64')

pandas/hashtable.pyx

Lines changed: 68 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -866,51 +866,90 @@ cdef class Int64Factorizer:
866866
self.count = len(self.uniques)
867867
return labels
868868

869-
869+
ctypedef fused kh_scalar64:
870+
kh_int64_t
871+
kh_float64_t
870872

871873
@cython.boundscheck(False)
872-
cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
874+
cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values,
875+
kh_scalar64 *table, bint dropna):
873876
cdef:
874877
khiter_t k
875878
Py_ssize_t i, n = len(values)
876-
int64_t val
879+
sixty_four_bit_scalar val
877880
int ret = 0
878881

879-
with nogil:
880-
kh_resize_int64(table, n)
882+
if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t:
883+
with nogil:
884+
kh_resize_float64(table, n)
885+
886+
for i in range(n):
887+
val = values[i]
888+
if val == val or not dropna:
889+
k = kh_get_float64(table, val)
890+
if k != table.n_buckets:
891+
table.vals[k] += 1
892+
else:
893+
k = kh_put_float64(table, val, &ret)
894+
table.vals[k] = 1
895+
elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t:
896+
with nogil:
897+
kh_resize_int64(table, n)
898+
899+
for i in range(n):
900+
val = values[i]
901+
k = kh_get_int64(table, val)
902+
if k != table.n_buckets:
903+
table.vals[k] += 1
904+
else:
905+
k = kh_put_int64(table, val, &ret)
906+
table.vals[k] = 1
907+
else:
908+
raise ValueError("Table type must match scalar type.")
881909

882-
for i in range(n):
883-
val = values[i]
884-
k = kh_get_int64(table, val)
885-
if k != table.n_buckets:
886-
table.vals[k] += 1
887-
else:
888-
k = kh_put_int64(table, val, &ret)
889-
table.vals[k] = 1
890910

891911

892912
@cython.boundscheck(False)
893-
cpdef value_count_int64(int64_t[:] values):
913+
cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
894914
cdef:
895915
Py_ssize_t i
896-
kh_int64_t *table
897-
int64_t[:] result_keys, result_counts
916+
kh_float64_t *ftable
917+
kh_int64_t *itable
918+
sixty_four_bit_scalar[:] result_keys
919+
int64_t[:] result_counts
898920
int k
899921

900-
table = kh_init_int64()
901-
build_count_table_int64(values, table)
902-
903922
i = 0
904-
result_keys = np.empty(table.n_occupied, dtype=np.int64)
905-
result_counts = np.zeros(table.n_occupied, dtype=np.int64)
906923

907-
with nogil:
908-
for k in range(table.n_buckets):
909-
if kh_exist_int64(table, k):
910-
result_keys[i] = table.keys[k]
911-
result_counts[i] = table.vals[k]
912-
i += 1
913-
kh_destroy_int64(table)
924+
if sixty_four_bit_scalar is float64_t:
925+
ftable = kh_init_float64()
926+
build_count_table_scalar64(values, ftable, dropna)
927+
928+
result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
929+
result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)
930+
931+
with nogil:
932+
for k in range(ftable.n_buckets):
933+
if kh_exist_float64(ftable, k):
934+
result_keys[i] = ftable.keys[k]
935+
result_counts[i] = ftable.vals[k]
936+
i += 1
937+
kh_destroy_float64(ftable)
938+
939+
elif sixty_four_bit_scalar is int64_t:
940+
itable = kh_init_int64()
941+
build_count_table_scalar64(values, itable, dropna)
942+
943+
result_keys = np.empty(itable.n_occupied, dtype=np.int64)
944+
result_counts = np.zeros(itable.n_occupied, dtype=np.int64)
945+
946+
with nogil:
947+
for k in range(itable.n_buckets):
948+
if kh_exist_int64(itable, k):
949+
result_keys[i] = itable.keys[k]
950+
result_counts[i] = itable.vals[k]
951+
i += 1
952+
kh_destroy_int64(itable)
914953

915954
return np.asarray(result_keys), np.asarray(result_counts)
916955

@@ -1002,7 +1041,7 @@ def mode_int64(int64_t[:] values):
10021041

10031042
table = kh_init_int64()
10041043

1005-
build_count_table_int64(values, table)
1044+
build_count_table_scalar64(values, table, 0)
10061045

10071046
modes = np.empty(table.n_buckets, dtype=np.int64)
10081047

vb_suite/groupby.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,15 @@ def f():
194194
series_value_counts_strings = Benchmark('s.value_counts()', setup,
195195
start_date=datetime(2011, 10, 21))
196196

197+
#value_counts on float dtype
198+
199+
setup = common_setup + """
200+
s = Series(np.random.randint(0, 1000, size=100000)).astype(float)
201+
"""
202+
203+
series_value_counts_float64 = Benchmark('s.value_counts()', setup,
204+
start_date=datetime(2015, 8, 17))
205+
197206
#----------------------------------------------------------------------
198207
# pivot_table
199208

0 commit comments

Comments
 (0)