Merge pull request #10840 from chris-b1/valuecounts-float64

jreback · jreback · commit ab769d87ad54 · 2015-08-18T19:37:09.000-04:00
PERF: value_counts_float64 #10821
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -603,7 +603,7 @@ Performance Improvements
 - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`)
 - 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`)
 - Improved performance of ``to_datetime`` when specified format string is ISO8601 (:issue:`10178`)
-
+- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
 
 .. _whatsnew_0170.bug_fixes:
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -232,7 +232,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
                 values = PeriodIndex(values, name=name)
 
             values = values.view(np.int64)
-            keys, counts = htable.value_count_int64(values)
+            keys, counts = htable.value_count_scalar64(values, dropna)
 
             if dropna:
                 from pandas.tslib import iNaT
@@ -244,7 +244,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
         elif com.is_integer_dtype(dtype):
             values = com._ensure_int64(values)
-            keys, counts = htable.value_count_int64(values)
+            keys, counts = htable.value_count_scalar64(values, dropna)
+        elif com.is_float_dtype(dtype):
+            values = com._ensure_float64(values)
+            keys, counts = htable.value_count_scalar64(values, dropna)
 
         else:
             values = com._ensure_object(values)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1030,7 +1030,7 @@ def value_counts(self, dropna=True):
         from pandas.core.index import CategoricalIndex
 
         cat = self.dropna() if dropna else self
-        keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
+        keys, counts = htable.value_count_scalar64(com._ensure_int64(cat._codes), dropna)
         result = Series(counts, index=keys)
 
         ix = np.arange(len(cat.categories), dtype='int64')
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -866,51 +866,90 @@ cdef class Int64Factorizer:
         self.count = len(self.uniques)
         return labels
 
-
+ctypedef fused kh_scalar64:
+    kh_int64_t
+    kh_float64_t
 
 @cython.boundscheck(False)
-cdef build_count_table_int64(int64_t[:] values, kh_int64_t *table):
+cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values,
+                                kh_scalar64 *table, bint dropna):
     cdef:
         khiter_t k
         Py_ssize_t i, n = len(values)
-        int64_t val
+        sixty_four_bit_scalar val
         int ret = 0
 
-    with nogil:
-        kh_resize_int64(table, n)
+    if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t:
+        with nogil:
+            kh_resize_float64(table, n)
+
+            for i in range(n):
+                val = values[i]
+                if val == val or not dropna:
+                    k = kh_get_float64(table, val)
+                    if k != table.n_buckets:
+                        table.vals[k] += 1
+                    else:
+                        k = kh_put_float64(table, val, &ret)
+                        table.vals[k] = 1
+    elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t:
+        with nogil:
+            kh_resize_int64(table, n)
+
+            for i in range(n):
+                val = values[i]
+                k = kh_get_int64(table, val)
+                if k != table.n_buckets:
+                    table.vals[k] += 1
+                else:
+                    k = kh_put_int64(table, val, &ret)
+                    table.vals[k] = 1
+    else:
+        raise ValueError("Table type must match scalar type.")
 
-        for i in range(n):
-            val = values[i]
-            k = kh_get_int64(table, val)
-            if k != table.n_buckets:
-                table.vals[k] += 1
-            else:
-                k = kh_put_int64(table, val, &ret)
-                table.vals[k] = 1
 
 
 @cython.boundscheck(False)
-cpdef value_count_int64(int64_t[:] values):
+cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna):
     cdef:
         Py_ssize_t i
-        kh_int64_t *table
-        int64_t[:] result_keys, result_counts
+        kh_float64_t *ftable
+        kh_int64_t *itable
+        sixty_four_bit_scalar[:] result_keys
+        int64_t[:] result_counts
         int k
 
-    table = kh_init_int64()
-    build_count_table_int64(values, table)
-
     i = 0
-    result_keys = np.empty(table.n_occupied, dtype=np.int64)
-    result_counts = np.zeros(table.n_occupied, dtype=np.int64)
 
-    with nogil:
-        for k in range(table.n_buckets):
-            if kh_exist_int64(table, k):
-                result_keys[i] = table.keys[k]
-                result_counts[i] = table.vals[k]
-                i += 1
-    kh_destroy_int64(table)
+    if sixty_four_bit_scalar is float64_t:
+        ftable = kh_init_float64()
+        build_count_table_scalar64(values, ftable, dropna)
+
+        result_keys = np.empty(ftable.n_occupied, dtype=np.float64)
+        result_counts = np.zeros(ftable.n_occupied, dtype=np.int64)
+
+        with nogil:
+            for k in range(ftable.n_buckets):
+                if kh_exist_float64(ftable, k):
+                    result_keys[i] = ftable.keys[k]
+                    result_counts[i] = ftable.vals[k]
+                    i += 1
+        kh_destroy_float64(ftable)
+
+    elif sixty_four_bit_scalar is int64_t:
+        itable = kh_init_int64()
+        build_count_table_scalar64(values, itable, dropna)
+
+        result_keys = np.empty(itable.n_occupied, dtype=np.int64)
+        result_counts = np.zeros(itable.n_occupied, dtype=np.int64)
+
+        with nogil:
+            for k in range(itable.n_buckets):
+                if kh_exist_int64(itable, k):
+                    result_keys[i] = itable.keys[k]
+                    result_counts[i] = itable.vals[k]
+                    i += 1
+        kh_destroy_int64(itable)
 
     return np.asarray(result_keys), np.asarray(result_counts)
 
@@ -1002,7 +1041,7 @@ def mode_int64(int64_t[:] values):
 
     table = kh_init_int64()
 
-    build_count_table_int64(values, table)
+    build_count_table_scalar64(values, table, 0)
 
     modes = np.empty(table.n_buckets, dtype=np.int64)
 
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -194,6 +194,15 @@ def f():
 series_value_counts_strings = Benchmark('s.value_counts()', setup,
                                         start_date=datetime(2011, 10, 21))
 
+#value_counts on float dtype
+
+setup = common_setup + """
+s = Series(np.random.randint(0, 1000, size=100000)).astype(float)
+"""
+
+series_value_counts_float64 = Benchmark('s.value_counts()', setup,
+                                      start_date=datetime(2015, 8, 17))
+
 #----------------------------------------------------------------------
 # pivot_table