diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index cf9428d5862ec..37abcf6d6ec73 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1,4 +1,5 @@ -from cpython cimport PyObject, Py_INCREF, PyList_Check, PyTuple_Check +from cpython cimport (PyObject, Py_INCREF, PyList_Check, PyTuple_Check, + PyString_AsStringAndSize, PyDict_Copy) from khash cimport * from numpy cimport * @@ -843,6 +844,127 @@ cdef class PyObjectHashTable(HashTable): return labels +cdef inline cbuf_t to_cbuf(object s): + cdef cbuf_t output + PyString_AsStringAndSize(s, &output.buf, &output.len) + return output + + +cdef class CBufHashTable(HashTable): + cdef kh_cbuf_map_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_cbuf_map() + if size_hint is not None: + kh_resize_cbuf_map(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_cbuf_map(self.table) + + cdef inline int check_type(self, object val): + return util.is_string_object(val) + + cpdef get_item(self, object val): + cdef khiter_t it + it = kh_get_cbuf_map(self.table, to_cbuf(val)) + if it != self.table.n_buckets: + return self.table.vals[it] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef khiter_t it + cdef Py_ssize_t i, val + for i in range(iterations): + it = kh_get_cbuf_map(self.table, to_cbuf(key)) + if it != self.table.n_buckets: + val = self.table.vals[it] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t it + int ret = 0 + cbuf_t buf + + buf = to_cbuf(key) + + it = kh_put_cbuf_map(self.table, buf, &ret) + self.table.keys[it] = buf + if kh_exist_cbuf_map(self.table, it): + self.table.vals[it] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + cbuf_t buf + int64_t[::1] out = labels + khiter_t it + kh_cbuf_map_t *table = self.table + + for i in range(n): + buf = to_cbuf(values[i]) + it = kh_get_cbuf_map(table, buf) + if it != table.n_buckets: + out[i] = table.vals[it] + else: + out[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + object val + cbuf_t buf + khiter_t it + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = to_cbuf(val) + it = kh_get_cbuf_map(self.table, buf) + if it == self.table.n_buckets: + it = kh_put_cbuf_map(self.table, buf, &ret) + count += 1 + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + list reverse = [] + Py_ssize_t idx, count = 0 + int ret = 0 + object val + cbuf_t buf + khiter_t it + + for i in range(n): + val = values[i] + buf = to_cbuf(val) + it = kh_get_cbuf_map(self.table, buf) + if it != self.table.n_buckets: + idx = self.table.vals[it] + labels[i] = idx + else: + it = kh_put_cbuf_map(self.table, buf, &ret) + + self.table.vals[it] = count + reverse.append(val) + labels[i] = count + count += 1 + + return PyDict_Copy(enumerate(reverse)), labels + + + + cdef class Factorizer: cdef public PyObjectHashTable table cdef public ObjectVector uniques diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd index a8fd51a62cfbe..622c0d70daaf0 100644 --- a/pandas/src/khash.pxd +++ b/pandas/src/khash.pxd @@ -17,7 +17,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_pymap(kh_pymap_t*, PyObject*) inline void kh_resize_pymap(kh_pymap_t*, khint_t) inline khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) - inline void kh_del_pymap(kh_pymap_t*, khint_t) bint kh_exist_pymap(kh_pymap_t*, khiter_t) @@ -33,7 +32,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_pyset(kh_pyset_t*, PyObject*) inline void kh_resize_pyset(kh_pyset_t*, khint_t) inline khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*) - inline void kh_del_pyset(kh_pyset_t*, khint_t) bint kh_exist_pyset(kh_pyset_t*, khiter_t) @@ -51,7 +49,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) inline void kh_resize_str(kh_str_t*, khint_t) inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) - inline void kh_del_str(kh_str_t*, khint_t) bint kh_exist_str(kh_str_t*, khiter_t) @@ -68,7 +65,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_int64(kh_int64_t*, int64_t) inline void kh_resize_int64(kh_int64_t*, khint_t) inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) - inline void kh_del_int64(kh_int64_t*, khint_t) bint kh_exist_int64(kh_int64_t*, khiter_t) @@ -84,7 +80,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_float64(kh_float64_t*, float64_t) inline void kh_resize_float64(kh_float64_t*, khint_t) inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*) - inline void kh_del_float64(kh_float64_t*, khint_t) bint kh_exist_float64(kh_float64_t*, khiter_t) @@ -100,7 +95,6 @@ cdef extern from "khash_python.h": inline khint_t kh_get_int32(kh_int32_t*, int32_t) inline void kh_resize_int32(kh_int32_t*, khint_t) inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*) - inline void kh_del_int32(kh_int32_t*, khint_t) bint kh_exist_int32(kh_int32_t*, khiter_t) @@ -118,7 +112,24 @@ cdef extern from "khash_python.h": inline khint_t kh_get_strbox(kh_strbox_t*, kh_cstr_t) inline void kh_resize_strbox(kh_strbox_t*, khint_t) inline khint_t kh_put_strbox(kh_strbox_t*, kh_cstr_t, int*) - inline void kh_del_strbox(kh_strbox_t*, khint_t) bint kh_exist_strbox(kh_strbox_t*, khiter_t) + ctypedef struct cbuf_t: + kh_cstr_t buf + Py_ssize_t len + + ctypedef struct kh_cbuf_map_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + cbuf_t *keys + size_t *vals + + inline kh_cbuf_map_t* kh_init_cbuf_map() + inline void kh_destroy_cbuf_map(kh_cbuf_map_t*) + inline void kh_clear_cbuf_map(kh_cbuf_map_t*) + inline khint_t kh_get_cbuf_map(kh_cbuf_map_t*, cbuf_t) + inline void kh_resize_cbuf_map(kh_cbuf_map_t*, khint_t) + inline khint_t kh_put_cbuf_map(kh_cbuf_map_t*, cbuf_t, int*) + + bint kh_exist_cbuf_map(kh_cbuf_map_t*, khiter_t) diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h index 4350ff06f37f0..22eb46063cf60 100644 --- a/pandas/src/klib/khash.h +++ b/pandas/src/klib/khash.h @@ -47,6 +47,23 @@ int main() { */ /* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + 2011-09-16 (0.2.6): * The capacity is a power of 2. This seems to dramatically improve the @@ -107,12 +124,13 @@ int main() { Generic hash table library. */ -#define AC_VERSION_KHASH_H "0.2.6" +#define AC_VERSION_KHASH_H "0.2.8" #include #include #include +/* compiler specific configuration */ #if UINT_MAX == 0xffffffffu typedef unsigned int khint32_t; @@ -121,26 +139,20 @@ typedef unsigned long khint32_t; #endif #if ULONG_MAX == ULLONG_MAX -typedef unsigned long khuint64_t; -typedef signed long khint64_t; +typedef unsigned long khint64_t; #else -typedef unsigned long long khuint64_t; -typedef signed long long khint64_t; +typedef unsigned long long khint64_t; #endif typedef double khfloat64_t; -#ifndef PANDAS_INLINE - #if defined(__GNUC__) - #define PANDAS_INLINE __inline__ - #elif defined(_MSC_VER) - #define PANDAS_INLINE __inline - #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L - #define PANDAS_INLINE inline - #else - #define PANDAS_INLINE - #endif +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline #endif +#endif /* kh_inline */ typedef khint32_t khint_t; typedef khint_t khiter_t; @@ -154,11 +166,6 @@ typedef khint_t khiter_t; #define __ac_set_isboth_false(flag, i) __ac_set_isempty_false(flag, i) #define __ac_set_isdel_true(flag, i) (0) -#ifdef KHASH_LINEAR -#define __ac_inc(k, m) 1 -#else -#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) -#endif #define __ac_fsize(m) ((m) < 32? 1 : (m)>>5) @@ -166,39 +173,47 @@ typedef khint_t khiter_t; #define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + static const double __ac_HASH_UPPER = 0.77; -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ - extern kh_##name##_t *kh_init_##name(); \ +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ extern void kh_destroy_##name(kh_##name##_t *h); \ extern void kh_clear_##name(kh_##name##_t *h); \ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; \ +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ } \ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ - free(h->keys); free(h->flags); \ - free(h->vals); \ - free(h); \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ } \ } \ SCOPE void kh_clear_##name(kh_##name##_t *h) \ @@ -211,19 +226,19 @@ static const double __ac_HASH_UPPER = 0.77; SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ - khint_t inc, k, i, last, mask; \ + khint_t k, i, last, mask, step = 0; \ mask = h->n_buckets - 1; \ k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + inc) & mask; \ + i = (i + (++step)) & mask; \ if (i == last) return h->n_buckets; \ } \ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ @@ -231,11 +246,18 @@ static const double __ac_HASH_UPPER = 0.77; if (new_n_buckets < 4) new_n_buckets = 4; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ memset(new_flags, 0xff, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { /* expand */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(new_flags); return -1; } \ + h->vals = new_vals; \ + } \ } /* otherwise shrink */ \ } \ } \ @@ -249,11 +271,10 @@ static const double __ac_HASH_UPPER = 0.77; if (kh_is_map) val = h->vals[j]; \ __ac_set_isempty_true(h->flags, j); \ while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t inc, k, i; \ + khint_t k, i, step = 0; \ k = __hash_func(key); \ i = k & new_mask; \ - inc = __ac_inc(k, new_mask); \ - while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ __ac_set_isempty_false(new_flags, i); \ if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ @@ -268,32 +289,38 @@ static const double __ac_HASH_UPPER = 0.77; } \ } \ if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ } \ - free(h->flags); /* free the working space */ \ + kfree(h->flags); /* free the working space */ \ h->flags = new_flags; \ h->n_buckets = new_n_buckets; \ h->n_occupied = h->size; \ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ + return 0; \ } \ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ - else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ { \ - khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ else { \ - inc = __ac_inc(k, mask); last = i; \ + last = i; \ while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + inc) & mask; \ + i = (i + (++step)) & mask; \ if (i == last) { x = site; break; } \ } \ if (x == h->n_buckets) { \ @@ -314,17 +341,18 @@ static const double __ac_HASH_UPPER = 0.77; *ret = 2; \ } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ } +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + #define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) /* --- BEGIN OF HASH FUNCTIONS --- */ @@ -354,10 +382,10 @@ static const double __ac_HASH_UPPER = 0.77; @param s Pointer to a null terminated string @return The hash value */ -static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s) +static kh_inline khint_t __ac_X31_hash_string(const char *s) { - khint_t h = *s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; return h; } /*! @function @@ -371,7 +399,7 @@ static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s) */ #define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) -static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) +static kh_inline khint_t __ac_Wang_hash(khint_t key) { key += ~(key << 15); key ^= (key >> 10); @@ -427,7 +455,8 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @param r Extra return code: 0 if the key is present in the hash table; + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; 1 if the bucket is empty (never used); 2 if the element in the bucket has been deleted [int*] @return Iterator to the inserted element [khint_t] @@ -439,18 +468,10 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param h Pointer to the hash table [khash_t(name)*] @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] */ #define kh_get(name, h, k) kh_get_##name(h, k) -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -#define kh_del(name, h, k) kh_del_##name(h, k) - /*! @function @abstract Test whether a bucket contains data. @param h Pointer to the hash table [khash_t(name)*] @@ -509,6 +530,34 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) */ #define kh_n_buckets(h) ((h)->n_buckets) +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + /* More conenient interfaces */ /*! @function @@ -530,9 +579,6 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ -#define KHASH_SET_INIT_UINT64(name) \ - KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) - #define KHASH_SET_INIT_INT64(name) \ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) @@ -541,13 +587,9 @@ static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) @param name Name of the hash table [symbol] @param khval_t Type of values [type] */ -#define KHASH_MAP_INIT_UINT64(name, khval_t) \ - KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - #define KHASH_MAP_INIT_INT64(name, khval_t) \ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - typedef const char *kh_cstr_t; /*! @function @abstract Instantiate a hash map containing const char* keys @@ -565,14 +607,4 @@ typedef const char *kh_cstr_t; KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) -#define kh_exist_str(h, k) (kh_exist(h, k)) -#define kh_exist_float64(h, k) (kh_exist(h, k)) -#define kh_exist_int64(h, k) (kh_exist(h, k)) -#define kh_exist_int32(h, k) (kh_exist(h, k)) - -KHASH_MAP_INIT_STR(str, size_t) -KHASH_MAP_INIT_INT(int32, size_t) -KHASH_MAP_INIT_INT64(int64, size_t) - - #endif /* __AC_KHASH_H */ diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index d3ef48de0f831..e196e9b23434f 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -1,7 +1,126 @@ +#ifndef _KLIB_KHASH_PYTHON_H_ +#define _KLIB_KHASH_PYTHON_H_ + #include +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +#define kh_inline PANDAS_INLINE #include "khash.h" +#define kh_exist_str(h, k) (kh_exist(h, k)) +#define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_int32(h, k) (kh_exist(h, k)) + +#include "xxhash/xxhash.h" + +/* + * By default khash uses crappy x31 hash function which puts strings that + * differ only in the last character into neighbouring buckets which is not + * good given that quadratic probing tries small steps first. + * + * xxhash gives better bucket distribution and performance-wise is great for + * long-ish strings, but it is a bit slower than x31 on the shortest ones + * (turns out at length == 2 the difference is already negligible). + * + * Inlining will hinder merging in upstream releases, but 1-character strings + * are a valid use case for pandas, so let's pre-calculate a vector of 256 + * values to avoid calling two functions (strlen and XXH32) if there's only one + * character to hash. + * + * This table was generated with the following code. Feel free to re-run it if + * an update comes in: + +#include +#include "xxhash.h" + +int main(int argc, char *argv[]) +{ + printf("static khint_t XXH32_EMPTY_HASH = 0x%08x;\n", + XXH32("", 0, 0xdeadbeef)); + printf("static khint_t XXH32_ONECHAR_HASH[256] = {"); + unsigned char s[2] = {0}; + for (int i = 0; i < 256; ++i) { + if (i % 8 == 0) { + printf("\n "); + } + s[0] = i; + printf("0x%08x", XXH32(s, 1, 0xdeadbeef)); + if (i < 255) { + printf(", "); + } + } + printf("\n};\n"); + return 0; +} +*/ + +static khint_t XXH32_EMPTY_HASH = 0xc372c6cb; +static khint_t XXH32_ONECHAR_HASH[256] = { + 0x39110451, 0xd3efa134, 0xea8d6dc4, 0xe59a066b, 0x89f3a4f5, 0xdcce5bc9, 0x44be0c3e, 0x96469248, + 0x7885ddeb, 0x24417b24, 0xb77b30b2, 0xa83d21eb, 0x6f6ba52b, 0x7315bbe5, 0xce858701, 0x52299f26, + 0x440ec810, 0xd02a934f, 0xf873d394, 0xd168a8e1, 0x31c30198, 0x37c3967b, 0xc1bdbdf8, 0x3ddaf3cc, + 0xb7222f4a, 0x96625cdf, 0xabf92a2f, 0x69e97975, 0x55f24523, 0x6b1abaa0, 0xe5b033ab, 0x9e21842c, + 0x3ac2a339, 0x827b0af2, 0xd7ea0f97, 0x72317ee6, 0xe6bd4439, 0xb0b183f1, 0xca90e5e0, 0x57960753, + 0x6eefe374, 0xb9c9c5b5, 0x57396d1f, 0x6db79351, 0xab55c12d, 0x32229df4, 0xbfa3a164, 0x58f9f4ba, + 0x5987c643, 0xffbfa961, 0x1080d4eb, 0xc5c3d846, 0x16a7fd8e, 0xed29fd3a, 0x8d78613d, 0xd088b720, + 0x8d597f4c, 0x2df1ce8f, 0x79bc5215, 0x749d67c1, 0xa9ad300c, 0x60c6237d, 0xeeb080e7, 0xb74eef62, + 0x6ddba2f2, 0x3d9f18cf, 0x0b6ad1bd, 0xc7a33d19, 0x3cb6352f, 0x872839f9, 0x259ced1e, 0x0f9d713b, + 0x6816620f, 0x8d2c96a7, 0x377fb2f9, 0x2616b5b5, 0x9bae3a05, 0x8368a004, 0x3a67fd94, 0x312529c4, + 0xc9238f87, 0x3e85e142, 0x973dedc6, 0xcbc3d4ba, 0xd2629b58, 0x2aae9a6d, 0x82ffc598, 0x4a8512b3, + 0x51146ceb, 0x85ddc3f4, 0xa83b942f, 0x55769a32, 0xf7fa3fdf, 0xfbe35842, 0x342ff574, 0x848400a6, + 0x92707153, 0x48cd58fd, 0xbdae4a11, 0x701bbadb, 0x4a5b37c4, 0x98770eeb, 0xfc1b98fc, 0x05dd6894, + 0xd3ba005c, 0x453bc774, 0xfe186d14, 0xa25acde2, 0xcc738313, 0x1dbdefa7, 0x83ed6f1e, 0xf9d8e195, + 0x5f10c546, 0xf22c5a0f, 0x31da5f5e, 0x5341c163, 0xabd3f750, 0x882e33d8, 0x4d8105cd, 0xc1f6f3d9, + 0x347e1d5c, 0xdb06193c, 0x64841a53, 0x3991a6e6, 0x0abdd625, 0xedcf00f7, 0xa8e64229, 0x2fc9029b, + 0x4fc5ca41, 0x1f5aaae5, 0x29bdda91, 0x55446dae, 0x1566ec40, 0x9ac8391e, 0xcd4d6ab1, 0x0f3807f6, + 0xf3be6887, 0x9f4b88bd, 0x33c401df, 0xaa9df64f, 0xce5c70ac, 0x9ee55a87, 0x4cb91c84, 0x8c322b3d, + 0x8e40fb24, 0x3af430fb, 0xeea567c2, 0xe80c7dc2, 0x6f619449, 0xe0ca8048, 0x984c626e, 0x50bf1281, + 0x4895cbee, 0x5d016a96, 0xe58b8980, 0x3457ef7c, 0x2a24f819, 0x0641cc30, 0xbddc5f84, 0x03ce4656, + 0xbcb73c9c, 0xcd29be82, 0x0930d945, 0xf3fc8e3c, 0xbed775cd, 0xd6668fae, 0x6876f949, 0xcf34fbd7, + 0x0537d916, 0x7efd5f26, 0xb2d32520, 0x10d58995, 0x19d64e1c, 0xacae767c, 0xf23a4e7d, 0xdcb654fe, + 0xe1ec9a9f, 0x3061302b, 0x453a0b7c, 0xe845436e, 0xb2b690df, 0x245c17b5, 0x756a9374, 0x470998f5, + 0xe31a5f5b, 0x60dbad02, 0xf738299d, 0x0db8b11a, 0xd34cb801, 0xb2f3597d, 0xa627e466, 0xda4f9935, + 0x5c58e1df, 0x4b5319d6, 0x48acc08f, 0xce18d68e, 0xeb995e7f, 0x11a07cba, 0x025127b2, 0xd1325331, + 0x55d76240, 0x281bba14, 0xb9ac069d, 0x25e60bcc, 0xf077fbd3, 0xe460ece9, 0x725a9971, 0xa6b5c6b4, + 0xe5f216a3, 0xbee80d71, 0x1a049114, 0x851012d4, 0xa6e175cc, 0x6ec98c95, 0x56a77202, 0x7e2ab05f, + 0x4850279c, 0x1b009afe, 0xf71e36b6, 0x9cadc37a, 0x43a167da, 0x5d75b5f3, 0xc432215c, 0x93ff1905, + 0x8764d057, 0xf44cd35d, 0x03d3a324, 0xd65a5047, 0xe872b4d8, 0x8dcb9a23, 0xfebf9113, 0x59701be9, + 0xdf9f6090, 0xce9b2907, 0x664c6a5a, 0x81bfefc4, 0x13829979, 0xda98b6ab, 0x7b7e9ff0, 0x13c24005, + 0xcee61b6b, 0x15737a85, 0xe2f95e48, 0xf2136570, 0xd1ccfdab, 0xa9adfb16, 0x1f7339a9, 0x83247f43, + 0x68c6c8bf, 0x5046f6fc, 0x2d3dea84, 0x79a0be74, 0x39dd7eb3, 0x4d5cc636, 0xe4e1352d, 0xd1317a99 +}; + +/* Seed value is chosen arbitrarily. */ +static khint_t XXH32_SEED = 0xdeadbeef; + +static khint_t PANDAS_INLINE str_xxhash_hash_func(kh_cstr_t key) { + if (!key[0]) { + return XXH32_EMPTY_HASH; + } + if (!key[1]) { + return XXH32_ONECHAR_HASH[(uint8_t)key[0]]; + } + return XXH32(key, strlen(key), XXH32_SEED); +} + +KHASH_INIT(str, kh_cstr_t, size_t, 1, + str_xxhash_hash_func, kh_str_hash_equal) + +KHASH_MAP_INIT_INT(int32, size_t) +KHASH_MAP_INIT_INT64(int64, size_t) + // kludge #define kh_float64_hash_func _Py_HashDouble @@ -13,7 +132,7 @@ KHASH_MAP_INIT_FLOAT64(float64, size_t) -int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { +static int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { PyErr_Clear(); @@ -46,4 +165,44 @@ KHASH_SET_INIT_PYOBJECT(pyset) #define kh_exist_pymap(h, k) (kh_exist(h, k)) #define kh_exist_pyset(h, k) (kh_exist(h, k)) -KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) +KHASH_INIT(strbox, kh_cstr_t, kh_pyobject_t, 1, + str_xxhash_hash_func, kh_str_hash_equal) + +/* Plain old C buffer structure */ +typedef struct { + kh_cstr_t buf; + Py_ssize_t len; +} cbuf_t; + +static khint_t PANDAS_INLINE cbuf_xxhash(cbuf_t val) { + switch (val.len) { + case 0: + return XXH32_EMPTY_HASH; + case 1: + return XXH32_ONECHAR_HASH[(uint8_t)val.buf[0]]; + default: + return XXH32(val.buf, val.len, XXH32_SEED); + } +} + +static int PANDAS_INLINE cbuf_equal(cbuf_t a, cbuf_t b) { + int i; + if (a.len != b.len) { + return 0; + } + if (a.buf == b.buf) { + return 1; + } + for (i = 0; i < a.len; ++i) { + if (a.buf[i] != b.buf[i]) { + return 0; + } + } + return 1; +} + +/* [cbuf_t -> size_t] hash map */ +KHASH_INIT(cbuf_map, cbuf_t, size_t, 1, cbuf_xxhash, cbuf_equal) +#define kh_exist_cbuf_map(h, k) (kh_exist(h, k)) + +#endif /* _KLIB_KHASH_PYTHON_H_ */ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 0947315fbe6b7..4020dd24c87e8 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -33,7 +33,7 @@ See LICENSE for the license #include #endif -#include "khash.h" +#include "khash_python.h" #define CHUNKSIZE 1024*256 #define KB 1024 diff --git a/pandas/src/xxhash/LICENSE b/pandas/src/xxhash/LICENSE new file mode 100644 index 0000000000000..7de801ed1bc78 --- /dev/null +++ b/pandas/src/xxhash/LICENSE @@ -0,0 +1,24 @@ +xxHash Library +Copyright (c) 2012-2014, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pandas/src/xxhash/xxhash.c b/pandas/src/xxhash/xxhash.c new file mode 100644 index 0000000000000..529d69ccc4caa --- /dev/null +++ b/pandas/src/xxhash/xxhash.c @@ -0,0 +1,934 @@ +/* +xxHash - Fast Hash algorithm +Copyright (C) 2012-2014, Yann Collet. +BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash source repository : http://code.google.com/p/xxhash/ +- public discussion board : https://groups.google.com/forum/#!forum/lz4c +*/ + + +//************************************** +// Tuning parameters +//************************************** +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. +// You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). +#if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_USE_UNALIGNED_ACCESS 1 +#endif + +// XXH_ACCEPT_NULL_INPUT_POINTER : +// If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. +// When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. +// This option has a very small performance cost (only measurable on small inputs). +// By default, this option is disabled. To enable it, uncomment below define : +// #define XXH_ACCEPT_NULL_INPUT_POINTER 1 + +// XXH_FORCE_NATIVE_FORMAT : +// By default, xxHash library provides endian-independant Hash values, based on little-endian convention. +// Results are therefore identical for little-endian and big-endian CPU. +// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. +// Should endian-independance be of no importance for your application, you may set the #define below to 1. +// It will improve speed for Big-endian CPU. +// This option has no impact on Little_Endian CPU. +#define XXH_FORCE_NATIVE_FORMAT 0 + +//************************************** +// Compiler Specific Options +//************************************** +// Disable some Visual warning messages +#ifdef _MSC_VER // Visual Studio +# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant +#endif + +#ifdef _MSC_VER // Visual Studio +# define FORCE_INLINE static __forceinline +#else +# ifdef __GNUC__ +# define FORCE_INLINE static inline __attribute__((always_inline)) +# else +# define FORCE_INLINE static inline +# endif +#endif + +//************************************** +// Includes & Memory related functions +//************************************** +#include "xxhash.h" +// Modify the local functions below should you wish to use some other memory routines +// for malloc(), free() +#include +FORCE_INLINE void* XXH_malloc(size_t s) +{ + return malloc(s); +} +FORCE_INLINE void XXH_free (void* p) +{ + free(p); +} +// for memcpy() +#include +FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + + +//************************************** +// Basic Types +//************************************** +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include +typedef uint8_t BYTE; +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +#else +typedef unsigned char BYTE; +typedef unsigned short U16; +typedef unsigned int U32; +typedef signed int S32; +typedef unsigned long long U64; +#endif + +#if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) +# define _PACKED __attribute__ ((packed)) +#else +# define _PACKED +#endif + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# ifdef __IBMC__ +# pragma pack(1) +# else +# pragma pack(push, 1) +# endif +#endif + +typedef struct _U32_S +{ + U32 v; +} _PACKED U32_S; +typedef struct _U64_S +{ + U64 v; +} _PACKED U64_S; + +#if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) +# pragma pack(pop) +#endif + +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + + +//*************************************** +// Compiler-specific Functions and Macros +//*************************************** +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +// Note : although _rotl exists for minGW (GCC under windows), performance seems poor +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) // Visual Studio +# define XXH_swap32 _byteswap_ulong +# define XXH_swap64 _byteswap_uint64 +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +# define XXH_swap64 __builtin_bswap64 +#else +static inline U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +static inline U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +//************************************** +// Constants +//************************************** +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + +#define PRIME64_1 11400714785074694791ULL +#define PRIME64_2 14029467366897019727ULL +#define PRIME64_3 1609587929392839161ULL +#define PRIME64_4 9650029242287828579ULL +#define PRIME64_5 2870177450012600261ULL + +//************************************** +// Architecture Macros +//************************************** +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; +#ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch +static const int one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) +#endif + + +//************************************** +// Macros +//************************************** +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations + + +//**************************** +// Memory reads +//**************************** +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); + else + return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr); +} + +FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +FORCE_INLINE U64 XXH_readLE64_align(const U64* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); + else + return endian==XXH_littleEndian ? *ptr : XXH_swap64(*ptr); +} + +FORCE_INLINE U64 XXH_readLE64(const U64* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + + +//**************************** +// Simple Hash Functions +//**************************** +FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; +#define XXH_get32bits(p) XXH_readLE32_align((const U32*)p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) + { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do + { + v1 += XXH_get32bits(p) * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + p+=4; + v2 += XXH_get32bits(p) * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + p+=4; + v3 += XXH_get32bits(p) * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + p+=4; + v4 += XXH_get32bits(p) * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + p+=4; + } + while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p+4<=bEnd) + { + h32 += XXH_get32bits(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +unsigned int XXH32 (const void* input, size_t len, unsigned seed) +{ +#if 0 + // Simple version, good for code maintenance, but unfortunately slow for small inputs + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, input, len); + return XXH32_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USE_UNALIGNED_ACCESS) + if ((((size_t)input) & 3) == 0) // Input is aligned, let's leverage the speed advantage + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U64 h64; +#define XXH_get64bits(p) XXH_readLE64_align((const U64*)p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) + { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) + { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do + { + v1 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + v2 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + v3 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + v4 += XXH_get64bits(p) * PRIME64_2; + p+=8; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + } + while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + + v1 *= PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + h64 ^= v1; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v2 *= PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + h64 ^= v2; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v3 *= PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + h64 ^= v3; + h64 = h64 * PRIME64_1 + PRIME64_4; + + v4 *= PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + h64 ^= v4; + h64 = h64 * PRIME64_1 + PRIME64_4; + } + else + { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + while (p+8<=bEnd) + { + U64 k1 = XXH_get64bits(p); + k1 *= PRIME64_2; + k1 = XXH_rotl64(k1,31); + k1 *= PRIME64_1; + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) + { + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + // Simple version, good for code maintenance, but unfortunately slow for small inputs + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, input, len); + return XXH64_digest(&state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + +# if !defined(XXH_USE_UNALIGNED_ACCESS) + if ((((size_t)input) & 7)==0) // Input is aligned, let's leverage the speed advantage + { + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } +# endif + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + +/**************************************************** + * Advanced Hash Functions +****************************************************/ + +/*** Allocation ***/ +typedef struct +{ + U64 total_len; + U32 seed; + U32 v1; + U32 v2; + U32 v3; + U32 v4; + U32 memsize; + char memory[16]; +} XXH_istate32_t; + +typedef struct +{ + U64 total_len; + U64 seed; + U64 v1; + U64 v2; + U64 v3; + U64 v4; + U32 memsize; + char memory[32]; +} XXH_istate64_t; + + +XXH32_state_t* XXH32_createState(void) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_state_t) >= sizeof(XXH_istate32_t)); // A compilation error here means XXH32_state_t is not large enough + return (XXH32_state_t*)malloc(sizeof(XXH32_state_t)); +} +XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + free(statePtr); + return XXH_OK; +}; + +XXH64_state_t* XXH64_createState(void) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_state_t) >= sizeof(XXH_istate64_t)); // A compilation error here means XXH64_state_t is not large enough + return (XXH64_state_t*)malloc(sizeof(XXH64_state_t)); +} +XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + free(statePtr); + return XXH_OK; +}; + + +/*** Hash feed ***/ + +XXH_errorcode XXH32_reset(XXH32_state_t* state_in, U32 seed) +{ + XXH_istate32_t* state = (XXH_istate32_t*) state_in; + state->seed = seed; + state->v1 = seed + PRIME32_1 + PRIME32_2; + state->v2 = seed + PRIME32_2; + state->v3 = seed + 0; + state->v4 = seed - PRIME32_1; + state->total_len = 0; + state->memsize = 0; + return XXH_OK; +} + +XXH_errorcode XXH64_reset(XXH64_state_t* state_in, unsigned long long seed) +{ + XXH_istate64_t* state = (XXH_istate64_t*) state_in; + state->seed = seed; + state->v1 = seed + PRIME64_1 + PRIME64_2; + state->v2 = seed + PRIME64_2; + state->v3 = seed + 0; + state->v4 = seed - PRIME64_1; + state->total_len = 0; + state->memsize = 0; + return XXH_OK; +} + + +FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state_in, const void* input, size_t len, XXH_endianess endian) +{ + XXH_istate32_t* state = (XXH_istate32_t *) state_in; + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 16) // fill in tmp buffer + { + XXH_memcpy(state->memory + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) // some data left from previous update + { + XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize); + { + const U32* p32 = (const U32*)state->memory; + state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v1 = XXH_rotl32(state->v1, 13); + state->v1 *= PRIME32_1; + p32++; + state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v2 = XXH_rotl32(state->v2, 13); + state->v2 *= PRIME32_1; + p32++; + state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v3 = XXH_rotl32(state->v3, 13); + state->v3 *= PRIME32_1; + p32++; + state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; + state->v4 = XXH_rotl32(state->v4, 13); + state->v4 *= PRIME32_1; + p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do + { + v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v1 = XXH_rotl32(v1, 13); + v1 *= PRIME32_1; + p+=4; + v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v2 = XXH_rotl32(v2, 13); + v2 *= PRIME32_1; + p+=4; + v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v3 = XXH_rotl32(v3, 13); + v3 *= PRIME32_1; + p+=4; + v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; + v4 = XXH_rotl32(v4, 13); + v4 *= PRIME32_1; + p+=4; + } + while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->memory, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state_in, XXH_endianess endian) +{ + XXH_istate32_t* state = (XXH_istate32_t*) state_in; + const BYTE * p = (const BYTE*)state->memory; + BYTE* bEnd = (BYTE*)state->memory + state->memsize; + U32 h32; + + if (state->total_len >= 16) + { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } + else + { + h32 = state->seed + PRIME32_5; + } + + h32 += (U32) state->total_len; + + while (p+4<=bEnd) + { + h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +U32 XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + +FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state_in, const void* input, size_t len, XXH_endianess endian) +{ + XXH_istate64_t * state = (XXH_istate64_t *) state_in; + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) // fill in tmp buffer + { + XXH_memcpy(state->memory + state->memsize, input, len); + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) // some data left from previous update + { + XXH_memcpy(state->memory + state->memsize, input, 32-state->memsize); + { + const U64* p64 = (const U64*)state->memory; + state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v1 = XXH_rotl64(state->v1, 31); + state->v1 *= PRIME64_1; + p64++; + state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v2 = XXH_rotl64(state->v2, 31); + state->v2 *= PRIME64_1; + p64++; + state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v3 = XXH_rotl64(state->v3, 31); + state->v3 *= PRIME64_1; + p64++; + state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; + state->v4 = XXH_rotl64(state->v4, 31); + state->v4 *= PRIME64_1; + p64++; + } + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) + { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do + { + v1 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + p+=8; + v2 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + p+=8; + v3 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + p+=8; + v4 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + p+=8; + } + while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) + { + XXH_memcpy(state->memory, p, bEnd-p); + state->memsize = (int)(bEnd-p); + } + + return XXH_OK; +} + +XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state_in, XXH_endianess endian) +{ + XXH_istate64_t * state = (XXH_istate64_t *) state_in; + const BYTE * p = (const BYTE*)state->memory; + BYTE* bEnd = (BYTE*)state->memory + state->memsize; + U64 h64; + + if (state->total_len >= 32) + { + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + + v1 *= PRIME64_2; + v1 = XXH_rotl64(v1, 31); + v1 *= PRIME64_1; + h64 ^= v1; + h64 = h64*PRIME64_1 + PRIME64_4; + + v2 *= PRIME64_2; + v2 = XXH_rotl64(v2, 31); + v2 *= PRIME64_1; + h64 ^= v2; + h64 = h64*PRIME64_1 + PRIME64_4; + + v3 *= PRIME64_2; + v3 = XXH_rotl64(v3, 31); + v3 *= PRIME64_1; + h64 ^= v3; + h64 = h64*PRIME64_1 + PRIME64_4; + + v4 *= PRIME64_2; + v4 = XXH_rotl64(v4, 31); + v4 *= PRIME64_1; + h64 ^= v4; + h64 = h64*PRIME64_1 + PRIME64_4; + } + else + { + h64 = state->seed + PRIME64_5; + } + + h64 += (U64) state->total_len; + + while (p+8<=bEnd) + { + U64 k1 = XXH_readLE64((const U64*)p, endian); + k1 *= PRIME64_2; + k1 = XXH_rotl64(k1,31); + k1 *= PRIME64_1; + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) + { + h64 ^= (U64)(XXH_readLE32((const U32*)p, endian)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + diff --git a/pandas/src/xxhash/xxhash.h b/pandas/src/xxhash/xxhash.h new file mode 100644 index 0000000000000..55b45015a447e --- /dev/null +++ b/pandas/src/xxhash/xxhash.h @@ -0,0 +1,156 @@ +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2014, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : http://code.google.com/p/xxhash/ +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. +*/ + +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + + +/***************************** + Includes +*****************************/ +#include /* size_t */ + + +/***************************** + Type +*****************************/ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + + +/***************************** + Simple Hash Functions +*****************************/ + +unsigned int XXH32 (const void* input, size_t length, unsigned seed); +unsigned long long XXH64 (const void* input, size_t length, unsigned long long seed); + +/* +XXH32() : + Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + This function successfully passes all SMHasher tests. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s +XXH64() : + Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". +*/ + + + +/***************************** + Advanced Hash Functions +*****************************/ +typedef struct { long long ll[ 6]; } XXH32_state_t; +typedef struct { long long ll[11]; } XXH64_state_t; + +/* +These structures allow static allocation of XXH states. +States must then be initialized using XXHnn_reset() before first use. + +If you prefer dynamic allocation, please refer to functions below. +*/ + +XXH32_state_t* XXH32_createState(void); +XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + +XXH64_state_t* XXH64_createState(void); +XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/* +These functions create and release memory for XXH state. +States must then be initialized using XXHnn_reset() before first use. +*/ + + +XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); +XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +unsigned int XXH32_digest (const XXH32_state_t* statePtr); + +XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +unsigned long long XXH64_digest (const XXH64_state_t* statePtr); + +/* +These functions calculate the xxHash of an input provided in multiple smaller packets, +as opposed to an input provided as a single block. + +XXH state space must first be allocated, using either static or dynamic method provided above. + +Start a new hash by initializing state with a seed, using XXHnn_reset(). + +Then, feed the hash state by calling XXHnn_update() as many times as necessary. +Obviously, input must be valid, meaning allocated and read accessible. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + +Finally, you can produce a hash anytime, by using XXHnn_digest(). +This function returns the final nn-bits hash. +You can nonetheless continue feeding the hash state with more input, +and therefore get some new hashes, by calling again XXHnn_digest(). + +When you are done, don't forget to free XXH state space, using typically XXHnn_freeState(). +*/ + + +#if defined (__cplusplus) +} +#endif diff --git a/setup.py b/setup.py index a14f831613696..f9fbdae83d233 100755 --- a/setup.py +++ b/setup.py @@ -275,7 +275,7 @@ def initialize_options(self): 'JSONtoObj.c', 'ultrajsonenc.c', 'ultrajsondec.c', - ] + 'xxhash.c'] for root, dirs, files in os.walk('pandas'): for f in files: @@ -452,7 +452,11 @@ def pxd(name): 'pxdfiles': [], 'depends': lib_depends}, hashtable={'pyxfile': 'hashtable', - 'pxdfiles': ['hashtable']}, + 'pxdfiles': ['hashtable'], + 'depends': ['pandas/src/xxhash/xxhash.h', + 'pandas/src/klib/khash_python.h', + 'pandas/src/klib/khash.h'], + 'sources': ['pandas/src/xxhash/xxhash.c']}, tslib={'pyxfile': 'tslib', 'depends': tseries_depends, 'sources': ['pandas/src/datetime/np_datetime.c', @@ -467,9 +471,14 @@ def pxd(name): parser=dict(pyxfile='parser', depends=['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h', - 'pandas/src/numpy_helper.h'], + 'pandas/src/numpy_helper.h', + 'pandas/src/xxhash/xxhash.h', + 'pandas/src/klib/khash_python.h', + 'pandas/src/klib/khash.h'], sources=['pandas/src/parser/tokenizer.c', - 'pandas/src/parser/io.c']) + 'pandas/src/parser/io.c', + 'pandas/src/xxhash/xxhash.c'], + libraries=['hashtable']) ) extensions = [] diff --git a/vb_suite/factorize.py b/vb_suite/factorize.py new file mode 100644 index 0000000000000..21ce350d91262 --- /dev/null +++ b/vb_suite/factorize.py @@ -0,0 +1,62 @@ +from vbench.api import Benchmark +from datetime import datetime + +START_DATE = datetime(2014, 10, 13) + +# GH 8524 + +common_setup = """from pandas_vb_common import * +from pandas import factorize +SIZE = 1000000 +indices = np.random.randint(100, size=SIZE) +""" + + +# --- Integer array factorization +setup = common_setup + """ +int_values_uniq = np.arange(SIZE) * 100 +""" +factorize_int_uniq = Benchmark("factorize(int_values_uniq)", setup, + start_date=START_DATE) +setup = common_setup + """ +int_values_dup = (np.arange(SIZE) * 100).take(indices) +""" +factorize_int_dup = Benchmark("factorize(int_values_dup)", setup, + start_date=START_DATE) + + +# --- String array factorization +setup = common_setup + """ +str_values_uniq = tm.makeStringIndex(SIZE) +""" +factorize_str_uniq = Benchmark("factorize(str_values_uniq)", setup=setup, + start_date=START_DATE) +setup = common_setup + """ +str_values_dup = tm.makeStringIndex(SIZE).take(indices) +""" +factorize_str_dup = Benchmark("factorize(str_values_dup)", setup=setup, + start_date=START_DATE) +setup = common_setup + """ +shortstr_4_dup = Index(np.take(['AA', 'BB', 'CC', 'DD'], + np.random.randint(4, size=SIZE))) +""" +factorize_shortstr_4_dup = Benchmark("factorize(shortstr_values_dup)", + setup=setup, start_date=START_DATE) +setup = common_setup + """ +shortstr_many_dup = tm.rands_array(2, SIZE) +""" +factorize_shortstr_many_dup = Benchmark("factorize(shortstr_many_dup)", + setup=setup, start_date=START_DATE) + + +# --- Float array factorization +setup = common_setup + """ +float_values_uniq = np.linspace(0., 1., num=SIZE) * 100 +""" +factorize_float_uniq = Benchmark("factorize(float_values_uniq)", setup=setup, + start_date=START_DATE) +setup = common_setup + """ +float_values_dup = (np.linspace(0., 1., num=SIZE) * 100).take(indices) +""" +factorize_float_dup = Benchmark("factorize(float_values_dup)", setup, + start_date=START_DATE) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index a16d183ae62e2..e9d325d2ef543 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -6,6 +6,7 @@ modules = ['attrs_caching', 'binary_ops', 'ctors', + 'factorize', 'frame_ctor', 'frame_methods', 'groupby',