diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 2c7780e0d95fd..3527fe2d8cd8d 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -19,6 +19,7 @@ from pandas._libs.khash cimport ( are_equivalent_float64_t, are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, + kh_needed_n_buckets, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -152,7 +153,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a3e72ed858392..0b6bb170cc531 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -392,9 +392,8 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_{{dtype}}(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: return self.table.size @@ -420,6 +419,15 @@ cdef class {{name}}HashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k @@ -731,9 +739,8 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_str(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_str(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: @@ -747,6 +754,15 @@ cdef class StringHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ returns infos about the state of the hashtable""" + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, str val): cdef: khiter_t k @@ -1044,9 +1060,8 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() - if size_hint is not None: - size_hint = min(size_hint, SIZE_HINT_LIMIT) - kh_resize_pymap(self.table, size_hint) + size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT) + kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: @@ -1072,6 +1087,18 @@ cdef class PyObjectHashTable(HashTable): sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs + def get_state(self): + """ + returns infos about the current state of the hashtable like size, + number of buckets and so on. + """ + return { + 'n_buckets' : self.table.n_buckets, + 'size' : self.table.size, + 'n_occupied' : self.table.n_occupied, + 'upper_bound' : self.table.upper_bound, + } + cpdef get_item(self, object val): cdef: khiter_t k diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 4684ecb8716c0..772d83e67394c 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -121,7 +121,7 @@ def duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 82b8ca4d443db..ba805e9ff1251 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -120,4 +120,7 @@ cdef extern from "khash_python.h": bint kh_exist_strbox(kh_strbox_t*, khiter_t) nogil + khuint_t kh_needed_n_buckets(khuint_t element_n) nogil + + include "khash_for_primitive_helper.pxi" diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index fc7a650eebba4..0073aaf0195c7 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -244,3 +244,13 @@ void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khuint_t val) { kh_resize_str(table->table, val); } + +// utility function: given the number of elements +// returns number of necessary buckets +khuint_t PANDAS_INLINE kh_needed_n_buckets(khuint_t n_elements){ + khuint_t candidate = n_elements; + kroundup32(candidate); + khuint_t upper_bound = (khuint_t)(candidate * __ac_HASH_UPPER + 0.5); + return (upper_bound < n_elements) ? 2*candidate : candidate; + +} diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c7585b21abe99..3d4d0d20f70bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5451,15 +5451,13 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import duplicated_int64 if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): - labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), SIZE_HINT_LIMIT) - ) + labels, shape = algorithms.factorize(vals, size_hint=len(self)) return labels.astype("i8", copy=False), len(shape) if subset is None: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 2c2e0c16a4482..cfbabab491ae4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -593,7 +593,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) + size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 8393312004241..a28e2f22560eb 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -155,6 +155,28 @@ def test_tracemalloc_for_empty(self, table_type, dtype): del table assert get_allocated_khash_memory() == 0 + def test_get_state(self, table_type, dtype): + table = table_type(1000) + state = table.get_state() + assert state["size"] == 0 + assert state["n_occupied"] == 0 + assert "n_buckets" in state + assert "upper_bound" in state + + def test_no_reallocation(self, table_type, dtype): + for N in range(1, 110): + keys = np.arange(N).astype(dtype) + preallocated_table = table_type(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = table_type() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() @@ -190,6 +212,21 @@ def test_tracemalloc_for_empty_StringHashTable(): assert get_allocated_khash_memory() == 0 +def test_no_reallocation_StringHashTable(): + for N in range(1, 110): + keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) + preallocated_table = ht.StringHashTable(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # orgininal number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = ht.StringHashTable() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] + + @pytest.mark.parametrize( "table_type, dtype", [