From c4bf1edc59660554a68e8dad3788a0f55ed1a244 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Wed, 3 Feb 2021 22:18:40 +0100 Subject: [PATCH 1/3] taking upper 32bit of PyHash into account as well --- asv_bench/benchmarks/hash_functions.py | 9 +++++++++ pandas/_libs/src/klib/khash_python.h | 28 +++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/hash_functions.py b/asv_bench/benchmarks/hash_functions.py index 5227ad0f53a04..3743882b936e2 100644 --- a/asv_bench/benchmarks/hash_functions.py +++ b/asv_bench/benchmarks/hash_functions.py @@ -25,6 +25,15 @@ def time_isin_outside(self, dtype, exponent): self.s.isin(self.values_outside) +class UniqueForLargePyObjectInts: + def setup(self): + lst = [x << 32 for x in range(5000)] + self.arr = np.array(lst, dtype=np.object_) + + def time_unique(self): + pd.unique(self.arr) + + class IsinWithRandomFloat: params = [ [np.float64, np.object], diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 0073aaf0195c7..9d30c87723c8a 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -178,11 +178,29 @@ int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { return result; } -// For PyObject_Hash holds: -// hash(0.0) == 0 == hash(-0.0) -// hash(X) == 0 if X is a NaN-value -// so it is OK to use it directly -#define kh_python_hash_func(key) (PyObject_Hash(key)) + +khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ + // For PyObject_Hash holds: + // hash(0.0) == 0 == hash(-0.0) + // hash(X) == 0 if X is a NaN-value + // so it is OK to use it directly for doubles + Py_hash_t hash = PyObject_Hash(key); + if (hash == -1) { + PyErr_Clear(); + return 0; + } + #if SIZEOF_PY_HASH_T == 4 + // it is already 32bit value + return hash; + #else + // for 64bit builds, + // we need information of the upper 32bits as well + // see GH 37615 + return kh_int64_hash_func(hash); + #endif +} + + #define kh_python_hash_equal(a, b) (pyobject_cmp(a, b)) From 57dceb17cb432564d2f2a4d3dab0a64df65df2f0 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Thu, 4 Feb 2021 21:39:45 +0100 Subject: [PATCH 2/3] using a simpler hash, minimizing changes to the original state --- pandas/_libs/src/klib/khash_python.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 9d30c87723c8a..aee018262e3a6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -196,7 +196,9 @@ khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ // for 64bit builds, // we need information of the upper 32bits as well // see GH 37615 - return kh_int64_hash_func(hash); + khuint64_t as_uint = (khuint64_t) hash; + // uints avoid undefined behavior of signed ints + return (as_uint>>32)^as_uint; #endif } From 1a8c2dd190e61e3c991a208aef9cb1926eaa8014 Mon Sep 17 00:00:00 2001 From: Egor Dranischnikow Date: Fri, 5 Feb 2021 23:37:53 +0100 Subject: [PATCH 3/3] adding whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 17d8c79994dbe..d5c0551abbdd7 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -253,6 +253,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) +- Performance improvement in :func:`unique` for object data type (:issue:`37615`) .. ---------------------------------------------------------------------------