From 28451006c489ac72cfb1deb43b1c05024d8cf746 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Sep 2020 18:44:18 -0700 Subject: [PATCH 1/2] PERF: CategoricalDtype.__eq__ --- pandas/core/dtypes/dtypes.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e321fdd9b3a9b..9bbed0ec2f236 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -375,12 +375,28 @@ def __eq__(self, other: Any) -> bool: # but same order is not necessary. There is no distinction between # ordered=False and ordered=None: CDT(., False) and CDT(., None) # will be equal if they have the same categories. - if ( - self.categories.dtype == other.categories.dtype - and self.categories.equals(other.categories) - ): + left = self.categories + right = other.categories + if not left.dtype == right.dtype: + return False + + if len(left) != len(right): + return False + + if self.categories.equals(other.categories): # Check and see if they happen to be identical categories return True + + if left.dtype != object: + # Faster than calculating hash + indexer = left.get_indexer(right) + # Because left and right have the same length and are unique, + # `indexer` not having any -1s implies that there is a + # bijection between `left` and `right`. + return (indexer != -1).all() + + # With object-dtype we need a comparison that identifies + # e.g. int(2) as distinct from float(2) return hash(self) == hash(other) def __repr__(self) -> str_type: From 0091f5fcd7cfeee43219f3fa282883f627991fcd Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 12 Sep 2020 14:41:58 -0700 Subject: [PATCH 2/2] comment --- pandas/core/dtypes/dtypes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 9bbed0ec2f236..2e5dc15131e70 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -377,6 +377,8 @@ def __eq__(self, other: Any) -> bool: # will be equal if they have the same categories. left = self.categories right = other.categories + + # GH#36280 the ordering of checks here is for performance if not left.dtype == right.dtype: return False