Skip to content

Commit ac360e9

Browse files
committed
BUG: we don't like hash collisions in siphash
1 parent b3dd9ba commit ac360e9

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

pandas/tools/tests/test_hashing.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,3 +146,22 @@ def test_long_strings(self):
146146

147147
obj = Index(tm.rands_array(nchars=10000, size=100))
148148
self.check_equal(obj)
149+
150+
def test_hash_collisions(self):
151+
152+
# hash collisions are bad
153+
# https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
154+
L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9', # noqa
155+
'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe'] # noqa
156+
157+
# these should be different!
158+
result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
159+
expected1 = np.array([1760245841805064774], dtype=np.uint64)
160+
self.assert_numpy_array_equal(result1, expected1)
161+
162+
result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
163+
expected2 = np.array([1760245841805064774], dtype=np.uint64)
164+
self.assert_numpy_array_equal(result2, expected2)
165+
166+
result = hash_array(np.asarray(L, dtype=object), 'utf8')
167+
self.assertTrue(len(result)) == 2

0 commit comments

Comments
 (0)