diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a6ba7770dadcc..d5231e63b7e6e 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -912,6 +912,7 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d735890f7d07e..5298d8c5ed34e 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -70,6 +70,12 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): # null, stringify and encode data = str(val).encode(encoding) + elif isinstance(val, tuple): + # GH#28969 we could have a tuple, but need to ensure that + # the tuple entries are themselves hashable before converting + # to str + hash(val) + data = str(val).encode(encoding) else: raise TypeError(f"{val} of type {type(val)} is not a valid type " "for hashing, must be string or null") diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fa3582755a202..43655fa3ea913 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -85,11 +85,12 @@ def hash_pandas_object( if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) - if isinstance(obj, ABCIndexClass): + elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) + elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index ebbdbd6c29842..ee9c4ed12bd92 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -353,3 +353,24 @@ def test_hash_collisions(): result = hash_array(np.asarray(hashes, dtype=object), "utf8") tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3)