Skip to content

Commit 908161b

Browse files
committed
ENH: add reduction option to hash_pandas_objects
xref pandas-dev#14729
1 parent 1b0333b commit 908161b

File tree

2 files changed

+46
-2
lines changed

2 files changed

+46
-2
lines changed

pandas/tools/hashing.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,19 @@
1212
_default_hash_key = '0123456789123456'
1313

1414

15-
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
15+
def reducer(arr):
16+
"""
17+
return a scalar via our reduction function
18+
19+
an empty array returns np.nan
20+
"""
21+
if len(arr):
22+
return np.bitwise_xor.reduce(arr)
23+
return np.nan
24+
25+
26+
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
27+
reduce=False):
1628
"""
1729
Return a data hash of the Index/Series/DataFrame
1830
@@ -25,6 +37,8 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
2537
encoding : string, default 'utf8'
2638
encoding for data & key when strings
2739
hash_key : string key to encode, default to _default_hash_key
40+
reduce : boolean, default False
41+
produce a single hash result
2842
2943
Returns
3044
-------
@@ -65,10 +79,14 @@ def adder(h, hashed_to_add):
6579
h = Series(h, index=obj.index, dtype='uint64')
6680
else:
6781
raise TypeError("Unexpected type for hashing %s" % type(obj))
82+
83+
if reduce:
84+
h = reducer(h.values)
85+
6886
return h
6987

7088

71-
def hash_array(vals, encoding='utf8', hash_key=None):
89+
def hash_array(vals, encoding='utf8', hash_key=None, reduce=False):
7290
"""
7391
Given a 1d array, return an array of deterministic integers.
7492
@@ -80,6 +98,8 @@ def hash_array(vals, encoding='utf8', hash_key=None):
8098
encoding : string, default 'utf8'
8199
encoding for data & key when strings
82100
hash_key : string key to encode, default to _default_hash_key
101+
reduce : boolean, default False
102+
produce a single hash result
83103
84104
Returns
85105
-------
@@ -134,4 +154,8 @@ def hash_array(vals, encoding='utf8', hash_key=None):
134154
vals ^= vals >> 27
135155
vals *= np.uint64(0x94d049bb133111eb)
136156
vals ^= vals >> 31
157+
158+
if reduce:
159+
vals = reducer(vals)
160+
137161
return vals

pandas/tools/tests/test_hashing.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from pandas import DataFrame, Series, Index
55
from pandas.tools.hashing import hash_array, hash_pandas_object
6+
from pandas.types.common import is_scalar
67
import pandas.util.testing as tm
78

89

@@ -31,6 +32,20 @@ def test_consistency(self):
3132
index=['foo', 'bar', 'baz'])
3233
tm.assert_series_equal(result, expected)
3334

35+
def test_reduce_consistency(self):
36+
c = Series(pd.Categorical([1, 2], ordered=True))
37+
38+
result = hash_pandas_object(c, reduce=True)
39+
self.assertEqual(result, 6505546918052763540)
40+
41+
c = Series(pd.Categorical([2, 1], ordered=True))
42+
result = hash_pandas_object(c, reduce=True)
43+
self.assertEqual(result, 6135121269729399882)
44+
45+
c = Series(pd.Categorical([1., 2.], ordered=True))
46+
result = hash_pandas_object(c, reduce=True)
47+
self.assertEqual(result, 6505546918052763540)
48+
3449
def test_hash_array(self):
3550
for name, s in self.df.iteritems():
3651
a = s.values
@@ -46,6 +61,11 @@ def check_equal(self, obj, **kwargs):
4661
b = hash_pandas_object(obj, **kwargs)
4762
tm.assert_series_equal(a, b)
4863

64+
# test reducing
65+
kwargs['reduce'] = True
66+
result = hash_pandas_object(obj, **kwargs)
67+
self.assertTrue(is_scalar(result))
68+
4969
def check_not_equal_with_index(self, obj):
5070

5171
# check that we are not hashing the same if

0 commit comments

Comments
 (0)