1313
1414class TestHashing (object ):
1515
16- def setup_method ( self , method ):
17- self . df = DataFrame (
18- { 'i32' : np . array ([ 1 , 2 , 3 ] * 3 , dtype = 'int32 ' ),
19- 'f32' : np . array ([ None , 2.5 , 3.5 ] * 3 , dtype = 'float32 ' ),
20- 'cat' : Series (['a ' , 'b ' , 'c ' ] * 3 ). astype ( 'category' ),
21- 'obj' : Series (['d' , 'e' , 'f' ] * 3 ),
22- 'bool' : np . array ([ True , False , True ] * 3 ),
23- 'dt' : Series (pd .date_range ('20130101' , periods = 9 )),
24- 'dt_tz' : Series (pd .date_range ( '20130101 ' , periods = 9 ,
25- tz = 'US/Eastern' )),
26- 'td' : Series ( pd . timedelta_range ( '2000' , periods = 9 ))})
16+ @ pytest . fixture ( params = [
17+ Series ([ 1 , 2 , 3 ] * 3 , dtype = 'int32' ),
18+ Series ([ None , 2.5 , 3.5 ] * 3 , dtype = 'float32 ' ),
19+ Series ([ 'a' , 'b' , 'c' ] * 3 , dtype = 'category ' ),
20+ Series (['d ' , 'e ' , 'f ' ] * 3 ),
21+ Series ([True , False , True ] * 3 ),
22+ Series ( pd . date_range ( '20130101' , periods = 9 ) ),
23+ Series (pd .date_range ('20130101' , periods = 9 , tz = 'US/Eastern' )),
24+ Series (pd .timedelta_range ( '2000 ' , periods = 9 ))])
25+ def series ( self , request ):
26+ return request . param
2727
2828 def test_consistency (self ):
2929 # check that our hash doesn't change because of a mistake
@@ -34,10 +34,9 @@ def test_consistency(self):
3434 index = ['foo' , 'bar' , 'baz' ])
3535 tm .assert_series_equal (result , expected )
3636
37- def test_hash_array (self ):
38- for name , s in self .df .iteritems ():
39- a = s .values
40- tm .assert_numpy_array_equal (hash_array (a ), hash_array (a ))
37+ def test_hash_array (self , series ):
38+ a = series .values
39+ tm .assert_numpy_array_equal (hash_array (a ), hash_array (a ))
4140
4241 def test_hash_array_mixed (self ):
4342 result1 = hash_array (np .array ([3 , 4 , 'All' ]))
@@ -46,10 +45,11 @@ def test_hash_array_mixed(self):
4645 tm .assert_numpy_array_equal (result1 , result2 )
4746 tm .assert_numpy_array_equal (result1 , result3 )
4847
49- def test_hash_array_errors (self ):
50-
51- for val in [5 , 'foo' , pd .Timestamp ('20130101' )]:
52- pytest .raises (TypeError , hash_array , val )
48+ @pytest .mark .parametrize ('val' , [5 , 'foo' , pd .Timestamp ('20130101' )])
49+ def test_hash_array_errors (self , val ):
50+ msg = 'must pass a ndarray-like'
51+ with tm .assert_raises_regex (TypeError , msg ):
52+ hash_array (val )
5353
5454 def check_equal (self , obj , ** kwargs ):
5555 a = hash_pandas_object (obj , ** kwargs )
@@ -80,31 +80,33 @@ def test_hash_tuples(self):
8080 result = hash_tuples (tups [0 ])
8181 assert result == expected [0 ]
8282
83- def test_hash_tuple (self ):
83+ @pytest .mark .parametrize ('tup' , [
84+ (1 , 'one' ), (1 , np .nan ), (1.0 , pd .NaT , 'A' ),
85+ ('A' , pd .Timestamp ("2012-01-01" ))])
86+ def test_hash_tuple (self , tup ):
8487 # test equivalence between hash_tuples and hash_tuple
85- for tup in [(1 , 'one' ), (1 , np .nan ), (1.0 , pd .NaT , 'A' ),
86- ('A' , pd .Timestamp ("2012-01-01" ))]:
87- result = hash_tuple (tup )
88- expected = hash_tuples ([tup ])[0 ]
89- assert result == expected
90-
91- def test_hash_scalar (self ):
92- for val in [1 , 1.4 , 'A' , b'A' , u'A' , pd .Timestamp ("2012-01-01" ),
93- pd .Timestamp ("2012-01-01" , tz = 'Europe/Brussels' ),
94- datetime .datetime (2012 , 1 , 1 ),
95- pd .Timestamp ("2012-01-01" , tz = 'EST' ).to_pydatetime (),
96- pd .Timedelta ('1 days' ), datetime .timedelta (1 ),
97- pd .Period ('2012-01-01' , freq = 'D' ), pd .Interval (0 , 1 ),
98- np .nan , pd .NaT , None ]:
99- result = _hash_scalar (val )
100- expected = hash_array (np .array ([val ], dtype = object ),
101- categorize = True )
102- assert result [0 ] == expected [0 ]
103-
104- def test_hash_tuples_err (self ):
105-
106- for val in [5 , 'foo' , pd .Timestamp ('20130101' )]:
107- pytest .raises (TypeError , hash_tuples , val )
88+ result = hash_tuple (tup )
89+ expected = hash_tuples ([tup ])[0 ]
90+ assert result == expected
91+
92+ @pytest .mark .parametrize ('val' , [
93+ 1 , 1.4 , 'A' , b'A' , u'A' , pd .Timestamp ("2012-01-01" ),
94+ pd .Timestamp ("2012-01-01" , tz = 'Europe/Brussels' ),
95+ datetime .datetime (2012 , 1 , 1 ),
96+ pd .Timestamp ("2012-01-01" , tz = 'EST' ).to_pydatetime (),
97+ pd .Timedelta ('1 days' ), datetime .timedelta (1 ),
98+ pd .Period ('2012-01-01' , freq = 'D' ), pd .Interval (0 , 1 ),
99+ np .nan , pd .NaT , None ])
100+ def test_hash_scalar (self , val ):
101+ result = _hash_scalar (val )
102+ expected = hash_array (np .array ([val ], dtype = object ), categorize = True )
103+ assert result [0 ] == expected [0 ]
104+
105+ @pytest .mark .parametrize ('val' , [5 , 'foo' , pd .Timestamp ('20130101' )])
106+ def test_hash_tuples_err (self , val ):
107+ msg = 'must be convertible to a list-of-tuples'
108+ with tm .assert_raises_regex (TypeError , msg ):
109+ hash_tuples (val )
108110
109111 def test_multiindex_unique (self ):
110112 mi = MultiIndex .from_tuples ([(118 , 472 ), (236 , 118 ),
@@ -172,36 +174,35 @@ def test_hash_pandas_object(self, obj):
172174 self .check_equal (obj )
173175 self .check_not_equal_with_index (obj )
174176
175- def test_hash_pandas_object2 (self ):
176- for name , s in self .df .iteritems ():
177- self .check_equal (s )
178- self .check_not_equal_with_index (s )
179-
180- def test_hash_pandas_empty_object (self ):
181- for obj in [Series ([], dtype = 'float64' ),
182- Series ([], dtype = 'object' ),
183- Index ([])]:
184- self .check_equal (obj )
177+ def test_hash_pandas_object2 (self , series ):
178+ self .check_equal (series )
179+ self .check_not_equal_with_index (series )
185180
186- # these are by-definition the same with
187- # or w/o the index as the data is empty
181+ @pytest .mark .parametrize ('obj' , [
182+ Series ([], dtype = 'float64' ), Series ([], dtype = 'object' ), Index ([])])
183+ def test_hash_pandas_empty_object (self , obj ):
184+ # these are by-definition the same with
185+ # or w/o the index as the data is empty
186+ self .check_equal (obj )
188187
189- def test_categorical_consistency (self ):
188+ @pytest .mark .parametrize ('s1' , [
189+ Series (['a' , 'b' , 'c' , 'd' ]),
190+ Series ([1000 , 2000 , 3000 , 4000 ]),
191+ Series (pd .date_range (0 , periods = 4 ))])
192+ @pytest .mark .parametrize ('categorize' , [True , False ])
193+ def test_categorical_consistency (self , s1 , categorize ):
190194 # GH15143
191195 # Check that categoricals hash consistent with their values, not codes
192196 # This should work for categoricals of any dtype
193- for s1 in [Series (['a' , 'b' , 'c' , 'd' ]),
194- Series ([1000 , 2000 , 3000 , 4000 ]),
195- Series (pd .date_range (0 , periods = 4 ))]:
196- s2 = s1 .astype ('category' ).cat .set_categories (s1 )
197- s3 = s2 .cat .set_categories (list (reversed (s1 )))
198- for categorize in [True , False ]:
199- # These should all hash identically
200- h1 = hash_pandas_object (s1 , categorize = categorize )
201- h2 = hash_pandas_object (s2 , categorize = categorize )
202- h3 = hash_pandas_object (s3 , categorize = categorize )
203- tm .assert_series_equal (h1 , h2 )
204- tm .assert_series_equal (h1 , h3 )
197+ s2 = s1 .astype ('category' ).cat .set_categories (s1 )
198+ s3 = s2 .cat .set_categories (list (reversed (s1 )))
199+
200+ # These should all hash identically
201+ h1 = hash_pandas_object (s1 , categorize = categorize )
202+ h2 = hash_pandas_object (s2 , categorize = categorize )
203+ h3 = hash_pandas_object (s3 , categorize = categorize )
204+ tm .assert_series_equal (h1 , h2 )
205+ tm .assert_series_equal (h1 , h3 )
205206
206207 def test_categorical_with_nan_consistency (self ):
207208 c = pd .Categorical .from_codes (
@@ -216,13 +217,12 @@ def test_categorical_with_nan_consistency(self):
216217 assert result [1 ] in expected
217218
218219 def test_pandas_errors (self ):
219-
220- for obj in [pd .Timestamp ('20130101' )]:
221- with pytest .raises (TypeError ):
222- hash_pandas_object (obj )
220+ with pytest .raises (TypeError ):
221+ hash_pandas_object (pd .Timestamp ('20130101' ))
223222
224223 with catch_warnings (record = True ):
225224 obj = tm .makePanel ()
225+
226226 with pytest .raises (TypeError ):
227227 hash_pandas_object (obj )
228228
@@ -238,9 +238,9 @@ def test_hash_keys(self):
238238
239239 def test_invalid_key (self ):
240240 # this only matters for object dtypes
241- def f ():
241+ msg = r"key should be a 16-byte string encoded, got b'foo' \(len 3\)"
242+ with tm .assert_raises_regex (ValueError , msg ):
242243 hash_pandas_object (Series (list ('abc' )), hash_key = 'foo' )
243- pytest .raises (ValueError , f )
244244
245245 def test_alread_encoded (self ):
246246 # if already encoded then ok
@@ -253,19 +253,13 @@ def test_alternate_encoding(self):
253253 obj = Series (list ('abc' ))
254254 self .check_equal (obj , encoding = 'ascii' )
255255
256- def test_same_len_hash_collisions (self ):
257-
258- for l in range (8 ):
259- length = 2 ** (l + 8 ) + 1
260- s = tm .rands_array (length , 2 )
261- result = hash_array (s , 'utf8' )
262- assert not result [0 ] == result [1 ]
263-
264- for l in range (8 ):
265- length = 2 ** (l + 8 )
266- s = tm .rands_array (length , 2 )
267- result = hash_array (s , 'utf8' )
268- assert not result [0 ] == result [1 ]
256+ @pytest .mark .parametrize ('l_exp' , range (8 ))
257+ @pytest .mark .parametrize ('l_add' , [0 , 1 ])
258+ def test_same_len_hash_collisions (self , l_exp , l_add ):
259+ length = 2 ** (l_exp + 8 ) + l_add
260+ s = tm .rands_array (length , 2 )
261+ result = hash_array (s , 'utf8' )
262+ assert not result [0 ] == result [1 ]
269263
270264 def test_hash_collisions (self ):
271265
0 commit comments