diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index 937eccf7a0afe..0a3a10315b5fd 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -1,4 +1,5 @@ from contextlib import contextmanager +import struct import tracemalloc import numpy as np @@ -77,16 +78,16 @@ def test_get_set_contains_len(self, table_type, dtype): with pytest.raises(KeyError, match=str(index + 2)): table.get_item(index + 2) - def test_map(self, table_type, dtype, writable): - # PyObjectHashTable has no map-method - if table_type != ht.PyObjectHashTable: + def test_map_keys_to_values(self, table_type, dtype, writable): + # only Int64HashTable has this method + if table_type == ht.Int64HashTable: N = 77 table = table_type() keys = np.arange(N).astype(dtype) vals = np.arange(N).astype(np.int64) + N keys.flags.writeable = writable vals.flags.writeable = writable - table.map(keys, vals) + table.map_keys_to_values(keys, vals) for i in range(N): assert table.get_item(keys[i]) == i + N @@ -165,19 +166,139 @@ def test_get_state(self, table_type, dtype): assert "n_buckets" in state assert "upper_bound" in state - def test_no_reallocation(self, table_type, dtype): - for N in range(1, 110): - keys = np.arange(N).astype(dtype) - preallocated_table = table_type(N) - n_buckets_start = preallocated_table.get_state()["n_buckets"] - preallocated_table.map_locations(keys) - n_buckets_end = preallocated_table.get_state()["n_buckets"] - # original number of buckets was enough: - assert n_buckets_start == n_buckets_end - # check with clean table (not too much preallocated) - clean_table = table_type() - clean_table.map_locations(keys) - assert n_buckets_start == clean_table.get_state()["n_buckets"] + @pytest.mark.parametrize("N", range(1, 110)) + def test_no_reallocation(self, table_type, dtype, N): + keys = np.arange(N).astype(dtype) + preallocated_table = table_type(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # original number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = table_type() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] + + +class TestHashTableUnsorted: + # TODO: moved from test_algos; may be redundancies with other tests + def test_string_hashtable_set_item_signature(self): + # GH#30419 fix typing in StringHashTable.set_item to prevent segfault + tbl = ht.StringHashTable() + + tbl.set_item("key", 1) + assert tbl.get_item("key") == 1 + + with pytest.raises(TypeError, match="'key' has incorrect type"): + # key arg typed as string, not object + tbl.set_item(4, 6) + with pytest.raises(TypeError, match="'val' has incorrect type"): + tbl.get_item(4) + + def test_lookup_nan(self, writable): + # GH#21688 ensure we can deal with readonly memory views + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) + xs.setflags(write=writable) + m = ht.Float64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) + + def test_add_signed_zeros(self): + # GH#21866 inconsistent hash-function for float64 + # default hash-function would lead to different hash-buckets + # for 0.0 and -0.0 if there are more than 2^30 hash-buckets + # but this would mean 16GB + N = 4 # 12 * 10**8 would trigger the error, if you have enough memory + m = ht.Float64HashTable(N) + m.set_item(0.0, 0) + m.set_item(-0.0, 0) + assert len(m) == 1 # 0.0 and -0.0 are equivalent + + def test_add_different_nans(self): + # GH#21866 inconsistent hash-function for float64 + # create different nans from bit-patterns: + NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0] + NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0] + assert NAN1 != NAN1 + assert NAN2 != NAN2 + # default hash function would lead to different hash-buckets + # for NAN1 and NAN2 even if there are only 4 buckets: + m = ht.Float64HashTable() + m.set_item(NAN1, 0) + m.set_item(NAN2, 0) + assert len(m) == 1 # NAN1 and NAN2 are equivalent + + def test_lookup_overflow(self, writable): + xs = np.array([1, 2, 2**63], dtype=np.uint64) + # GH 21688 ensure we can deal with readonly memory views + xs.setflags(write=writable) + m = ht.UInt64HashTable() + m.map_locations(xs) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) + + @pytest.mark.parametrize("nvals", [0, 10]) # resizing to 0 is special case + @pytest.mark.parametrize( + "htable, uniques, dtype, safely_resizes", + [ + (ht.PyObjectHashTable, ht.ObjectVector, "object", False), + (ht.StringHashTable, ht.ObjectVector, "object", True), + (ht.Float64HashTable, ht.Float64Vector, "float64", False), + (ht.Int64HashTable, ht.Int64Vector, "int64", False), + (ht.Int32HashTable, ht.Int32Vector, "int32", False), + (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False), + ], + ) + def test_vector_resize( + self, writable, htable, uniques, dtype, safely_resizes, nvals + ): + # Test for memory errors after internal vector + # reallocations (GH 7157) + # Changed from using np.random.rand to range + # which could cause flaky CI failures when safely_resizes=False + vals = np.array(range(1000), dtype=dtype) + + # GH 21688 ensures we can deal with read-only memory views + vals.setflags(write=writable) + + # initialise instances; cannot initialise in parametrization, + # as otherwise external views would be held on the array (which is + # one of the things this test is checking) + htable = htable() + uniques = uniques() + + # get_labels may append to uniques + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array() sets an external_view_exists flag on uniques. + tmp = uniques.to_array() + oldshape = tmp.shape + + # subsequent get_labels() calls can no longer append to it + # (except for StringHashTables + ObjectVector) + if safely_resizes: + htable.get_labels(vals, uniques, 0, -1) + else: + with pytest.raises(ValueError, match="external reference.*"): + htable.get_labels(vals, uniques, 0, -1) + + uniques.to_array() # should not raise here + assert tmp.shape == oldshape + + @pytest.mark.parametrize( + "hashtable", + [ + ht.PyObjectHashTable, + ht.StringHashTable, + ht.Float64HashTable, + ht.Int64HashTable, + ht.Int32HashTable, + ht.UInt64HashTable, + ], + ) + def test_hashtable_large_sizehint(self, hashtable): + # GH#22729 smoketest for not raising when passing a large size_hint + size_hint = np.iinfo(np.uint32).max + 1 + hashtable(size_hint=size_hint) class TestPyObjectHashTableWithNans: @@ -282,19 +403,19 @@ def test_tracemalloc_for_empty_StringHashTable(): assert get_allocated_khash_memory() == 0 -def test_no_reallocation_StringHashTable(): - for N in range(1, 110): - keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) - preallocated_table = ht.StringHashTable(N) - n_buckets_start = preallocated_table.get_state()["n_buckets"] - preallocated_table.map_locations(keys) - n_buckets_end = preallocated_table.get_state()["n_buckets"] - # original number of buckets was enough: - assert n_buckets_start == n_buckets_end - # check with clean table (not too much preallocated) - clean_table = ht.StringHashTable() - clean_table.map_locations(keys) - assert n_buckets_start == clean_table.get_state()["n_buckets"] +@pytest.mark.parametrize("N", range(1, 110)) +def test_no_reallocation_StringHashTable(N): + keys = np.arange(N).astype(np.compat.unicode).astype(np.object_) + preallocated_table = ht.StringHashTable(N) + n_buckets_start = preallocated_table.get_state()["n_buckets"] + preallocated_table.map_locations(keys) + n_buckets_end = preallocated_table.get_state()["n_buckets"] + # original number of buckets was enough: + assert n_buckets_start == n_buckets_end + # check with clean table (not too much preallocated) + clean_table = ht.StringHashTable() + clean_table.map_locations(keys) + assert n_buckets_start == clean_table.get_state()["n_buckets"] @pytest.mark.parametrize( @@ -322,15 +443,6 @@ def test_get_set_contains_len(self, table_type, dtype): assert index in table assert table.get_item(index) == 41 - def test_map(self, table_type, dtype): - N = 332 - table = table_type() - keys = np.full(N, np.nan, dtype=dtype) - vals = (np.arange(N) + N).astype(np.int64) - table.map(keys, vals) - assert len(table) == 1 - assert table.get_item(np.nan) == 2 * N - 1 - def test_map_locations(self, table_type, dtype): N = 10 table = table_type() @@ -468,6 +580,21 @@ def test_unique_label_indices_intp(writable): tm.assert_numpy_array_equal(result, expected) +def test_unique_label_indices(): + + a = np.random.randint(1, 1 << 10, 1 << 15).astype(np.intp) + + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1] + + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + a[np.random.choice(len(a), 10)] = -1 + left = ht.unique_label_indices(a) + right = np.unique(a, return_index=True)[1][1:] + tm.assert_numpy_array_equal(left, right, check_dtype=False) + + @pytest.mark.parametrize( "dtype", [