1
+ #include "git-compat-util.h"
2
+ #include "bloom.h"
3
+ #include "diff.h"
4
+ #include "diffcore.h"
5
+ #include "revision.h"
6
+ #include "hashmap.h"
7
+ #include "commit-graph.h"
8
+ #include "commit.h"
9
+
10
+ define_commit_slab (bloom_filter_slab , struct bloom_filter );
11
+
12
+ struct bloom_filter_slab bloom_filters ;
13
+
14
+ struct pathmap_hash_entry {
15
+ struct hashmap_entry entry ;
16
+ const char path [FLEX_ARRAY ];
17
+ };
18
+
19
+ static uint32_t rotate_left (uint32_t value , int32_t count )
20
+ {
21
+ uint32_t mask = 8 * sizeof (uint32_t ) - 1 ;
22
+ count &= mask ;
23
+ return ((value << count ) | (value >> ((- count ) & mask )));
24
+ }
25
+
26
+ static inline unsigned char get_bitmask (uint32_t pos )
27
+ {
28
+ return ((unsigned char )1 ) << (pos & (BITS_PER_WORD - 1 ));
29
+ }
30
+
31
+ static int load_bloom_filter_from_graph (struct commit_graph * g ,
32
+ struct bloom_filter * filter ,
33
+ struct commit * c )
34
+ {
35
+ uint32_t lex_pos , start_index , end_index ;
36
+
37
+ while (c -> graph_pos < g -> num_commits_in_base )
38
+ g = g -> base_graph ;
39
+
40
+ /* The commit graph commit 'c' lives in doesn't carry bloom filters. */
41
+ if (!g -> chunk_bloom_indexes )
42
+ return 0 ;
43
+
44
+ lex_pos = c -> graph_pos - g -> num_commits_in_base ;
45
+
46
+ end_index = get_be32 (g -> chunk_bloom_indexes + 4 * lex_pos );
47
+
48
+ if (lex_pos > 0 )
49
+ start_index = get_be32 (g -> chunk_bloom_indexes + 4 * (lex_pos - 1 ));
50
+ else
51
+ start_index = 0 ;
52
+
53
+ filter -> len = end_index - start_index ;
54
+ filter -> data = (unsigned char * )(g -> chunk_bloom_data +
55
+ sizeof (unsigned char ) * start_index +
56
+ BLOOMDATA_CHUNK_HEADER_SIZE );
57
+
58
+ return 1 ;
59
+ }
60
+
61
+ /*
62
+ * Calculate the murmur3 32-bit hash value for the given data
63
+ * using the given seed.
64
+ * Produces a uniformly distributed hash value.
65
+ * Not considered to be cryptographically secure.
66
+ * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
67
+ */
68
+ uint32_t murmur3_seeded (uint32_t seed , const char * data , size_t len )
69
+ {
70
+ const uint32_t c1 = 0xcc9e2d51 ;
71
+ const uint32_t c2 = 0x1b873593 ;
72
+ const uint32_t r1 = 15 ;
73
+ const uint32_t r2 = 13 ;
74
+ const uint32_t m = 5 ;
75
+ const uint32_t n = 0xe6546b64 ;
76
+ int i ;
77
+ uint32_t k1 = 0 ;
78
+ const char * tail ;
79
+
80
+ int len4 = len / sizeof (uint32_t );
81
+
82
+ uint32_t k ;
83
+ for (i = 0 ; i < len4 ; i ++ ) {
84
+ uint32_t byte1 = (uint32_t )data [4 * i ];
85
+ uint32_t byte2 = ((uint32_t )data [4 * i + 1 ]) << 8 ;
86
+ uint32_t byte3 = ((uint32_t )data [4 * i + 2 ]) << 16 ;
87
+ uint32_t byte4 = ((uint32_t )data [4 * i + 3 ]) << 24 ;
88
+ k = byte1 | byte2 | byte3 | byte4 ;
89
+ k *= c1 ;
90
+ k = rotate_left (k , r1 );
91
+ k *= c2 ;
92
+
93
+ seed ^= k ;
94
+ seed = rotate_left (seed , r2 ) * m + n ;
95
+ }
96
+
97
+ tail = (data + len4 * sizeof (uint32_t ));
98
+
99
+ switch (len & (sizeof (uint32_t ) - 1 )) {
100
+ case 3 :
101
+ k1 ^= ((uint32_t )tail [2 ]) << 16 ;
102
+ /*-fallthrough*/
103
+ case 2 :
104
+ k1 ^= ((uint32_t )tail [1 ]) << 8 ;
105
+ /*-fallthrough*/
106
+ case 1 :
107
+ k1 ^= ((uint32_t )tail [0 ]) << 0 ;
108
+ k1 *= c1 ;
109
+ k1 = rotate_left (k1 , r1 );
110
+ k1 *= c2 ;
111
+ seed ^= k1 ;
112
+ break ;
113
+ }
114
+
115
+ seed ^= (uint32_t )len ;
116
+ seed ^= (seed >> 16 );
117
+ seed *= 0x85ebca6b ;
118
+ seed ^= (seed >> 13 );
119
+ seed *= 0xc2b2ae35 ;
120
+ seed ^= (seed >> 16 );
121
+
122
+ return seed ;
123
+ }
124
+
125
+ void fill_bloom_key (const char * data ,
126
+ size_t len ,
127
+ struct bloom_key * key ,
128
+ const struct bloom_filter_settings * settings )
129
+ {
130
+ int i ;
131
+ const uint32_t seed0 = 0x293ae76f ;
132
+ const uint32_t seed1 = 0x7e646e2c ;
133
+ const uint32_t hash0 = murmur3_seeded (seed0 , data , len );
134
+ const uint32_t hash1 = murmur3_seeded (seed1 , data , len );
135
+
136
+ key -> hashes = (uint32_t * )xcalloc (settings -> num_hashes , sizeof (uint32_t ));
137
+ for (i = 0 ; i < settings -> num_hashes ; i ++ )
138
+ key -> hashes [i ] = hash0 + i * hash1 ;
139
+ }
140
+
141
+ void add_key_to_filter (const struct bloom_key * key ,
142
+ struct bloom_filter * filter ,
143
+ const struct bloom_filter_settings * settings )
144
+ {
145
+ int i ;
146
+ uint64_t mod = filter -> len * BITS_PER_WORD ;
147
+
148
+ for (i = 0 ; i < settings -> num_hashes ; i ++ ) {
149
+ uint64_t hash_mod = key -> hashes [i ] % mod ;
150
+ uint64_t block_pos = hash_mod / BITS_PER_WORD ;
151
+
152
+ filter -> data [block_pos ] |= get_bitmask (hash_mod );
153
+ }
154
+ }
155
+
156
+ void init_bloom_filters (void )
157
+ {
158
+ init_bloom_filter_slab (& bloom_filters );
159
+ }
160
+
161
+ struct bloom_filter * get_bloom_filter (struct repository * r ,
162
+ struct commit * c ,
163
+ int compute_if_not_present )
164
+ {
165
+ struct bloom_filter * filter ;
166
+ struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS ;
167
+ int i ;
168
+ struct diff_options diffopt ;
169
+ int max_changes = 512 ;
170
+
171
+ if (bloom_filters .slab_size == 0 )
172
+ return NULL ;
173
+
174
+ filter = bloom_filter_slab_at (& bloom_filters , c );
175
+
176
+ if (!filter -> data ) {
177
+ load_commit_graph_info (r , c );
178
+ if (c -> graph_pos != COMMIT_NOT_FROM_GRAPH &&
179
+ r -> objects -> commit_graph -> chunk_bloom_indexes ) {
180
+ if (load_bloom_filter_from_graph (r -> objects -> commit_graph , filter , c ))
181
+ return filter ;
182
+ else
183
+ return NULL ;
184
+ }
185
+ }
186
+
187
+ if (filter -> data || !compute_if_not_present )
188
+ return filter ;
189
+
190
+ repo_diff_setup (r , & diffopt );
191
+ diffopt .flags .recursive = 1 ;
192
+ diffopt .detect_rename = 0 ;
193
+ diffopt .max_changes = max_changes ;
194
+ diff_setup_done (& diffopt );
195
+
196
+ if (c -> parents )
197
+ diff_tree_oid (& c -> parents -> item -> object .oid , & c -> object .oid , "" , & diffopt );
198
+ else
199
+ diff_tree_oid (NULL , & c -> object .oid , "" , & diffopt );
200
+ diffcore_std (& diffopt );
201
+
202
+ if (diff_queued_diff .nr <= max_changes ) {
203
+ struct hashmap pathmap ;
204
+ struct pathmap_hash_entry * e ;
205
+ struct hashmap_iter iter ;
206
+ hashmap_init (& pathmap , NULL , NULL , 0 );
207
+
208
+ for (i = 0 ; i < diff_queued_diff .nr ; i ++ ) {
209
+ const char * path = diff_queued_diff .queue [i ]-> two -> path ;
210
+
211
+ /*
212
+ * Add each leading directory of the changed file, i.e. for
213
+ * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
214
+ * the Bloom filter could be used to speed up commands like
215
+ * 'git log dir/subdir', too.
216
+ *
217
+ * Note that directories are added without the trailing '/'.
218
+ */
219
+ do {
220
+ char * last_slash = strrchr (path , '/' );
221
+
222
+ FLEX_ALLOC_STR (e , path , path );
223
+ hashmap_entry_init (& e -> entry , strhash (path ));
224
+ hashmap_add (& pathmap , & e -> entry );
225
+
226
+ if (!last_slash )
227
+ last_slash = (char * )path ;
228
+ * last_slash = '\0' ;
229
+
230
+ } while (* path );
231
+
232
+ diff_free_filepair (diff_queued_diff .queue [i ]);
233
+ }
234
+
235
+ filter -> len = (hashmap_get_size (& pathmap ) * settings .bits_per_entry + BITS_PER_WORD - 1 ) / BITS_PER_WORD ;
236
+ filter -> data = xcalloc (filter -> len , sizeof (unsigned char ));
237
+
238
+ hashmap_for_each_entry (& pathmap , & iter , e , entry ) {
239
+ struct bloom_key key ;
240
+ fill_bloom_key (e -> path , strlen (e -> path ), & key , & settings );
241
+ add_key_to_filter (& key , filter , & settings );
242
+ }
243
+
244
+ hashmap_free_entries (& pathmap , struct pathmap_hash_entry , entry );
245
+ } else {
246
+ for (i = 0 ; i < diff_queued_diff .nr ; i ++ )
247
+ diff_free_filepair (diff_queued_diff .queue [i ]);
248
+ filter -> data = NULL ;
249
+ filter -> len = 0 ;
250
+ }
251
+
252
+ free (diff_queued_diff .queue );
253
+ DIFF_QUEUE_CLEAR (& diff_queued_diff );
254
+
255
+ return filter ;
256
+ }
257
+
258
+ int bloom_filter_contains (const struct bloom_filter * filter ,
259
+ const struct bloom_key * key ,
260
+ const struct bloom_filter_settings * settings )
261
+ {
262
+ int i ;
263
+ uint64_t mod = filter -> len * BITS_PER_WORD ;
264
+
265
+ if (!mod )
266
+ return -1 ;
267
+
268
+ for (i = 0 ; i < settings -> num_hashes ; i ++ ) {
269
+ uint64_t hash_mod = key -> hashes [i ] % mod ;
270
+ uint64_t block_pos = hash_mod / BITS_PER_WORD ;
271
+ if (!(filter -> data [block_pos ] & get_bitmask (hash_mod )))
272
+ return 0 ;
273
+ }
274
+
275
+ return 1 ;
276
+ }
0 commit comments