|
| 1 | +#include "git-compat-util.h" |
| 2 | +#include "bloom.h" |
| 3 | +#include "commit.h" |
| 4 | +#include "commit-slab.h" |
| 5 | +#include "commit-graph.h" |
| 6 | +#include "object-store.h" |
| 7 | +#include "diff.h" |
| 8 | +#include "diffcore.h" |
| 9 | +#include "revision.h" |
| 10 | +#include "hashmap.h" |
| 11 | + |
| 12 | +define_commit_slab(bloom_filter_slab, struct bloom_filter); |
| 13 | + |
| 14 | +struct bloom_filter_slab bloom_filters; |
| 15 | + |
| 16 | +struct pathmap_hash_entry { |
| 17 | + struct hashmap_entry entry; |
| 18 | + const char path[FLEX_ARRAY]; |
| 19 | +}; |
| 20 | + |
| 21 | +static uint32_t rotate_right(uint32_t value, int32_t count) |
| 22 | +{ |
| 23 | + uint32_t mask = 8 * sizeof(uint32_t) - 1; |
| 24 | + count &= mask; |
| 25 | + return ((value >> count) | (value << ((-count) & mask))); |
| 26 | +} |
| 27 | + |
| 28 | +/* |
| 29 | + * Calculate a hash value for the given data using the given seed. |
| 30 | + * Produces a uniformly distributed hash value. |
| 31 | + * Not considered to be cryptographically secure. |
| 32 | + * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm |
| 33 | + **/ |
| 34 | +static uint32_t seed_murmur3(uint32_t seed, const char *data, int len) |
| 35 | +{ |
| 36 | + const uint32_t c1 = 0xcc9e2d51; |
| 37 | + const uint32_t c2 = 0x1b873593; |
| 38 | + const uint32_t r1 = 15; |
| 39 | + const uint32_t r2 = 13; |
| 40 | + const uint32_t m = 5; |
| 41 | + const uint32_t n = 0xe6546b64; |
| 42 | + int i; |
| 43 | + uint32_t k1 = 0; |
| 44 | + const char *tail; |
| 45 | + |
| 46 | + int len4 = len / sizeof(uint32_t); |
| 47 | + |
| 48 | + uint32_t k; |
| 49 | + for (i = 0; i < len4; i++) { |
| 50 | + uint32_t byte1 = (uint32_t)data[4*i]; |
| 51 | + uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8; |
| 52 | + uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16; |
| 53 | + uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24; |
| 54 | + k = byte1 | byte2 | byte3 | byte4; |
| 55 | + k *= c1; |
| 56 | + k = rotate_right(k, r1); |
| 57 | + k *= c2; |
| 58 | + |
| 59 | + seed ^= k; |
| 60 | + seed = rotate_right(seed, r2) * m + n; |
| 61 | + } |
| 62 | + |
| 63 | + tail = (data + len4 * sizeof(uint32_t)); |
| 64 | + |
| 65 | + switch (len & (sizeof(uint32_t) - 1)) { |
| 66 | + case 3: |
| 67 | + k1 ^= ((uint32_t)tail[2]) << 16; |
| 68 | + /*-fallthrough*/ |
| 69 | + case 2: |
| 70 | + k1 ^= ((uint32_t)tail[1]) << 8; |
| 71 | + /*-fallthrough*/ |
| 72 | + case 1: |
| 73 | + k1 ^= ((uint32_t)tail[0]) << 0; |
| 74 | + k1 *= c1; |
| 75 | + k1 = rotate_right(k1, r1); |
| 76 | + k1 *= c2; |
| 77 | + seed ^= k1; |
| 78 | + break; |
| 79 | + } |
| 80 | + |
| 81 | + seed ^= (uint32_t)len; |
| 82 | + seed ^= (seed >> 16); |
| 83 | + seed *= 0x85ebca6b; |
| 84 | + seed ^= (seed >> 13); |
| 85 | + seed *= 0xc2b2ae35; |
| 86 | + seed ^= (seed >> 16); |
| 87 | + |
| 88 | + return seed; |
| 89 | +} |
| 90 | + |
| 91 | +static inline unsigned char get_bitmask(uint32_t pos) |
| 92 | +{ |
| 93 | + return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); |
| 94 | +} |
| 95 | + |
| 96 | +void load_bloom_filters(void) |
| 97 | +{ |
| 98 | + init_bloom_filter_slab(&bloom_filters); |
| 99 | +} |
| 100 | + |
| 101 | +void fill_bloom_key(const char *data, |
| 102 | + int len, |
| 103 | + struct bloom_key *key, |
| 104 | + struct bloom_filter_settings *settings) |
| 105 | +{ |
| 106 | + int i; |
| 107 | + const uint32_t seed0 = 0x293ae76f; |
| 108 | + const uint32_t seed1 = 0x7e646e2c; |
| 109 | + const uint32_t hash0 = seed_murmur3(seed0, data, len); |
| 110 | + const uint32_t hash1 = seed_murmur3(seed1, data, len); |
| 111 | + |
| 112 | + key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); |
| 113 | + for (i = 0; i < settings->num_hashes; i++) |
| 114 | + key->hashes[i] = hash0 + i * hash1; |
| 115 | +} |
| 116 | + |
| 117 | +void add_key_to_filter(struct bloom_key *key, |
| 118 | + struct bloom_filter *filter, |
| 119 | + struct bloom_filter_settings *settings) |
| 120 | +{ |
| 121 | + int i; |
| 122 | + uint64_t mod = filter->len * BITS_PER_WORD; |
| 123 | + |
| 124 | + for (i = 0; i < settings->num_hashes; i++) { |
| 125 | + uint64_t hash_mod = key->hashes[i] % mod; |
| 126 | + uint64_t block_pos = hash_mod / BITS_PER_WORD; |
| 127 | + |
| 128 | + filter->data[block_pos] |= get_bitmask(hash_mod); |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +static int load_bloom_filter_from_graph(struct commit_graph *g, |
| 133 | + struct bloom_filter *filter, |
| 134 | + struct commit *c) |
| 135 | +{ |
| 136 | + uint32_t lex_pos, start_index, end_index; |
| 137 | + |
| 138 | + while (c->graph_pos < g->num_commits_in_base) |
| 139 | + g = g->base_graph; |
| 140 | + |
| 141 | + /* The commit graph commit 'c' lives in doesn't carry bloom filters. */ |
| 142 | + if (!g->chunk_bloom_indexes) |
| 143 | + return 0; |
| 144 | + |
| 145 | + lex_pos = c->graph_pos - g->num_commits_in_base; |
| 146 | + |
| 147 | + end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos); |
| 148 | + |
| 149 | + if (lex_pos) |
| 150 | + start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1)); |
| 151 | + else |
| 152 | + start_index = 0; |
| 153 | + |
| 154 | + filter->len = end_index - start_index; |
| 155 | + filter->data = (unsigned char *)(g->chunk_bloom_data + |
| 156 | + sizeof(unsigned char) * start_index + |
| 157 | + BLOOMDATA_CHUNK_HEADER_SIZE); |
| 158 | + |
| 159 | + return 1; |
| 160 | +} |
| 161 | + |
| 162 | +struct bloom_filter *get_bloom_filter(struct repository *r, |
| 163 | + struct commit *c, |
| 164 | + int compute_if_not_present) |
| 165 | +{ |
| 166 | + struct bloom_filter *filter; |
| 167 | + struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; |
| 168 | + int i; |
| 169 | + struct diff_options diffopt; |
| 170 | + int max_changes = 512; |
| 171 | + |
| 172 | + if (!bloom_filters.slab_size) |
| 173 | + return NULL; |
| 174 | + |
| 175 | + filter = bloom_filter_slab_at(&bloom_filters, c); |
| 176 | + |
| 177 | + if (!filter->data) { |
| 178 | + load_commit_graph_info(r, c); |
| 179 | + if (c->graph_pos != COMMIT_NOT_FROM_GRAPH && |
| 180 | + r->objects->commit_graph->chunk_bloom_indexes) { |
| 181 | + if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c)) |
| 182 | + return filter; |
| 183 | + else |
| 184 | + return NULL; |
| 185 | + } |
| 186 | + } |
| 187 | + |
| 188 | + if (filter->data || !compute_if_not_present) |
| 189 | + return filter; |
| 190 | + |
| 191 | + repo_diff_setup(r, &diffopt); |
| 192 | + diffopt.flags.recursive = 1; |
| 193 | + diffopt.max_changes = max_changes; |
| 194 | + diff_setup_done(&diffopt); |
| 195 | + |
| 196 | + if (c->parents) |
| 197 | + diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); |
| 198 | + else |
| 199 | + diff_tree_oid(NULL, &c->object.oid, "", &diffopt); |
| 200 | + diffcore_std(&diffopt); |
| 201 | + |
| 202 | + if (diff_queued_diff.nr <= max_changes) { |
| 203 | + struct hashmap pathmap; |
| 204 | + struct pathmap_hash_entry* e; |
| 205 | + struct hashmap_iter iter; |
| 206 | + hashmap_init(&pathmap, NULL, NULL, 0); |
| 207 | + |
| 208 | + for (i = 0; i < diff_queued_diff.nr; i++) { |
| 209 | + const char* path = diff_queued_diff.queue[i]->two->path; |
| 210 | + const char* p = path; |
| 211 | + |
| 212 | + /* |
| 213 | + * Add each leading directory of the changed file, i.e. for |
| 214 | + * 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so |
| 215 | + * the Bloom filter could be used to speed up commands like |
| 216 | + * 'git log dir/subdir', too. |
| 217 | + * |
| 218 | + * Note that directories are added without the trailing '/'. |
| 219 | + */ |
| 220 | + do { |
| 221 | + char* last_slash = strrchr(p, '/'); |
| 222 | + |
| 223 | + FLEX_ALLOC_STR(e, path, path); |
| 224 | + hashmap_entry_init(&e->entry, strhash(p)); |
| 225 | + hashmap_add(&pathmap, &e->entry); |
| 226 | + |
| 227 | + if (!last_slash) |
| 228 | + last_slash = (char*)p; |
| 229 | + *last_slash = '\0'; |
| 230 | + |
| 231 | + } while (*p); |
| 232 | + |
| 233 | + diff_free_filepair(diff_queued_diff.queue[i]); |
| 234 | + } |
| 235 | + |
| 236 | + filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD; |
| 237 | + filter->data = xcalloc(filter->len, sizeof(unsigned char)); |
| 238 | + |
| 239 | + hashmap_for_each_entry(&pathmap, &iter, e, entry) { |
| 240 | + struct bloom_key key; |
| 241 | + fill_bloom_key(e->path, strlen(e->path), &key, &settings); |
| 242 | + add_key_to_filter(&key, filter, &settings); |
| 243 | + } |
| 244 | + |
| 245 | + hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry); |
| 246 | + } else { |
| 247 | + for (i = 0; i < diff_queued_diff.nr; i++) |
| 248 | + diff_free_filepair(diff_queued_diff.queue[i]); |
| 249 | + filter->data = NULL; |
| 250 | + filter->len = 0; |
| 251 | + } |
| 252 | + |
| 253 | + free(diff_queued_diff.queue); |
| 254 | + DIFF_QUEUE_CLEAR(&diff_queued_diff); |
| 255 | + |
| 256 | + return filter; |
| 257 | +} |
| 258 | + |
| 259 | +int bloom_filter_contains(struct bloom_filter *filter, |
| 260 | + struct bloom_key *key, |
| 261 | + struct bloom_filter_settings *settings) |
| 262 | +{ |
| 263 | + int i; |
| 264 | + uint64_t mod = filter->len * BITS_PER_WORD; |
| 265 | + |
| 266 | + if (!mod) |
| 267 | + return -1; |
| 268 | + |
| 269 | + for (i = 0; i < settings->num_hashes; i++) { |
| 270 | + uint64_t hash_mod = key->hashes[i] % mod; |
| 271 | + uint64_t block_pos = hash_mod / BITS_PER_WORD; |
| 272 | + if (!(filter->data[block_pos] & get_bitmask(hash_mod))) |
| 273 | + return 0; |
| 274 | + } |
| 275 | + |
| 276 | + return 1; |
| 277 | +} |
0 commit comments