-
Notifications
You must be signed in to change notification settings - Fork 146
Changed Paths Bloom Filters #497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c3ffd98
a5aa341
a7702c1
8304c29
2d4c0b2
c38b9b3
d24c85c
5ed16f3
ff6b96a
cc8022b
c8b86c3
617f549
6beaede
b899df5
5656e85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,11 @@ or `--stdin-packs`.) | |
With the `--append` option, include all commits that are present in the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Bryan Turner wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
|
||
existing commit-graph file. | ||
+ | ||
With the `--changed-paths` option, compute and write information about the | ||
paths changed between a commit and it's first parent. This operation can | ||
take a while on large repositories. It provides significant performance gains | ||
for getting history of a directory or a file with `git log -- <path>`. | ||
+ | ||
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
With the `--split` option, write the commit-graph as a chain of multiple | ||
commit-graph files stored in `<dir>/info/commit-graphs`. The new commits | ||
not already in the commit-graph are added in a new "tip" file. This file | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,9 @@ metadata, including: | |
- The parents of the commit, stored using positional references within | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
|
||
the graph file. | ||
|
||
- The Bloom filter of the commit carrying the paths that were changed between | ||
the commit and its first parent, if requested. | ||
|
||
These positional references are stored as unsigned 32-bit integers | ||
corresponding to the array position within the list of commit OIDs. Due | ||
to some special constants we use to track parents, we can store at most | ||
|
@@ -93,6 +96,33 @@ CHUNK DATA: | |
positions for the parents until reaching a value with the most-significant | ||
bit on. The other bits correspond to the position of the last parent. | ||
|
||
Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional] | ||
* The ith entry, BIDX[i], stores the number of 8-byte word blocks in all | ||
Bloom filters from commit 0 to commit i (inclusive) in lexicographic | ||
order. The Bloom filter for the i-th commit spans from BIDX[i-1] to | ||
BIDX[i] (plus header length), where BIDX[-1] is 0. | ||
* The BIDX chunk is ignored if the BDAT chunk is not present. | ||
|
||
Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional] | ||
* It starts with header consisting of three unsigned 32-bit integers: | ||
- Version of the hash algorithm being used. We currently only support | ||
value 1 which corresponds to the 32-bit version of the murmur3 hash | ||
implemented exactly as described in | ||
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double | ||
hashing technique using seed values 0x293ae76f and 0x7e646e2 as | ||
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters | ||
in Probabilistic Verification" | ||
- The number of times a path is hashed and hence the number of bit positions | ||
that cumulatively determine whether a file is present in the commit. | ||
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter | ||
contains 'n' entries, then the filter size is the minimum number of 64-bit | ||
words that contain n*b bits. | ||
* The rest of the chunk is the concatenation of all the computed Bloom | ||
filters for the commits in lexicographic order. | ||
* Note: Commits with no changes or more than 512 changes have Bloom filters | ||
of length zero. | ||
* The BDAT chunk is present if and only if BIDX is present. | ||
|
||
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional] | ||
This list of H-byte hashes describe a set of B commit-graph files that | ||
form a commit-graph chain. The graph position for the ith commit in this | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,275 @@ | ||
#include "git-compat-util.h" | ||
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Jakub Narebski wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Garima Singh wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Junio C Hamano wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, SZEDER Gábor wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the Git mailing list, Derrick Stolee wrote (reply to this):
|
||
#include "bloom.h" | ||
#include "diff.h" | ||
#include "diffcore.h" | ||
#include "revision.h" | ||
#include "hashmap.h" | ||
#include "commit-graph.h" | ||
#include "commit.h" | ||
|
||
define_commit_slab(bloom_filter_slab, struct bloom_filter); | ||
|
||
struct bloom_filter_slab bloom_filters; | ||
|
||
struct pathmap_hash_entry { | ||
struct hashmap_entry entry; | ||
const char path[FLEX_ARRAY]; | ||
}; | ||
|
||
static uint32_t rotate_left(uint32_t value, int32_t count) | ||
{ | ||
uint32_t mask = 8 * sizeof(uint32_t) - 1; | ||
count &= mask; | ||
return ((value << count) | (value >> ((-count) & mask))); | ||
} | ||
|
||
static inline unsigned char get_bitmask(uint32_t pos) | ||
{ | ||
return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1)); | ||
} | ||
|
||
static int load_bloom_filter_from_graph(struct commit_graph *g, | ||
struct bloom_filter *filter, | ||
struct commit *c) | ||
{ | ||
uint32_t lex_pos, start_index, end_index; | ||
|
||
while (c->graph_pos < g->num_commits_in_base) | ||
g = g->base_graph; | ||
|
||
/* The commit graph commit 'c' lives in doesn't carry bloom filters. */ | ||
if (!g->chunk_bloom_indexes) | ||
return 0; | ||
|
||
lex_pos = c->graph_pos - g->num_commits_in_base; | ||
|
||
end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos); | ||
|
||
if (lex_pos > 0) | ||
start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1)); | ||
else | ||
start_index = 0; | ||
|
||
filter->len = end_index - start_index; | ||
filter->data = (unsigned char *)(g->chunk_bloom_data + | ||
sizeof(unsigned char) * start_index + | ||
BLOOMDATA_CHUNK_HEADER_SIZE); | ||
|
||
return 1; | ||
} | ||
|
||
/* | ||
* Calculate the murmur3 32-bit hash value for the given data | ||
* using the given seed. | ||
* Produces a uniformly distributed hash value. | ||
* Not considered to be cryptographically secure. | ||
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm | ||
*/ | ||
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len) | ||
{ | ||
const uint32_t c1 = 0xcc9e2d51; | ||
const uint32_t c2 = 0x1b873593; | ||
const uint32_t r1 = 15; | ||
const uint32_t r2 = 13; | ||
const uint32_t m = 5; | ||
const uint32_t n = 0xe6546b64; | ||
int i; | ||
uint32_t k1 = 0; | ||
const char *tail; | ||
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
int len4 = len / sizeof(uint32_t); | ||
|
||
uint32_t k; | ||
for (i = 0; i < len4; i++) { | ||
uint32_t byte1 = (uint32_t)data[4*i]; | ||
uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8; | ||
uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16; | ||
uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24; | ||
k = byte1 | byte2 | byte3 | byte4; | ||
k *= c1; | ||
k = rotate_left(k, r1); | ||
k *= c2; | ||
|
||
seed ^= k; | ||
seed = rotate_left(seed, r2) * m + n; | ||
} | ||
|
||
tail = (data + len4 * sizeof(uint32_t)); | ||
|
||
switch (len & (sizeof(uint32_t) - 1)) { | ||
case 3: | ||
k1 ^= ((uint32_t)tail[2]) << 16; | ||
/*-fallthrough*/ | ||
case 2: | ||
k1 ^= ((uint32_t)tail[1]) << 8; | ||
/*-fallthrough*/ | ||
case 1: | ||
k1 ^= ((uint32_t)tail[0]) << 0; | ||
k1 *= c1; | ||
k1 = rotate_left(k1, r1); | ||
k1 *= c2; | ||
seed ^= k1; | ||
break; | ||
} | ||
|
||
seed ^= (uint32_t)len; | ||
seed ^= (seed >> 16); | ||
seed *= 0x85ebca6b; | ||
seed ^= (seed >> 13); | ||
seed *= 0xc2b2ae35; | ||
seed ^= (seed >> 16); | ||
|
||
return seed; | ||
} | ||
|
||
void fill_bloom_key(const char *data, | ||
size_t len, | ||
struct bloom_key *key, | ||
const struct bloom_filter_settings *settings) | ||
{ | ||
int i; | ||
const uint32_t seed0 = 0x293ae76f; | ||
const uint32_t seed1 = 0x7e646e2c; | ||
const uint32_t hash0 = murmur3_seeded(seed0, data, len); | ||
const uint32_t hash1 = murmur3_seeded(seed1, data, len); | ||
|
||
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t)); | ||
for (i = 0; i < settings->num_hashes; i++) | ||
key->hashes[i] = hash0 + i * hash1; | ||
} | ||
|
||
void add_key_to_filter(const struct bloom_key *key, | ||
struct bloom_filter *filter, | ||
const struct bloom_filter_settings *settings) | ||
{ | ||
int i; | ||
uint64_t mod = filter->len * BITS_PER_WORD; | ||
|
||
for (i = 0; i < settings->num_hashes; i++) { | ||
uint64_t hash_mod = key->hashes[i] % mod; | ||
uint64_t block_pos = hash_mod / BITS_PER_WORD; | ||
|
||
filter->data[block_pos] |= get_bitmask(hash_mod); | ||
} | ||
} | ||
|
||
void init_bloom_filters(void) | ||
{ | ||
init_bloom_filter_slab(&bloom_filters); | ||
} | ||
|
||
struct bloom_filter *get_bloom_filter(struct repository *r, | ||
struct commit *c, | ||
int compute_if_not_present) | ||
{ | ||
struct bloom_filter *filter; | ||
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; | ||
int i; | ||
struct diff_options diffopt; | ||
int max_changes = 512; | ||
|
||
if (bloom_filters.slab_size == 0) | ||
return NULL; | ||
|
||
filter = bloom_filter_slab_at(&bloom_filters, c); | ||
|
||
if (!filter->data) { | ||
load_commit_graph_info(r, c); | ||
if (c->graph_pos != COMMIT_NOT_FROM_GRAPH && | ||
r->objects->commit_graph->chunk_bloom_indexes) { | ||
if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c)) | ||
return filter; | ||
else | ||
return NULL; | ||
} | ||
} | ||
|
||
if (filter->data || !compute_if_not_present) | ||
return filter; | ||
|
||
repo_diff_setup(r, &diffopt); | ||
diffopt.flags.recursive = 1; | ||
diffopt.max_changes = max_changes; | ||
diff_setup_done(&diffopt); | ||
|
||
if (c->parents) | ||
diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt); | ||
else | ||
diff_tree_oid(NULL, &c->object.oid, "", &diffopt); | ||
diffcore_std(&diffopt); | ||
|
||
if (diff_queued_diff.nr <= max_changes) { | ||
struct hashmap pathmap; | ||
struct pathmap_hash_entry *e; | ||
struct hashmap_iter iter; | ||
hashmap_init(&pathmap, NULL, NULL, 0); | ||
|
||
for (i = 0; i < diff_queued_diff.nr; i++) { | ||
const char *path = diff_queued_diff.queue[i]->two->path; | ||
|
||
/* | ||
* Add each leading directory of the changed file, i.e. for | ||
* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so | ||
* the Bloom filter could be used to speed up commands like | ||
* 'git log dir/subdir', too. | ||
* | ||
* Note that directories are added without the trailing '/'. | ||
*/ | ||
do { | ||
char *last_slash = strrchr(path, '/'); | ||
|
||
FLEX_ALLOC_STR(e, path, path); | ||
hashmap_entry_init(&e->entry, strhash(path)); | ||
hashmap_add(&pathmap, &e->entry); | ||
|
||
if (!last_slash) | ||
last_slash = (char*)path; | ||
*last_slash = '\0'; | ||
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
} while (*path); | ||
|
||
diff_free_filepair(diff_queued_diff.queue[i]); | ||
} | ||
|
||
filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD; | ||
filter->data = xcalloc(filter->len, sizeof(unsigned char)); | ||
|
||
hashmap_for_each_entry(&pathmap, &iter, e, entry) { | ||
struct bloom_key key; | ||
fill_bloom_key(e->path, strlen(e->path), &key, &settings); | ||
add_key_to_filter(&key, filter, &settings); | ||
} | ||
|
||
hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry); | ||
} else { | ||
for (i = 0; i < diff_queued_diff.nr; i++) | ||
diff_free_filepair(diff_queued_diff.queue[i]); | ||
filter->data = NULL; | ||
filter->len = 0; | ||
} | ||
|
||
free(diff_queued_diff.queue); | ||
DIFF_QUEUE_CLEAR(&diff_queued_diff); | ||
|
||
return filter; | ||
} | ||
|
||
int bloom_filter_contains(const struct bloom_filter *filter, | ||
const struct bloom_key *key, | ||
const struct bloom_filter_settings *settings) | ||
{ | ||
int i; | ||
uint64_t mod = filter->len * BITS_PER_WORD; | ||
|
||
if (!mod) | ||
return -1; | ||
|
||
for (i = 0; i < settings->num_hashes; i++) { | ||
uint64_t hash_mod = key->hashes[i] % mod; | ||
uint64_t block_pos = hash_mod / BITS_PER_WORD; | ||
if (!(filter->data[block_pos] & get_bitmask(hash_mod))) | ||
return 0; | ||
} | ||
|
||
return 1; | ||
} | ||
garimasi514 marked this conversation as resolved.
Show resolved
Hide resolved
|
Uh oh!
There was an error while loading. Please reload this page.