Skip to content

Commit 9b6606f

Browse files
committed
Merge branch 'gs/commit-graph-path-filter'
Introduce an extension to the commit-graph to make it efficient to check for the paths that were modified at each commit using Bloom filters. * gs/commit-graph-path-filter: bloom: ignore renames when computing changed paths commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag t4216: add end to end tests for git log with Bloom filters revision.c: add trace2 stats around Bloom filter usage revision.c: use Bloom filters to speed up path based revision walks commit-graph: add --changed-paths option to write subcommand commit-graph: reuse existing Bloom filters during write commit-graph: write Bloom filters to commit graph file commit-graph: examine commits by generation number commit-graph: examine changed-path objects in pack order commit-graph: compute Bloom filters for changed paths diff: halt tree-diff early after max_changes bloom.c: core Bloom filter implementation for changed paths. bloom.c: introduce core Bloom filter constructs bloom.c: add the murmur3 hash implementation commit-graph: define and use MAX_NUM_CHUNKS
2 parents cf054f8 + caf388c commit 9b6606f

22 files changed

+1140
-11
lines changed

Documentation/git-commit-graph.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ or `--stdin-packs`.)
5757
With the `--append` option, include all commits that are present in the
5858
existing commit-graph file.
5959
+
60+
With the `--changed-paths` option, compute and write information about the
61+
paths changed between a commit and it's first parent. This operation can
62+
take a while on large repositories. It provides significant performance gains
63+
for getting history of a directory or a file with `git log -- <path>`.
64+
+
6065
With the `--split[=<strategy>]` option, write the commit-graph as a
6166
chain of multiple commit-graph files stored in
6267
`<dir>/info/commit-graphs`. Commit-graph layers are merged based on the

Documentation/technical/commit-graph-format.txt

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ metadata, including:
1717
- The parents of the commit, stored using positional references within
1818
the graph file.
1919

20+
- The Bloom filter of the commit carrying the paths that were changed between
21+
the commit and its first parent, if requested.
22+
2023
These positional references are stored as unsigned 32-bit integers
2124
corresponding to the array position within the list of commit OIDs. Due
2225
to some special constants we use to track parents, we can store at most
@@ -93,6 +96,33 @@ CHUNK DATA:
9396
positions for the parents until reaching a value with the most-significant
9497
bit on. The other bits correspond to the position of the last parent.
9598

99+
Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
100+
* The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
101+
Bloom filters from commit 0 to commit i (inclusive) in lexicographic
102+
order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
103+
BIDX[i] (plus header length), where BIDX[-1] is 0.
104+
* The BIDX chunk is ignored if the BDAT chunk is not present.
105+
106+
Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
107+
* It starts with header consisting of three unsigned 32-bit integers:
108+
- Version of the hash algorithm being used. We currently only support
109+
value 1 which corresponds to the 32-bit version of the murmur3 hash
110+
implemented exactly as described in
111+
https://en.wikipedia.org/wiki/MurmurHash#Algorithm and the double
112+
hashing technique using seed values 0x293ae76f and 0x7e646e2 as
113+
described in https://doi.org/10.1007/978-3-540-30494-4_26 "Bloom Filters
114+
in Probabilistic Verification"
115+
- The number of times a path is hashed and hence the number of bit positions
116+
that cumulatively determine whether a file is present in the commit.
117+
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter
118+
contains 'n' entries, then the filter size is the minimum number of 64-bit
119+
words that contain n*b bits.
120+
* The rest of the chunk is the concatenation of all the computed Bloom
121+
filters for the commits in lexicographic order.
122+
* Note: Commits with no changes or more than 512 changes have Bloom filters
123+
of length zero.
124+
* The BDAT chunk is present if and only if BIDX is present.
125+
96126
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
97127
This list of H-byte hashes describe a set of B commit-graph files that
98128
form a commit-graph chain. The graph position for the ith commit in this

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,7 @@ X =
689689
PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
690690

691691
TEST_BUILTINS_OBJS += test-advise.o
692+
TEST_BUILTINS_OBJS += test-bloom.o
692693
TEST_BUILTINS_OBJS += test-chmtime.o
693694
TEST_BUILTINS_OBJS += test-config.o
694695
TEST_BUILTINS_OBJS += test-ctype.o
@@ -834,6 +835,7 @@ LIB_OBJS += base85.o
834835
LIB_OBJS += bisect.o
835836
LIB_OBJS += blame.o
836837
LIB_OBJS += blob.o
838+
LIB_OBJS += bloom.o
837839
LIB_OBJS += branch.o
838840
LIB_OBJS += bulk-checkin.o
839841
LIB_OBJS += bundle.o

bloom.c

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
#include "git-compat-util.h"
2+
#include "bloom.h"
3+
#include "diff.h"
4+
#include "diffcore.h"
5+
#include "revision.h"
6+
#include "hashmap.h"
7+
#include "commit-graph.h"
8+
#include "commit.h"
9+
10+
define_commit_slab(bloom_filter_slab, struct bloom_filter);
11+
12+
struct bloom_filter_slab bloom_filters;
13+
14+
struct pathmap_hash_entry {
15+
struct hashmap_entry entry;
16+
const char path[FLEX_ARRAY];
17+
};
18+
19+
static uint32_t rotate_left(uint32_t value, int32_t count)
20+
{
21+
uint32_t mask = 8 * sizeof(uint32_t) - 1;
22+
count &= mask;
23+
return ((value << count) | (value >> ((-count) & mask)));
24+
}
25+
26+
static inline unsigned char get_bitmask(uint32_t pos)
27+
{
28+
return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
29+
}
30+
31+
static int load_bloom_filter_from_graph(struct commit_graph *g,
32+
struct bloom_filter *filter,
33+
struct commit *c)
34+
{
35+
uint32_t lex_pos, start_index, end_index;
36+
37+
while (c->graph_pos < g->num_commits_in_base)
38+
g = g->base_graph;
39+
40+
/* The commit graph commit 'c' lives in doesn't carry bloom filters. */
41+
if (!g->chunk_bloom_indexes)
42+
return 0;
43+
44+
lex_pos = c->graph_pos - g->num_commits_in_base;
45+
46+
end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
47+
48+
if (lex_pos > 0)
49+
start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
50+
else
51+
start_index = 0;
52+
53+
filter->len = end_index - start_index;
54+
filter->data = (unsigned char *)(g->chunk_bloom_data +
55+
sizeof(unsigned char) * start_index +
56+
BLOOMDATA_CHUNK_HEADER_SIZE);
57+
58+
return 1;
59+
}
60+
61+
/*
62+
* Calculate the murmur3 32-bit hash value for the given data
63+
* using the given seed.
64+
* Produces a uniformly distributed hash value.
65+
* Not considered to be cryptographically secure.
66+
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
67+
*/
68+
uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len)
69+
{
70+
const uint32_t c1 = 0xcc9e2d51;
71+
const uint32_t c2 = 0x1b873593;
72+
const uint32_t r1 = 15;
73+
const uint32_t r2 = 13;
74+
const uint32_t m = 5;
75+
const uint32_t n = 0xe6546b64;
76+
int i;
77+
uint32_t k1 = 0;
78+
const char *tail;
79+
80+
int len4 = len / sizeof(uint32_t);
81+
82+
uint32_t k;
83+
for (i = 0; i < len4; i++) {
84+
uint32_t byte1 = (uint32_t)data[4*i];
85+
uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
86+
uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
87+
uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
88+
k = byte1 | byte2 | byte3 | byte4;
89+
k *= c1;
90+
k = rotate_left(k, r1);
91+
k *= c2;
92+
93+
seed ^= k;
94+
seed = rotate_left(seed, r2) * m + n;
95+
}
96+
97+
tail = (data + len4 * sizeof(uint32_t));
98+
99+
switch (len & (sizeof(uint32_t) - 1)) {
100+
case 3:
101+
k1 ^= ((uint32_t)tail[2]) << 16;
102+
/*-fallthrough*/
103+
case 2:
104+
k1 ^= ((uint32_t)tail[1]) << 8;
105+
/*-fallthrough*/
106+
case 1:
107+
k1 ^= ((uint32_t)tail[0]) << 0;
108+
k1 *= c1;
109+
k1 = rotate_left(k1, r1);
110+
k1 *= c2;
111+
seed ^= k1;
112+
break;
113+
}
114+
115+
seed ^= (uint32_t)len;
116+
seed ^= (seed >> 16);
117+
seed *= 0x85ebca6b;
118+
seed ^= (seed >> 13);
119+
seed *= 0xc2b2ae35;
120+
seed ^= (seed >> 16);
121+
122+
return seed;
123+
}
124+
125+
void fill_bloom_key(const char *data,
126+
size_t len,
127+
struct bloom_key *key,
128+
const struct bloom_filter_settings *settings)
129+
{
130+
int i;
131+
const uint32_t seed0 = 0x293ae76f;
132+
const uint32_t seed1 = 0x7e646e2c;
133+
const uint32_t hash0 = murmur3_seeded(seed0, data, len);
134+
const uint32_t hash1 = murmur3_seeded(seed1, data, len);
135+
136+
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
137+
for (i = 0; i < settings->num_hashes; i++)
138+
key->hashes[i] = hash0 + i * hash1;
139+
}
140+
141+
void add_key_to_filter(const struct bloom_key *key,
142+
struct bloom_filter *filter,
143+
const struct bloom_filter_settings *settings)
144+
{
145+
int i;
146+
uint64_t mod = filter->len * BITS_PER_WORD;
147+
148+
for (i = 0; i < settings->num_hashes; i++) {
149+
uint64_t hash_mod = key->hashes[i] % mod;
150+
uint64_t block_pos = hash_mod / BITS_PER_WORD;
151+
152+
filter->data[block_pos] |= get_bitmask(hash_mod);
153+
}
154+
}
155+
156+
void init_bloom_filters(void)
157+
{
158+
init_bloom_filter_slab(&bloom_filters);
159+
}
160+
161+
struct bloom_filter *get_bloom_filter(struct repository *r,
162+
struct commit *c,
163+
int compute_if_not_present)
164+
{
165+
struct bloom_filter *filter;
166+
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
167+
int i;
168+
struct diff_options diffopt;
169+
int max_changes = 512;
170+
171+
if (bloom_filters.slab_size == 0)
172+
return NULL;
173+
174+
filter = bloom_filter_slab_at(&bloom_filters, c);
175+
176+
if (!filter->data) {
177+
load_commit_graph_info(r, c);
178+
if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
179+
r->objects->commit_graph->chunk_bloom_indexes) {
180+
if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
181+
return filter;
182+
else
183+
return NULL;
184+
}
185+
}
186+
187+
if (filter->data || !compute_if_not_present)
188+
return filter;
189+
190+
repo_diff_setup(r, &diffopt);
191+
diffopt.flags.recursive = 1;
192+
diffopt.detect_rename = 0;
193+
diffopt.max_changes = max_changes;
194+
diff_setup_done(&diffopt);
195+
196+
if (c->parents)
197+
diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
198+
else
199+
diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
200+
diffcore_std(&diffopt);
201+
202+
if (diff_queued_diff.nr <= max_changes) {
203+
struct hashmap pathmap;
204+
struct pathmap_hash_entry *e;
205+
struct hashmap_iter iter;
206+
hashmap_init(&pathmap, NULL, NULL, 0);
207+
208+
for (i = 0; i < diff_queued_diff.nr; i++) {
209+
const char *path = diff_queued_diff.queue[i]->two->path;
210+
211+
/*
212+
* Add each leading directory of the changed file, i.e. for
213+
* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
214+
* the Bloom filter could be used to speed up commands like
215+
* 'git log dir/subdir', too.
216+
*
217+
* Note that directories are added without the trailing '/'.
218+
*/
219+
do {
220+
char *last_slash = strrchr(path, '/');
221+
222+
FLEX_ALLOC_STR(e, path, path);
223+
hashmap_entry_init(&e->entry, strhash(path));
224+
hashmap_add(&pathmap, &e->entry);
225+
226+
if (!last_slash)
227+
last_slash = (char*)path;
228+
*last_slash = '\0';
229+
230+
} while (*path);
231+
232+
diff_free_filepair(diff_queued_diff.queue[i]);
233+
}
234+
235+
filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
236+
filter->data = xcalloc(filter->len, sizeof(unsigned char));
237+
238+
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
239+
struct bloom_key key;
240+
fill_bloom_key(e->path, strlen(e->path), &key, &settings);
241+
add_key_to_filter(&key, filter, &settings);
242+
}
243+
244+
hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
245+
} else {
246+
for (i = 0; i < diff_queued_diff.nr; i++)
247+
diff_free_filepair(diff_queued_diff.queue[i]);
248+
filter->data = NULL;
249+
filter->len = 0;
250+
}
251+
252+
free(diff_queued_diff.queue);
253+
DIFF_QUEUE_CLEAR(&diff_queued_diff);
254+
255+
return filter;
256+
}
257+
258+
int bloom_filter_contains(const struct bloom_filter *filter,
259+
const struct bloom_key *key,
260+
const struct bloom_filter_settings *settings)
261+
{
262+
int i;
263+
uint64_t mod = filter->len * BITS_PER_WORD;
264+
265+
if (!mod)
266+
return -1;
267+
268+
for (i = 0; i < settings->num_hashes; i++) {
269+
uint64_t hash_mod = key->hashes[i] % mod;
270+
uint64_t block_pos = hash_mod / BITS_PER_WORD;
271+
if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
272+
return 0;
273+
}
274+
275+
return 1;
276+
}

0 commit comments

Comments
 (0)