Skip to content

Commit c0f4c5e

Browse files
committed
Merge branch 'gs/commit-graph-path-filter' into pu
Introduce an extension to the commit-graph to make it efficient to check for the paths that were modified at each commit using Bloom filters. * gs/commit-graph-path-filter: (bytesex breakage band-aid) commit-graph: add GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS test flag revision.c: use Bloom filters to speed up path based revision walks commit-graph: add --changed-paths option to write subcommand commit-graph: reuse existing Bloom filters during write. commit-graph: write Bloom filters to commit graph file commit-graph: examine commits by generation number commit-graph: examine changed-path objects in pack order commit-graph: compute Bloom filters for changed paths diff: halt tree-diff early after max_changes bloom: core Bloom filter implementation for changed paths commit-graph: use MAX_NUM_CHUNKS
2 parents 628f64e + 80214a4 commit c0f4c5e

22 files changed

+1088
-7
lines changed

Documentation/git-commit-graph.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@ conducted, and the remaining options are ignored. Conversely, if
7676
remaining options are ignored. A bare `--split` defers to the remaining
7777
options.
7878
+
79+
With the `--changed-paths` option, compute and write information about the
80+
paths changed between a commit and it's first parent. This operation can
81+
take a while on large repositories. It provides significant performance gains
82+
for getting history of a directory or a file with `git log -- <path>`.
83+
+
7984
* If `--size-multiple=<X>` is not specified, let `X` equal 2. If the new
8085
tip file would have `N` commits and the previous tip has `M` commits and
8186
`X` times `N` is greater than `M`, instead merge the two files into a

Documentation/technical/commit-graph-format.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ metadata, including:
1717
- The parents of the commit, stored using positional references within
1818
the graph file.
1919

20+
- The Bloom filter of the commit carrying the paths that were changed between
21+
the commit and its first parent.
22+
2023
These positional references are stored as unsigned 32-bit integers
2124
corresponding to the array position within the list of commit OIDs. Due
2225
to some special constants we use to track parents, we can store at most
@@ -93,6 +96,27 @@ CHUNK DATA:
9396
positions for the parents until reaching a value with the most-significant
9497
bit on. The other bits correspond to the position of the last parent.
9598

99+
Bloom Filter Index (ID: {'B', 'I', 'D', 'X'}) (N * 4 bytes) [Optional]
100+
* The ith entry, BIDX[i], stores the number of 8-byte word blocks in all
101+
Bloom filters from commit 0 to commit i (inclusive) in lexicographic
102+
order. The Bloom filter for the i-th commit spans from BIDX[i-1] to
103+
BIDX[i] (plus header length), where BIDX[-1] is 0.
104+
* The BIDX chunk is ignored if the BDAT chunk is not present.
105+
106+
Bloom Filter Data (ID: {'B', 'D', 'A', 'T'}) [Optional]
107+
* It starts with header consisting of three unsigned 32-bit integers:
108+
- Version of the hash algorithm being used. We currently only support
109+
value 1 which implies the murmur3 hash implemented exactly as described
110+
in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
111+
- The number of times a path is hashed and hence the number of bit positions
112+
that cumulatively determine whether a file is present in the commit.
113+
- The minimum number of bits 'b' per entry in the Bloom filter. If the filter
114+
contains 'n' entries, then the filter size is the minimum number of 64-bit
115+
words that contain n*b bits.
116+
* The rest of the chunk is the concatenation of all the computed Bloom
117+
filters for the commits in lexicographic order.
118+
* The BDAT chunk is present iff BIDX is present.
119+
96120
Base Graphs List (ID: {'B', 'A', 'S', 'E'}) [Optional]
97121
This list of H-byte hashes describe a set of B commit-graph files that
98122
form a commit-graph chain. The graph position for the ith commit in this

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ X =
696696

697697
PROGRAMS += $(patsubst %.o,git-%$X,$(PROGRAM_OBJS))
698698

699+
TEST_BUILTINS_OBJS += test-bloom.o
699700
TEST_BUILTINS_OBJS += test-chmtime.o
700701
TEST_BUILTINS_OBJS += test-config.o
701702
TEST_BUILTINS_OBJS += test-ctype.o
@@ -844,6 +845,7 @@ LIB_OBJS += base85.o
844845
LIB_OBJS += bisect.o
845846
LIB_OBJS += blame.o
846847
LIB_OBJS += blob.o
848+
LIB_OBJS += bloom.o
847849
LIB_OBJS += branch.o
848850
LIB_OBJS += bulk-checkin.o
849851
LIB_OBJS += bundle.o

bloom.c

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
#include "git-compat-util.h"
2+
#include "bloom.h"
3+
#include "commit.h"
4+
#include "commit-slab.h"
5+
#include "commit-graph.h"
6+
#include "object-store.h"
7+
#include "diff.h"
8+
#include "diffcore.h"
9+
#include "revision.h"
10+
#include "hashmap.h"
11+
12+
define_commit_slab(bloom_filter_slab, struct bloom_filter);
13+
14+
struct bloom_filter_slab bloom_filters;
15+
16+
struct pathmap_hash_entry {
17+
struct hashmap_entry entry;
18+
const char path[FLEX_ARRAY];
19+
};
20+
21+
static uint32_t rotate_right(uint32_t value, int32_t count)
22+
{
23+
uint32_t mask = 8 * sizeof(uint32_t) - 1;
24+
count &= mask;
25+
return ((value >> count) | (value << ((-count) & mask)));
26+
}
27+
28+
/*
29+
* Calculate a hash value for the given data using the given seed.
30+
* Produces a uniformly distributed hash value.
31+
* Not considered to be cryptographically secure.
32+
* Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
33+
**/
34+
static uint32_t seed_murmur3(uint32_t seed, const char *data, int len)
35+
{
36+
const uint32_t c1 = 0xcc9e2d51;
37+
const uint32_t c2 = 0x1b873593;
38+
const uint32_t r1 = 15;
39+
const uint32_t r2 = 13;
40+
const uint32_t m = 5;
41+
const uint32_t n = 0xe6546b64;
42+
int i;
43+
uint32_t k1 = 0;
44+
const char *tail;
45+
46+
int len4 = len / sizeof(uint32_t);
47+
48+
uint32_t k;
49+
for (i = 0; i < len4; i++) {
50+
uint32_t byte1 = (uint32_t)data[4*i];
51+
uint32_t byte2 = ((uint32_t)data[4*i + 1]) << 8;
52+
uint32_t byte3 = ((uint32_t)data[4*i + 2]) << 16;
53+
uint32_t byte4 = ((uint32_t)data[4*i + 3]) << 24;
54+
k = byte1 | byte2 | byte3 | byte4;
55+
k *= c1;
56+
k = rotate_right(k, r1);
57+
k *= c2;
58+
59+
seed ^= k;
60+
seed = rotate_right(seed, r2) * m + n;
61+
}
62+
63+
tail = (data + len4 * sizeof(uint32_t));
64+
65+
switch (len & (sizeof(uint32_t) - 1)) {
66+
case 3:
67+
k1 ^= ((uint32_t)tail[2]) << 16;
68+
/*-fallthrough*/
69+
case 2:
70+
k1 ^= ((uint32_t)tail[1]) << 8;
71+
/*-fallthrough*/
72+
case 1:
73+
k1 ^= ((uint32_t)tail[0]) << 0;
74+
k1 *= c1;
75+
k1 = rotate_right(k1, r1);
76+
k1 *= c2;
77+
seed ^= k1;
78+
break;
79+
}
80+
81+
seed ^= (uint32_t)len;
82+
seed ^= (seed >> 16);
83+
seed *= 0x85ebca6b;
84+
seed ^= (seed >> 13);
85+
seed *= 0xc2b2ae35;
86+
seed ^= (seed >> 16);
87+
88+
return seed;
89+
}
90+
91+
static inline unsigned char get_bitmask(uint32_t pos)
92+
{
93+
return ((unsigned char)1) << (pos & (BITS_PER_WORD - 1));
94+
}
95+
96+
void load_bloom_filters(void)
97+
{
98+
init_bloom_filter_slab(&bloom_filters);
99+
}
100+
101+
void fill_bloom_key(const char *data,
102+
int len,
103+
struct bloom_key *key,
104+
struct bloom_filter_settings *settings)
105+
{
106+
int i;
107+
const uint32_t seed0 = 0x293ae76f;
108+
const uint32_t seed1 = 0x7e646e2c;
109+
const uint32_t hash0 = seed_murmur3(seed0, data, len);
110+
const uint32_t hash1 = seed_murmur3(seed1, data, len);
111+
112+
key->hashes = (uint32_t *)xcalloc(settings->num_hashes, sizeof(uint32_t));
113+
for (i = 0; i < settings->num_hashes; i++)
114+
key->hashes[i] = hash0 + i * hash1;
115+
}
116+
117+
void add_key_to_filter(struct bloom_key *key,
118+
struct bloom_filter *filter,
119+
struct bloom_filter_settings *settings)
120+
{
121+
int i;
122+
uint64_t mod = filter->len * BITS_PER_WORD;
123+
124+
for (i = 0; i < settings->num_hashes; i++) {
125+
uint64_t hash_mod = key->hashes[i] % mod;
126+
uint64_t block_pos = hash_mod / BITS_PER_WORD;
127+
128+
filter->data[block_pos] |= get_bitmask(hash_mod);
129+
}
130+
}
131+
132+
static int load_bloom_filter_from_graph(struct commit_graph *g,
133+
struct bloom_filter *filter,
134+
struct commit *c)
135+
{
136+
uint32_t lex_pos, start_index, end_index;
137+
138+
while (c->graph_pos < g->num_commits_in_base)
139+
g = g->base_graph;
140+
141+
/* The commit graph commit 'c' lives in doesn't carry bloom filters. */
142+
if (!g->chunk_bloom_indexes)
143+
return 0;
144+
145+
lex_pos = c->graph_pos - g->num_commits_in_base;
146+
147+
end_index = get_be32(g->chunk_bloom_indexes + 4 * lex_pos);
148+
149+
if (lex_pos)
150+
start_index = get_be32(g->chunk_bloom_indexes + 4 * (lex_pos - 1));
151+
else
152+
start_index = 0;
153+
154+
filter->len = end_index - start_index;
155+
filter->data = (unsigned char *)(g->chunk_bloom_data +
156+
sizeof(unsigned char) * start_index +
157+
BLOOMDATA_CHUNK_HEADER_SIZE);
158+
159+
return 1;
160+
}
161+
162+
struct bloom_filter *get_bloom_filter(struct repository *r,
163+
struct commit *c,
164+
int compute_if_not_present)
165+
{
166+
struct bloom_filter *filter;
167+
struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
168+
int i;
169+
struct diff_options diffopt;
170+
int max_changes = 512;
171+
172+
if (!bloom_filters.slab_size)
173+
return NULL;
174+
175+
filter = bloom_filter_slab_at(&bloom_filters, c);
176+
177+
if (!filter->data) {
178+
load_commit_graph_info(r, c);
179+
if (c->graph_pos != COMMIT_NOT_FROM_GRAPH &&
180+
r->objects->commit_graph->chunk_bloom_indexes) {
181+
if (load_bloom_filter_from_graph(r->objects->commit_graph, filter, c))
182+
return filter;
183+
else
184+
return NULL;
185+
}
186+
}
187+
188+
if (filter->data || !compute_if_not_present)
189+
return filter;
190+
191+
repo_diff_setup(r, &diffopt);
192+
diffopt.flags.recursive = 1;
193+
diffopt.max_changes = max_changes;
194+
diff_setup_done(&diffopt);
195+
196+
if (c->parents)
197+
diff_tree_oid(&c->parents->item->object.oid, &c->object.oid, "", &diffopt);
198+
else
199+
diff_tree_oid(NULL, &c->object.oid, "", &diffopt);
200+
diffcore_std(&diffopt);
201+
202+
if (diff_queued_diff.nr <= max_changes) {
203+
struct hashmap pathmap;
204+
struct pathmap_hash_entry* e;
205+
struct hashmap_iter iter;
206+
hashmap_init(&pathmap, NULL, NULL, 0);
207+
208+
for (i = 0; i < diff_queued_diff.nr; i++) {
209+
const char* path = diff_queued_diff.queue[i]->two->path;
210+
const char* p = path;
211+
212+
/*
213+
* Add each leading directory of the changed file, i.e. for
214+
* 'dir/subdir/file' add 'dir' and 'dir/subdir' as well, so
215+
* the Bloom filter could be used to speed up commands like
216+
* 'git log dir/subdir', too.
217+
*
218+
* Note that directories are added without the trailing '/'.
219+
*/
220+
do {
221+
char* last_slash = strrchr(p, '/');
222+
223+
FLEX_ALLOC_STR(e, path, path);
224+
hashmap_entry_init(&e->entry, strhash(p));
225+
hashmap_add(&pathmap, &e->entry);
226+
227+
if (!last_slash)
228+
last_slash = (char*)p;
229+
*last_slash = '\0';
230+
231+
} while (*p);
232+
233+
diff_free_filepair(diff_queued_diff.queue[i]);
234+
}
235+
236+
filter->len = (hashmap_get_size(&pathmap) * settings.bits_per_entry + BITS_PER_WORD - 1) / BITS_PER_WORD;
237+
filter->data = xcalloc(filter->len, sizeof(unsigned char));
238+
239+
hashmap_for_each_entry(&pathmap, &iter, e, entry) {
240+
struct bloom_key key;
241+
fill_bloom_key(e->path, strlen(e->path), &key, &settings);
242+
add_key_to_filter(&key, filter, &settings);
243+
}
244+
245+
hashmap_free_entries(&pathmap, struct pathmap_hash_entry, entry);
246+
} else {
247+
for (i = 0; i < diff_queued_diff.nr; i++)
248+
diff_free_filepair(diff_queued_diff.queue[i]);
249+
filter->data = NULL;
250+
filter->len = 0;
251+
}
252+
253+
free(diff_queued_diff.queue);
254+
DIFF_QUEUE_CLEAR(&diff_queued_diff);
255+
256+
return filter;
257+
}
258+
259+
int bloom_filter_contains(struct bloom_filter *filter,
260+
struct bloom_key *key,
261+
struct bloom_filter_settings *settings)
262+
{
263+
int i;
264+
uint64_t mod = filter->len * BITS_PER_WORD;
265+
266+
if (!mod)
267+
return -1;
268+
269+
for (i = 0; i < settings->num_hashes; i++) {
270+
uint64_t hash_mod = key->hashes[i] % mod;
271+
uint64_t block_pos = hash_mod / BITS_PER_WORD;
272+
if (!(filter->data[block_pos] & get_bitmask(hash_mod)))
273+
return 0;
274+
}
275+
276+
return 1;
277+
}

0 commit comments

Comments
 (0)