Skip to content

Commit ba2336a

Browse files
dschoGit for Windows Build Agent
authored and
Git for Windows Build Agent
committed
pack-objects: create new name-hash algorithm (#5157)
This is an updated version of gitgitgadget#1785, intended for early consumption into Git for Windows. The idea here is to add a new `--full-name-hash` option to `git pack-objects` and `git repack`. This adjusts the name-hash value used for finding delta bases in such a way that uses the full path name with a lower likelihood of collisions than the default name-hash algorithm. In many repositories with name-hash collisions and many versions of those paths, this can significantly reduce the size of a full repack. It can also help in certain cases of `git push`, but only if the pack is already artificially inflated by name-hash collisions; cases that find "sibling" deltas as better choices become worse with `--full-name-hash`. Thus, this option is currently recommended for full repacks of large repos, and on client machines without reachability bitmaps. Some care is taken to ignore this option when using bitmaps, either writing bitmaps or using a bitmap walk during reads. The bitmap file format contains name-hash values, but no way to indicate which function is used, so compatibility is a concern for bitmaps. Future work could explore this idea. After this PR is merged, then the more-involved `--path-walk` option may be considered.
2 parents 7a4e45c + 6b79aac commit ba2336a

21 files changed

+310
-13
lines changed

Documentation/git-pack-objects.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ SYNOPSIS
1515
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
1616
[--cruft] [--cruft-expiration=<time>]
1717
[--stdout [--filter=<filter-spec>] | <base-name>]
18-
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
18+
[--shallow] [--keep-true-parents] [--[no-]sparse]
19+
[--full-name-hash] < <object-list>
1920

2021

2122
DESCRIPTION

Documentation/git-repack.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
99
SYNOPSIS
1010
--------
1111
[verse]
12-
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
12+
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
13+
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
14+
[--write-midx] [--full-name-hash]
1315

1416
DESCRIPTION
1517
-----------

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,7 @@ TEST_BUILTINS_OBJS += test-match-trees.o
814814
TEST_BUILTINS_OBJS += test-mergesort.o
815815
TEST_BUILTINS_OBJS += test-mktemp.o
816816
TEST_BUILTINS_OBJS += test-oid-array.o
817+
TEST_BUILTINS_OBJS += test-name-hash.o
817818
TEST_BUILTINS_OBJS += test-online-cpus.o
818819
TEST_BUILTINS_OBJS += test-pack-mtimes.o
819820
TEST_BUILTINS_OBJS += test-parse-options.o

builtin/pack-objects.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,14 @@ struct configured_exclusion {
266266
static struct oidmap configured_exclusions;
267267

268268
static struct oidset excluded_by_config;
269+
static int use_full_name_hash = -1;
270+
271+
static inline uint32_t pack_name_hash_fn(const char *name)
272+
{
273+
if (use_full_name_hash)
274+
return pack_full_name_hash(name);
275+
return pack_name_hash(name);
276+
}
269277

270278
/*
271279
* stats
@@ -1670,7 +1678,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
16701678
return 0;
16711679
}
16721680

1673-
create_object_entry(oid, type, pack_name_hash(name),
1681+
create_object_entry(oid, type, pack_name_hash_fn(name),
16741682
exclude, name && no_try_delta(name),
16751683
found_pack, found_offset);
16761684
return 1;
@@ -1884,7 +1892,7 @@ static void add_preferred_base_object(const char *name)
18841892
{
18851893
struct pbase_tree *it;
18861894
size_t cmplen;
1887-
unsigned hash = pack_name_hash(name);
1895+
unsigned hash = pack_name_hash_fn(name);
18881896

18891897
if (!num_preferred_base || check_pbase_path(hash))
18901898
return;
@@ -3394,7 +3402,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
33943402
* here using a now in order to perhaps improve the delta selection
33953403
* process.
33963404
*/
3397-
oe->hash = pack_name_hash(name);
3405+
oe->hash = pack_name_hash_fn(name);
33983406
oe->no_try_delta = name && no_try_delta(name);
33993407

34003408
stdin_packs_hints_nr++;
@@ -3544,7 +3552,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35443552
entry = packlist_find(&to_pack, oid);
35453553
if (entry) {
35463554
if (name) {
3547-
entry->hash = pack_name_hash(name);
3555+
entry->hash = pack_name_hash_fn(name);
35483556
entry->no_try_delta = no_try_delta(name);
35493557
}
35503558
} else {
@@ -3567,7 +3575,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35673575
return;
35683576
}
35693577

3570-
entry = create_object_entry(oid, type, pack_name_hash(name),
3578+
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
35713579
0, name && no_try_delta(name),
35723580
pack, offset);
35733581
}
@@ -4397,6 +4405,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
43974405
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
43984406
N_("protocol"),
43994407
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
4408+
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
4409+
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
44004410
OPT_END(),
44014411
};
44024412

@@ -4544,6 +4554,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
45444554
if (pack_to_stdout || !rev_list_all)
45454555
write_bitmap_index = 0;
45464556

4557+
if (write_bitmap_index && use_full_name_hash > 0)
4558+
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
4559+
if (use_full_name_hash < 0)
4560+
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);
4561+
45474562
if (use_delta_islands)
45484563
strvec_push(&rp, "--topo-order");
45494564

builtin/repack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ static int run_update_server_info = 1;
3838
static char *packdir, *packtmp_name, *packtmp;
3939

4040
static const char *const git_repack_usage[] = {
41-
N_("git repack [<options>]"),
41+
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
42+
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
43+
"[--write-midx] [--full-name-hash]"),
4244
NULL
4345
};
4446

@@ -57,6 +59,7 @@ struct pack_objects_args {
5759
int no_reuse_object;
5860
int quiet;
5961
int local;
62+
int full_name_hash;
6063
struct list_objects_filter_options filter_options;
6164
};
6265

@@ -288,6 +291,8 @@ static void prepare_pack_objects(struct child_process *cmd,
288291
strvec_pushf(&cmd->args, "--no-reuse-delta");
289292
if (args->no_reuse_object)
290293
strvec_pushf(&cmd->args, "--no-reuse-object");
294+
if (args->full_name_hash)
295+
strvec_pushf(&cmd->args, "--full-name-hash");
291296
if (args->local)
292297
strvec_push(&cmd->args, "--local");
293298
if (args->quiet)
@@ -1157,6 +1162,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
11571162
N_("pass --no-reuse-delta to git-pack-objects")),
11581163
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
11591164
N_("pass --no-reuse-object to git-pack-objects")),
1165+
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
1166+
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
11601167
OPT_NEGBIT('n', NULL, &run_update_server_info,
11611168
N_("do not run git-update-server-info"), 1),
11621169
OPT__QUIET(&po_args.quiet, N_("be quiet")),

ci/run-build-and-tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ linux-TEST-vars)
2525
export GIT_TEST_NO_WRITE_REV_INDEX=1
2626
export GIT_TEST_CHECKOUT_WORKERS=2
2727
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
28+
export GIT_TEST_FULL_NAME_HASH=1
2829
;;
2930
linux-clang)
3031
export GIT_TEST_DEFAULT_HASH=sha1

pack-objects.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,27 @@ static inline uint32_t pack_name_hash(const char *name)
207207
return hash;
208208
}
209209

210+
static inline uint32_t pack_full_name_hash(const char *name)
211+
{
212+
const uint32_t bigp = 1234572167U;
213+
uint32_t c, hash = bigp;
214+
215+
if (!name)
216+
return 0;
217+
218+
/*
219+
* Do the simplest thing that will resemble pseudo-randomness: add
220+
* random multiples of a large prime number with a binary shift.
221+
* The goal is not to be cryptographic, but to be generally
222+
* uniformly distributed.
223+
*/
224+
while ((c = *name++) != 0) {
225+
hash += c * bigp;
226+
hash = (hash >> 5) | (hash << 27);
227+
}
228+
return hash;
229+
}
230+
210231
static inline enum object_type oe_type(const struct object_entry *e)
211232
{
212233
return e->type_valid ? e->type_ : OBJ_BAD;

t/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,10 @@ a test and then fails then the whole test run will abort. This can help to make
488488
sure the expected tests are executed and not silently skipped when their
489489
dependency breaks or is simply not present in a new environment.
490490

491+
GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
492+
function in 'git pack-objects' to be the one used by the --full-name-hash
493+
option.
494+
491495
Naming Tests
492496
------------
493497

t/helper/test-name-hash.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* test-name-hash.c: Read a list of paths over stdin and report on their
3+
* name-hash and full name-hash.
4+
*/
5+
6+
#include "test-tool.h"
7+
#include "git-compat-util.h"
8+
#include "pack-objects.h"
9+
#include "strbuf.h"
10+
11+
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
12+
{
13+
struct strbuf line = STRBUF_INIT;
14+
15+
while (!strbuf_getline(&line, stdin)) {
16+
uint32_t name_hash = pack_name_hash(line.buf);
17+
uint32_t full_hash = pack_full_name_hash(line.buf);
18+
19+
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
20+
}
21+
22+
strbuf_release(&line);
23+
return 0;
24+
}

t/helper/test-tool.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ static struct test_cmd cmds[] = {
4343
{ "match-trees", cmd__match_trees },
4444
{ "mergesort", cmd__mergesort },
4545
{ "mktemp", cmd__mktemp },
46+
{ "name-hash", cmd__name_hash },
4647
{ "oid-array", cmd__oid_array },
4748
{ "online-cpus", cmd__online_cpus },
4849
{ "pack-mtimes", cmd__pack_mtimes },

t/helper/test-tool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
3737
int cmd__match_trees(int argc, const char **argv);
3838
int cmd__mergesort(int argc, const char **argv);
3939
int cmd__mktemp(int argc, const char **argv);
40+
int cmd__name_hash(int argc, const char **argv);
4041
int cmd__online_cpus(int argc, const char **argv);
4142
int cmd__pack_mtimes(int argc, const char **argv);
4243
int cmd__parse_options(int argc, const char **argv);

t/perf/p5313-pack-objects.sh

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/sh
2+
3+
test_description='Tests pack performance using bitmaps'
4+
. ./perf-lib.sh
5+
6+
GIT_TEST_PASSING_SANITIZE_LEAK=0
7+
export GIT_TEST_PASSING_SANITIZE_LEAK
8+
9+
test_perf_large_repo
10+
11+
test_expect_success 'create rev input' '
12+
cat >in-thin <<-EOF &&
13+
$(git rev-parse HEAD)
14+
^$(git rev-parse HEAD~1)
15+
EOF
16+
17+
cat >in-big <<-EOF
18+
$(git rev-parse HEAD)
19+
^$(git rev-parse HEAD~1000)
20+
EOF
21+
'
22+
23+
test_perf 'thin pack' '
24+
git pack-objects --thin --stdout --revs --sparse <in-thin >out
25+
'
26+
27+
test_size 'thin pack size' '
28+
test_file_size out
29+
'
30+
31+
test_perf 'thin pack with --full-name-hash' '
32+
git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
33+
'
34+
35+
test_size 'thin pack size with --full-name-hash' '
36+
test_file_size out
37+
'
38+
39+
test_perf 'big pack' '
40+
git pack-objects --stdout --revs --sparse <in-big >out
41+
'
42+
43+
test_size 'big pack size' '
44+
test_file_size out
45+
'
46+
47+
test_perf 'big pack with --full-name-hash' '
48+
git pack-objects --stdout --revs --sparse --full-name-hash <in-big >out
49+
'
50+
51+
test_size 'big pack size with --full-name-hash' '
52+
test_file_size out
53+
'
54+
55+
test_perf 'repack' '
56+
git repack -adf
57+
'
58+
59+
test_size 'repack size' '
60+
pack=$(ls .git/objects/pack/pack-*.pack) &&
61+
test_file_size "$pack"
62+
'
63+
64+
test_perf 'repack with --full-name-hash' '
65+
git repack -adf --full-name-hash
66+
'
67+
68+
test_size 'repack size with --full-name-hash' '
69+
pack=$(ls .git/objects/pack/pack-*.pack) &&
70+
test_file_size "$pack"
71+
'
72+
73+
test_done

t/perf/p5314-name-hash.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/sh
2+
3+
test_description='Tests pack performance using bitmaps'
4+
. ./perf-lib.sh
5+
6+
GIT_TEST_PASSING_SANITIZE_LEAK=0
7+
export GIT_TEST_PASSING_SANITIZE_LEAK
8+
9+
test_perf_large_repo
10+
11+
test_size 'paths at head' '
12+
git ls-tree -r --name-only HEAD >path-list &&
13+
wc -l <path-list
14+
'
15+
16+
test_size 'number of distinct name-hashes' '
17+
cat path-list | test-tool name-hash >name-hashes &&
18+
cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
19+
wc -l <name-hash-count
20+
'
21+
22+
test_size 'number of distinct full-name-hashes' '
23+
cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
24+
wc -l <full-name-hash-count
25+
'
26+
27+
test_size 'maximum multiplicity of name-hashes' '
28+
cat name-hash-count | \
29+
sort -nr | \
30+
head -n 1 | \
31+
awk "{ print \$1; }"
32+
'
33+
34+
test_size 'maximum multiplicity of fullname-hashes' '
35+
cat full-name-hash-count | \
36+
sort -nr | \
37+
head -n 1 | \
38+
awk "{ print \$1; }"
39+
'
40+
41+
test_done

t/t0450/txt-help-mismatches

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ rebase
4545
remote
4646
remote-ext
4747
remote-fd
48-
repack
4948
reset
5049
restore
5150
rev-parse

t/t5300-pack-object.sh

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,4 +674,19 @@ do
674674
'
675675
done
676676

677+
# The following test is not necessarily a permanent choice, but since we do not
678+
# have a "name hash version" bit in the .bitmap file format, we cannot write the
679+
# full-name hash values into the .bitmap file without risking breakage later.
680+
#
681+
# TODO: Make these compatible in the future and replace this test with the
682+
# expected behavior when both are specified.
683+
test_expect_success '--full-name-hash and --write-bitmap-index are incompatible' '
684+
test_must_fail git pack-objects base --all \
685+
--full-name-hash --write-bitmap-index 2>err &&
686+
grep incompatible err &&
687+
688+
# --stdout option silently removes --write-bitmap-index
689+
git pack-objects --stdout --all --full-name-hash --write-bitmap-index >out
690+
'
691+
677692
test_done

0 commit comments

Comments
 (0)