Skip to content

Commit ce0bd64

Browse files
Linus TorvaldsJunio C Hamano
Linus Torvalds
authored and
Junio C Hamano
committed
pack-objects: improve path grouping heuristics.
This trivial patch not only simplifies the name hashing, it actually improves packing for both git and the kernel. The git archive pack shrinks from 6824090->6622627 bytes (a 3% improvement), and the kernel pack shrinks from 108756213 to 108219021 (a mere 0.5% improvement, but still, it's an improvement from making the hashing much simpler!) We just create a 32-bit hash, where we "age" previous characters by two bits, so the last characters in a filename count most. So when we then compare the hashes in the sort routine, filenames that end the same way sort the same way. It takes the subdirectory into account (unless the filename is > 16 characters), but files with the same name within the same subdirectory will obviously sort closer than files in different subdirectories. And, incidentally (which is why I tried the hash change in the first place, of course) builtin-rev-list.c will sort fairly close to rev-list.c. And no, it's not a "good hash" in the sense of being secure or unique, but that's not what we're looking for. The whole "hash" thing is misnamed here. It's not so much a hash as a "sorting number". [jc: rolled in simplification for computing the sorting number computation for thin pack base objects] Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Junio C Hamano <[email protected]>
1 parent 87cefaa commit ce0bd64

File tree

1 file changed

+19
-50
lines changed

1 file changed

+19
-50
lines changed

pack-objects.c

Lines changed: 19 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -463,48 +463,21 @@ static void rehash_objects(void)
463463
}
464464
}
465465

466-
struct name_path {
467-
struct name_path *up;
468-
const char *elem;
469-
int len;
470-
};
471-
472-
#define DIRBITS 12
473-
474-
static unsigned name_hash(struct name_path *path, const char *name)
466+
static unsigned name_hash(const char *name)
475467
{
476-
struct name_path *p = path;
477-
const char *n = name + strlen(name);
478-
unsigned hash = 0, name_hash = 0, name_done = 0;
479-
480-
if (n != name && n[-1] == '\n')
481-
n--;
482-
while (name <= --n) {
483-
unsigned char c = *n;
484-
if (c == '/' && !name_done) {
485-
name_hash = hash;
486-
name_done = 1;
487-
hash = 0;
488-
}
489-
hash = hash * 11 + c;
490-
}
491-
if (!name_done) {
492-
name_hash = hash;
493-
hash = 0;
494-
}
495-
for (p = path; p; p = p->up) {
496-
hash = hash * 11 + '/';
497-
n = p->elem + p->len;
498-
while (p->elem <= --n) {
499-
unsigned char c = *n;
500-
hash = hash * 11 + c;
501-
}
502-
}
468+
unsigned char c;
469+
unsigned hash = 0;
470+
503471
/*
504-
* Make sure "Makefile" and "t/Makefile" are hashed separately
505-
* but close enough.
472+
* This effectively just creates a sortable number from the
473+
* last sixteen non-whitespace characters. Last characters
474+
* count "most", so things that end in ".c" sort together.
506475
*/
507-
hash = (name_hash<<DIRBITS) | (hash & ((1U<<DIRBITS )-1));
476+
while ((c = *name++) != 0) {
477+
if (isspace(c))
478+
continue;
479+
hash = (hash >> 2) + (c << 24);
480+
}
508481
return hash;
509482
}
510483

@@ -686,9 +659,9 @@ static int name_cmp_len(const char *name)
686659
}
687660

688661
static void add_pbase_object(struct tree_desc *tree,
689-
struct name_path *up,
690662
const char *name,
691-
int cmplen)
663+
int cmplen,
664+
const char *fullname)
692665
{
693666
struct name_entry entry;
694667

@@ -702,13 +675,12 @@ static void add_pbase_object(struct tree_desc *tree,
702675
sha1_object_info(entry.sha1, type, &size))
703676
continue;
704677
if (name[cmplen] != '/') {
705-
unsigned hash = name_hash(up, name);
678+
unsigned hash = name_hash(fullname);
706679
add_object_entry(entry.sha1, hash, 1);
707680
return;
708681
}
709682
if (!strcmp(type, tree_type)) {
710683
struct tree_desc sub;
711-
struct name_path me;
712684
struct pbase_tree_cache *tree;
713685
const char *down = name+cmplen+1;
714686
int downlen = name_cmp_len(down);
@@ -719,10 +691,7 @@ static void add_pbase_object(struct tree_desc *tree,
719691
sub.buf = tree->tree_data;
720692
sub.size = tree->tree_size;
721693

722-
me.up = up;
723-
me.elem = entry.path;
724-
me.len = entry.pathlen;
725-
add_pbase_object(&sub, &me, down, downlen);
694+
add_pbase_object(&sub, down, downlen, fullname);
726695
pbase_tree_put(tree);
727696
}
728697
}
@@ -778,14 +747,14 @@ static void add_preferred_base_object(char *name, unsigned hash)
778747

779748
for (it = pbase_tree; it; it = it->next) {
780749
if (cmplen == 0) {
781-
hash = name_hash(NULL, "");
750+
hash = name_hash("");
782751
add_object_entry(it->pcache.sha1, hash, 1);
783752
}
784753
else {
785754
struct tree_desc tree;
786755
tree.buf = it->pcache.tree_data;
787756
tree.size = it->pcache.tree_size;
788-
add_pbase_object(&tree, NULL, name, cmplen);
757+
add_pbase_object(&tree, name, cmplen, name);
789758
}
790759
}
791760
}
@@ -1328,7 +1297,7 @@ int main(int argc, char **argv)
13281297
}
13291298
if (get_sha1_hex(line, sha1))
13301299
die("expected sha1, got garbage:\n %s", line);
1331-
hash = name_hash(NULL, line+41);
1300+
hash = name_hash(line+41);
13321301
add_preferred_base_object(line+41, hash);
13331302
add_object_entry(sha1, hash, 0);
13341303
}

0 commit comments

Comments
 (0)