From dcfe80c0ec6b2070beae50ba526673d2b902bc92 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhost@microsoft.com>
Date: Thu, 17 Nov 2016 09:14:40 -0500
Subject: [PATCH 1/5] name-hash: eliminate duplicate memihash call

Remove duplicate memihash() call in hash_dir_entry().
The existing code called memihash() to do the find_dir_entry()
and it not found, called memihash() again to do the hashmap_add().

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 name-hash.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 6d9f23e932559c..860b8dd0c746e8 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -23,15 +23,21 @@ static int dir_entry_cmp(const struct dir_entry *e1,
 			name ? name : e2->name, e1->namelen);
 }
 
-static struct dir_entry *find_dir_entry(struct index_state *istate,
-		const char *name, unsigned int namelen)
+static struct dir_entry *find_dir_entry__hash(struct index_state *istate,
+		const char *name, unsigned int namelen, unsigned int hash)
 {
 	struct dir_entry key;
-	hashmap_entry_init(&key, memihash(name, namelen));
+	hashmap_entry_init(&key, hash);
 	key.namelen = namelen;
 	return hashmap_get(&istate->dir_hash, &key, name);
 }
 
+static struct dir_entry *find_dir_entry(struct index_state *istate,
+		const char *name, unsigned int namelen)
+{
+	return find_dir_entry__hash(istate, name, namelen, memihash(name,namelen));
+}
+
 static struct dir_entry *hash_dir_entry(struct index_state *istate,
 		struct cache_entry *ce, int namelen)
 {
@@ -43,6 +49,7 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 	 * in index_state.name_hash (as ordinary cache_entries).
 	 */
 	struct dir_entry *dir;
+	unsigned int hash;
 
 	/* get length of parent directory */
 	while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
@@ -52,11 +59,12 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 	namelen--;
 
 	/* lookup existing entry for that directory */
-	dir = find_dir_entry(istate, ce->name, namelen);
+	hash = memihash(ce->name, namelen);
+	dir = find_dir_entry__hash(istate, ce->name, namelen, hash);
 	if (!dir) {
 		/* not found, create it and add to hash table */
 		FLEX_ALLOC_MEM(dir, name, ce->name, namelen);
-		hashmap_entry_init(dir, memihash(ce->name, namelen));
+		hashmap_entry_init(dir, hash);
 		dir->namelen = namelen;
 		hashmap_add(&istate->dir_hash, dir);
 

From 191198fc22e9c7eb4a0736a8a917c8756a6f830e Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhost@microsoft.com>
Date: Thu, 17 Nov 2016 09:23:55 -0500
Subject: [PATCH 2/5] hashmap: allow memihash computation to be continued

Add variant of memihash() to allow the hash computation to
be continued.  There are times when we compute the hash on
a full path and then the hash on just the path to the parent
directory.  This can be expensive on large repositories.

With this, we can hash the parent directory first. And then
continue the computation to include the "/filename".

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 hashmap.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/hashmap.c b/hashmap.c
index b10b642229ca0c..8cf1d6bd208807 100644
--- a/hashmap.c
+++ b/hashmap.c
@@ -50,6 +50,23 @@ unsigned int memihash(const void *buf, size_t len)
 	return hash;
 }
 
+/*
+ * Incoporate another chunk of data into a memihash
+ * computation.
+ */ 
+unsigned int memihash2(unsigned int hash_seed, const void *buf, size_t len)
+{
+	unsigned int hash = hash_seed;
+	unsigned char *ucbuf = (unsigned char *) buf;
+	while (len--) {
+		unsigned int c = *ucbuf++;
+		if (c >= 'a' && c <= 'z')
+			c -= 'a' - 'A';
+		hash = (hash * FNV32_PRIME) ^ c;
+	}
+	return hash;
+}
+
 #define HASHMAP_INITIAL_SIZE 64
 /* grow / shrink by 2^2 */
 #define HASHMAP_RESIZE_BITS 2

From 0f0af882ceb674d190361714ebd8616cb17e0e2c Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhost@microsoft.com>
Date: Thu, 17 Nov 2016 09:59:59 -0500
Subject: [PATCH 3/5] name-hash: precompute hash values during preload-index

Precompute the istate.name_hash and istate.dir_hash values
for each cache-entry during the preload-index phase.

Move the expensive memihash() calculations from lazy_init_name_hash()
to the multi-threaded preload-index phase.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 cache.h         | 16 ++++++++++++
 name-hash.c     | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 preload-index.c |  2 ++
 3 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/cache.h b/cache.h
index 9d2a94493e1e99..deeb30844f5596 100644
--- a/cache.h
+++ b/cache.h
@@ -173,6 +173,9 @@ struct cache_entry {
 	unsigned int ce_flags;
 	unsigned int ce_namelen;
 	unsigned int index;	/* for link extension */
+	unsigned int precompute_hash_state;
+	unsigned int precompute_hash_name;
+	unsigned int precompute_hash_dir;
 	unsigned char sha1[20];
 	char name[FLEX_ARRAY]; /* more */
 };
@@ -229,6 +232,19 @@ struct cache_entry {
 #error "CE_EXTENDED_FLAGS out of range"
 #endif
 
+/*
+ * Bit set if preload-index precomputed the hash value(s)
+ * for this cache-entry.
+ */ 
+#define CE_PRECOMPUTE_HASH_STATE__SET   (1 << 0)
+/*
+ * Bit set if precompute-index also precomputed the hash value
+ * for the parent directory.
+ */ 
+#define CE_PRECOMPUTE_HASH_STATE__DIR   (1 << 1)
+
+void precompute_istate_hashes(struct cache_entry *ce);
+
 /* Forward structure decls */
 struct pathspec;
 struct child_process;
diff --git a/name-hash.c b/name-hash.c
index 860b8dd0c746e8..5cdb779ec953c6 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -50,6 +50,17 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 	 */
 	struct dir_entry *dir;
 	unsigned int hash;
+	int use_precomputed_dir_hash = 0;
+
+	if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET) {
+		if (!ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__DIR)
+			return NULL; /* item does not have a parent directory */
+		if (namelen == ce_namelen(ce)) {
+			/* dir hash only valid for outer-most call (not recursive ones) */
+			use_precomputed_dir_hash = 1;
+			hash = ce->precompute_hash_dir;
+		}
+	}
 
 	/* get length of parent directory */
 	while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
@@ -59,7 +70,8 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 	namelen--;
 
 	/* lookup existing entry for that directory */
-	hash = memihash(ce->name, namelen);
+	if (!use_precomputed_dir_hash)
+		hash = memihash(ce->name, namelen);
 	dir = find_dir_entry__hash(istate, ce->name, namelen, hash);
 	if (!dir) {
 		/* not found, create it and add to hash table */
@@ -99,10 +111,18 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce)
 
 static void hash_index_entry(struct index_state *istate, struct cache_entry *ce)
 {
+	unsigned int h;
+
 	if (ce->ce_flags & CE_HASHED)
 		return;
 	ce->ce_flags |= CE_HASHED;
-	hashmap_entry_init(ce, memihash(ce->name, ce_namelen(ce)));
+
+	if (ce->precompute_hash_state & CE_PRECOMPUTE_HASH_STATE__SET)
+		h = ce->precompute_hash_name;
+	else
+		h = memihash(ce->name, ce_namelen(ce));
+
+	hashmap_entry_init(ce, h);
 	hashmap_add(&istate->name_hash, ce);
 
 	if (ignore_case)
@@ -244,3 +264,45 @@ void free_name_hash(struct index_state *istate)
 	hashmap_free(&istate->name_hash, 0);
 	hashmap_free(&istate->dir_hash, 1);
 }
+
+/*
+ * Precompute the hash values for this cache_entry
+ * for use in the istate.name_hash and istate.dir_hash.
+ *
+ * If the item is in the root directory, just compute the
+ * hash value (for istate.name_hash) on the full path.
+ *
+ * If the item is in a subdirectory, first compute the
+ * hash value for the immediate parent directory (for
+ * istate.dir_hash) and then the hash value for the full
+ * path by continuing the computation.
+ *
+ * Note that these hashes will be used by
+ * wt_status_collect_untracked() as it scans the worktree
+ * and maps observed paths back to the index (optionally
+ * ignoring case).  Therefore, we probably only *NEED* to
+ * precompute this for non-skip-worktree items (since
+ * status should not observe skipped items), but because
+ * lazy_init_name_hash() hashes everything, we force it
+ * here.
+ */ 
+void precompute_istate_hashes(struct cache_entry *ce)
+{
+	int namelen = ce_namelen(ce);
+
+	while (namelen > 0 && !is_dir_sep(ce->name[namelen - 1]))
+		namelen--;
+
+	if (namelen <= 0) {
+		ce->precompute_hash_name = memihash(ce->name, ce_namelen(ce));
+		ce->precompute_hash_state = CE_PRECOMPUTE_HASH_STATE__SET;
+	} else {
+		namelen--;
+		ce->precompute_hash_dir = memihash(ce->name, namelen);
+		ce->precompute_hash_name = memihash2(
+			ce->precompute_hash_dir, &ce->name[namelen],
+			ce_namelen(ce) - namelen);
+		ce->precompute_hash_state =
+			CE_PRECOMPUTE_HASH_STATE__SET | CE_PRECOMPUTE_HASH_STATE__DIR;
+	}
+}
diff --git a/preload-index.c b/preload-index.c
index 4dbed6c16d7a02..bd02ba03207f25 100644
--- a/preload-index.c
+++ b/preload-index.c
@@ -47,6 +47,8 @@ static void *preload_thread(void *_data)
 		struct cache_entry *ce = *cep++;
 		struct stat st;
 
+		precompute_istate_hashes(ce);
+
 		if (ce_stage(ce))
 			continue;
 		if (S_ISGITLINK(ce->ce_mode))

From b9e1bcb2ed83391c6b49c8ba1b60588603e7ff3a Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhost@microsoft.com>
Date: Thu, 17 Nov 2016 10:05:26 -0500
Subject: [PATCH 4/5] name-hash: specify initial size for istate.dir_hash table

Specify an initial size for the istate.dir_hash HashMap matching
the size of the istate.name_hash.

Previously hashmap_init() was given 0, causing a 64 bucket
hashmap to be created.  When working with very large
repositories, this would cause numerous rehash() calls to
realloc and rebalance the hashmap. This is especially true
when the worktree is deep, with many directories containing
a few files.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 name-hash.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/name-hash.c b/name-hash.c
index 5cdb779ec953c6..1d3af4100502d7 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -148,7 +148,8 @@ static void lazy_init_name_hash(struct index_state *istate)
 		return;
 	hashmap_init(&istate->name_hash, (hashmap_cmp_fn) cache_entry_cmp,
 			istate->cache_nr);
-	hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp, 0);
+	hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp,
+			istate->cache_nr);
 	for (nr = 0; nr < istate->cache_nr; nr++)
 		hash_index_entry(istate, istate->cache[nr]);
 	istate->name_hash_initialized = 1;

From bca14bed2710f7fa0adfad6fa3820ad60c07aae0 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhost@microsoft.com>
Date: Thu, 17 Nov 2016 10:37:01 -0500
Subject: [PATCH 5/5] name-hash: remember previous dir_entry during
 lazy_init_name_hash

Teach hash_dir_entry() to remember the previously found dir_entry
during lazy_init_name_hash() iteration.  This is a performance
optimization.  Since items in the index array are sorted by full
pathname, adjacent items are likely to be in the same directory.
This can save memihash() computations and HashMap lookups.

Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com>
---
 name-hash.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 1d3af4100502d7..71becd84be519b 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -39,7 +39,7 @@ static struct dir_entry *find_dir_entry(struct index_state *istate,
 }
 
 static struct dir_entry *hash_dir_entry(struct index_state *istate,
-		struct cache_entry *ce, int namelen)
+		struct cache_entry *ce, int namelen, struct dir_entry **p_previous_dir)
 {
 	/*
 	 * Throw each directory component in the hash for quick lookup
@@ -70,9 +70,21 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 	namelen--;
 
 	/* lookup existing entry for that directory */
-	if (!use_precomputed_dir_hash)
-		hash = memihash(ce->name, namelen);
-	dir = find_dir_entry__hash(istate, ce->name, namelen, hash);
+	if (p_previous_dir && *p_previous_dir
+		&& namelen == (*p_previous_dir)->namelen
+		&& memcmp(ce->name, (*p_previous_dir)->name, namelen) == 0) {
+		/*
+		 * When our caller is sequentially iterating thru the index,
+		 * items in the same directory will be sequential, and therefore
+		 * refer to the same dir_entry.
+		 */
+		dir = *p_previous_dir;
+	} else {
+		if (!use_precomputed_dir_hash)
+			hash = memihash(ce->name, namelen);
+		dir = find_dir_entry__hash(istate, ce->name, namelen, hash);
+	}
+
 	if (!dir) {
 		/* not found, create it and add to hash table */
 		FLEX_ALLOC_MEM(dir, name, ce->name, namelen);
@@ -81,15 +93,20 @@ static struct dir_entry *hash_dir_entry(struct index_state *istate,
 		hashmap_add(&istate->dir_hash, dir);
 
 		/* recursively add missing parent directories */
-		dir->parent = hash_dir_entry(istate, ce, namelen);
+		dir->parent = hash_dir_entry(istate, ce, namelen, NULL);
 	}
+
+	if (p_previous_dir)
+		*p_previous_dir = dir;
+
 	return dir;
 }
 
-static void add_dir_entry(struct index_state *istate, struct cache_entry *ce)
+static void add_dir_entry(struct index_state *istate, struct cache_entry *ce,
+	struct dir_entry **p_previous_dir)
 {
 	/* Add reference to the directory entry (and parents if 0). */
-	struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce));
+	struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), p_previous_dir);
 	while (dir && !(dir->nr++))
 		dir = dir->parent;
 }
@@ -100,7 +117,7 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce)
 	 * Release reference to the directory entry. If 0, remove and continue
 	 * with parent directory.
 	 */
-	struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce));
+	struct dir_entry *dir = hash_dir_entry(istate, ce, ce_namelen(ce), NULL);
 	while (dir && !(--dir->nr)) {
 		struct dir_entry *parent = dir->parent;
 		hashmap_remove(&istate->dir_hash, dir, NULL);
@@ -109,7 +126,8 @@ static void remove_dir_entry(struct index_state *istate, struct cache_entry *ce)
 	}
 }
 
-static void hash_index_entry(struct index_state *istate, struct cache_entry *ce)
+static void hash_index_entry(struct index_state *istate, struct cache_entry *ce,
+	struct dir_entry **p_previous_dir)
 {
 	unsigned int h;
 
@@ -126,7 +144,7 @@ static void hash_index_entry(struct index_state *istate, struct cache_entry *ce)
 	hashmap_add(&istate->name_hash, ce);
 
 	if (ignore_case)
-		add_dir_entry(istate, ce);
+		add_dir_entry(istate, ce, p_previous_dir);
 }
 
 static int cache_entry_cmp(const struct cache_entry *ce1,
@@ -142,6 +160,7 @@ static int cache_entry_cmp(const struct cache_entry *ce1,
 
 static void lazy_init_name_hash(struct index_state *istate)
 {
+	struct dir_entry *previous_dir = NULL;
 	int nr;
 
 	if (istate->name_hash_initialized)
@@ -151,14 +170,14 @@ static void lazy_init_name_hash(struct index_state *istate)
 	hashmap_init(&istate->dir_hash, (hashmap_cmp_fn) dir_entry_cmp,
 			istate->cache_nr);
 	for (nr = 0; nr < istate->cache_nr; nr++)
-		hash_index_entry(istate, istate->cache[nr]);
+		hash_index_entry(istate, istate->cache[nr], &previous_dir);
 	istate->name_hash_initialized = 1;
 }
 
 void add_name_hash(struct index_state *istate, struct cache_entry *ce)
 {
 	if (istate->name_hash_initialized)
-		hash_index_entry(istate, ce);
+		hash_index_entry(istate, ce, NULL);
 }
 
 void remove_name_hash(struct index_state *istate, struct cache_entry *ce)