From a53bd0d37606c890d2fd2b715ce0bc92ff787b66 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 29 Aug 2024 12:27:28 -0400
Subject: [PATCH 01/30] path-walk: introduce an object walk by path

In anticipation of a few planned applications, introduce the most basic form
of a path-walk API. It currently assumes that there are no UNINTERESTING
objects, and does not include any complicated filters. It calls a function
pointer on groups of tree and blob objects as grouped by path. This only
includes objects the first time they are discovered, so an object that
appears at multiple paths will not be included in two batches.

There are many future adaptations that could be made, but they are left for
future updates when consumers are ready to take advantage of those features.

RFC TODO: It would be helpful to create a test-tool that allows printing of
each batch for strong testing.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Makefile    |   1 +
 path-walk.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 path-walk.h |  43 ++++++++++
 3 files changed, 279 insertions(+)
 create mode 100644 path-walk.c
 create mode 100644 path-walk.h

diff --git a/Makefile b/Makefile
index deb175a0408782..e83f6de9a2c413 100644
--- a/Makefile
+++ b/Makefile
@@ -1090,6 +1090,7 @@ LIB_OBJS += parse-options.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
+LIB_OBJS += path-walk.o
 LIB_OBJS += pathspec.o
 LIB_OBJS += pkt-line.o
 LIB_OBJS += preload-index.o
diff --git a/path-walk.c b/path-walk.c
new file mode 100644
index 00000000000000..2edfa0572e4a15
--- /dev/null
+++ b/path-walk.c
@@ -0,0 +1,235 @@
+/*
+ * path-walk.c: implementation for path-based walks of the object graph.
+ */
+#include "git-compat-util.h"
+#include "path-walk.h"
+#include "blob.h"
+#include "commit.h"
+#include "dir.h"
+#include "hashmap.h"
+#include "hex.h"
+#include "object.h"
+#include "oid-array.h"
+#include "revision.h"
+#include "string-list.h"
+#include "strmap.h"
+#include "trace2.h"
+#include "tree.h"
+#include "tree-walk.h"
+
+struct type_and_oid_list
+{
+	enum object_type type;
+	struct oid_array oids;
+};
+
+#define TYPE_AND_OID_LIST_INIT { \
+	.type = OBJ_NONE, 	 \
+	.oids = OID_ARRAY_INIT	 \
+}
+
+struct path_walk_context {
+	/**
+	 * Repeats of data in 'struct path_walk_info' for
+	 * access with fewer characters.
+	 */
+	struct repository *repo;
+	struct rev_info *revs;
+	struct path_walk_info *info;
+
+	/**
+	 * Map a path to a 'struct type_and_oid_list'
+	 * containing the objects discovered at that
+	 * path.
+	 */
+	struct strmap paths_to_lists;
+
+	/**
+	 * Store the current list of paths in a stack, to
+	 * facilitate depth-first-search without recursion.
+	 */
+	struct string_list path_stack;
+};
+
+static int add_children(struct path_walk_context *ctx,
+			const char *base_path,
+			struct object_id *oid)
+{
+	struct tree_desc desc;
+	struct name_entry entry;
+	struct strbuf path = STRBUF_INIT;
+	size_t base_len;
+	struct tree *tree = lookup_tree(ctx->repo, oid);
+
+	if (!tree) {
+		error(_("failed to walk children of tree %s: not found"),
+		      oid_to_hex(oid));
+		return -1;
+	}
+
+	strbuf_addstr(&path, base_path);
+	base_len = path.len;
+
+	parse_tree(tree);
+	init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
+	while (tree_entry(&desc, &entry)) {
+		struct type_and_oid_list *list;
+		struct object *o;
+		/* Not actually true, but we will ignore submodules later. */
+		enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
+
+		/* Skip submodules. */
+		if (S_ISGITLINK(entry.mode))
+			continue;
+
+		if (type == OBJ_TREE) {
+			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else if (type == OBJ_BLOB) {
+			struct blob *child = lookup_blob(ctx->repo, &entry.oid);
+			o = child ? &child->object : NULL;
+		} else {
+			/* Wrong type? */
+			continue;
+		}
+
+		if (!o) /* report error?*/
+			continue;
+
+		/* Skip this object if already seen. */
+		if (o->flags & SEEN)
+			continue;
+		o->flags |= SEEN;
+
+		strbuf_setlen(&path, base_len);
+		strbuf_add(&path, entry.path, entry.pathlen);
+
+		/*
+		 * Trees will end with "/" for concatenation and distinction
+		 * from blobs at the same path.
+		 */
+		if (type == OBJ_TREE)
+			strbuf_addch(&path, '/');
+
+		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
+			CALLOC_ARRAY(list, 1);
+			list->type = type;
+			strmap_put(&ctx->paths_to_lists, path.buf, list);
+			string_list_append(&ctx->path_stack, path.buf);
+		}
+		oid_array_append(&list->oids, &entry.oid);
+	}
+
+	free_tree_buffer(tree);
+	strbuf_release(&path);
+	return 0;
+}
+
+/*
+ * For each path in paths_to_explore, walk the trees another level
+ * and add any found blobs to the batch (but only if they don't
+ * exist and haven't been added yet).
+ */
+static int walk_path(struct path_walk_context *ctx,
+		     const char *path)
+{
+	struct type_and_oid_list *list;
+	int ret = 0;
+
+	list = strmap_get(&ctx->paths_to_lists, path);
+
+	/* Evaluate function pointer on this data. */
+	ret = ctx->info->path_fn(path, &list->oids, list->type,
+				 ctx->info->path_fn_data);
+
+	/* Expand data for children. */
+	if (list->type == OBJ_TREE) {
+		for (size_t i = 0; i < list->oids.nr; i++) {
+			ret |= add_children(ctx,
+					    path,
+					    &list->oids.oid[i]);
+		}
+	}
+
+	oid_array_clear(&list->oids);
+	strmap_remove(&ctx->paths_to_lists, path, 1);
+	return ret;
+}
+
+static void clear_strmap(struct strmap *map)
+{
+	struct hashmap_iter iter;
+	struct strmap_entry *e;
+
+	hashmap_for_each_entry(&map->map, &iter, e, ent) {
+		struct type_and_oid_list *list = e->value;
+		oid_array_clear(&list->oids);
+	}
+	strmap_clear(map, 1);
+	strmap_init(map);
+}
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info)
+{
+	const char *root_path = "";
+	int ret = 0;
+	size_t commits_nr = 0, paths_nr = 0;
+	struct commit *c;
+	struct type_and_oid_list *root_tree_list;
+	struct path_walk_context ctx = {
+		.repo = info->revs->repo,
+		.revs = info->revs,
+		.info = info,
+		.path_stack = STRING_LIST_INIT_DUP,
+		.paths_to_lists = STRMAP_INIT
+	};
+
+	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
+
+	/* Insert a single list for the root tree into the paths. */
+	CALLOC_ARRAY(root_tree_list, 1);
+	root_tree_list->type = OBJ_TREE;
+	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
+
+	if (prepare_revision_walk(info->revs))
+		die(_("failed to setup revision walk"));
+
+	while ((c = get_revision(info->revs))) {
+		struct object_id *oid = get_commit_tree_oid(c);
+		struct tree *t = lookup_tree(info->revs->repo, oid);
+		commits_nr++;
+
+		if (t)
+			oid_array_append(&root_tree_list->oids, oid);
+		else
+			warning("could not find tree %s", oid_to_hex(oid));
+	}
+
+	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
+	trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
+
+	string_list_append(&ctx.path_stack, root_path);
+
+	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
+	while (!ret && ctx.path_stack.nr) {
+		char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
+		ctx.path_stack.nr--;
+		paths_nr++;
+
+		ret = walk_path(&ctx, path);
+
+		free(path);
+	}
+	trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
+	trace2_region_leave("path-walk", "path-walk", info->revs->repo);
+
+	clear_strmap(&ctx.paths_to_lists);
+	string_list_clear(&ctx.path_stack, 0);
+	return ret;
+}
diff --git a/path-walk.h b/path-walk.h
new file mode 100644
index 00000000000000..c9e94a98bc8f6b
--- /dev/null
+++ b/path-walk.h
@@ -0,0 +1,43 @@
+/*
+ * path-walk.h : Methods and structures for walking the object graph in batches
+ * by the paths that can reach those objects.
+ */
+#include "object.h" /* Required for 'enum object_type'. */
+
+struct rev_info;
+struct oid_array;
+
+/**
+ * The type of a function pointer for the method that is called on a list of
+ * objects reachable at a given path.
+ */
+typedef int (*path_fn)(const char *path,
+		       struct oid_array *oids,
+		       enum object_type type,
+		       void *data);
+
+struct path_walk_info {
+	/**
+	 * revs provides the definitions for the commit walk, including
+	 * which commits are UNINTERESTING or not.
+	 */
+	struct rev_info *revs;
+
+	/**
+	 * The caller wishes to execute custom logic on objects reachable at a
+	 * given path. Every reachable object will be visited exactly once, and
+	 * the first path to see an object wins. This may not be a stable choice.
+	 */
+	path_fn path_fn;
+	void *path_fn_data;
+};
+
+#define PATH_WALK_INFO_INIT { 0 }
+
+/**
+ * Given the configuration of 'info', walk the commits based on 'info->revs' and
+ * call 'info->path_fn' on each discovered path.
+ *
+ * Returns nonzero on an error.
+ */
+int walk_objects_by_path(struct path_walk_info *info);

From 41c49bba131ed014cc4f7ab579313527dd4c9d29 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Fri, 7 Jun 2024 12:18:50 -0400
Subject: [PATCH 02/30] backfill: add builtin boilerplate

In anticipation of implementing 'git backfill', populate the necessary files
with the boilerplate of a new builtin.

RFC TODO: When preparing this for a full implementation, make sure it is
based on the newest standards introduced by [1].

[1] https://lore.kernel.org/git/xmqqjzfq2f0f.fsf@gitster.g/T/#m606036ea2e75a6d6819d6b5c90e729643b0ff7f7
    [PATCH 1/3] builtin: add a repository parameter for builtin functions

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 .gitignore                     |  1 +
 Documentation/git-backfill.txt | 23 +++++++++++++++++++++++
 Makefile                       |  1 +
 builtin.h                      |  1 +
 builtin/backfill.c             | 29 +++++++++++++++++++++++++++++
 command-list.txt               |  1 +
 git.c                          |  1 +
 7 files changed, 57 insertions(+)
 create mode 100644 Documentation/git-backfill.txt
 create mode 100644 builtin/backfill.c

diff --git a/.gitignore b/.gitignore
index 8caf3700c2305d..8f5cb938ecbf36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@
 /git-apply
 /git-archimport
 /git-archive
+/git-backfill
 /git-bisect
 /git-blame
 /git-branch
diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
new file mode 100644
index 00000000000000..640144187d33b7
--- /dev/null
+++ b/Documentation/git-backfill.txt
@@ -0,0 +1,23 @@
+git-backfill(1)
+===============
+
+NAME
+----
+git-backfill - Download missing objects in a partial clone
+
+
+SYNOPSIS
+--------
+[verse]
+'git backfill' [<options>]
+
+DESCRIPTION
+-----------
+
+SEE ALSO
+--------
+linkgit:git-clone[1].
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index e83f6de9a2c413..4305474d96e0f7 100644
--- a/Makefile
+++ b/Makefile
@@ -1198,6 +1198,7 @@ BUILTIN_OBJS += builtin/am.o
 BUILTIN_OBJS += builtin/annotate.o
 BUILTIN_OBJS += builtin/apply.o
 BUILTIN_OBJS += builtin/archive.o
+BUILTIN_OBJS += builtin/backfill.o
 BUILTIN_OBJS += builtin/bisect.o
 BUILTIN_OBJS += builtin/blame.o
 BUILTIN_OBJS += builtin/branch.o
diff --git a/builtin.h b/builtin.h
index 14fa0171607b17..73dd0ccbe8c961 100644
--- a/builtin.h
+++ b/builtin.h
@@ -127,6 +127,7 @@ int cmd_am(int argc, const char **argv, const char *prefix);
 int cmd_annotate(int argc, const char **argv, const char *prefix);
 int cmd_apply(int argc, const char **argv, const char *prefix);
 int cmd_archive(int argc, const char **argv, const char *prefix);
+int cmd_backfill(int argc, const char **argv, const char *prefix);
 int cmd_bisect(int argc, const char **argv, const char *prefix);
 int cmd_blame(int argc, const char **argv, const char *prefix);
 int cmd_branch(int argc, const char **argv, const char *prefix);
diff --git a/builtin/backfill.c b/builtin/backfill.c
new file mode 100644
index 00000000000000..77b05a2f8382a3
--- /dev/null
+++ b/builtin/backfill.c
@@ -0,0 +1,29 @@
+#include "builtin.h"
+#include "config.h"
+#include "parse-options.h"
+#include "repository.h"
+#include "object.h"
+
+static const char * const builtin_backfill_usage[] = {
+	N_("git backfill [<options>]"),
+	NULL
+};
+
+int cmd_backfill(int argc, const char **argv, const char *prefix)
+{
+	struct option options[] = {
+		OPT_END(),
+	};
+
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(builtin_backfill_usage, options);
+
+	argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage,
+			     0);
+
+	git_config(git_default_config, NULL);
+
+	die(_("not implemented"));
+
+	return 0;
+}
diff --git a/command-list.txt b/command-list.txt
index e0bb87b3b5c278..c537114b4687b8 100644
--- a/command-list.txt
+++ b/command-list.txt
@@ -60,6 +60,7 @@ git-annotate                            ancillaryinterrogators
 git-apply                               plumbingmanipulators            complete
 git-archimport                          foreignscminterface
 git-archive                             mainporcelain
+git-backfill                            mainporcelain           history
 git-bisect                              mainporcelain           info
 git-blame                               ancillaryinterrogators          complete
 git-branch                              mainporcelain           history
diff --git a/git.c b/git.c
index 9a618a2740f195..4f2215e9c8b0cf 100644
--- a/git.c
+++ b/git.c
@@ -509,6 +509,7 @@ static struct cmd_struct commands[] = {
 	{ "annotate", cmd_annotate, RUN_SETUP },
 	{ "apply", cmd_apply, RUN_SETUP_GENTLY },
 	{ "archive", cmd_archive, RUN_SETUP_GENTLY },
+	{ "backfill", cmd_backfill, RUN_SETUP },
 	{ "bisect", cmd_bisect, RUN_SETUP },
 	{ "blame", cmd_blame, RUN_SETUP },
 	{ "branch", cmd_branch, RUN_SETUP | DELAY_PAGER_CONFIG },

From be21c8370eb5143c3f6e91d354d48465d221692a Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Sun, 1 Sep 2024 12:07:10 -0400
Subject: [PATCH 03/30] backfill: basic functionality and tests

The default behavior of 'git backfill' is to fetch all missing blobs that
are reachable from HEAD. Document and test this behavior.

The implementation is a very simple use of the path-walk API, initializing
the revision walk at HEAD to start the path-walk from all commits reachable
from HEAD. Ignore the object arrays that correspond to tree entries,
assuming that they are all present already.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-backfill.txt |  24 ++++++++
 builtin/backfill.c             | 101 ++++++++++++++++++++++++++++++++-
 t/t5620-backfill.sh            |  97 +++++++++++++++++++++++++++++++
 3 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100755 t/t5620-backfill.sh

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index 640144187d33b7..0e10f066fef56f 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -14,6 +14,30 @@ SYNOPSIS
 DESCRIPTION
 -----------
 
+Blobless partial clones are created using `git clone --filter=blob:none`
+and then configure the local repository such that the Git client avoids
+downloading blob objects unless they are required for a local operation.
+This initially means that the clone and later fetches download reachable
+commits and trees but no blobs. Later operations that change the `HEAD`
+pointer, such as `git checkout` or `git merge`, may need to download
+missing blobs in order to complete their operation.
+
+In the worst cases, commands that compute blob diffs, such as `git blame`,
+become very slow as they download the missing blobs in single-blob
+requests to satisfy the missing object as the Git command needs it. This
+leads to multiple download requests and no ability for the Git server to
+provide delta compression across those objects.
+
+The `git backfill` command provides a way for the user to request that
+Git downloads the missing blobs (with optional filters) such that the
+missing blobs representing historical versions of files can be downloaded
+in batches. The `backfill` command attempts to optimize the request by
+grouping blobs that appear at the same path, hopefully leading to good
+delta compression in the packfile sent by the server.
+
+By default, `git backfill` downloads all blobs reachable from the `HEAD`
+commit. This set can be restricted or expanded using various options.
+
 SEE ALSO
 --------
 linkgit:git-clone[1].
diff --git a/builtin/backfill.c b/builtin/backfill.c
index 77b05a2f8382a3..23d40fc02a280a 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -1,16 +1,113 @@
 #include "builtin.h"
+#include "git-compat-util.h"
 #include "config.h"
 #include "parse-options.h"
 #include "repository.h"
+#include "commit.h"
+#include "hex.h"
+#include "tree.h"
+#include "tree-walk.h"
 #include "object.h"
+#include "object-store-ll.h"
+#include "oid-array.h"
+#include "oidset.h"
+#include "promisor-remote.h"
+#include "strmap.h"
+#include "string-list.h"
+#include "revision.h"
+#include "trace2.h"
+#include "progress.h"
+#include "packfile.h"
+#include "path-walk.h"
 
 static const char * const builtin_backfill_usage[] = {
 	N_("git backfill [<options>]"),
 	NULL
 };
 
+struct backfill_context {
+	struct repository *repo;
+	struct oid_array current_batch;
+	size_t batch_size;
+};
+
+static void clear_backfill_context(struct backfill_context *ctx)
+{
+	oid_array_clear(&ctx->current_batch);
+}
+
+static void download_batch(struct backfill_context *ctx)
+{
+	promisor_remote_get_direct(ctx->repo,
+				   ctx->current_batch.oid,
+				   ctx->current_batch.nr);
+	oid_array_clear(&ctx->current_batch);
+
+	/*
+	 * We likely have a new packfile. Add it to the packed list to
+	 * avoid possible duplicate downloads of the same objects.
+	 */
+	reprepare_packed_git(ctx->repo);
+}
+
+static int fill_missing_blobs(const char *path,
+			      struct oid_array *list,
+			      enum object_type type,
+			      void *data)
+{
+	struct backfill_context *ctx = data;
+
+	if (type != OBJ_BLOB)
+		return 0;
+
+	for (size_t i = 0; i < list->nr; i++) {
+		off_t size = 0;
+		struct object_info info = OBJECT_INFO_INIT;
+		info.disk_sizep = &size;
+		if (oid_object_info_extended(the_repository,
+					     &list->oid[i],
+					     &info,
+					     OBJECT_INFO_FOR_PREFETCH) ||
+		    !size)
+			oid_array_append(&ctx->current_batch, &list->oid[i]);
+	}
+
+	if (ctx->current_batch.nr >= ctx->batch_size)
+		download_batch(ctx);
+
+	return 0;
+}
+
+static int do_backfill(struct backfill_context *ctx)
+{
+	struct rev_info revs;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	int ret;
+
+	repo_init_revisions(ctx->repo, &revs, "");
+	handle_revision_arg("HEAD", &revs, 0, 0);
+
+	info.revs = &revs;
+	info.path_fn = fill_missing_blobs;
+	info.path_fn_data = ctx;
+
+	ret = walk_objects_by_path(&info);
+
+	/* Download the objects that did not fill a batch. */
+	if (!ret)
+		download_batch(ctx);
+
+	clear_backfill_context(ctx);
+	return ret;
+}
+
 int cmd_backfill(int argc, const char **argv, const char *prefix)
 {
+	struct backfill_context ctx = {
+		.repo = the_repository,
+		.current_batch = OID_ARRAY_INIT,
+		.batch_size = 16000,
+	};
 	struct option options[] = {
 		OPT_END(),
 	};
@@ -23,7 +120,5 @@ int cmd_backfill(int argc, const char **argv, const char *prefix)
 
 	git_config(git_default_config, NULL);
 
-	die(_("not implemented"));
-
-	return 0;
+	return do_backfill(&ctx);
 }
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
new file mode 100755
index 00000000000000..43868a4a75f950
--- /dev/null
+++ b/t/t5620-backfill.sh
@@ -0,0 +1,97 @@
+#!/bin/sh
+
+test_description='git backfill on partial clones'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+TEST_PASSES_SANITIZE_LEAK=0
+export TEST_PASSES_SANITIZE_LEAK
+
+. ./test-lib.sh
+
+# We create objects in the 'src' repo.
+test_expect_success 'setup repo for object creation' '
+	echo "{print \$1}" >print_1.awk &&
+	echo "{print \$2}" >print_2.awk &&
+
+	git init src &&
+
+	mkdir -p src/a/b/c &&
+	mkdir -p src/d/e &&
+
+	for i in 1 2
+	do
+		for n in 1 2 3 4
+		do
+			echo "Version $i of file $n" > src/file.$n.txt &&
+			echo "Version $i of file a/$n" > src/a/file.$n.txt &&
+			echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt &&
+			echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt &&
+			echo "Version $i of file d/$n" > src/d/file.$n.txt &&
+			echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt &&
+			git -C src add . &&
+			git -C src commit -m "Iteration $n" || return 1
+		done
+	done
+'
+
+# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin
+# server for the partial clone.
+test_expect_success 'setup bare clone for server' '
+	git clone --bare "file://$(pwd)/src" srv.bare &&
+	git -C srv.bare config --local uploadpack.allowfilter 1 &&
+	git -C srv.bare config --local uploadpack.allowanysha1inwant 1
+'
+
+# do basic partial clone from "srv.bare"
+test_expect_success 'do partial clone 1, backfill gets all objects' '
+	git clone --no-checkout --filter=blob:none	\
+		--single-branch --branch=main 		\
+		"file://$(pwd)/srv.bare" backfill1 &&
+
+	# Backfill with no options gets everything reachable from HEAD.
+	GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \
+		-C backfill1 backfill &&
+
+	# We should have engaged the partial clone machinery
+	test_trace2_data promisor fetch_count 48 <backfill-file-trace &&
+
+	# No more missing objects!
+	git -C backfill1 rev-list --quiet --objects --missing=print HEAD >revs2 &&
+	test_line_count = 0 revs2
+'
+
+. "$TEST_DIRECTORY"/lib-httpd.sh
+start_httpd
+
+test_expect_success 'create a partial clone over HTTP' '
+	SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" &&
+	rm -rf "$SERVER" repo &&
+	git clone --bare "file://$(pwd)/src" "$SERVER" &&
+	test_config -C "$SERVER" uploadpack.allowfilter 1 &&
+	test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 &&
+
+	git clone --no-checkout --filter=blob:none \
+		"$HTTPD_URL/smart/server" backfill-http
+'
+
+test_expect_success 'backfilling over HTTP succeeds' '
+	GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \
+		-C backfill-http backfill &&
+
+	# We should have engaged the partial clone machinery
+	test_trace2_data promisor fetch_count 48 <backfill-http-trace &&
+
+	# Confirm all objects are present, none missing.
+	git -C backfill-http rev-list --objects --all >rev-list-out &&
+	awk "{print \$1;}" <rev-list-out >oids &&
+	GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \
+		cat-file --batch-check <oids >batch-out &&
+	! grep missing batch-out
+'
+
+# DO NOT add non-httpd-specific tests here, because the last part of this
+# test script is only executed when httpd is available and enabled.
+
+test_done

From f904b02e08dda5af1620991418755176e0b731e0 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Sun, 1 Sep 2024 12:22:10 -0400
Subject: [PATCH 04/30] backfill: add --batch-size=<n> option

Users may want to specify a minimum batch size for their needs. This is only
a minimum: the path-walk API provides a list of OIDs that correspond to the
same path, and thus it is optimal to allow delta compression across those
objects in a single server request.

We could consider limiting the request to have a maximum batch size in the
future.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-backfill.txt | 10 +++++++++-
 builtin/backfill.c             |  4 +++-
 t/t5620-backfill.sh            | 18 ++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index 0e10f066fef56f..9b0bae04e9d8f3 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone
 SYNOPSIS
 --------
 [verse]
-'git backfill' [<options>]
+'git backfill' [--batch-size=<n>]
 
 DESCRIPTION
 -----------
@@ -38,6 +38,14 @@ delta compression in the packfile sent by the server.
 By default, `git backfill` downloads all blobs reachable from the `HEAD`
 commit. This set can be restricted or expanded using various options.
 
+OPTIONS
+-------
+
+--batch-size=<n>::
+	Specify a minimum size for a batch of missing objects to request
+	from the server. This size may be exceeded by the last set of
+	blobs seen at a given path. Default batch size is 16,000.
+
 SEE ALSO
 --------
 linkgit:git-clone[1].
diff --git a/builtin/backfill.c b/builtin/backfill.c
index 23d40fc02a280a..50006f15740526 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -21,7 +21,7 @@
 #include "path-walk.h"
 
 static const char * const builtin_backfill_usage[] = {
-	N_("git backfill [<options>]"),
+	N_("git backfill [--batch-size=<n>]"),
 	NULL
 };
 
@@ -109,6 +109,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix)
 		.batch_size = 16000,
 	};
 	struct option options[] = {
+		OPT_INTEGER(0, "batch-size", &ctx.batch_size,
+			    N_("Minimun number of objects to request at a time")),
 		OPT_END(),
 	};
 
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
index 43868a4a75f950..2d81559d8e9c25 100755
--- a/t/t5620-backfill.sh
+++ b/t/t5620-backfill.sh
@@ -62,6 +62,24 @@ test_expect_success 'do partial clone 1, backfill gets all objects' '
 	test_line_count = 0 revs2
 '
 
+test_expect_success 'do partial clone 2, backfill batch size' '
+	git clone --no-checkout --filter=blob:none	\
+		--single-branch --branch=main 		\
+		"file://$(pwd)/srv.bare" backfill2 &&
+
+	GIT_TRACE2_EVENT="$(pwd)/batch-trace" git \
+		-C backfill2 backfill --batch-size=20 &&
+
+	# Batches were used
+	test_trace2_data promisor fetch_count 20 <batch-trace >matches &&
+	test_line_count = 2 matches &&
+	test_trace2_data promisor fetch_count 8 <batch-trace &&
+
+	# No more missing objects!
+	git -C backfill2 rev-list --quiet --objects --missing=print HEAD >revs2 &&
+	test_line_count = 0 revs2
+'
+
 . "$TEST_DIRECTORY"/lib-httpd.sh
 start_httpd
 

From cd33c62f9cc7a9b05b0441b956f2b303cd302270 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Sun, 1 Sep 2024 13:39:18 -0400
Subject: [PATCH 05/30] backfill: add --sparse option

One way to significantly reduce the cost of a Git clone and later fetches is
to use a blobless partial clone and combine that with a sparse-checkout that
reduces the paths that need to be populated in the working directory. Not
only does this reduce the cost of clones and fetches, the sparse-checkout
reduces the number of objects needed to download from a promisor remote.

However, history investigations can be expensie as computing blob diffs will
trigger promisor remote requests for one object at a time. This can be
avoided by downloading the blobs needed for the given sparse-checkout using
'git backfill' and its new '--sparse' mode, at a time that the user is
willing to pay that extra cost.

Note that this is distinctly different from the '--filter=sparse:<oid>'
option, as this assumes that the partial clone has all reachable trees and
we are using client-side logic to avoid downloading blobs outside of the
sparse-checkout cone. This avoids the server-side cost of walking trees
while also achieving a similar goal. It also downloads in batches based on
similar path names, presenting a resumable download if things are
interrupted.

This augments the path-walk API to have a possibly-NULL 'pl' member that may
point to a 'struct pattern_list'. This could be more general than the
sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently
the only consumer.

Be sure to test this in both cone mode and not cone mode. Cone mode has the
benefit that the path-walk can skip certain paths once they would expand
beyond the sparse-checkout.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-backfill.txt |  6 +++-
 builtin/backfill.c             | 13 +++++++-
 path-walk.c                    | 18 +++++++++++
 path-walk.h                    | 11 +++++++
 t/t5620-backfill.sh            | 55 ++++++++++++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index 9b0bae04e9d8f3..ecf2ac428cefb7 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone
 SYNOPSIS
 --------
 [verse]
-'git backfill' [--batch-size=<n>]
+'git backfill' [--batch-size=<n>] [--[no-]sparse]
 
 DESCRIPTION
 -----------
@@ -46,6 +46,10 @@ OPTIONS
 	from the server. This size may be exceeded by the last set of
 	blobs seen at a given path. Default batch size is 16,000.
 
+--[no-]sparse::
+	Only download objects if they appear at a path that matches the
+	current sparse-checkout.
+
 SEE ALSO
 --------
 linkgit:git-clone[1].
diff --git a/builtin/backfill.c b/builtin/backfill.c
index 50006f15740526..de75471cf44657 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -4,6 +4,7 @@
 #include "parse-options.h"
 #include "repository.h"
 #include "commit.h"
+#include "dir.h"
 #include "hex.h"
 #include "tree.h"
 #include "tree-walk.h"
@@ -21,7 +22,7 @@
 #include "path-walk.h"
 
 static const char * const builtin_backfill_usage[] = {
-	N_("git backfill [--batch-size=<n>]"),
+	N_("git backfill [--batch-size=<n>] [--[no-]sparse]"),
 	NULL
 };
 
@@ -29,6 +30,7 @@ struct backfill_context {
 	struct repository *repo;
 	struct oid_array current_batch;
 	size_t batch_size;
+	int sparse;
 };
 
 static void clear_backfill_context(struct backfill_context *ctx)
@@ -84,6 +86,12 @@ static int do_backfill(struct backfill_context *ctx)
 	struct path_walk_info info = PATH_WALK_INFO_INIT;
 	int ret;
 
+	if (ctx->sparse) {
+		CALLOC_ARRAY(info.pl, 1);
+		if (get_sparse_checkout_patterns(info.pl))
+			return error(_("problem loading sparse-checkout"));
+	}
+
 	repo_init_revisions(ctx->repo, &revs, "");
 	handle_revision_arg("HEAD", &revs, 0, 0);
 
@@ -107,10 +115,13 @@ int cmd_backfill(int argc, const char **argv, const char *prefix)
 		.repo = the_repository,
 		.current_batch = OID_ARRAY_INIT,
 		.batch_size = 16000,
+		.sparse = 0,
 	};
 	struct option options[] = {
 		OPT_INTEGER(0, "batch-size", &ctx.batch_size,
 			    N_("Minimun number of objects to request at a time")),
+		OPT_BOOL(0, "sparse", &ctx.sparse,
+			 N_("Restrict the missing objects to the current sparse-checkout")),
 		OPT_END(),
 	};
 
diff --git a/path-walk.c b/path-walk.c
index 2edfa0572e4a15..dc2390dd9eaa84 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -10,6 +10,7 @@
 #include "hex.h"
 #include "object.h"
 #include "oid-array.h"
+#include "repository.h"
 #include "revision.h"
 #include "string-list.h"
 #include "strmap.h"
@@ -111,6 +112,23 @@ static int add_children(struct path_walk_context *ctx,
 		if (type == OBJ_TREE)
 			strbuf_addch(&path, '/');
 
+		if (ctx->info->pl) {
+			int dtype;
+			enum pattern_match_result match;
+			match = path_matches_pattern_list(path.buf, path.len,
+							  path.buf + base_len, &dtype,
+							  ctx->info->pl,
+							  ctx->repo->index);
+
+			if (ctx->info->pl->use_cone_patterns &&
+			    match == NOT_MATCHED)
+				continue;
+			else if (!ctx->info->pl->use_cone_patterns &&
+				 type == OBJ_BLOB &&
+				 match != MATCHED)
+				continue;
+		}
+
 		if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
 			CALLOC_ARRAY(list, 1);
 			list->type = type;
diff --git a/path-walk.h b/path-walk.h
index c9e94a98bc8f6b..bc1ebba5081a8e 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -6,6 +6,7 @@
 
 struct rev_info;
 struct oid_array;
+struct pattern_list;
 
 /**
  * The type of a function pointer for the method that is called on a list of
@@ -30,6 +31,16 @@ struct path_walk_info {
 	 */
 	path_fn path_fn;
 	void *path_fn_data;
+
+	/**
+	 * Specify a sparse-checkout definition to match our paths to. Do not
+	 * walk outside of this sparse definition. If the patterns are in
+	 * cone mode, then the search may prune directories that are outside
+	 * of the cone. If not in cone mode, then all tree paths will be
+	 * explored but the path_fn will only be called when the path matches
+	 * the sparse-checkout patterns.
+	 */
+	struct pattern_list *pl;
 };
 
 #define PATH_WALK_INFO_INIT { 0 }
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
index 2d81559d8e9c25..c7bb27b72c15ee 100755
--- a/t/t5620-backfill.sh
+++ b/t/t5620-backfill.sh
@@ -80,6 +80,61 @@ test_expect_success 'do partial clone 2, backfill batch size' '
 	test_line_count = 0 revs2
 '
 
+test_expect_success 'backfill --sparse' '
+	git clone --sparse --filter=blob:none		\
+		--single-branch --branch=main 		\
+		"file://$(pwd)/srv.bare" backfill3 &&
+
+	# Initial checkout includes four files at root.
+	git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 44 missing &&
+
+	# Initial sparse-checkout is just the files at root, so we get the
+	# older versions of the four files at tip.
+	GIT_TRACE2_EVENT="$(pwd)/sparse-trace1" git \
+		-C backfill3 backfill --sparse &&
+	test_trace2_data promisor fetch_count 4 <sparse-trace1 &&
+	test_trace2_data path-walk paths 5 <sparse-trace1 &&
+	git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 40 missing &&
+
+	# Expand the sparse-checkout to include 'd' recursively. This
+	# engages the algorithm to skip the trees for 'a'. Note that
+	# the "sparse-checkout set" command downloads the objects at tip
+	# to satisfy the current checkout.
+	git -C backfill3 sparse-checkout set d &&
+	GIT_TRACE2_EVENT="$(pwd)/sparse-trace2" git \
+		-C backfill3 backfill --sparse &&
+	test_trace2_data promisor fetch_count 8 <sparse-trace2 &&
+	test_trace2_data path-walk paths 15 <sparse-trace2 &&
+	git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 24 missing
+'
+
+test_expect_success 'backfill --sparse without cone mode' '
+	git clone --no-checkout --filter=blob:none		\
+		--single-branch --branch=main 		\
+		"file://$(pwd)/srv.bare" backfill4 &&
+
+	# No blobs yet
+	git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 48 missing &&
+
+	# Define sparse-checkout by filename regardless of parent directory.
+	# This downloads 6 blobs to satisfy the checkout.
+	git -C backfill4 sparse-checkout set --no-cone "**/file.1.txt" &&
+	git -C backfill4 checkout main &&
+
+	GIT_TRACE2_EVENT="$(pwd)/no-cone-trace1" git \
+		-C backfill4 backfill --sparse &&
+	test_trace2_data promisor fetch_count 6 <no-cone-trace1 &&
+
+	# This walk needed to visit all directories to search for these paths.
+	test_trace2_data path-walk paths 12 <no-cone-trace1 &&
+	git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 36 missing
+'
+
 . "$TEST_DIRECTORY"/lib-httpd.sh
 start_httpd
 

From aa34653de3b7bc501cb40d8bded3ddaff20f37ae Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Sun, 1 Sep 2024 14:06:11 -0400
Subject: [PATCH 06/30] backfill: assume --sparse when sparse-checkout is
 enabled

The previous change introduced the '--[no-]sparse' option for the 'git
backfill' command, but did not assume it as enabled by default. However,
this is likely the behavior that users will most often want to happen.
Without this default, users with a small sparse-checkout may be confused
when 'git backfill' downloads every version of every object in the full
history.

However, this is left as a separate change so this decision can be reviewed
independently of the value of the '--[no-]sparse' option.

Add a test of adding the '--sparse' option to a repo without sparse-checkout
to make it clear that supplying it without a sparse-checkout is an error.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-backfill.txt |  3 ++-
 builtin/backfill.c             |  4 ++++
 t/t5620-backfill.sh            | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt
index ecf2ac428cefb7..066ec6b161a22c 100644
--- a/Documentation/git-backfill.txt
+++ b/Documentation/git-backfill.txt
@@ -48,7 +48,8 @@ OPTIONS
 
 --[no-]sparse::
 	Only download objects if they appear at a path that matches the
-	current sparse-checkout.
+	current sparse-checkout. If the sparse-checkout feature is enabled,
+	then `--sparse` is assumed and can be disabled with `--no-sparse`.
 
 SEE ALSO
 --------
diff --git a/builtin/backfill.c b/builtin/backfill.c
index de75471cf44657..82a18e58a4102f 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -5,6 +5,7 @@
 #include "repository.h"
 #include "commit.h"
 #include "dir.h"
+#include "environment.h"
 #include "hex.h"
 #include "tree.h"
 #include "tree-walk.h"
@@ -133,5 +134,8 @@ int cmd_backfill(int argc, const char **argv, const char *prefix)
 
 	git_config(git_default_config, NULL);
 
+	if (ctx.sparse < 0)
+		ctx.sparse = core_apply_sparse_checkout;
+
 	return do_backfill(&ctx);
 }
diff --git a/t/t5620-backfill.sh b/t/t5620-backfill.sh
index c7bb27b72c15ee..1fa2e90f8cf57f 100755
--- a/t/t5620-backfill.sh
+++ b/t/t5620-backfill.sh
@@ -80,6 +80,12 @@ test_expect_success 'do partial clone 2, backfill batch size' '
 	test_line_count = 0 revs2
 '
 
+test_expect_success 'backfill --sparse without sparse-checkout fails' '
+	git init not-sparse &&
+	test_must_fail git -C not-sparse backfill --sparse 2>err &&
+	grep "problem loading sparse-checkout" err
+'
+
 test_expect_success 'backfill --sparse' '
 	git clone --sparse --filter=blob:none		\
 		--single-branch --branch=main 		\
@@ -108,7 +114,12 @@ test_expect_success 'backfill --sparse' '
 	test_trace2_data promisor fetch_count 8 <sparse-trace2 &&
 	test_trace2_data path-walk paths 15 <sparse-trace2 &&
 	git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
-	test_line_count = 24 missing
+	test_line_count = 24 missing &&
+
+	# Disabling the --sparse option (on by default) will download everything
+	git -C backfill3 backfill --no-sparse &&
+	git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
+	test_line_count = 0 missing
 '
 
 test_expect_success 'backfill --sparse without cone mode' '

From 2829fe3875438f3a9907f36d825d6c24952abded Mon Sep 17 00:00:00 2001
From: Derrick Stolee <derrickstolee@github.com>
Date: Sun, 1 Sep 2024 14:25:33 -0400
Subject: [PATCH 07/30] path-walk: allow consumer to specify object types

This adds the ability to ask for the commits as a single list. This will
also reduce the calls in 'git backfill' to be a BUG() statement if called
with anything other than blobs.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/backfill.c |  2 +-
 path-walk.c        | 40 ++++++++++++++++++++++++++++++++++------
 path-walk.h        | 12 +++++++++++-
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/builtin/backfill.c b/builtin/backfill.c
index 82a18e58a4102f..2a1b043f18821a 100644
--- a/builtin/backfill.c
+++ b/builtin/backfill.c
@@ -61,7 +61,7 @@ static int fill_missing_blobs(const char *path,
 	struct backfill_context *ctx = data;
 
 	if (type != OBJ_BLOB)
-		return 0;
+		BUG("fill_missing_blobs only takes blob objects");
 
 	for (size_t i = 0; i < list->nr; i++) {
 		off_t size = 0;
diff --git a/path-walk.c b/path-walk.c
index dc2390dd9eaa84..d70e6840fb5864 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -83,6 +83,10 @@ static int add_children(struct path_walk_context *ctx,
 		if (S_ISGITLINK(entry.mode))
 			continue;
 
+		/* If the caller doesn't want blobs, then don't bother. */
+		if (!ctx->info->blobs && type == OBJ_BLOB)
+			continue;
+
 		if (type == OBJ_TREE) {
 			struct tree *child = lookup_tree(ctx->repo, &entry.oid);
 			o = child ? &child->object : NULL;
@@ -156,9 +160,11 @@ static int walk_path(struct path_walk_context *ctx,
 
 	list = strmap_get(&ctx->paths_to_lists, path);
 
-	/* Evaluate function pointer on this data. */
-	ret = ctx->info->path_fn(path, &list->oids, list->type,
-				 ctx->info->path_fn_data);
+	/* Evaluate function pointer on this data, if requested. */
+	if ((list->type == OBJ_TREE && ctx->info->trees) ||
+	    (list->type == OBJ_BLOB && ctx->info->blobs))
+		ret = ctx->info->path_fn(path, &list->oids, list->type,
+					ctx->info->path_fn_data);
 
 	/* Expand data for children. */
 	if (list->type == OBJ_TREE) {
@@ -200,6 +206,7 @@ int walk_objects_by_path(struct path_walk_info *info)
 	size_t commits_nr = 0, paths_nr = 0;
 	struct commit *c;
 	struct type_and_oid_list *root_tree_list;
+	struct type_and_oid_list *commit_list;
 	struct path_walk_context ctx = {
 		.repo = info->revs->repo,
 		.revs = info->revs,
@@ -210,28 +217,49 @@ int walk_objects_by_path(struct path_walk_info *info)
 
 	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
 
+	CALLOC_ARRAY(commit_list, 1);
+	commit_list->type = OBJ_COMMIT;
+
 	/* Insert a single list for the root tree into the paths. */
 	CALLOC_ARRAY(root_tree_list, 1);
 	root_tree_list->type = OBJ_TREE;
 	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
-
 	if (prepare_revision_walk(info->revs))
 		die(_("failed to setup revision walk"));
 
 	while ((c = get_revision(info->revs))) {
-		struct object_id *oid = get_commit_tree_oid(c);
-		struct tree *t = lookup_tree(info->revs->repo, oid);
+		struct object_id *oid;
+		struct tree *t;
 		commits_nr++;
 
+		if (info->commits)
+			oid_array_append(&commit_list->oids,
+					 &c->object.oid);
+
+		/* If we only care about commits, then skip trees. */
+		if (!info->trees && !info->blobs)
+			continue;
+
+		oid = get_commit_tree_oid(c);
+		t = lookup_tree(info->revs->repo, oid);
+
 		if (t)
 			oid_array_append(&root_tree_list->oids, oid);
 		else
 			warning("could not find tree %s", oid_to_hex(oid));
+
 	}
 
 	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
 	trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
 
+	/* Track all commits. */
+	if (info->commits)
+		ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT,
+				    info->path_fn_data);
+	oid_array_clear(&commit_list->oids);
+	free(commit_list);
+
 	string_list_append(&ctx.path_stack, root_path);
 
 	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
diff --git a/path-walk.h b/path-walk.h
index bc1ebba5081a8e..49b982dade6e45 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -32,6 +32,14 @@ struct path_walk_info {
 	path_fn path_fn;
 	void *path_fn_data;
 
+	/**
+	 * Initialize which object types the path_fn should be called on. This
+	 * could also limit the walk to skip blobs if not set.
+	 */
+	int commits;
+	int trees;
+	int blobs;
+
 	/**
 	 * Specify a sparse-checkout definition to match our paths to. Do not
 	 * walk outside of this sparse definition. If the patterns are in
@@ -43,7 +51,9 @@ struct path_walk_info {
 	struct pattern_list *pl;
 };
 
-#define PATH_WALK_INFO_INIT { 0 }
+#define PATH_WALK_INFO_INIT {   \
+	.blobs = 1,		\
+}
 
 /**
  * Given the configuration of 'info', walk the commits based on 'info->revs' and

From d67679dc1a3cf055cf9357f2a6bc9ff990cfef43 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Mon, 9 Sep 2024 12:31:21 -0400
Subject: [PATCH 08/30] path-walk: allow visiting tags

In anticipation of using the path-walk API to analyze tags or include
them in a pack-file, add the ability to walk the tags that were included
in the revision walk.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 path-walk.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 path-walk.h |  1 +
 2 files changed, 59 insertions(+)

diff --git a/path-walk.c b/path-walk.c
index d70e6840fb5864..65f9856afa2f66 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -14,6 +14,7 @@
 #include "revision.h"
 #include "string-list.h"
 #include "strmap.h"
+#include "tag.h"
 #include "trace2.h"
 #include "tree.h"
 #include "tree-walk.h"
@@ -215,6 +216,9 @@ int walk_objects_by_path(struct path_walk_info *info)
 		.paths_to_lists = STRMAP_INIT
 	};
 
+	struct oid_array tagged_tree_list = OID_ARRAY_INIT;
+	struct oid_array tagged_blob_list = OID_ARRAY_INIT;
+
 	trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
 
 	CALLOC_ARRAY(commit_list, 1);
@@ -260,6 +264,60 @@ int walk_objects_by_path(struct path_walk_info *info)
 	oid_array_clear(&commit_list->oids);
 	free(commit_list);
 
+	if (info->tags) {
+		struct oid_array tags = OID_ARRAY_INIT;
+
+		trace2_region_enter("path-walk", "tag-walk", info->revs->repo);
+
+		/*
+		 * Walk any pending objects at this point, but they should only
+		 * be tags.
+		 */
+		for (size_t i = 0; i < info->revs->pending.nr; i++) {
+			struct object_array_entry *pending = info->revs->pending.objects + i;
+			struct object *obj = pending->item;
+
+			while (obj->type == OBJ_TAG) {
+				struct tag *tag = lookup_tag(info->revs->repo,
+							     &obj->oid);
+				oid_array_append(&tags, &obj->oid);
+				obj = tag->tagged;
+			}
+
+			switch (obj->type) {
+			case OBJ_TREE:
+				oid_array_append(&tagged_tree_list, &obj->oid);
+				break;
+
+			case OBJ_BLOB:
+				oid_array_append(&tagged_blob_list, &obj->oid);
+				break;
+
+			case OBJ_COMMIT:
+				/* skip */
+				break;
+
+			default:
+				BUG("should not see any other type here");
+			}
+		}
+
+		info->path_fn("initial", &tags, OBJ_TAG, info->path_fn_data);
+
+		if (tagged_tree_list.nr)
+			info->path_fn("tagged-trees", &tagged_tree_list, OBJ_TREE,
+				      info->path_fn_data);
+		if (tagged_blob_list.nr)
+			info->path_fn("tagged-blobs", &tagged_blob_list, OBJ_BLOB,
+				      info->path_fn_data);
+
+		trace2_data_intmax("path-walk", ctx.repo, "tags", tags.nr);
+		trace2_region_leave("path-walk", "tag-walk", info->revs->repo);
+		oid_array_clear(&tags);
+		oid_array_clear(&tagged_tree_list);
+		oid_array_clear(&tagged_blob_list);
+	}
+
 	string_list_append(&ctx.path_stack, root_path);
 
 	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
diff --git a/path-walk.h b/path-walk.h
index 49b982dade6e45..637d3b0cabb69a 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -39,6 +39,7 @@ struct path_walk_info {
 	int commits;
 	int trees;
 	int blobs;
+	int tags;
 
 	/**
 	 * Specify a sparse-checkout definition to match our paths to. Do not

From 7d43a1634bbe2d2efa96a806e3de1f1fd480041b Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 08:55:03 -0400
Subject: [PATCH 09/30] survey: stub in new experimental `git-survey` command

Start work on a new `git survey` command to scan the repository
for monorepo performance and scaling problems.  The goal is to
measure the various known "dimensions of scale" and serve as a
foundation for adding additional measurements as we learn more
about Git monorepo scaling problems.

The initial goal is to complement the scanning and analysis performed
by the GO-based `git-sizer` (https://github.com/github/git-sizer) tool.
It is hoped that by creating a builtin command, we may be able to take
advantage of internal Git data structures and code that is not
accessible from GO to gain further insight into potential scaling
problems.

RFC TODO: Adapt this boilerplat to match the upcoming changes to builtin
methods that include a 'struct repository' pointer.

Co-authored-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 .gitignore                   |  1 +
 Documentation/git-survey.txt | 36 ++++++++++++++++++++++
 Makefile                     |  1 +
 builtin.h                    |  1 +
 builtin/survey.c             | 60 ++++++++++++++++++++++++++++++++++++
 command-list.txt             |  1 +
 git.c                        |  1 +
 t/t8100-git-survey.sh        | 18 +++++++++++
 8 files changed, 119 insertions(+)
 create mode 100644 Documentation/git-survey.txt
 create mode 100644 builtin/survey.c
 create mode 100755 t/t8100-git-survey.sh

diff --git a/.gitignore b/.gitignore
index 8f5cb938ecbf36..3f6fdb31a5ea32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,6 +165,7 @@
 /git-submodule
 /git-submodule--helper
 /git-subtree
+/git-survey
 /git-svn
 /git-switch
 /git-symbolic-ref
diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
new file mode 100644
index 00000000000000..cdd1ec4358b8bb
--- /dev/null
+++ b/Documentation/git-survey.txt
@@ -0,0 +1,36 @@
+git-survey(1)
+=============
+
+NAME
+----
+git-survey - EXPERIMENTAL: Measure various repository dimensions of scale
+
+SYNOPSIS
+--------
+[verse]
+(EXPERIMENTAL!) `git survey` <options>
+
+DESCRIPTION
+-----------
+
+Survey the repository and measure various dimensions of scale.
+
+As repositories grow to "monorepo" size, certain data shapes can cause
+performance problems.  `git-survey` attempts to measure and report on
+known problem areas.
+
+OPTIONS
+-------
+
+--progress::
+	Show progress.  This is automatically enabled when interactive.
+
+OUTPUT
+------
+
+By default, `git survey` will print information about the repository in a
+human-readable format that includes overviews and tables.
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index 4305474d96e0f7..154de6e01d06b2 100644
--- a/Makefile
+++ b/Makefile
@@ -1303,6 +1303,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
+BUILTIN_OBJS += builtin/survey.o
 BUILTIN_OBJS += builtin/symbolic-ref.o
 BUILTIN_OBJS += builtin/tag.o
 BUILTIN_OBJS += builtin/unpack-file.o
diff --git a/builtin.h b/builtin.h
index 73dd0ccbe8c961..d4e8cf3b97b590 100644
--- a/builtin.h
+++ b/builtin.h
@@ -239,6 +239,7 @@ int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
 int cmd_submodule__helper(int argc, const char **argv, const char *prefix);
+int cmd_survey(int argc, const char **argv, const char *prefix);
 int cmd_switch(int argc, const char **argv, const char *prefix);
 int cmd_symbolic_ref(int argc, const char **argv, const char *prefix);
 int cmd_tag(int argc, const char **argv, const char *prefix);
diff --git a/builtin/survey.c b/builtin/survey.c
new file mode 100644
index 00000000000000..4cfd0f0293c3b3
--- /dev/null
+++ b/builtin/survey.c
@@ -0,0 +1,60 @@
+#include "builtin.h"
+#include "config.h"
+#include "parse-options.h"
+
+static const char * const survey_usage[] = {
+	N_("(EXPERIMENTAL!) git survey <options>"),
+	NULL,
+};
+
+struct survey_opts {
+	int verbose;
+	int show_progress;
+};
+
+static struct survey_opts survey_opts = {
+	.verbose = 0,
+	.show_progress = -1, /* defaults to isatty(2) */
+};
+
+static struct option survey_options[] = {
+	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
+	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
+	OPT_END(),
+};
+
+static int survey_load_config_cb(const char *var, const char *value,
+				 const struct config_context *ctx, void *pvoid)
+{
+	if (!strcmp(var, "survey.verbose")) {
+		survey_opts.verbose = git_config_bool(var, value);
+		return 0;
+	}
+	if (!strcmp(var, "survey.progress")) {
+		survey_opts.show_progress = git_config_bool(var, value);
+		return 0;
+	}
+
+	return git_default_config(var, value, ctx, pvoid);
+}
+
+static void survey_load_config(void)
+{
+	git_config(survey_load_config_cb, NULL);
+}
+
+int cmd_survey(int argc, const char **argv, const char *prefix)
+{
+	if (argc == 2 && !strcmp(argv[1], "-h"))
+		usage_with_options(survey_usage, survey_options);
+
+	prepare_repo_settings(the_repository);
+	survey_load_config();
+
+	argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0);
+
+	if (survey_opts.show_progress < 0)
+		survey_opts.show_progress = isatty(2);
+
+	return 0;
+}
diff --git a/command-list.txt b/command-list.txt
index c537114b4687b8..ecc9d2281a0909 100644
--- a/command-list.txt
+++ b/command-list.txt
@@ -187,6 +187,7 @@ git-stash                               mainporcelain
 git-status                              mainporcelain           info
 git-stripspace                          purehelpers
 git-submodule                           mainporcelain
+git-survey                              mainporcelain
 git-svn                                 foreignscminterface
 git-switch                              mainporcelain           history
 git-symbolic-ref                        plumbingmanipulators
diff --git a/git.c b/git.c
index 4f2215e9c8b0cf..98e90838e42fbf 100644
--- a/git.c
+++ b/git.c
@@ -630,6 +630,7 @@ static struct cmd_struct commands[] = {
 	{ "status", cmd_status, RUN_SETUP | NEED_WORK_TREE },
 	{ "stripspace", cmd_stripspace },
 	{ "submodule--helper", cmd_submodule__helper, RUN_SETUP },
+	{ "survey", cmd_survey, RUN_SETUP },
 	{ "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE },
 	{ "symbolic-ref", cmd_symbolic_ref, RUN_SETUP },
 	{ "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG },
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
new file mode 100755
index 00000000000000..2df7fa83629301
--- /dev/null
+++ b/t/t8100-git-survey.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+test_description='git survey'
+
+GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main
+export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME
+
+TEST_PASSES_SANITIZE_LEAK=0
+export TEST_PASSES_SANITIZE_LEAK
+
+. ./test-lib.sh
+
+test_expect_success 'git survey -h shows experimental warning' '
+	test_expect_code 129 git survey -h 2>usage &&
+	grep "EXPERIMENTAL!" usage
+'
+
+test_done

From 90986876381e4ccc10c5e191a3928407181e6a04 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 09:51:34 -0400
Subject: [PATCH 10/30] survey: add command line opts to select references

By default we will scan all references in "refs/heads/", "refs/tags/"
and "refs/remotes/".

Add command line opts let the use ask for all refs or a subset of them
and to include a detached HEAD.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/git-survey.txt | 34 +++++++++++++
 builtin/survey.c             | 99 ++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
index cdd1ec4358b8bb..c648ef704e3806 100644
--- a/Documentation/git-survey.txt
+++ b/Documentation/git-survey.txt
@@ -19,12 +19,46 @@ As repositories grow to "monorepo" size, certain data shapes can cause
 performance problems.  `git-survey` attempts to measure and report on
 known problem areas.
 
+Ref Selection and Reachable Objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this first analysis phase, `git survey` will iterate over the set of
+requested branches, tags, and other refs and treewalk over all of the
+reachable commits, trees, and blobs and generate various statistics.
+
 OPTIONS
 -------
 
 --progress::
 	Show progress.  This is automatically enabled when interactive.
 
+Ref Selection
+~~~~~~~~~~~~~
+
+The following options control the set of refs that `git survey` will examine.
+By default, `git survey` will look at tags, local branches, and remote refs.
+If any of the following options are given, the default set is cleared and
+only refs for the given options are added.
+
+--all-refs::
+	Use all refs.  This includes local branches, tags, remote refs,
+	notes, and stashes.  This option overrides all of the following.
+
+--branches::
+	Add local branches (`refs/heads/`) to the set.
+
+--tags::
+	Add tags (`refs/tags/`) to the set.
+
+--remotes::
+	Add remote branches (`refs/remote/`) to the set.
+
+--detached::
+	Add HEAD to the set.
+
+--other::
+	Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set.
+
 OUTPUT
 ------
 
diff --git a/builtin/survey.c b/builtin/survey.c
index 4cfd0f0293c3b3..e0e844201de4c3 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -7,19 +7,117 @@ static const char * const survey_usage[] = {
 	NULL,
 };
 
+struct survey_refs_wanted {
+	int want_all_refs; /* special override */
+
+	int want_branches;
+	int want_tags;
+	int want_remotes;
+	int want_detached;
+	int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */
+};
+
+/*
+ * The set of refs that we will search if the user doesn't select
+ * any on the command line.
+ */
+static struct survey_refs_wanted refs_if_unspecified = {
+	.want_all_refs = 0,
+
+	.want_branches = 1,
+	.want_tags = 1,
+	.want_remotes = 1,
+	.want_detached = 0,
+	.want_other = 0,
+};
+
 struct survey_opts {
 	int verbose;
 	int show_progress;
+	struct survey_refs_wanted refs;
 };
 
 static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
+
+	.refs.want_all_refs = -1,
+
+	.refs.want_branches = -1, /* default these to undefined */
+	.refs.want_tags = -1,
+	.refs.want_remotes = -1,
+	.refs.want_detached = -1,
+	.refs.want_other = -1,
 };
 
+/*
+ * After parsing the command line arguments, figure out which refs we
+ * should scan.
+ *
+ * If ANY were given in positive sense, then we ONLY include them and
+ * do not use the builtin values.
+ */
+static void fixup_refs_wanted(void)
+{
+	struct survey_refs_wanted *rw = &survey_opts.refs;
+
+	/*
+	 * `--all-refs` overrides and enables everything.
+	 */
+	if (rw->want_all_refs == 1) {
+		rw->want_branches = 1;
+		rw->want_tags = 1;
+		rw->want_remotes = 1;
+		rw->want_detached = 1;
+		rw->want_other = 1;
+		return;
+	}
+
+	/*
+	 * If none of the `--<ref-type>` were given, we assume all
+	 * of the builtin unspecified values.
+	 */
+	if (rw->want_branches == -1 &&
+	    rw->want_tags == -1 &&
+	    rw->want_remotes == -1 &&
+	    rw->want_detached == -1 &&
+	    rw->want_other == -1) {
+		*rw = refs_if_unspecified;
+		return;
+	}
+
+	/*
+	 * Since we only allow positive boolean values on the command
+	 * line, we will only have true values where they specified
+	 * a `--<ref-type>`.
+	 *
+	 * So anything that still has an unspecified value should be
+	 * set to false.
+	 */
+	if (rw->want_branches == -1)
+		rw->want_branches = 0;
+	if (rw->want_tags == -1)
+		rw->want_tags = 0;
+	if (rw->want_remotes == -1)
+		rw->want_remotes = 0;
+	if (rw->want_detached == -1)
+		rw->want_detached = 0;
+	if (rw->want_other == -1)
+		rw->want_other = 0;
+}
+
 static struct option survey_options[] = {
 	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
 	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
+
+	OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
+
+	OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"),          PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "tags",     &survey_opts.refs.want_tags,     N_("include tags"),              PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "remotes",  &survey_opts.refs.want_remotes,  N_("include all remotes refs"),  PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"),     PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "other",    &survey_opts.refs.want_other,    N_("include notes and stashes"), PARSE_OPT_NONEG),
+
 	OPT_END(),
 };
 
@@ -55,6 +153,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	if (survey_opts.show_progress < 0)
 		survey_opts.show_progress = isatty(2);
+	fixup_refs_wanted();
 
 	return 0;
 }

From efa1793a5729b152b8961238dd834a26275e969a Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 10:38:17 -0400
Subject: [PATCH 11/30] survey: collect the set of requested refs

Collect the set of requested branches, tags, and etc into a ref_array and
collect the set of requested patterns into a strvec.

RFC TODO: This patch has some changes that should be in the previous patch,
to make the diff look a lot better.

Co-authored-by: Derrick Stolee <stolee@gmail.com>
Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 258 ++++++++++++++++++++++++++++++++++--------
 t/t8100-git-survey.sh |   9 ++
 2 files changed, 217 insertions(+), 50 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index e0e844201de4c3..1b4fe591e59d49 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,6 +1,12 @@
 #include "builtin.h"
 #include "config.h"
+#include "object.h"
+#include "object-store-ll.h"
 #include "parse-options.h"
+#include "progress.h"
+#include "ref-filter.h"
+#include "strvec.h"
+#include "trace2.h"
 
 static const char * const survey_usage[] = {
 	N_("(EXPERIMENTAL!) git survey <options>"),
@@ -17,18 +23,8 @@ struct survey_refs_wanted {
 	int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */
 };
 
-/*
- * The set of refs that we will search if the user doesn't select
- * any on the command line.
- */
-static struct survey_refs_wanted refs_if_unspecified = {
-	.want_all_refs = 0,
-
-	.want_branches = 1,
-	.want_tags = 1,
-	.want_remotes = 1,
-	.want_detached = 0,
-	.want_other = 0,
+static struct survey_refs_wanted default_ref_options = {
+	.want_all_refs = 1,
 };
 
 struct survey_opts {
@@ -37,19 +33,51 @@ struct survey_opts {
 	struct survey_refs_wanted refs;
 };
 
-static struct survey_opts survey_opts = {
-	.verbose = 0,
-	.show_progress = -1, /* defaults to isatty(2) */
+struct survey_report_ref_summary {
+	size_t refs_nr;
+	size_t branches_nr;
+	size_t remote_refs_nr;
+	size_t tags_nr;
+	size_t tags_annotated_nr;
+	size_t others_nr;
+	size_t unknown_nr;
+};
+
+/**
+ * This struct contains all of the information that needs to be printed
+ * at the end of the exploration of the repository and its references.
+ */
+struct survey_report {
+	struct survey_report_ref_summary refs;
+};
+
+struct survey_context {
+	/* Options that control what is done. */
+	struct survey_opts opts;
+
+	/* Info for output only. */
+	struct survey_report report;
 
-	.refs.want_all_refs = -1,
+	/*
+	 * The rest of the members are about enabling the activity
+	 * of the 'git survey' command, including ref listings, object
+	 * pointers, and progress.
+	 */
+
+	struct repository *repo;
+
+	struct progress *progress;
+	size_t progress_nr;
+	size_t progress_total;
 
-	.refs.want_branches = -1, /* default these to undefined */
-	.refs.want_tags = -1,
-	.refs.want_remotes = -1,
-	.refs.want_detached = -1,
-	.refs.want_other = -1,
+	struct strvec refs;
 };
 
+static void clear_survey_context(struct survey_context *ctx)
+{
+	strvec_clear(&ctx->refs);
+}
+
 /*
  * After parsing the command line arguments, figure out which refs we
  * should scan.
@@ -57,9 +85,9 @@ static struct survey_opts survey_opts = {
  * If ANY were given in positive sense, then we ONLY include them and
  * do not use the builtin values.
  */
-static void fixup_refs_wanted(void)
+static void fixup_refs_wanted(struct survey_context *ctx)
 {
-	struct survey_refs_wanted *rw = &survey_opts.refs;
+	struct survey_refs_wanted *rw = &ctx->opts.refs;
 
 	/*
 	 * `--all-refs` overrides and enables everything.
@@ -82,7 +110,7 @@ static void fixup_refs_wanted(void)
 	    rw->want_remotes == -1 &&
 	    rw->want_detached == -1 &&
 	    rw->want_other == -1) {
-		*rw = refs_if_unspecified;
+		*rw = default_ref_options;
 		return;
 	}
 
@@ -106,54 +134,184 @@ static void fixup_refs_wanted(void)
 		rw->want_other = 0;
 }
 
-static struct option survey_options[] = {
-	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
-	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
-
-	OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
-
-	OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"),          PARSE_OPT_NONEG),
-	OPT_BOOL_F(0, "tags",     &survey_opts.refs.want_tags,     N_("include tags"),              PARSE_OPT_NONEG),
-	OPT_BOOL_F(0, "remotes",  &survey_opts.refs.want_remotes,  N_("include all remotes refs"),  PARSE_OPT_NONEG),
-	OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"),     PARSE_OPT_NONEG),
-	OPT_BOOL_F(0, "other",    &survey_opts.refs.want_other,    N_("include notes and stashes"), PARSE_OPT_NONEG),
-
-	OPT_END(),
-};
-
 static int survey_load_config_cb(const char *var, const char *value,
-				 const struct config_context *ctx, void *pvoid)
+				 const struct config_context *cctx, void *pvoid)
 {
+	struct survey_context *sctx = pvoid;
 	if (!strcmp(var, "survey.verbose")) {
-		survey_opts.verbose = git_config_bool(var, value);
+		sctx->opts.verbose = git_config_bool(var, value);
 		return 0;
 	}
 	if (!strcmp(var, "survey.progress")) {
-		survey_opts.show_progress = git_config_bool(var, value);
+		sctx->opts.show_progress = git_config_bool(var, value);
 		return 0;
 	}
 
-	return git_default_config(var, value, ctx, pvoid);
+	return git_default_config(var, value, cctx, pvoid);
 }
 
-static void survey_load_config(void)
+static void survey_load_config(struct survey_context *ctx)
 {
-	git_config(survey_load_config_cb, NULL);
+	git_config(survey_load_config_cb, ctx);
+}
+
+static void do_load_refs(struct survey_context *ctx,
+			 struct ref_array *ref_array)
+{
+	struct ref_filter filter = REF_FILTER_INIT;
+	struct ref_sorting *sorting;
+	struct string_list sorting_options = STRING_LIST_INIT_DUP;
+
+	string_list_append(&sorting_options, "objectname");
+	sorting = ref_sorting_options(&sorting_options);
+
+	if (ctx->opts.refs.want_detached)
+		strvec_push(&ctx->refs, "HEAD");
+
+	if (ctx->opts.refs.want_all_refs) {
+		strvec_push(&ctx->refs, "refs/");
+	} else {
+		if (ctx->opts.refs.want_branches)
+			strvec_push(&ctx->refs, "refs/heads/");
+		if (ctx->opts.refs.want_tags)
+			strvec_push(&ctx->refs, "refs/tags/");
+		if (ctx->opts.refs.want_remotes)
+			strvec_push(&ctx->refs, "refs/remotes/");
+		if (ctx->opts.refs.want_other) {
+			strvec_push(&ctx->refs, "refs/notes/");
+			strvec_push(&ctx->refs, "refs/stash/");
+		}
+	}
+
+	filter.name_patterns = ctx->refs.v;
+	filter.ignore_case = 0;
+	filter.match_as_path = 1;
+
+	if (ctx->opts.show_progress) {
+		ctx->progress_total = 0;
+		ctx->progress = start_progress(_("Scanning refs..."), 0);
+	}
+
+	filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK);
+
+	if (ctx->opts.show_progress) {
+		ctx->progress_total = ref_array->nr;
+		display_progress(ctx->progress, ctx->progress_total);
+	}
+
+	ref_array_sort(sorting, ref_array);
+
+	stop_progress(&ctx->progress);
+	ref_filter_clear(&filter);
+	ref_sorting_release(sorting);
+}
+
+/*
+ * The REFS phase:
+ *
+ * Load the set of requested refs and assess them for scalablity problems.
+ * Use that set to start a treewalk to all reachable objects and assess
+ * them.
+ *
+ * This data will give us insights into the repository itself (the number
+ * of refs, the size and shape of the DAG, the number and size of the
+ * objects).
+ *
+ * Theoretically, this data is independent of the on-disk representation
+ * (e.g. independent of packing concerns).
+ */
+static void survey_phase_refs(struct survey_context *ctx)
+{
+	struct ref_array ref_array = { 0 };
+
+	trace2_region_enter("survey", "phase/refs", ctx->repo);
+	do_load_refs(ctx, &ref_array);
+
+	ctx->report.refs.refs_nr = ref_array.nr;
+	for (size_t i = 0; i < ref_array.nr; i++) {
+		size_t size;
+		struct ref_array_item *item = ref_array.items[i];
+
+		switch (item->kind) {
+		case FILTER_REFS_TAGS:
+			ctx->report.refs.tags_nr++;
+			if (oid_object_info(ctx->repo,
+					    &item->objectname,
+					    &size) == OBJ_TAG)
+				ctx->report.refs.tags_annotated_nr++;
+			break;
+
+		case FILTER_REFS_BRANCHES:
+			ctx->report.refs.branches_nr++;
+			break;
+
+		case FILTER_REFS_REMOTES:
+			ctx->report.refs.remote_refs_nr++;
+			break;
+
+		case FILTER_REFS_OTHERS:
+			ctx->report.refs.others_nr++;
+			break;
+
+		default:
+			ctx->report.refs.unknown_nr++;
+			break;
+		}
+	}
+
+	trace2_region_leave("survey", "phase/refs", ctx->repo);
+
+	ref_array_clear(&ref_array);
 }
 
 int cmd_survey(int argc, const char **argv, const char *prefix)
 {
+	static struct survey_context ctx = {
+		.opts = {
+			.verbose = 0,
+			.show_progress = -1, /* defaults to isatty(2) */
+
+			.refs.want_all_refs = -1,
+
+			.refs.want_branches = -1, /* default these to undefined */
+			.refs.want_tags = -1,
+			.refs.want_remotes = -1,
+			.refs.want_detached = -1,
+			.refs.want_other = -1,
+		},
+		.refs = STRVEC_INIT,
+	};
+
+	static struct option survey_options[] = {
+		OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")),
+		OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")),
+
+		OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
+
+		OPT_BOOL_F(0, "branches", &ctx.opts.refs.want_branches, N_("include branches"),          PARSE_OPT_NONEG),
+		OPT_BOOL_F(0, "tags",     &ctx.opts.refs.want_tags,     N_("include tags"),              PARSE_OPT_NONEG),
+		OPT_BOOL_F(0, "remotes",  &ctx.opts.refs.want_remotes,  N_("include all remotes refs"),  PARSE_OPT_NONEG),
+		OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"),     PARSE_OPT_NONEG),
+		OPT_BOOL_F(0, "other",    &ctx.opts.refs.want_other,    N_("include notes and stashes"), PARSE_OPT_NONEG),
+
+		OPT_END(),
+	};
+
 	if (argc == 2 && !strcmp(argv[1], "-h"))
 		usage_with_options(survey_usage, survey_options);
 
-	prepare_repo_settings(the_repository);
-	survey_load_config();
+	ctx.repo = the_repository;
+	prepare_repo_settings(ctx.repo);
+	survey_load_config(&ctx);
 
 	argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0);
 
-	if (survey_opts.show_progress < 0)
-		survey_opts.show_progress = isatty(2);
-	fixup_refs_wanted();
+	if (ctx.opts.show_progress < 0)
+		ctx.opts.show_progress = isatty(2);
+	fixup_refs_wanted(&ctx);
+
+	survey_phase_refs(&ctx);
 
+	clear_survey_context(&ctx);
 	return 0;
 }
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index 2df7fa83629301..5903c90cb570ac 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -15,4 +15,13 @@ test_expect_success 'git survey -h shows experimental warning' '
 	grep "EXPERIMENTAL!" usage
 '
 
+test_expect_success 'creat a semi-interesting repo' '
+	test_commit_bulk 10
+'
+
+test_expect_success 'git survey (default)' '
+	git survey >out 2>err &&
+	test_line_count = 0 err
+'
+
 test_done

From 44417cceddcaeec9e90acd0b058edd8c80627479 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 15:58:32 -0400
Subject: [PATCH 12/30] survey: start pretty printing data in table form

When 'git survey' provides information to the user, this will be presented
in one of two formats: plaintext and JSON. The JSON implementation will be
delayed until the functionality is complete for the plaintext format.

The most important parts of the plaintext format are headers specifying the
different sections of the report and tables providing concreted data.

Create a custom table data structure that allows specifying a list of
strings for the row values. When printing the table, check each column for
the maximum width so we can create a table of the correct size from the
start.

The table structure is designed to be flexible to the different kinds of
output that will be implemented in future changes.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 175 ++++++++++++++++++++++++++++++++++++++++++
 t/t8100-git-survey.sh |  17 +++-
 2 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index 1b4fe591e59d49..b2104e84d6114e 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -5,6 +5,7 @@
 #include "parse-options.h"
 #include "progress.h"
 #include "ref-filter.h"
+#include "strbuf.h"
 #include "strvec.h"
 #include "trace2.h"
 
@@ -27,10 +28,16 @@ static struct survey_refs_wanted default_ref_options = {
 	.want_all_refs = 1,
 };
 
+enum survey_format {
+	SURVEY_PLAINTEXT = 0,
+	SURVEY_JSON = 1,
+};
+
 struct survey_opts {
 	int verbose;
 	int show_progress;
 	struct survey_refs_wanted refs;
+	enum survey_format format;
 };
 
 struct survey_report_ref_summary {
@@ -78,6 +85,161 @@ static void clear_survey_context(struct survey_context *ctx)
 	strvec_clear(&ctx->refs);
 }
 
+struct survey_table {
+	const char *table_name;
+	struct strvec header;
+	struct strvec *rows;
+	size_t rows_nr;
+	size_t rows_alloc;
+};
+
+#define SURVEY_TABLE_INIT {	\
+	.header = STRVEC_INIT,	\
+}
+
+static void clear_table(struct survey_table *table)
+{
+	strvec_clear(&table->header);
+	for (size_t i = 0; i < table->rows_nr; i++)
+		strvec_clear(&table->rows[i]);
+	free(table->rows);
+}
+
+static void insert_table_rowv(struct survey_table *table, ...)
+{
+	va_list ap;
+	char *arg;
+	ALLOC_GROW(table->rows, table->rows_nr + 1, table->rows_alloc);
+
+	memset(&table->rows[table->rows_nr], 0, sizeof(struct strvec));
+
+	va_start(ap, table);
+	while ((arg = va_arg(ap, char *)))
+		strvec_push(&table->rows[table->rows_nr], arg);
+	va_end(ap);
+
+	table->rows_nr++;
+}
+
+static void print_table_title(const char *name, size_t *widths, size_t nr)
+{
+	static struct strbuf lines = STRBUF_INIT;
+	size_t width = 0;
+	strbuf_setlen(&lines, 0);
+
+	strbuf_addch(&lines, ' ');
+	strbuf_addstr(&lines, name);
+	strbuf_addch(&lines, '\n');
+
+	for (size_t i = 0; i < nr; i++) {
+		if (i)
+			width += 3;
+		width += widths[i];
+	}
+	strbuf_addchars(&lines, '=', width);
+	printf("%s\n", lines.buf);
+}
+
+static void print_row_plaintext(struct strvec *row, size_t *widths)
+{
+	static struct strbuf line = STRBUF_INIT;
+	strbuf_setlen(&line, 0);
+
+	for (size_t i = 0; i < row->nr; i++) {
+		const char *str = row->v[i];
+		size_t len = strlen(str);
+		if (i)
+			strbuf_add(&line, " | ", 3);
+		strbuf_addchars(&line, ' ', widths[i] - len);
+		strbuf_add(&line, str, len);
+	}
+	printf("%s\n", line.buf);
+}
+
+static void print_divider_plaintext(size_t *widths, size_t nr)
+{
+	static struct strbuf line = STRBUF_INIT;
+	strbuf_setlen(&line, 0);
+
+	for (size_t i = 0; i < nr; i++) {
+		if (i)
+			strbuf_add(&line, "-+-", 3);
+		strbuf_addchars(&line, '-', widths[i]);
+	}
+	printf("%s\n", line.buf);
+}
+
+static void print_table_plaintext(struct survey_table *table)
+{
+	size_t *column_widths;
+	size_t columns_nr = table->header.nr;
+	CALLOC_ARRAY(column_widths, columns_nr);
+
+	for (size_t i = 0; i < columns_nr; i++) {
+		column_widths[i] = strlen(table->header.v[i]);
+
+		for (size_t j = 0; j < table->rows_nr; j++) {
+			size_t rowlen = strlen(table->rows[j].v[i]);
+			if (column_widths[i] < rowlen)
+				column_widths[i] = rowlen;
+		}
+	}
+
+	print_table_title(table->table_name, column_widths, columns_nr);
+	print_row_plaintext(&table->header, column_widths);
+	print_divider_plaintext(column_widths, columns_nr);
+
+	for (size_t j = 0; j < table->rows_nr; j++)
+		print_row_plaintext(&table->rows[j], column_widths);
+}
+
+static void survey_report_plaintext_refs(struct survey_context *ctx)
+{
+	struct survey_report_ref_summary *refs = &ctx->report.refs;
+	struct survey_table table = SURVEY_TABLE_INIT;
+
+	table.table_name = _("REFERENCES SUMMARY");
+
+	strvec_push(&table.header, _("Ref Type"));
+	strvec_push(&table.header, _("Count"));
+
+	if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_branches) {
+		char *fmt = xstrfmt("%"PRIuMAX"", refs->branches_nr);
+		insert_table_rowv(&table, _("Branches"), fmt, NULL);
+		free(fmt);
+	}
+
+	if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_remotes) {
+		char *fmt = xstrfmt("%"PRIuMAX"", refs->remote_refs_nr);
+		insert_table_rowv(&table, _("Remote refs"), fmt, NULL);
+		free(fmt);
+	}
+
+	if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_tags) {
+		char *fmt = xstrfmt("%"PRIuMAX"", refs->tags_nr);
+		insert_table_rowv(&table, _("Tags (all)"), fmt, NULL);
+		free(fmt);
+		fmt = xstrfmt("%"PRIuMAX"", refs->tags_annotated_nr);
+		insert_table_rowv(&table, _("Tags (annotated)"), fmt, NULL);
+		free(fmt);
+	}
+
+	print_table_plaintext(&table);
+	clear_table(&table);
+}
+
+static void survey_report_plaintext(struct survey_context *ctx)
+{
+	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
+	printf("-----------------------------------------------------\n");
+	survey_report_plaintext_refs(ctx);
+}
+
+static void survey_report_json(struct survey_context *ctx)
+{
+	/* TODO. */
+}
+
 /*
  * After parsing the command line arguments, figure out which refs we
  * should scan.
@@ -312,6 +474,19 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	survey_phase_refs(&ctx);
 
+	switch (ctx.opts.format) {
+	case SURVEY_PLAINTEXT:
+		survey_report_plaintext(&ctx);
+		break;
+
+	case SURVEY_JSON:
+		survey_report_json(&ctx);
+		break;
+
+	default:
+		BUG("Undefined format");
+	}
+
 	clear_survey_context(&ctx);
 	return 0;
 }
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index 5903c90cb570ac..a57f6ca7a59b06 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -21,7 +21,22 @@ test_expect_success 'creat a semi-interesting repo' '
 
 test_expect_success 'git survey (default)' '
 	git survey >out 2>err &&
-	test_line_count = 0 err
+	test_line_count = 0 err &&
+
+	cat >expect <<-EOF &&
+	GIT SURVEY for "$(pwd)"
+	-----------------------------------------------------
+	 REFERENCES SUMMARY
+	========================
+	        Ref Type | Count
+	-----------------+------
+	        Branches |     1
+	     Remote refs |     0
+	      Tags (all) |     0
+	Tags (annotated) |     0
+	EOF
+
+	test_cmp expect out
 '
 
 test_done

From fcc281ac2bfabb6d19e6be40c41157612c5a3f83 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 20:33:47 -0400
Subject: [PATCH 13/30] survey: add object count summary

At the moment, nothing is obvious about the reason for the use of the
path-walk API, but this will become more prevelant in future iterations. For
now, use the path-walk API to sum up the counts of each kind of object.

For example, this is the reachable object summary output for my local repo:

REACHABLE OBJECT SUMMARY
========================
Object Type |  Count
------------+-------
       Tags |      0
    Commits | 178573
      Trees | 312745
      Blobs | 183035

(Note: the "Tags" are zero right now because the path-walk API has not been
integrated to walk tags yet. This will be fixed in a later change.)

RFC TODO: make sure tags are walked before this change.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 196 ++++++++++++++++++++++++++++++++++++++++--
 t/t8100-git-survey.sh |  26 ++++--
 2 files changed, 209 insertions(+), 13 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index b2104e84d6114e..504b4edafce77f 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,12 +1,19 @@
 #include "builtin.h"
 #include "config.h"
+#include "environment.h"
+#include "hex.h"
 #include "object.h"
+#include "object-name.h"
 #include "object-store-ll.h"
 #include "parse-options.h"
+#include "path-walk.h"
 #include "progress.h"
 #include "ref-filter.h"
+#include "refs.h"
+#include "revision.h"
 #include "strbuf.h"
 #include "strvec.h"
+#include "tag.h"
 #include "trace2.h"
 
 static const char * const survey_usage[] = {
@@ -50,12 +57,20 @@ struct survey_report_ref_summary {
 	size_t unknown_nr;
 };
 
+struct survey_report_object_summary {
+	size_t commits_nr;
+	size_t tags_nr;
+	size_t trees_nr;
+	size_t blobs_nr;
+};
+
 /**
  * This struct contains all of the information that needs to be printed
  * at the end of the exploration of the repository and its references.
  */
 struct survey_report {
 	struct survey_report_ref_summary refs;
+	struct survey_report_object_summary reachable_objects;
 };
 
 struct survey_context {
@@ -78,10 +93,12 @@ struct survey_context {
 	size_t progress_total;
 
 	struct strvec refs;
+	struct ref_array ref_array;
 };
 
 static void clear_survey_context(struct survey_context *ctx)
 {
+	ref_array_clear(&ctx->ref_array);
 	strvec_clear(&ctx->refs);
 }
 
@@ -125,10 +142,12 @@ static void print_table_title(const char *name, size_t *widths, size_t nr)
 {
 	static struct strbuf lines = STRBUF_INIT;
 	size_t width = 0;
+	size_t min_width;
 	strbuf_setlen(&lines, 0);
 
-	strbuf_addch(&lines, ' ');
+	strbuf_addch(&lines, '\n');
 	strbuf_addstr(&lines, name);
+	min_width = lines.len - 1;
 	strbuf_addch(&lines, '\n');
 
 	for (size_t i = 0; i < nr; i++) {
@@ -136,6 +155,10 @@ static void print_table_title(const char *name, size_t *widths, size_t nr)
 			width += 3;
 		width += widths[i];
 	}
+
+	if (width < min_width)
+		width = min_width;
+
 	strbuf_addchars(&lines, '=', width);
 	printf("%s\n", lines.buf);
 }
@@ -228,11 +251,43 @@ static void survey_report_plaintext_refs(struct survey_context *ctx)
 	clear_table(&table);
 }
 
+static void survey_report_plaintext_reachable_object_summary(struct survey_context *ctx)
+{
+	struct survey_report_object_summary *objs = &ctx->report.reachable_objects;
+	struct survey_table table = SURVEY_TABLE_INIT;
+	char *fmt;
+
+	table.table_name = _("REACHABLE OBJECT SUMMARY");
+
+	strvec_push(&table.header, _("Object Type"));
+	strvec_push(&table.header, _("Count"));
+
+	fmt = xstrfmt("%"PRIuMAX"", objs->tags_nr);
+	insert_table_rowv(&table, _("Tags"), fmt, NULL);
+	free(fmt);
+
+	fmt = xstrfmt("%"PRIuMAX"", objs->commits_nr);
+	insert_table_rowv(&table, _("Commits"), fmt, NULL);
+	free(fmt);
+
+	fmt = xstrfmt("%"PRIuMAX"", objs->trees_nr);
+	insert_table_rowv(&table, _("Trees"), fmt, NULL);
+	free(fmt);
+
+	fmt = xstrfmt("%"PRIuMAX"", objs->blobs_nr);
+	insert_table_rowv(&table, _("Blobs"), fmt, NULL);
+	free(fmt);
+
+	print_table_plaintext(&table);
+	clear_table(&table);
+}
+
 static void survey_report_plaintext(struct survey_context *ctx)
 {
 	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
 	printf("-----------------------------------------------------\n");
 	survey_report_plaintext_refs(ctx);
+	survey_report_plaintext_reachable_object_summary(ctx);
 }
 
 static void survey_report_json(struct survey_context *ctx)
@@ -384,15 +439,13 @@ static void do_load_refs(struct survey_context *ctx,
  */
 static void survey_phase_refs(struct survey_context *ctx)
 {
-	struct ref_array ref_array = { 0 };
-
 	trace2_region_enter("survey", "phase/refs", ctx->repo);
-	do_load_refs(ctx, &ref_array);
+	do_load_refs(ctx, &ctx->ref_array);
 
-	ctx->report.refs.refs_nr = ref_array.nr;
-	for (size_t i = 0; i < ref_array.nr; i++) {
+	ctx->report.refs.refs_nr = ctx->ref_array.nr;
+	for (size_t i = 0; i < ctx->ref_array.nr; i++) {
 		size_t size;
-		struct ref_array_item *item = ref_array.items[i];
+		struct ref_array_item *item = ctx->ref_array.items[i];
 
 		switch (item->kind) {
 		case FILTER_REFS_TAGS:
@@ -422,8 +475,133 @@ static void survey_phase_refs(struct survey_context *ctx)
 	}
 
 	trace2_region_leave("survey", "phase/refs", ctx->repo);
+}
+
+static void increment_object_counts(
+		struct survey_report_object_summary *summary,
+		enum object_type type,
+		size_t nr)
+{
+	switch (type) {
+	case OBJ_COMMIT:
+		summary->commits_nr += nr;
+		break;
+
+	case OBJ_TREE:
+		summary->trees_nr += nr;
+		break;
+
+	case OBJ_BLOB:
+		summary->blobs_nr += nr;
+		break;
+
+	default:
+		break;
+	}
+}
+
+static int survey_objects_path_walk_fn(const char *path,
+				       struct oid_array *oids,
+				       enum object_type type,
+				       void *data)
+{
+	struct survey_context *ctx = data;
+
+	increment_object_counts(&ctx->report.reachable_objects,
+				type, oids->nr);
+
+	return 0;
+}
+
+static int iterate_tag_chain(struct survey_context *ctx,
+			     struct object_id *oid,
+			     struct object_id *peeled)
+{
+	struct object *o = lookup_unknown_object(ctx->repo, oid);
+	struct tag *t;
+
+	if (o->type != OBJ_TAG) {
+		oidcpy(peeled, &o->oid);
+		return o->type != OBJ_COMMIT;
+	}
+
+	t = lookup_tag(ctx->repo, oid);
+	while (t) {
+		parse_tag(t);
+		ctx->report.reachable_objects.tags_nr++;
+
+		if (!t->tagged)
+			break;
+
+		o = lookup_unknown_object(ctx->repo, &t->tagged->oid);
+		if (o && o->type == OBJ_TAG)
+			t = lookup_tag(ctx->repo, &t->tagged->oid);
+		else
+			break;
+	}
+
+	if (!t || !t->tagged)
+		return -1;
 
-	ref_array_clear(&ref_array);
+	oidcpy(peeled, &t->tagged->oid);
+	o = lookup_unknown_object(ctx->repo, peeled);
+	if (o && o->type == OBJ_COMMIT)
+		return 0;
+	return -1;
+}
+
+static void survey_phase_objects(struct survey_context *ctx)
+{
+	struct rev_info revs = REV_INFO_INIT;
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	unsigned int add_flags = 0;
+
+	trace2_region_enter("survey", "phase/objects", ctx->repo);
+
+	info.revs = &revs;
+	info.path_fn = survey_objects_path_walk_fn;
+	info.path_fn_data = ctx;
+
+	info.commits = 1;
+	info.trees = 1;
+	info.blobs = 1;
+	info.tags = 1;
+
+	repo_init_revisions(ctx->repo, &revs, "");
+
+	for (size_t i = 0; i < ctx->ref_array.nr; i++) {
+		struct ref_array_item *item = ctx->ref_array.items[i];
+		struct object_id peeled;
+
+		switch (item->kind) {
+		case FILTER_REFS_TAGS:
+			if (!iterate_tag_chain(ctx, &item->objectname, &peeled))
+				add_pending_oid(&revs, NULL, &peeled, add_flags);
+			break;
+		case FILTER_REFS_BRANCHES:
+			add_pending_oid(&revs, NULL, &item->objectname, add_flags);
+			break;
+		case FILTER_REFS_REMOTES:
+			add_pending_oid(&revs, NULL, &item->objectname, add_flags);
+			break;
+		case FILTER_REFS_OTHERS:
+			/*
+			 * This may be a note, stash, or custom namespace branch.
+			 */
+			add_pending_oid(&revs, NULL, &item->objectname, add_flags);
+			break;
+		case FILTER_REFS_DETACHED_HEAD:
+			add_pending_oid(&revs, NULL, &item->objectname, add_flags);
+			break;
+		default:
+			break;
+		}
+	}
+
+	walk_objects_by_path(&info);
+
+	release_revisions(&revs);
+	trace2_region_leave("survey", "phase/objects", ctx->repo);
 }
 
 int cmd_survey(int argc, const char **argv, const char *prefix)
@@ -474,6 +652,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	survey_phase_refs(&ctx);
 
+	survey_phase_objects(&ctx);
+
 	switch (ctx.opts.format) {
 	case SURVEY_PLAINTEXT:
 		survey_report_plaintext(&ctx);
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index a57f6ca7a59b06..0da92eafa955ea 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -16,24 +16,40 @@ test_expect_success 'git survey -h shows experimental warning' '
 '
 
 test_expect_success 'creat a semi-interesting repo' '
-	test_commit_bulk 10
+	test_commit_bulk 10 &&
+	git tag -a -m one one HEAD~5 &&
+	git tag -a -m two two HEAD~3 &&
+	git tag -a -m three three two &&
+	git tag -a -m four four three &&
+	git update-ref -d refs/tags/three &&
+	git update-ref -d refs/tags/two
 '
 
 test_expect_success 'git survey (default)' '
-	git survey >out 2>err &&
+	git survey --all-refs >out 2>err &&
 	test_line_count = 0 err &&
 
 	cat >expect <<-EOF &&
 	GIT SURVEY for "$(pwd)"
 	-----------------------------------------------------
-	 REFERENCES SUMMARY
+
+	REFERENCES SUMMARY
 	========================
 	        Ref Type | Count
 	-----------------+------
 	        Branches |     1
 	     Remote refs |     0
-	      Tags (all) |     0
-	Tags (annotated) |     0
+	      Tags (all) |     2
+	Tags (annotated) |     2
+
+	REACHABLE OBJECT SUMMARY
+	========================
+	Object Type | Count
+	------------+------
+	       Tags |     0
+	    Commits |    10
+	      Trees |    10
+	      Blobs |    10
 	EOF
 
 	test_cmp expect out

From 462ca0b80d29218647ffd26c2ae22c359917f00c Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 20:58:35 -0400
Subject: [PATCH 14/30] survey: summarize total sizes by object type

Now that we have explored objects by count, we can expand that a bit more to
summarize the data for the on-disk and inflated size of those objects. This
information is helpful for diagnosing both why disk space (and perhaps
clone or fetch times) is growing but also why certain operations are slow
because the inflated size of the abstract objects that must be processed is
so large.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 113 ++++++++++++++++++++++++++++++++++++++++++
 t/t8100-git-survey.sh |   8 +++
 2 files changed, 121 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 504b4edafce77f..435c4bd452a963 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -64,6 +64,19 @@ struct survey_report_object_summary {
 	size_t blobs_nr;
 };
 
+/**
+ * For some category given by 'label', count the number of objects
+ * that match that label along with the on-disk size and the size
+ * after decompressing (both with delta bases and zlib).
+ */
+struct survey_report_object_size_summary {
+	char *label;
+	size_t nr;
+	size_t disk_size;
+	size_t inflated_size;
+	size_t num_missing;
+};
+
 /**
  * This struct contains all of the information that needs to be printed
  * at the end of the exploration of the repository and its references.
@@ -71,8 +84,15 @@ struct survey_report_object_summary {
 struct survey_report {
 	struct survey_report_ref_summary refs;
 	struct survey_report_object_summary reachable_objects;
+
+	struct survey_report_object_size_summary *by_type;
 };
 
+#define REPORT_TYPE_COMMIT 0
+#define REPORT_TYPE_TREE 1
+#define REPORT_TYPE_BLOB 2
+#define REPORT_TYPE_COUNT 3
+
 struct survey_context {
 	/* Options that control what is done. */
 	struct survey_opts opts;
@@ -282,12 +302,41 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte
 	clear_table(&table);
 }
 
+static void survey_report_object_sizes(const char *title,
+				       const char *categories,
+				       struct survey_report_object_size_summary *summary,
+				       size_t summary_nr)
+{
+	struct survey_table table = SURVEY_TABLE_INIT;
+	table.table_name = title;
+
+	strvec_push(&table.header, xstrdup(categories));
+	strvec_push(&table.header, xstrdup(_("Count")));
+	strvec_push(&table.header, xstrdup(_("Disk Size")));
+	strvec_push(&table.header, xstrdup(_("Inflated Size")));
+
+	for (size_t i = 0; i < summary_nr; i++) {
+		insert_table_rowv(&table, xstrdup(summary[i].label),
+				  xstrfmt("%"PRIuMAX, summary[i].nr),
+				  xstrfmt("%"PRIuMAX, summary[i].disk_size),
+				  xstrfmt("%"PRIuMAX, summary[i].inflated_size),
+				  NULL);
+	}
+
+	print_table_plaintext(&table);
+	clear_table(&table);
+}
+
 static void survey_report_plaintext(struct survey_context *ctx)
 {
 	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
 	printf("-----------------------------------------------------\n");
 	survey_report_plaintext_refs(ctx);
 	survey_report_plaintext_reachable_object_summary(ctx);
+	survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"),
+				   _("Object Type"),
+				   ctx->report.by_type,
+				   REPORT_TYPE_COUNT);
 }
 
 static void survey_report_json(struct survey_context *ctx)
@@ -500,6 +549,64 @@ static void increment_object_counts(
 	}
 }
 
+static void increment_totals(struct survey_context *ctx,
+			     struct oid_array *oids,
+			     struct survey_report_object_size_summary *summary)
+{
+	for (size_t i = 0; i < oids->nr; i++) {
+		struct object_info oi = OBJECT_INFO_INIT;
+		unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH;
+		unsigned long object_length = 0;
+		off_t disk_sizep = 0;
+		enum object_type type;
+
+		oi.typep = &type;
+		oi.sizep = &object_length;
+		oi.disk_sizep = &disk_sizep;
+
+		if (oid_object_info_extended(ctx->repo, &oids->oid[i],
+					     &oi, oi_flags) < 0) {
+			summary->num_missing++;
+		} else {
+			summary->nr++;
+			summary->disk_size += disk_sizep;
+			summary->inflated_size += object_length;
+		}
+	}
+}
+
+static void increment_object_totals(struct survey_context *ctx,
+				    struct oid_array *oids,
+				    enum object_type type)
+{
+	struct survey_report_object_size_summary *total;
+	struct survey_report_object_size_summary summary = { 0 };
+
+	increment_totals(ctx, oids, &summary);
+
+	switch (type) {
+	case OBJ_COMMIT:
+		total = &ctx->report.by_type[REPORT_TYPE_COMMIT];
+		break;
+
+	case OBJ_TREE:
+		total = &ctx->report.by_type[REPORT_TYPE_TREE];
+		break;
+
+	case OBJ_BLOB:
+		total = &ctx->report.by_type[REPORT_TYPE_BLOB];
+		break;
+
+	default:
+		BUG("No other type allowed");
+	}
+
+	total->nr += summary.nr;
+	total->disk_size += summary.disk_size;
+	total->inflated_size += summary.inflated_size;
+	total->num_missing += summary.num_missing;
+}
+
 static int survey_objects_path_walk_fn(const char *path,
 				       struct oid_array *oids,
 				       enum object_type type,
@@ -509,6 +616,7 @@ static int survey_objects_path_walk_fn(const char *path,
 
 	increment_object_counts(&ctx->report.reachable_objects,
 				type, oids->nr);
+	increment_object_totals(ctx, oids, type);
 
 	return 0;
 }
@@ -567,6 +675,11 @@ static void survey_phase_objects(struct survey_context *ctx)
 	info.blobs = 1;
 	info.tags = 1;
 
+	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
+	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
+	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
+	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+
 	repo_init_revisions(ctx->repo, &revs, "");
 
 	for (size_t i = 0; i < ctx->ref_array.nr; i++) {
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index 0da92eafa955ea..f8af96012149de 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -50,6 +50,14 @@ test_expect_success 'git survey (default)' '
 	    Commits |    10
 	      Trees |    10
 	      Blobs |    10
+
+	TOTAL OBJECT SIZES BY TYPE
+	===============================================
+	Object Type | Count | Disk Size | Inflated Size
+	------------+-------+-----------+--------------
+	    Commits |    10 |      1523 |          2153
+	      Trees |    10 |       495 |          1706
+	      Blobs |    10 |       191 |           101
 	EOF
 
 	test_cmp expect out

From 9c54c14435742927a66487df2862204aca8e6fc7 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 21:21:54 -0400
Subject: [PATCH 15/30] survey: show progress during object walk

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 435c4bd452a963..baaaf8a63748e3 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -618,6 +618,9 @@ static int survey_objects_path_walk_fn(const char *path,
 				type, oids->nr);
 	increment_object_totals(ctx, oids, type);
 
+	ctx->progress_nr += oids->nr;
+	display_progress(ctx->progress, ctx->progress_nr);
+
 	return 0;
 }
 
@@ -682,6 +685,11 @@ static void survey_phase_objects(struct survey_context *ctx)
 
 	repo_init_revisions(ctx->repo, &revs, "");
 
+	ctx->progress_nr = 0;
+	ctx->progress_total = ctx->ref_array.nr;
+	if (ctx->opts.show_progress)
+		ctx->progress = start_progress(_("Preparing object walk"),
+					       ctx->progress_total);
 	for (size_t i = 0; i < ctx->ref_array.nr; i++) {
 		struct ref_array_item *item = ctx->ref_array.items[i];
 		struct object_id peeled;
@@ -709,9 +717,17 @@ static void survey_phase_objects(struct survey_context *ctx)
 		default:
 			break;
 		}
+
+		display_progress(ctx->progress, ++(ctx->progress_nr));
 	}
+	stop_progress(&ctx->progress);
 
+	ctx->progress_nr = 0;
+	ctx->progress_total = 0;
+	if (ctx->opts.show_progress)
+		ctx->progress = start_progress(_("Walking objects"), 0);
 	walk_objects_by_path(&info);
+	stop_progress(&ctx->progress);
 
 	release_revisions(&revs);
 	trace2_region_leave("survey", "phase/objects", ctx->repo);

From 3504abb269b5229d4aaff4db9f4d694d925ac1b2 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 22:35:06 -0400
Subject: [PATCH 16/30] survey: add ability to track prioritized lists

In future changes, we will make use of these methods. The intention is to
keep track of the top contributors according to some metric. We don't want
to store all of the entries and do a sort at the end, so track a
constant-size table and remove rows that get pushed out depending on the
chosen sorting algorithm.

Co-authored-by: Jeff Hostetler <git@jeffhostetler.com>
Signed-off-by; Jeff Hostetler <git@jeffhostetler.com>
Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index baaaf8a63748e3..ad467e9a88ccff 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -77,6 +77,102 @@ struct survey_report_object_size_summary {
 	size_t num_missing;
 };
 
+typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
+				   struct survey_report_object_size_summary *s2);
+
+MAYBE_UNUSED
+static int cmp_by_nr(struct survey_report_object_size_summary *s1,
+		     struct survey_report_object_size_summary *s2)
+{
+	if (s1->nr < s2->nr)
+		return -1;
+	if (s1->nr > s2->nr)
+		return 1;
+	return 0;
+}
+
+MAYBE_UNUSED
+static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
+			    struct survey_report_object_size_summary *s2)
+{
+	if (s1->disk_size < s2->disk_size)
+		return -1;
+	if (s1->disk_size > s2->disk_size)
+		return 1;
+	return 0;
+}
+
+MAYBE_UNUSED
+static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
+				struct survey_report_object_size_summary *s2)
+{
+	if (s1->inflated_size < s2->inflated_size)
+		return -1;
+	if (s1->inflated_size > s2->inflated_size)
+		return 1;
+	return 0;
+}
+
+/**
+ * Store a list of "top" categories by some sorting function. When
+ * inserting a new category, reorder the list and free the one that
+ * got ejected (if any).
+ */
+struct survey_report_top_sizes {
+	const char *name;
+	survey_top_size_cmp cmp_fn;
+	struct survey_report_object_size_summary *data;
+	size_t nr;
+	size_t alloc;
+};
+
+MAYBE_UNUSED
+static void init_top_sizes(struct survey_report_top_sizes *top,
+			   size_t limit, const char *name,
+			   survey_top_size_cmp cmp)
+{
+	top->name = name;
+	top->alloc = limit;
+	top->nr = 0;
+	CALLOC_ARRAY(top->data, limit);
+	top->cmp_fn = cmp;
+}
+
+MAYBE_UNUSED
+static void clear_top_sizes(struct survey_report_top_sizes *top)
+{
+	for (size_t i = 0; i < top->nr; i++)
+		free(top->data[i].label);
+	free(top->data);
+}
+
+MAYBE_UNUSED
+static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
+				       struct survey_report_object_size_summary *summary)
+{
+	size_t pos = top->nr;
+
+	/* Compare against list from the bottom. */
+	while (pos > 0 && top->cmp_fn(&top->data[pos - 1], summary) < 0)
+		pos--;
+
+	/* Not big enough! */
+	if (pos >= top->alloc)
+		return;
+
+	/* We need to shift the data. */
+	if (top->nr == top->alloc)
+		free(top->data[top->nr - 1].label);
+	else
+		top->nr++;
+
+	for (size_t i = top->nr - 1; i > pos; i--)
+		memcpy(&top->data[i], &top->data[i - 1], sizeof(*top->data));
+
+	memcpy(&top->data[pos], summary, sizeof(*summary));
+	top->data[pos].label = xstrdup(summary->label);
+}
+
 /**
  * This struct contains all of the information that needs to be printed
  * at the end of the exploration of the repository and its references.

From 9e95914d393ff83054ee419b58b9db4d3560a36c Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 1 Sep 2024 22:35:40 -0400
Subject: [PATCH 17/30] survey: add report of "largest" paths

Since we are already walking our reachable objects using the path-walk API,
let's now collect lists of the paths that contribute most to different
metrics. Specifically, we care about

 * Number of versions.
 * Total size on disk.
 * Total inflated size (no delta or zlib compression).

This information can be critical to discovering which parts of the
repository are causing the most growth, especially on-disk size. Different
packing strategies might help compress data more efficiently, but the toal
inflated size is a representation of the raw size of all snapshots of those
paths. Even when stored efficiently on disk, that size represents how much
information must be processed to complete a command such as 'git blame'.

Since the on-disk size is likely to be fragile, stop testing the exact
output of 'git survey' and check that the correct set of headers is
output.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 90 +++++++++++++++++++++++++++++++++++++------
 t/t8100-git-survey.sh | 12 +++++-
 2 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index ad467e9a88ccff..90b041967c887e 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -80,7 +80,6 @@ struct survey_report_object_size_summary {
 typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
 				   struct survey_report_object_size_summary *s2);
 
-MAYBE_UNUSED
 static int cmp_by_nr(struct survey_report_object_size_summary *s1,
 		     struct survey_report_object_size_summary *s2)
 {
@@ -91,7 +90,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1,
 	return 0;
 }
 
-MAYBE_UNUSED
 static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
 			    struct survey_report_object_size_summary *s2)
 {
@@ -102,7 +100,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
 	return 0;
 }
 
-MAYBE_UNUSED
 static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
 				struct survey_report_object_size_summary *s2)
 {
@@ -126,7 +123,6 @@ struct survey_report_top_sizes {
 	size_t alloc;
 };
 
-MAYBE_UNUSED
 static void init_top_sizes(struct survey_report_top_sizes *top,
 			   size_t limit, const char *name,
 			   survey_top_size_cmp cmp)
@@ -146,7 +142,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top)
 	free(top->data);
 }
 
-MAYBE_UNUSED
 static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
 				       struct survey_report_object_size_summary *summary)
 {
@@ -182,6 +177,10 @@ struct survey_report {
 	struct survey_report_object_summary reachable_objects;
 
 	struct survey_report_object_size_summary *by_type;
+
+	struct survey_report_top_sizes *top_paths_by_count;
+	struct survey_report_top_sizes *top_paths_by_disk;
+	struct survey_report_top_sizes *top_paths_by_inflate;
 };
 
 #define REPORT_TYPE_COMMIT 0
@@ -423,6 +422,13 @@ static void survey_report_object_sizes(const char *title,
 	clear_table(&table);
 }
 
+static void survey_report_plaintext_sorted_size(
+		struct survey_report_top_sizes *top)
+{
+	survey_report_object_sizes(top->name,  _("Path"),
+				   top->data, top->nr);
+}
+
 static void survey_report_plaintext(struct survey_context *ctx)
 {
 	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -433,6 +439,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
 				   _("Object Type"),
 				   ctx->report.by_type,
 				   REPORT_TYPE_COUNT);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
 }
 
 static void survey_report_json(struct survey_context *ctx)
@@ -673,7 +694,8 @@ static void increment_totals(struct survey_context *ctx,
 
 static void increment_object_totals(struct survey_context *ctx,
 				    struct oid_array *oids,
-				    enum object_type type)
+				    enum object_type type,
+				    const char *path)
 {
 	struct survey_report_object_size_summary *total;
 	struct survey_report_object_size_summary summary = { 0 };
@@ -701,6 +723,27 @@ static void increment_object_totals(struct survey_context *ctx,
 	total->disk_size += summary.disk_size;
 	total->inflated_size += summary.inflated_size;
 	total->num_missing += summary.num_missing;
+
+	if (type == OBJ_TREE || type == OBJ_BLOB) {
+		int index = type == OBJ_TREE ?
+			    REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
+		struct survey_report_top_sizes *top;
+
+		/*
+		 * Temporarily store (const char *) here, but it will
+		 * be duped if inserted and will not be freed.
+		 */
+		summary.label = (char *)path;
+
+		top = ctx->report.top_paths_by_count;
+		maybe_insert_into_top_size(&top[index], &summary);
+
+		top = ctx->report.top_paths_by_disk;
+		maybe_insert_into_top_size(&top[index], &summary);
+
+		top = ctx->report.top_paths_by_inflate;
+		maybe_insert_into_top_size(&top[index], &summary);
+	}
 }
 
 static int survey_objects_path_walk_fn(const char *path,
@@ -712,7 +755,7 @@ static int survey_objects_path_walk_fn(const char *path,
 
 	increment_object_counts(&ctx->report.reachable_objects,
 				type, oids->nr);
-	increment_object_totals(ctx, oids, type);
+	increment_object_totals(ctx, oids, type, path);
 
 	ctx->progress_nr += oids->nr;
 	display_progress(ctx->progress, ctx->progress_nr);
@@ -757,6 +800,34 @@ static int iterate_tag_chain(struct survey_context *ctx,
 	return -1;
 }
 
+static void initialize_report(struct survey_context *ctx)
+{
+	const int top_limit = 100;
+
+	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
+	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
+	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
+	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
+	init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
+	init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
+	init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
+}
+
 static void survey_phase_objects(struct survey_context *ctx)
 {
 	struct rev_info revs = REV_INFO_INIT;
@@ -774,10 +845,7 @@ static void survey_phase_objects(struct survey_context *ctx)
 	info.blobs = 1;
 	info.tags = 1;
 
-	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
-	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
-	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
-	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+	initialize_report(ctx);
 
 	repo_init_revisions(ctx->repo, &revs, "");
 
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index f8af96012149de..c2dab0033f9379 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -60,7 +60,17 @@ test_expect_success 'git survey (default)' '
 	      Blobs |    10 |       191 |           101
 	EOF
 
-	test_cmp expect out
+	lines=$(wc -l <expect) &&
+	head -n $lines out >out-trimmed &&
+	test_cmp expect out-trimmed &&
+
+	for type in "DIRECTORIES" "FILES"
+	do
+		for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
+		do
+			grep "TOP $type BY $metric" out || return 1
+		done || return 1
+	done
 '
 
 test_done

From 98a854c4b542309269f56ba0ae8b9a7c1504e409 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 6 Sep 2024 14:16:13 -0400
Subject: [PATCH 18/30] revision: create mark_trees_uninteresting_dense()

The sparse tree walk algorithm was created in d5d2e93577e (revision:
implement sparse algorithm, 2019-01-16) and involves using the
mark_trees_uninteresting_sparse() method. This method takes a repository
and an oidset of tree IDs, some of which have the UNINTERESTING flag and
some of which do not.

Create a method that has an equivalent set of preconditions but uses a
"dense" walk (recursively visits all reachable trees, as long as they
have not previously been marked UNINTERESTING). This is an important
difference from mark_tree_uninteresting(), which short-circuits if the
given tree has the UNINTERESTING flag.

A use of this method will be added in a later change, with a condition
set whether the sparse or dense approach should be used.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 revision.c | 15 +++++++++++++++
 revision.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/revision.c b/revision.c
index ac94f8d4292ff3..21c8b6d1bc0486 100644
--- a/revision.c
+++ b/revision.c
@@ -219,6 +219,21 @@ static void add_children_by_path(struct repository *r,
 	free_tree_buffer(tree);
 }
 
+void mark_trees_uninteresting_dense(struct repository *r,
+				    struct oidset *trees)
+{
+	struct object_id *oid;
+	struct oidset_iter iter;
+
+	oidset_iter_init(trees, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		struct tree *tree = lookup_tree(r, oid);
+
+		if (tree->object.flags & UNINTERESTING)
+			mark_tree_contents_uninteresting(r, tree);
+	}
+}
+
 void mark_trees_uninteresting_sparse(struct repository *r,
 				     struct oidset *trees)
 {
diff --git a/revision.h b/revision.h
index 0e470d1df19f69..6c3df8e42bfa6d 100644
--- a/revision.h
+++ b/revision.h
@@ -487,6 +487,7 @@ void put_revision_mark(const struct rev_info *revs,
 
 void mark_parents_uninteresting(struct rev_info *revs, struct commit *commit);
 void mark_tree_uninteresting(struct repository *r, struct tree *tree);
+void mark_trees_uninteresting_dense(struct repository *r, struct oidset *trees);
 void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees);
 
 void show_object_with_name(FILE *, struct object *, const char *);

From 78168d98bfc0df7151eac5280e12b95c9fb694ec Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Tue, 3 Sep 2024 21:55:47 -0400
Subject: [PATCH 19/30] path-walk: add prune_all_uninteresting option

This option causes the path-walk API to act like the sparse tree-walk
algorithm implemented by mark_trees_uninteresting_sparse() in
list-objects.c.

Starting from the commits marked as UNINTERESTING, their root trees and
all objects reachable from those trees are UNINTERSTING, at least as we
walk path-by-path. When we reach a path where all objects associated
with that path are marked UNINTERESTING, then do no continue walking the
children of that path.

We need to be careful to pass the UNINTERESTING flag in a deep way on
the UNINTERESTING objects before we start the path-walk, or else the
depth-first search for the path-walk API may accidentally report some
objects as interesting.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 path-walk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 path-walk.h |  8 +++++++
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/path-walk.c b/path-walk.c
index 65f9856afa2f66..08de29614f7299 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -23,6 +23,7 @@ struct type_and_oid_list
 {
 	enum object_type type;
 	struct oid_array oids;
+	int maybe_interesting;
 };
 
 #define TYPE_AND_OID_LIST_INIT { \
@@ -139,6 +140,9 @@ static int add_children(struct path_walk_context *ctx,
 			list->type = type;
 			strmap_put(&ctx->paths_to_lists, path.buf, list);
 			string_list_append(&ctx->path_stack, path.buf);
+
+			if (!(o->flags & UNINTERESTING))
+				list->maybe_interesting = 1;
 		}
 		oid_array_append(&list->oids, &entry.oid);
 	}
@@ -161,6 +165,40 @@ static int walk_path(struct path_walk_context *ctx,
 
 	list = strmap_get(&ctx->paths_to_lists, path);
 
+	if (ctx->info->prune_all_uninteresting) {
+		/*
+		 * This is true if all objects were UNINTERESTING
+		 * when added to the list.
+		 */
+		if (!list->maybe_interesting)
+			return 0;
+
+		/*
+		 * But it's still possible that the objects were set
+		 * as UNINTERESTING after being added. Do a quick check.
+		 */
+		list->maybe_interesting = 0;
+		for (size_t i = 0;
+		     !list->maybe_interesting && i < list->oids.nr;
+		     i++) {
+			if (list->type == OBJ_TREE) {
+				struct tree *t = lookup_tree(ctx->repo,
+							     &list->oids.oid[i]);
+				if (t && !(t->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			} else {
+				struct blob *b = lookup_blob(ctx->repo,
+							     &list->oids.oid[i]);
+				if (b && !(b->object.flags & UNINTERESTING))
+					list->maybe_interesting = 1;
+			}
+		}
+
+		/* We have confirmed that all objects are UNINTERESTING. */
+		if (!list->maybe_interesting)
+			return 0;
+	}
+
 	/* Evaluate function pointer on this data, if requested. */
 	if ((list->type == OBJ_TREE && ctx->info->trees) ||
 	    (list->type == OBJ_BLOB && ctx->info->blobs))
@@ -203,7 +241,7 @@ static void clear_strmap(struct strmap *map)
 int walk_objects_by_path(struct path_walk_info *info)
 {
 	const char *root_path = "";
-	int ret = 0;
+	int ret = 0, has_uninteresting = 0;
 	size_t commits_nr = 0, paths_nr = 0;
 	struct commit *c;
 	struct type_and_oid_list *root_tree_list;
@@ -215,6 +253,7 @@ int walk_objects_by_path(struct path_walk_info *info)
 		.path_stack = STRING_LIST_INIT_DUP,
 		.paths_to_lists = STRMAP_INIT
 	};
+	struct oidset root_tree_set = OIDSET_INIT;
 
 	struct oid_array tagged_tree_list = OID_ARRAY_INIT;
 	struct oid_array tagged_blob_list = OID_ARRAY_INIT;
@@ -227,7 +266,9 @@ int walk_objects_by_path(struct path_walk_info *info)
 	/* Insert a single list for the root tree into the paths. */
 	CALLOC_ARRAY(root_tree_list, 1);
 	root_tree_list->type = OBJ_TREE;
+	root_tree_list->maybe_interesting = 1;
 	strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
+
 	if (prepare_revision_walk(info->revs))
 		die(_("failed to setup revision walk"));
 
@@ -247,11 +288,17 @@ int walk_objects_by_path(struct path_walk_info *info)
 		oid = get_commit_tree_oid(c);
 		t = lookup_tree(info->revs->repo, oid);
 
-		if (t)
+		if (t) {
+			oidset_insert(&root_tree_set, oid);
 			oid_array_append(&root_tree_list->oids, oid);
-		else
+		} else {
 			warning("could not find tree %s", oid_to_hex(oid));
+		}
 
+		if (t && (c->object.flags & UNINTERESTING)) {
+			t->object.flags |= UNINTERESTING;
+			has_uninteresting = 1;
+		}
 	}
 
 	trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
@@ -318,6 +365,21 @@ int walk_objects_by_path(struct path_walk_info *info)
 		oid_array_clear(&tagged_blob_list);
 	}
 
+	/*
+	 * Before performing a DFS of our paths and emitting them as interesting,
+	 * do a full walk of the trees to distribute the UNINTERESTING bit. Use
+	 * the sparse algorithm if prune_all_uninteresting was set.
+	 */
+	if (has_uninteresting) {
+		trace2_region_enter("path-walk", "uninteresting-walk", info->revs->repo);
+		if (info->prune_all_uninteresting)
+			mark_trees_uninteresting_sparse(ctx.repo, &root_tree_set);
+		else
+			mark_trees_uninteresting_dense(ctx.repo, &root_tree_set);
+		trace2_region_leave("path-walk", "uninteresting-walk", info->revs->repo);
+	}
+	oidset_clear(&root_tree_set);
+
 	string_list_append(&ctx.path_stack, root_path);
 
 	trace2_region_enter("path-walk", "path-walk", info->revs->repo);
diff --git a/path-walk.h b/path-walk.h
index 637d3b0cabb69a..7c02bca71561aa 100644
--- a/path-walk.h
+++ b/path-walk.h
@@ -50,6 +50,14 @@ struct path_walk_info {
 	 * the sparse-checkout patterns.
 	 */
 	struct pattern_list *pl;
+
+	/**
+	 * When 'prune_all_uninteresting' is set and a path has all objects
+	 * marked as UNINTERESTING, then the path-walk will not visit those
+	 * objects. It will not call path_fn on those objects and will not
+	 * walk the children of such trees.
+	 */
+	int prune_all_uninteresting;
 };
 
 #define PATH_WALK_INFO_INIT {   \

From 3455af21e1bea375f38d21cc3b1d718ca6e563e4 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 5 Sep 2024 10:04:51 -0400
Subject: [PATCH 20/30] pack-objects: add --path-walk option

In order to more easily compute delta bases among objects that appear at the
exact same path, add a --path-walk option to 'git pack-objects'.

This option will use the path-walk API instead of the object walk given by
the revision machinery. Since objects will be provided in batches
representing a common path, those objects can be tested for delta bases
immediately instead of waiting for a sort of the full object list by
name-hash. This has multiple benefits, including avoiding collisions by
name-hash.

The objects marked as UNINTERESTING are included in these batches, so we
are guaranteeing some locality to find good delta bases.

After the individual passes are done on a per-path basis, the default
name-hash is used to find other opportunistic delta bases that did not
match exactly by the full path name.

RFC TODO: It is important to note that this option is inherently
incompatible with using a bitmap index. This walk probably also does not
work with other advanced features, such as delta islands.

Getting ahead of myself, this option compares well with --full-name-hash
when the packfile is large enough, but also performs at least as well as
the default in all cases that I've seen.

RFC TODO: this should probably be recording the batch locations to another
list so they could be processed in a second phase using threads.

RFC TODO: list some examples of how this outperforms previous pack-objects
strategies. (This is coming in later commits that include performance
test changes.)

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c | 209 ++++++++++++++++++++++++++++++++++-------
 path-walk.c            |   2 +-
 2 files changed, 177 insertions(+), 34 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 778be80f5646d1..3d0bb33427d7cf 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -39,6 +39,9 @@
 #include "promisor-remote.h"
 #include "pack-mtimes.h"
 #include "parse-options.h"
+#include "blob.h"
+#include "tree.h"
+#include "path-walk.h"
 
 /*
  * Objects we are going to pack are collected in the `to_pack` structure.
@@ -215,6 +218,7 @@ static int delta_search_threads;
 static int pack_to_stdout;
 static int sparse;
 static int thin;
+static int path_walk;
 static int num_preferred_base;
 static struct progress *progress_state;
 
@@ -3139,6 +3143,38 @@ static int add_ref_tag(const char *tag UNUSED, const char *referent UNUSED, cons
 	return 0;
 }
 
+static int should_attempt_deltas(struct object_entry *entry)
+{
+	if (DELTA(entry))
+		/* This happens if we decided to reuse existing
+		 * delta from a pack.  "reuse_delta &&" is implied.
+		 */
+		return 0;
+
+	if (!entry->type_valid ||
+		oe_size_less_than(&to_pack, entry, 50))
+		return 0;
+
+	if (entry->no_try_delta)
+		return 0;
+
+	if (!entry->preferred_base) {
+		if (oe_type(entry) < 0)
+			die(_("unable to get type of object %s"),
+				oid_to_hex(&entry->idx.oid));
+	} else {
+		if (oe_type(entry) < 0) {
+			/*
+			 * This object is not found, but we
+			 * don't have to include it anyway.
+			 */
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
 static void prepare_pack(int window, int depth)
 {
 	struct object_entry **delta_list;
@@ -3169,33 +3205,11 @@ static void prepare_pack(int window, int depth)
 	for (i = 0; i < to_pack.nr_objects; i++) {
 		struct object_entry *entry = to_pack.objects + i;
 
-		if (DELTA(entry))
-			/* This happens if we decided to reuse existing
-			 * delta from a pack.  "reuse_delta &&" is implied.
-			 */
+		if (!should_attempt_deltas(entry))
 			continue;
 
-		if (!entry->type_valid ||
-		    oe_size_less_than(&to_pack, entry, 50))
-			continue;
-
-		if (entry->no_try_delta)
-			continue;
-
-		if (!entry->preferred_base) {
+		if (!entry->preferred_base)
 			nr_deltas++;
-			if (oe_type(entry) < 0)
-				die(_("unable to get type of object %s"),
-				    oid_to_hex(&entry->idx.oid));
-		} else {
-			if (oe_type(entry) < 0) {
-				/*
-				 * This object is not found, but we
-				 * don't have to include it anyway.
-				 */
-				continue;
-			}
-		}
 
 		delta_list[n++] = entry;
 	}
@@ -4110,6 +4124,117 @@ static void mark_bitmap_preferred_tips(void)
 	}
 }
 
+static inline int is_oid_interesting(struct repository *repo,
+				     struct object_id *oid,
+				     enum object_type type)
+{
+	if (type == OBJ_TAG) {
+		struct tag *t = lookup_tag(repo, oid);
+		return t && !(t->object.flags & UNINTERESTING);
+	}
+
+	if (type == OBJ_COMMIT) {
+		struct commit *c = lookup_commit(repo, oid);
+		return c && !(c->object.flags & UNINTERESTING);
+	}
+
+	if (type == OBJ_TREE) {
+		struct tree *t = lookup_tree(repo, oid);
+		return t && !(t->object.flags & UNINTERESTING);
+	}
+
+	if (type == OBJ_BLOB) {
+		struct blob *b = lookup_blob(repo, oid);
+		return b && !(b->object.flags & UNINTERESTING);
+	}
+
+	return 0;
+}
+
+static int add_objects_by_path(const char *path,
+			       struct oid_array *oids,
+			       enum object_type type,
+			       void *data)
+{
+	struct object_entry **delta_list;
+	size_t oe_start = to_pack.nr_objects;
+	size_t oe_end;
+	unsigned int sub_list_size;
+	unsigned int *processed = data;
+
+	/*
+	 * First, add all objects to the packing data, including the ones
+	 * marked UNINTERESTING (translated to 'exclude') as they can be
+	 * used as delta bases.
+	 */
+	for (size_t i = 0; i < oids->nr; i++) {
+		struct object_id *oid = &oids->oid[i];
+		int exclude = !is_oid_interesting(the_repository, oid, type);
+		add_object_entry(oid, type, path, exclude);
+	}
+
+	oe_end = to_pack.nr_objects;
+
+	/* We can skip delta calculations if it is a no-op. */
+	if (oe_end == oe_start || !window)
+		return 0;
+
+	sub_list_size = 0;
+	ALLOC_ARRAY(delta_list, oe_end - oe_start);
+
+	for (size_t i = 0; i < oe_end - oe_start; i++) {
+		struct object_entry *entry = to_pack.objects + oe_start + i;
+
+		if (!should_attempt_deltas(entry))
+			continue;
+
+		delta_list[sub_list_size++] = entry;
+	}
+
+	/*
+	 * Find delta bases among this list of objects that all match the same
+	 * path. This causes the delta compression to be interleaved in the
+	 * object walk, which can lead to confusing progress indicators. This is
+	 * also incompatible with threaded delta calculations. In the future,
+	 * consider creating a list of regions in the full to_pack.objects array
+	 * that could be picked up by the threaded delta computation.
+	 */
+	if (sub_list_size && window) {
+		QSORT(delta_list, sub_list_size, type_size_sort);
+		find_deltas(delta_list, &sub_list_size, window, depth, processed);
+	}
+
+	free(delta_list);
+	return 0;
+}
+
+static void get_object_list_path_walk(struct rev_info *revs)
+{
+	struct path_walk_info info = PATH_WALK_INFO_INIT;
+	unsigned int processed = 0;
+
+	info.revs = revs;
+
+	info.revs->tag_objects = 1;
+	info.tags = 1;
+	info.commits = 1;
+	info.trees = 1;
+	info.blobs = 1;
+	info.path_fn = add_objects_by_path;
+	info.path_fn_data = &processed;
+
+	/*
+	 * Allow the --[no-]sparse option to be interesting here, if only
+	 * for testing purposes. Paths with no interesting objects will not
+	 * contribute to the resulting pack, but only create noisy preferred
+	 * base objects.
+	 */
+	info.prune_all_uninteresting = sparse;
+
+	if (walk_objects_by_path(&info))
+		die(_("failed to pack objects via path-walk"));
+}
+
 static void get_object_list(struct rev_info *revs, int ac, const char **av)
 {
 	struct setup_revision_opt s_r_opt = {
@@ -4156,7 +4281,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
 
 	warn_on_object_refname_ambiguity = save_warning;
 
-	if (use_bitmap_index && !get_object_list_from_bitmap(revs))
+	if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs))
 		return;
 
 	if (use_delta_islands)
@@ -4165,15 +4290,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av)
 	if (write_bitmap_index)
 		mark_bitmap_preferred_tips();
 
-	if (prepare_revision_walk(revs))
-		die(_("revision walk setup failed"));
-	mark_edges_uninteresting(revs, show_edge, sparse);
-
 	if (!fn_show_object)
 		fn_show_object = show_object;
-	traverse_commit_list(revs,
-			     show_commit, fn_show_object,
-			     NULL);
+
+	if (path_walk) {
+		get_object_list_path_walk(revs);
+	} else {
+		if (prepare_revision_walk(revs))
+			die(_("revision walk setup failed"));
+		mark_edges_uninteresting(revs, show_edge, sparse);
+		traverse_commit_list(revs,
+				show_commit, fn_show_object,
+				NULL);
+	}
 
 	if (unpack_unreachable_expiration) {
 		revs->ignore_missing_links = 1;
@@ -4368,6 +4497,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 			 N_("use the sparse reachability algorithm")),
 		OPT_BOOL(0, "thin", &thin,
 			 N_("create thin packs")),
+		OPT_BOOL(0, "path-walk", &path_walk,
+			 N_("use the path-walk API to walk objects when possible")),
 		OPT_BOOL(0, "shallow", &shallow,
 			 N_("create packs suitable for shallow fetches")),
 		OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk,
@@ -4448,7 +4579,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		window = 0;
 
 	strvec_push(&rp, "pack-objects");
-	if (thin) {
+
+	if (path_walk && filter_options.choice) {
+		warning(_("cannot use --filter with --path-walk"));
+		path_walk = 0;
+	}
+	if (path_walk) {
+		strvec_push(&rp, "--boundary");
+		 /*
+		  * We must disable the bitmaps because we are removing
+		  * the --objects / --objects-edge[-aggressive] options.
+		  */
+		use_bitmap_index = 0;
+	} else if (thin) {
 		use_internal_rev_list = 1;
 		strvec_push(&rp, shallow
 				? "--objects-edge-aggressive"
diff --git a/path-walk.c b/path-walk.c
index 08de29614f7299..9391e0579aea4b 100644
--- a/path-walk.c
+++ b/path-walk.c
@@ -306,7 +306,7 @@ int walk_objects_by_path(struct path_walk_info *info)
 
 	/* Track all commits. */
 	if (info->commits)
-		ret = info->path_fn("", &commit_list->oids, OBJ_COMMIT,
+		ret = info->path_fn("initial", &commit_list->oids, OBJ_COMMIT,
 				    info->path_fn_data);
 	oid_array_clear(&commit_list->oids);
 	free(commit_list);

From 502008bb7c57327bad65867a70871ef0cf8898b5 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 6 Sep 2024 15:24:10 -0400
Subject: [PATCH 21/30] pack-objects: extract should_attempt_deltas()

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 3d0bb33427d7cf..b1d684c341705e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3151,8 +3151,7 @@ static int should_attempt_deltas(struct object_entry *entry)
 		 */
 		return 0;
 
-	if (!entry->type_valid ||
-		oe_size_less_than(&to_pack, entry, 50))
+	if (!entry->type_valid || oe_size_less_than(&to_pack, entry, 50))
 		return 0;
 
 	if (entry->no_try_delta)
@@ -3162,14 +3161,12 @@ static int should_attempt_deltas(struct object_entry *entry)
 		if (oe_type(entry) < 0)
 			die(_("unable to get type of object %s"),
 				oid_to_hex(&entry->idx.oid));
-	} else {
-		if (oe_type(entry) < 0) {
-			/*
-			 * This object is not found, but we
-			 * don't have to include it anyway.
-			 */
-			return 0;
-		}
+	} else if (oe_type(entry) < 0) {
+		/*
+		 * This object is not found, but we
+		 * don't have to include it anyway.
+		 */
+		return 0;
 	}
 
 	return 1;

From b52ee338d1524a938e679fdeedc04818b75fd4d5 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Fri, 6 Sep 2024 09:18:43 -0400
Subject: [PATCH 22/30] pack-objects: introduce GIT_TEST_PACK_PATH_WALK

There are many tests that validate whether 'git pack-objects' works as
expected. Instead of duplicating these tests, add a new test environment
variable, GIT_TEST_PACK_PATH_WALK, that implies --path-walk by default
when specified.

This was useful in testing the implementation of the --path-walk
implementation, especially in conjunction with test such as:

 - t5322-pack-objects-sparse.sh : This demonstrates the effectiveness of
   the --sparse option and how it combines with --path-walk.

RFC TODO: list other helpful test cases, as well as the ones where the
behavior breaks if this is enabled...

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c | 1 +
 t/README               | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b1d684c341705e..b9fe1b1fbd5571 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -4534,6 +4534,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 
 	disable_replace_refs();
 
+	path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", 0);
 	sparse = git_env_bool("GIT_TEST_PACK_SPARSE", -1);
 	if (the_repository->gitdir) {
 		prepare_repo_settings(the_repository);
diff --git a/t/README b/t/README
index 44c02d81298dc6..a5d7d0239e074f 100644
--- a/t/README
+++ b/t/README
@@ -433,6 +433,10 @@ GIT_TEST_PACK_SPARSE=<boolean> if disabled will default the pack-objects
 builtin to use the non-sparse object walk. This can still be overridden by
 the --sparse command-line argument.
 
+GIT_TEST_PACK_PATH_WALK=<boolean> if enabled will default the pack-objects
+builtin to use the path-walk API for the object walk. This can still be
+overridden by the --no-path-walk command-line argument.
+
 GIT_TEST_PRELOAD_INDEX=<boolean> exercises the preload-index code path
 by overriding the minimum number of cache entries required per thread.
 

From 54bd80701fb9b55910d6d8453f235872fe549fdd Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Wed, 28 Aug 2024 12:07:42 -0400
Subject: [PATCH 23/30] p5313: add size comparison test

To test the benefits of the new --path-walk option in 'git
pack-objects', create a performance test that times the process but also
compares the size of the output.

Against the microsoft/fluentui repo [1] against a particular commit [2],
this has reproducible results of a similar scale:

Test                                            this tree
---------------------------------------------------------------
5313.2: thin pack                               0.39(0.48+0.03)
5313.3: thin pack size                                     1.2M
5313.4: thin pack with --path-walk              0.09(0.07+0.01)
5313.5: thin pack size with --path-walk                   20.8K
5313.6: big recent pack                         2.13(8.29+0.26)
5313.7: big recent pack size                              17.7M
5313.8: big recent pack with --path-walk        3.18(4.21+0.22)
5313.9: big recent pack size with --path-walk             15.0M

[1] https://github.com/microsoft/reactui
[2] e70848ebac1cd720875bccaa3026f4a9ed700e08

RFC TODO: Note that the path-walk version is slower for the big case,
but the delta calculation is single-threaded with the current
implementation! It's still faster for the small case that mimics a
typical push.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 t/perf/p5313-pack-objects.sh | 55 ++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100755 t/perf/p5313-pack-objects.sh

diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh
new file mode 100755
index 00000000000000..fdcdf188f952ad
--- /dev/null
+++ b/t/perf/p5313-pack-objects.sh
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+test_description='Tests pack performance using bitmaps'
+. ./perf-lib.sh
+
+GIT_TEST_PASSING_SANITIZE_LEAK=0
+export GIT_TEST_PASSING_SANITIZE_LEAK
+
+test_perf_large_repo
+
+test_expect_success 'create rev input' '
+	cat >in-thin <<-EOF &&
+	$(git rev-parse HEAD)
+	^$(git rev-parse HEAD~1)
+	EOF
+	
+	cat >in-big-recent <<-EOF
+	$(git rev-parse HEAD)
+	^$(git rev-parse HEAD~1000)
+	EOF
+'
+
+test_perf 'thin pack' '
+	git pack-objects --thin --stdout --revs --sparse  <in-thin >out
+'
+
+test_size 'thin pack size' '
+	wc -c <out
+'
+
+test_perf 'thin pack with --path-walk' '
+	git pack-objects --thin --stdout --revs --sparse --path-walk <in-thin >out
+'
+
+test_size 'thin pack size with --path-walk' '
+	wc -c <out
+'
+
+test_perf 'big recent pack' '
+	git pack-objects --stdout --revs <in-big-recent >out
+'
+
+test_size 'big recent pack size' '
+	wc -c <out
+'
+
+test_perf 'big recent pack with --path-walk' '
+	git pack-objects --stdout --revs --path-walk <in-big-recent >out
+'
+
+test_size 'big recent pack size with --path-walk' '
+	wc -c <out
+'
+
+test_done

From d3284d090d36e3bff3816123e9939ef0128f323e Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 5 Sep 2024 09:49:23 -0400
Subject: [PATCH 24/30] repack: add --path-walk option

Since 'git pack-objects' supports a --path-walk option, allow passing it
through in 'git repack'. This presents interesting testing opportunities for
comparing the different repacking strategies against each other.

For the microsoft/fluentui repo [1], the results are very interesting:

Test                                            this tree
-------------------------------------------------------------------
5313.10: full repack                            97.91(663.47+2.83)
5313.11: full repack size                                449.1K
5313.12: full repack with --path-walk           105.42(120.49+0.95)
5313.13: full repack size with --path-walk               159.1K

[1] https://github.com/microsoft/fluentui

This repo suffers from having a lot of paths that collide in the name
hash, so examining them in groups by path leads to better deltas. Also,
in this case, the single-threaded implementation is competitive with the
full repack. This is saving time diffing files that have significant
differences from each other.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/repack.c             |  5 +++++
 t/perf/p5313-pack-objects.sh | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/builtin/repack.c b/builtin/repack.c
index 62cfa50c50f893..9e39a1ea8f8ce4 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -57,6 +57,7 @@ struct pack_objects_args {
 	int no_reuse_object;
 	int quiet;
 	int local;
+	int path_walk;
 	struct list_objects_filter_options filter_options;
 };
 
@@ -288,6 +289,8 @@ static void prepare_pack_objects(struct child_process *cmd,
 		strvec_pushf(&cmd->args, "--no-reuse-delta");
 	if (args->no_reuse_object)
 		strvec_pushf(&cmd->args, "--no-reuse-object");
+	if (args->path_walk)
+		strvec_pushf(&cmd->args, "--path-walk");
 	if (args->local)
 		strvec_push(&cmd->args,  "--local");
 	if (args->quiet)
@@ -1158,6 +1161,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				N_("pass --no-reuse-delta to git-pack-objects")),
 		OPT_BOOL('F', NULL, &po_args.no_reuse_object,
 				N_("pass --no-reuse-object to git-pack-objects")),
+		OPT_BOOL(0, "path-walk", &po_args.path_walk,
+				N_("pass --path-walk to git-pack-objects")),
 		OPT_NEGBIT('n', NULL, &run_update_server_info,
 				N_("do not run git-update-server-info"), 1),
 		OPT__QUIET(&po_args.quiet, N_("be quiet")),
diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh
index fdcdf188f952ad..48fc05bb6c6ce3 100755
--- a/t/perf/p5313-pack-objects.sh
+++ b/t/perf/p5313-pack-objects.sh
@@ -52,4 +52,24 @@ test_size 'big recent pack size with --path-walk' '
 	wc -c <out
 '
 
+test_perf 'full repack' '
+	git repack -adf --no-write-bitmap-index
+'
+
+test_size 'full repack size' '
+	du -a .git/objects/pack | \
+	   awk "{ print \$1; }" | \
+		       sort -nr | head -n 1
+'
+
+test_perf 'full repack with --path-walk' '
+	git repack -adf --no-write-bitmap-index --path-walk
+'
+
+test_size 'full repack size with --path-walk' '
+	du -a .git/objects/pack | \
+	   awk "{ print \$1; }" | \
+		       sort -nr | head -n 1
+'
+
 test_done

From 1942f7d03622f2740d83e766fca65938cb590f6a Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 5 Sep 2024 09:50:06 -0400
Subject: [PATCH 25/30] pack-objects: enable --path-walk via config

Users may want to enable the --path-walk option for 'git pack-objects' by
default, especially underneath commands like 'git push' or 'git repack'.

This should be limited to client repositories, since the --path-walk option
disables bitmap walks, so would be bad to include in Git servers when
serving fetches and clones. There is potential that it may be helpful to
consider when repacking the repository, to take advantage of improved deltas
across historical versions of the same files.

Much like how "pack.useSparse" was introduced and included in
"feature.experimental" before being enabled by default, use the repository
settings infrastructure to make the new "pack.usePathWalk" config enabled by
"feature.experimental" and "feature.manyFiles".

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Documentation/config/pack.txt | 8 ++++++++
 builtin/pack-objects.c        | 4 +++-
 repo-settings.c               | 3 +++
 repository.h                  | 1 +
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt
index da527377fafcb6..08d06271177006 100644
--- a/Documentation/config/pack.txt
+++ b/Documentation/config/pack.txt
@@ -155,6 +155,14 @@ pack.useSparse::
 	commits contain certain types of direct renames. Default is
 	`true`.
 
+pack.usePathWalk::
+	When true, git will default to using the '--path-walk' option in
+	'git pack-objects' when the '--revs' option is present. This
+	algorithm groups objects by path to maximize the ability to
+	compute delta chains across historical versions of the same
+	object. This may disable other options, such as using bitmaps to
+	enumerate objects.
+
 pack.preferBitmapTips::
 	When selecting which commits will receive bitmaps, prefer a
 	commit at the tip of any reference that is a suffix of any value
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index b9fe1b1fbd5571..e7a9d0349c3ae9 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -4534,12 +4534,14 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 
 	disable_replace_refs();
 
-	path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", 0);
+	path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", -1);
 	sparse = git_env_bool("GIT_TEST_PACK_SPARSE", -1);
 	if (the_repository->gitdir) {
 		prepare_repo_settings(the_repository);
 		if (sparse < 0)
 			sparse = the_repository->settings.pack_use_sparse;
+		if (path_walk < 0)
+			path_walk = the_repository->settings.pack_use_path_walk;
 		if (the_repository->settings.pack_use_multi_pack_reuse)
 			allow_pack_reuse = MULTI_PACK_REUSE;
 	}
diff --git a/repo-settings.c b/repo-settings.c
index 2b4e68731bedce..d9597d84556161 100644
--- a/repo-settings.c
+++ b/repo-settings.c
@@ -45,11 +45,13 @@ void prepare_repo_settings(struct repository *r)
 		r->settings.fetch_negotiation_algorithm = FETCH_NEGOTIATION_SKIPPING;
 		r->settings.pack_use_bitmap_boundary_traversal = 1;
 		r->settings.pack_use_multi_pack_reuse = 1;
+		r->settings.pack_use_path_walk = 1;
 	}
 	if (manyfiles) {
 		r->settings.index_version = 4;
 		r->settings.index_skip_hash = 1;
 		r->settings.core_untracked_cache = UNTRACKED_CACHE_WRITE;
+		r->settings.pack_use_path_walk = 1;
 	}
 
 	/* Commit graph config or default, does not cascade (simple) */
@@ -64,6 +66,7 @@ void prepare_repo_settings(struct repository *r)
 
 	/* Boolean config or default, does not cascade (simple)  */
 	repo_cfg_bool(r, "pack.usesparse", &r->settings.pack_use_sparse, 1);
+	repo_cfg_bool(r, "pack.usepathwalk", &r->settings.pack_use_path_walk, 0);
 	repo_cfg_bool(r, "core.multipackindex", &r->settings.core_multi_pack_index, 1);
 	repo_cfg_bool(r, "index.sparse", &r->settings.sparse_index, 0);
 	repo_cfg_bool(r, "index.skiphash", &r->settings.index_skip_hash, r->settings.index_skip_hash);
diff --git a/repository.h b/repository.h
index af6ea0a62cdb70..2ae9c2b1741528 100644
--- a/repository.h
+++ b/repository.h
@@ -62,6 +62,7 @@ struct repo_settings {
 	enum untracked_cache_setting core_untracked_cache;
 
 	int pack_use_sparse;
+	int pack_use_path_walk;
 	enum fetch_negotiation_setting fetch_negotiation_algorithm;
 
 	int core_multi_pack_index;

From 4c10f859c8dcc42c4d0470a1f295fba979aca336 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Thu, 5 Sep 2024 09:51:33 -0400
Subject: [PATCH 26/30] scalar: enable path-walk during push via config

Repositories registered with Scalar are expected to be client-only
repositories that are rather large. This means that they are more likely to
be good candidates for using the --path-walk option when running 'git
pack-objects', especially under the hood of 'git push'. Enable this config
in Scalar repositories.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 scalar.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scalar.c b/scalar.c
index 6166a8dd4c8d1f..031d1ac179f782 100644
--- a/scalar.c
+++ b/scalar.c
@@ -170,6 +170,7 @@ static int set_recommended_config(int reconfigure)
 		{ "core.autoCRLF", "false" },
 		{ "core.safeCRLF", "false" },
 		{ "fetch.showForcedUpdates", "false" },
+		{ "push.usePathWalk", "true" },
 		{ NULL, NULL },
 	};
 	int i;

From db8cc46909bae552b0b23be9b07fdb2adfa68a10 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sat, 7 Sep 2024 19:43:45 -0400
Subject: [PATCH 27/30] pack-objects: add --full-name-hash option

RFC NOTE: this is essentially the same as the patch introduced
independently of the RFC, but now is on top of the --path-walk option
instead. This is included in the RFC for comparison purposes.

RFC NOTE: As you can see from the details below, the --full-name-hash
option essentially attempts to do similar things as the --path-walk
option, but sometimes misses the mark. Collisions still happen with the
--full-name-hash option, leading to some misses. However, in cases where
the default name-hash algorithm has low collision rates and deltas are
actually desired across objects with similar names but different full
names, the --path-walk option can still take advantage of the default
name hash approach.

Here are the new performance details simulating a single push in an
internal monorepo using a lot of paths that collide in the default name
hash. We can see that --full-name-hash gets close to the --path-walk
option's size.

Test                                           this tree
--------------------------------------------------------------
5313.2: thin pack                              2.43(2.92+0.14)
5313.3: thin pack size                                    4.5M
5313.4: thin pack with --full-name-hash        0.31(0.49+0.12)
5313.5: thin pack size with --full-name-hash             15.5K
5313.6: thin pack with --path-walk             0.35(0.31+0.04)
5313.7: thin pack size with --path-walk                  14.2K

However, when simulating pushes on repositories that do not have issues
with name-hash collisions, the --full-name-hash option presents a
potential of worse delta calculations, such as this example using my
local Git repository:

Test                                           this tree
--------------------------------------------------------------
5313.2: thin pack                              0.03(0.01+0.01)
5313.3: thin pack size                                     475
5313.4: thin pack with --full-name-hash        0.02(0.01+0.01)
5313.5: thin pack size with --full-name-hash             14.8K
5313.6: thin pack with --path-walk             0.02(0.01+0.01)
5313.7: thin pack size with --path-walk                    475

Note that the path-walk option found the same delta bases as the default
options in this case.

In the full repack case, the --full-name-hash option may be preferable
because it interacts well with other advanced features, such as using
bitmap indexes and tracking delta islands.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c       | 20 +++++++++++++++-----
 builtin/repack.c             |  5 +++++
 pack-objects.h               | 20 ++++++++++++++++++++
 t/perf/p5313-pack-objects.sh | 26 ++++++++++++++++++++++++++
 4 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index e7a9d0349c3ae9..5d5a57e6b1f755 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -270,6 +270,14 @@ struct configured_exclusion {
 static struct oidmap configured_exclusions;
 
 static struct oidset excluded_by_config;
+static int use_full_name_hash;
+
+static inline uint32_t pack_name_hash_fn(const char *name)
+{
+	if (use_full_name_hash)
+		return pack_full_name_hash(name);
+	return pack_name_hash(name);
+}
 
 /*
  * stats
@@ -1674,7 +1682,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
 		return 0;
 	}
 
-	create_object_entry(oid, type, pack_name_hash(name),
+	create_object_entry(oid, type, pack_name_hash_fn(name),
 			    exclude, name && no_try_delta(name),
 			    found_pack, found_offset);
 	return 1;
@@ -1888,7 +1896,7 @@ static void add_preferred_base_object(const char *name)
 {
 	struct pbase_tree *it;
 	size_t cmplen;
-	unsigned hash = pack_name_hash(name);
+	unsigned hash = pack_name_hash_fn(name);
 
 	if (!num_preferred_base || check_pbase_path(hash))
 		return;
@@ -3405,7 +3413,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
 	 * here using a now in order to perhaps improve the delta selection
 	 * process.
 	 */
-	oe->hash = pack_name_hash(name);
+	oe->hash = pack_name_hash_fn(name);
 	oe->no_try_delta = name && no_try_delta(name);
 
 	stdin_packs_hints_nr++;
@@ -3555,7 +3563,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
 	entry = packlist_find(&to_pack, oid);
 	if (entry) {
 		if (name) {
-			entry->hash = pack_name_hash(name);
+			entry->hash = pack_name_hash_fn(name);
 			entry->no_try_delta = no_try_delta(name);
 		}
 	} else {
@@ -3578,7 +3586,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
 			return;
 		}
 
-		entry = create_object_entry(oid, type, pack_name_hash(name),
+		entry = create_object_entry(oid, type, pack_name_hash_fn(name),
 					    0, name && no_try_delta(name),
 					    pack, offset);
 	}
@@ -4526,6 +4534,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 		OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
 				N_("protocol"),
 				N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
+		OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
+			 N_("optimize delta compression across identical path names over time")),
 		OPT_END(),
 	};
 
diff --git a/builtin/repack.c b/builtin/repack.c
index 9e39a1ea8f8ce4..a1ab103e62d0ce 100644
--- a/builtin/repack.c
+++ b/builtin/repack.c
@@ -58,6 +58,7 @@ struct pack_objects_args {
 	int quiet;
 	int local;
 	int path_walk;
+	int full_name_hash;
 	struct list_objects_filter_options filter_options;
 };
 
@@ -291,6 +292,8 @@ static void prepare_pack_objects(struct child_process *cmd,
 		strvec_pushf(&cmd->args, "--no-reuse-object");
 	if (args->path_walk)
 		strvec_pushf(&cmd->args, "--path-walk");
+	if (args->full_name_hash)
+		strvec_pushf(&cmd->args, "--full-name-hash");
 	if (args->local)
 		strvec_push(&cmd->args,  "--local");
 	if (args->quiet)
@@ -1163,6 +1166,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
 				N_("pass --no-reuse-object to git-pack-objects")),
 		OPT_BOOL(0, "path-walk", &po_args.path_walk,
 				N_("pass --path-walk to git-pack-objects")),
+		OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
+				N_("pass --full-name-hash to git-pack-objects")),
 		OPT_NEGBIT('n', NULL, &run_update_server_info,
 				N_("do not run git-update-server-info"), 1),
 		OPT__QUIET(&po_args.quiet, N_("be quiet")),
diff --git a/pack-objects.h b/pack-objects.h
index b9898a4e64b8b4..50097552d03f20 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -207,6 +207,26 @@ static inline uint32_t pack_name_hash(const char *name)
 	return hash;
 }
 
+static inline uint32_t pack_full_name_hash(const char *name)
+{
+	const uint32_t bigp = 1234572167U;
+	uint32_t c, hash = bigp;
+
+	if (!name)
+		return 0;
+
+	/*
+	 * Just do the dumbest thing possible: add random multiples of a
+	 * large prime number with a binary shift. Goal is not cryptographic,
+	 * but generally uniformly distributed.
+	 */
+	while ((c = *name++) != 0) {
+		hash += c * bigp;
+		hash = (hash >> 5) | (hash << 27);
+	}
+	return hash;
+}
+
 static inline enum object_type oe_type(const struct object_entry *e)
 {
 	return e->type_valid ? e->type_ : OBJ_BAD;
diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh
index 48fc05bb6c6ce3..b3b7fff8abf3bf 100755
--- a/t/perf/p5313-pack-objects.sh
+++ b/t/perf/p5313-pack-objects.sh
@@ -28,6 +28,14 @@ test_size 'thin pack size' '
 	wc -c <out
 '
 
+test_perf 'thin pack with --full-name-hash' '
+	git pack-objects --thin --stdout --revs --sparse --full-name-hash <in-thin >out
+'
+
+test_size 'thin pack size with --full-name-hash' '
+	wc -c <out
+'
+
 test_perf 'thin pack with --path-walk' '
 	git pack-objects --thin --stdout --revs --sparse --path-walk <in-thin >out
 '
@@ -44,6 +52,14 @@ test_size 'big recent pack size' '
 	wc -c <out
 '
 
+test_perf 'big recent pack with --full-name-hash' '
+	git pack-objects --stdout --revs --full-name-hash <in-big-recent >out
+'
+
+test_size 'big recent pack size with --full-name-hash' '
+	wc -c <out
+'
+
 test_perf 'big recent pack with --path-walk' '
 	git pack-objects --stdout --revs --path-walk <in-big-recent >out
 '
@@ -62,6 +78,16 @@ test_size 'full repack size' '
 		       sort -nr | head -n 1
 '
 
+test_perf 'full repack with --full-name-hash' '
+	git repack -adf --no-write-bitmap-index --full-name-hash
+'
+
+test_size 'full repack size with --full-name-hash' '
+	du -a .git/objects/pack | \
+	   awk "{ print \$1; }" | \
+		       sort -nr | head -n 1
+'
+
 test_perf 'full repack with --path-walk' '
 	git repack -adf --no-write-bitmap-index --path-walk
 '

From 8df39a432fa682212d53d31389d437e86b4513f6 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 8 Sep 2024 19:30:57 -0400
Subject: [PATCH 28/30] test-name-hash: add helper to compute name-hash
 functions

Using this tool, we can count how many distinct name-hash values exist
within a list of paths. Examples include

 git ls-tree -r --name-only HEAD | \
	     test-tool name-hash | \
  	      awk "{print \$1;}" | \
  		 sort -ns | uniq | wc -l

which outputs the number of distinct name-hash values that appear at
HEAD. Or, the following which presents the resulting name-hash values of
maximum multiplicity:

 git ls-tree -r --name-only HEAD | \
	     test-tool name-hash | \
	      awk "{print \$1;}" | \
	       sort -n | uniq -c | sort -nr | head -n 25

For an internal monorepo with around a quarter million paths at HEAD,
the highest multiplicity for the standard name-hash function was 14,424
while the full name-hash algorithm had only seven hash values with any
collision, with a maximum multiplicity of two.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 Makefile                  |  1 +
 t/helper/test-name-hash.c | 23 +++++++++++++++++++++++
 t/helper/test-tool.c      |  1 +
 t/helper/test-tool.h      |  1 +
 4 files changed, 26 insertions(+)
 create mode 100644 t/helper/test-name-hash.c

diff --git a/Makefile b/Makefile
index 154de6e01d06b2..462aff65a5089e 100644
--- a/Makefile
+++ b/Makefile
@@ -808,6 +808,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
 TEST_BUILTINS_OBJS += test-match-trees.o
 TEST_BUILTINS_OBJS += test-mergesort.o
 TEST_BUILTINS_OBJS += test-mktemp.o
+TEST_BUILTINS_OBJS += test-name-hash.o
 TEST_BUILTINS_OBJS += test-oid-array.o
 TEST_BUILTINS_OBJS += test-online-cpus.o
 TEST_BUILTINS_OBJS += test-pack-mtimes.o
diff --git a/t/helper/test-name-hash.c b/t/helper/test-name-hash.c
new file mode 100644
index 00000000000000..c82ccd7cefd860
--- /dev/null
+++ b/t/helper/test-name-hash.c
@@ -0,0 +1,23 @@
+/*
+ * test-name-hash.c: Read a list of paths over stdin and report on their
+ * name-hash and full name-hash.
+ */
+
+#include "test-tool.h"
+#include "git-compat-util.h"
+#include "pack-objects.h"
+#include "strbuf.h"
+
+int cmd__name_hash(int argc, const char **argv)
+{
+	struct strbuf line = STRBUF_INIT;
+
+	while (!strbuf_getline(&line, stdin)) {
+		uint32_t name_hash = pack_name_hash(line.buf);
+		uint32_t full_hash = pack_full_name_hash(line.buf);
+
+		printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
+	}
+
+	return 0;
+}
diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c
index f8a67df7de90fc..4a6039210020e5 100644
--- a/t/helper/test-tool.c
+++ b/t/helper/test-tool.c
@@ -43,6 +43,7 @@ static struct test_cmd cmds[] = {
 	{ "match-trees", cmd__match_trees },
 	{ "mergesort", cmd__mergesort },
 	{ "mktemp", cmd__mktemp },
+	{ "name-hash", cmd__name_hash },
 	{ "oid-array", cmd__oid_array },
 	{ "online-cpus", cmd__online_cpus },
 	{ "pack-mtimes", cmd__pack_mtimes },
diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h
index e74bc0ffd4179c..56a83bf3aacd06 100644
--- a/t/helper/test-tool.h
+++ b/t/helper/test-tool.h
@@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv);
 int cmd__match_trees(int argc, const char **argv);
 int cmd__mergesort(int argc, const char **argv);
 int cmd__mktemp(int argc, const char **argv);
+int cmd__name_hash(int argc, const char **argv);
 int cmd__online_cpus(int argc, const char **argv);
 int cmd__pack_mtimes(int argc, const char **argv);
 int cmd__parse_options(int argc, const char **argv);

From 5dcb20a1c5c1e6f5dd676c54fa6b001af9abe072 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Sun, 8 Sep 2024 21:04:52 -0400
Subject: [PATCH 29/30] p5314: add a size test for name-hash collisions

This test helps inform someone as to the behavior of the name-hash
algorithms for their repo based on the paths at HEAD.

For example, the microsoft/fluentui repo had these statistics at time of
committing:

Test                                              this tree
-----------------------------------------------------------------
5314.1: paths at head                                       19.6K
5314.2: number of distinct name-hashes                       8.2K
5314.3: number of distinct full-name-hashes                 19.6K
5314.4: maximum multiplicity of name-hashes                   279
5314.5: maximum multiplicity of fullname-hashes                 1

That demonstrates that of the nearly twenty thousand path names, they
are assigned around eight thousand distinct values. 279 paths are
assigned to a single value, leading the packing algorithm to sort
objects from those paths together, by size.

In this repository, no collisions occur for the full-name-hash
algorithm.

In a more extreme example, an internal monorepo had a much worse
collision rate:

Test                                              this tree
-----------------------------------------------------------------
5314.1: paths at head                                      221.6K
5314.2: number of distinct name-hashes                      72.0K
5314.3: number of distinct full-name-hashes                221.6K
5314.4: maximum multiplicity of name-hashes                 14.4K
5314.5: maximum multiplicity of fullname-hashes                 2

Even in this repository with many more paths at HEAD, the collision rate
was low and the maximum number of paths being grouped into a single
bucket by the full-path-name algorithm was two.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 t/perf/p5314-name-hash.sh | 41 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100755 t/perf/p5314-name-hash.sh

diff --git a/t/perf/p5314-name-hash.sh b/t/perf/p5314-name-hash.sh
new file mode 100755
index 00000000000000..9fe26612facc1d
--- /dev/null
+++ b/t/perf/p5314-name-hash.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+test_description='Tests pack performance using bitmaps'
+. ./perf-lib.sh
+
+GIT_TEST_PASSING_SANITIZE_LEAK=0
+export GIT_TEST_PASSING_SANITIZE_LEAK
+
+test_perf_large_repo
+
+test_size 'paths at head' '
+	git ls-tree -r --name-only HEAD >path-list &&
+	wc -l <path-list
+'
+
+test_size 'number of distinct name-hashes' '
+	cat path-list | test-tool name-hash >name-hashes &&
+	cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count &&
+	wc -l <name-hash-count
+'
+
+test_size 'number of distinct full-name-hashes' '
+	cat name-hashes | awk "{ print \$2; }" | sort -n | uniq -c >full-name-hash-count &&
+	wc -l <full-name-hash-count
+'
+
+test_size 'maximum multiplicity of name-hashes' '
+	cat name-hash-count | \
+		sort -nr | \
+		head -n 1 | \
+		awk "{ print \$1; }"
+'
+
+test_size 'maximum multiplicity of fullname-hashes' '
+	cat full-name-hash-count | \
+		sort -nr | \
+		head -n 1 | \
+		awk "{ print \$1; }"
+'
+
+test_done

From 460feef90fdd869b42e3663a1a1336a8ae663bc0 Mon Sep 17 00:00:00 2001
From: Derrick Stolee <stolee@gmail.com>
Date: Wed, 28 Aug 2024 12:06:43 -0400
Subject: [PATCH 30/30] pack-objects: output debug info about deltas

In order to debug what is going on during delta calculations, add a
--debug-file=<file> option to 'git pack-objects'. This leads to sending
a JSON-formatted description of the delta information to that file.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c | 69 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index 5d5a57e6b1f755..7d1dd5a65577bc 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -50,6 +50,9 @@
  */
 static struct packing_data to_pack;
 
+static FILE *delta_file;
+static int delta_file_nr;
+
 static inline struct object_entry *oe_delta(
 		const struct packing_data *pack,
 		const struct object_entry *e)
@@ -516,6 +519,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 	hdrlen = encode_in_pack_object_header(header, sizeof(header),
 					      type, size);
 
+	if (delta_file) {
+		if (delta_file_nr++)
+			fprintf(delta_file, ",\n");
+		fprintf(delta_file, "\t\t{\n");
+		fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid));
+		fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", datalen);
+	}
+
 	if (type == OBJ_OFS_DELTA) {
 		/*
 		 * Deltas with relative base contain an additional
@@ -536,6 +547,11 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		hashwrite(f, header, hdrlen);
 		hashwrite(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
+		if (delta_file) {
+			fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n");
+			fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs);
+			fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
+		}
 	} else if (type == OBJ_REF_DELTA) {
 		/*
 		 * Deltas with a base reference contain
@@ -550,6 +566,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		hashwrite(f, header, hdrlen);
 		hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz);
 		hdrlen += hashsz;
+		if (delta_file) {
+			fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n");
+			fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
+		}
 	} else {
 		if (limit && hdrlen + datalen + hashsz >= limit) {
 			if (st)
@@ -559,6 +579,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent
 		}
 		hashwrite(f, header, hdrlen);
 	}
+
+	if (delta_file)
+		fprintf(delta_file, "\t\t\t\"reused\" : false\n\t\t}");
+
 	if (st) {
 		datalen = write_large_blob_data(st, f, &entry->idx.oid);
 		close_istream(st);
@@ -619,6 +643,14 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		return write_no_reuse_object(f, entry, limit, usable_delta);
 	}
 
+	if (delta_file) {
+		if (delta_file_nr++)
+			fprintf(delta_file, ",\n");
+		fprintf(delta_file, "\t\t{\n");
+		fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid));
+		fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", entry_size);
+	}
+
 	if (type == OBJ_OFS_DELTA) {
 		off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset;
 		unsigned pos = sizeof(dheader) - 1;
@@ -633,6 +665,12 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		hashwrite(f, dheader + pos, sizeof(dheader) - pos);
 		hdrlen += sizeof(dheader) - pos;
 		reused_delta++;
+
+		if (delta_file) {
+			fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n");
+			fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs);
+			fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
+		}
 	} else if (type == OBJ_REF_DELTA) {
 		if (limit && hdrlen + hashsz + datalen + hashsz >= limit) {
 			unuse_pack(&w_curs);
@@ -642,6 +680,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 		hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz);
 		hdrlen += hashsz;
 		reused_delta++;
+		if (delta_file) {
+			fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n");
+			fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid));
+		}
 	} else {
 		if (limit && hdrlen + datalen + hashsz >= limit) {
 			unuse_pack(&w_curs);
@@ -652,6 +694,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry,
 	copy_pack_data(f, p, &w_curs, offset, datalen);
 	unuse_pack(&w_curs);
 	reused++;
+
+	if (delta_file)
+		fprintf(delta_file, "\t\t\t\"reused\" : true\n\t\t}");
+
 	return hdrlen + datalen;
 }
 
@@ -1264,6 +1310,11 @@ static void write_pack_file(void)
 	ALLOC_ARRAY(written_list, to_pack.nr_objects);
 	write_order = compute_write_order();
 
+	if (delta_file) {
+		fprintf(delta_file, "{\n\t\"num_objects\" : %"PRIu32",\n", to_pack.nr_objects);
+		fprintf(delta_file, "\t\"objects\" : [\n");
+	}
+
 	do {
 		unsigned char hash[GIT_MAX_RAWSZ];
 		char *pack_tmp_name = NULL;
@@ -1412,6 +1463,9 @@ static void write_pack_file(void)
 		    written, nr_result);
 	trace2_data_intmax("pack-objects", the_repository,
 			   "write_pack_file/wrote", nr_result);
+
+	if (delta_file)
+		fprintf(delta_file, "\n\t]\n}");
 }
 
 static int no_try_delta(const char *path)
@@ -4430,6 +4484,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
 	struct list_objects_filter_options filter_options =
 		LIST_OBJECTS_FILTER_INIT;
+	const char *delta_file_name = NULL;
 
 	struct option pack_objects_options[] = {
 		OPT_CALLBACK_F('q', "quiet", &progress, NULL,
@@ -4536,6 +4591,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 				N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
 		OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
 			 N_("optimize delta compression across identical path names over time")),
+		OPT_STRING(0, "delta-file", &delta_file_name,
+				N_("filename"),
+				N_("output delta compression details to the given file")),
 		OPT_END(),
 	};
 
@@ -4573,6 +4631,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	if (pack_to_stdout != !base_name || argc)
 		usage_with_options(pack_usage, pack_objects_options);
 
+	if (delta_file_name) {
+		delta_file = fopen(delta_file_name, "w");
+		if (!delta_file)
+			die_errno("failed to open '%s'", delta_file_name);
+		trace2_printf("opened '%s' for writing deltas", delta_file_name);
+	}
 	if (depth < 0)
 		depth = 0;
 	if (depth >= (1 << OE_DEPTH_BITS)) {
@@ -4796,5 +4860,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix)
 	list_objects_filter_release(&filter_options);
 	strvec_clear(&rp);
 
+	if (delta_file) {
+		fflush(delta_file);
+		fclose(delta_file);
+	}
+
 	return 0;
 }