diff --git a/.gitignore b/.gitignore index 8caf3700c2305d..3f6fdb31a5ea32 100644 --- a/.gitignore +++ b/.gitignore @@ -19,6 +19,7 @@ /git-apply /git-archimport /git-archive +/git-backfill /git-bisect /git-blame /git-branch @@ -164,6 +165,7 @@ /git-submodule /git-submodule--helper /git-subtree +/git-survey /git-svn /git-switch /git-symbolic-ref diff --git a/Documentation/config/pack.txt b/Documentation/config/pack.txt index da527377fafcb6..08d06271177006 100644 --- a/Documentation/config/pack.txt +++ b/Documentation/config/pack.txt @@ -155,6 +155,14 @@ pack.useSparse:: commits contain certain types of direct renames. Default is `true`. +pack.usePathWalk:: + When true, git will default to using the '--path-walk' option in + 'git pack-objects' when the '--revs' option is present. This + algorithm groups objects by path to maximize the ability to + compute delta chains across historical versions of the same + object. This may disable other options, such as using bitmaps to + enumerate objects. + pack.preferBitmapTips:: When selecting which commits will receive bitmaps, prefer a commit at the tip of any reference that is a suffix of any value diff --git a/Documentation/git-backfill.txt b/Documentation/git-backfill.txt new file mode 100644 index 00000000000000..066ec6b161a22c --- /dev/null +++ b/Documentation/git-backfill.txt @@ -0,0 +1,60 @@ +git-backfill(1) +=============== + +NAME +---- +git-backfill - Download missing objects in a partial clone + + +SYNOPSIS +-------- +[verse] +'git backfill' [--batch-size=] [--[no-]sparse] + +DESCRIPTION +----------- + +Blobless partial clones are created using `git clone --filter=blob:none` +and then configure the local repository such that the Git client avoids +downloading blob objects unless they are required for a local operation. +This initially means that the clone and later fetches download reachable +commits and trees but no blobs. Later operations that change the `HEAD` +pointer, such as `git checkout` or `git merge`, may need to download +missing blobs in order to complete their operation. + +In the worst cases, commands that compute blob diffs, such as `git blame`, +become very slow as they download the missing blobs in single-blob +requests to satisfy the missing object as the Git command needs it. This +leads to multiple download requests and no ability for the Git server to +provide delta compression across those objects. + +The `git backfill` command provides a way for the user to request that +Git downloads the missing blobs (with optional filters) such that the +missing blobs representing historical versions of files can be downloaded +in batches. The `backfill` command attempts to optimize the request by +grouping blobs that appear at the same path, hopefully leading to good +delta compression in the packfile sent by the server. + +By default, `git backfill` downloads all blobs reachable from the `HEAD` +commit. This set can be restricted or expanded using various options. + +OPTIONS +------- + +--batch-size=:: + Specify a minimum size for a batch of missing objects to request + from the server. This size may be exceeded by the last set of + blobs seen at a given path. Default batch size is 16,000. + +--[no-]sparse:: + Only download objects if they appear at a path that matches the + current sparse-checkout. If the sparse-checkout feature is enabled, + then `--sparse` is assumed and can be disabled with `--no-sparse`. + +SEE ALSO +-------- +linkgit:git-clone[1]. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt new file mode 100644 index 00000000000000..c648ef704e3806 --- /dev/null +++ b/Documentation/git-survey.txt @@ -0,0 +1,70 @@ +git-survey(1) +============= + +NAME +---- +git-survey - EXPERIMENTAL: Measure various repository dimensions of scale + +SYNOPSIS +-------- +[verse] +(EXPERIMENTAL!) `git survey` + +DESCRIPTION +----------- + +Survey the repository and measure various dimensions of scale. + +As repositories grow to "monorepo" size, certain data shapes can cause +performance problems. `git-survey` attempts to measure and report on +known problem areas. + +Ref Selection and Reachable Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this first analysis phase, `git survey` will iterate over the set of +requested branches, tags, and other refs and treewalk over all of the +reachable commits, trees, and blobs and generate various statistics. + +OPTIONS +------- + +--progress:: + Show progress. This is automatically enabled when interactive. + +Ref Selection +~~~~~~~~~~~~~ + +The following options control the set of refs that `git survey` will examine. +By default, `git survey` will look at tags, local branches, and remote refs. +If any of the following options are given, the default set is cleared and +only refs for the given options are added. + +--all-refs:: + Use all refs. This includes local branches, tags, remote refs, + notes, and stashes. This option overrides all of the following. + +--branches:: + Add local branches (`refs/heads/`) to the set. + +--tags:: + Add tags (`refs/tags/`) to the set. + +--remotes:: + Add remote branches (`refs/remote/`) to the set. + +--detached:: + Add HEAD to the set. + +--other:: + Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. + +OUTPUT +------ + +By default, `git survey` will print information about the repository in a +human-readable format that includes overviews and tables. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index deb175a0408782..462aff65a5089e 100644 --- a/Makefile +++ b/Makefile @@ -808,6 +808,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o TEST_BUILTINS_OBJS += test-match-trees.o TEST_BUILTINS_OBJS += test-mergesort.o TEST_BUILTINS_OBJS += test-mktemp.o +TEST_BUILTINS_OBJS += test-name-hash.o TEST_BUILTINS_OBJS += test-oid-array.o TEST_BUILTINS_OBJS += test-online-cpus.o TEST_BUILTINS_OBJS += test-pack-mtimes.o @@ -1090,6 +1091,7 @@ LIB_OBJS += parse-options.o LIB_OBJS += patch-delta.o LIB_OBJS += patch-ids.o LIB_OBJS += path.o +LIB_OBJS += path-walk.o LIB_OBJS += pathspec.o LIB_OBJS += pkt-line.o LIB_OBJS += preload-index.o @@ -1197,6 +1199,7 @@ BUILTIN_OBJS += builtin/am.o BUILTIN_OBJS += builtin/annotate.o BUILTIN_OBJS += builtin/apply.o BUILTIN_OBJS += builtin/archive.o +BUILTIN_OBJS += builtin/backfill.o BUILTIN_OBJS += builtin/bisect.o BUILTIN_OBJS += builtin/blame.o BUILTIN_OBJS += builtin/branch.o @@ -1301,6 +1304,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o +BUILTIN_OBJS += builtin/survey.o BUILTIN_OBJS += builtin/symbolic-ref.o BUILTIN_OBJS += builtin/tag.o BUILTIN_OBJS += builtin/unpack-file.o diff --git a/builtin.h b/builtin.h index 14fa0171607b17..d4e8cf3b97b590 100644 --- a/builtin.h +++ b/builtin.h @@ -127,6 +127,7 @@ int cmd_am(int argc, const char **argv, const char *prefix); int cmd_annotate(int argc, const char **argv, const char *prefix); int cmd_apply(int argc, const char **argv, const char *prefix); int cmd_archive(int argc, const char **argv, const char *prefix); +int cmd_backfill(int argc, const char **argv, const char *prefix); int cmd_bisect(int argc, const char **argv, const char *prefix); int cmd_blame(int argc, const char **argv, const char *prefix); int cmd_branch(int argc, const char **argv, const char *prefix); @@ -238,6 +239,7 @@ int cmd_status(int argc, const char **argv, const char *prefix); int cmd_stash(int argc, const char **argv, const char *prefix); int cmd_stripspace(int argc, const char **argv, const char *prefix); int cmd_submodule__helper(int argc, const char **argv, const char *prefix); +int cmd_survey(int argc, const char **argv, const char *prefix); int cmd_switch(int argc, const char **argv, const char *prefix); int cmd_symbolic_ref(int argc, const char **argv, const char *prefix); int cmd_tag(int argc, const char **argv, const char *prefix); diff --git a/builtin/backfill.c b/builtin/backfill.c new file mode 100644 index 00000000000000..2a1b043f18821a --- /dev/null +++ b/builtin/backfill.c @@ -0,0 +1,141 @@ +#include "builtin.h" +#include "git-compat-util.h" +#include "config.h" +#include "parse-options.h" +#include "repository.h" +#include "commit.h" +#include "dir.h" +#include "environment.h" +#include "hex.h" +#include "tree.h" +#include "tree-walk.h" +#include "object.h" +#include "object-store-ll.h" +#include "oid-array.h" +#include "oidset.h" +#include "promisor-remote.h" +#include "strmap.h" +#include "string-list.h" +#include "revision.h" +#include "trace2.h" +#include "progress.h" +#include "packfile.h" +#include "path-walk.h" + +static const char * const builtin_backfill_usage[] = { + N_("git backfill [--batch-size=] [--[no-]sparse]"), + NULL +}; + +struct backfill_context { + struct repository *repo; + struct oid_array current_batch; + size_t batch_size; + int sparse; +}; + +static void clear_backfill_context(struct backfill_context *ctx) +{ + oid_array_clear(&ctx->current_batch); +} + +static void download_batch(struct backfill_context *ctx) +{ + promisor_remote_get_direct(ctx->repo, + ctx->current_batch.oid, + ctx->current_batch.nr); + oid_array_clear(&ctx->current_batch); + + /* + * We likely have a new packfile. Add it to the packed list to + * avoid possible duplicate downloads of the same objects. + */ + reprepare_packed_git(ctx->repo); +} + +static int fill_missing_blobs(const char *path, + struct oid_array *list, + enum object_type type, + void *data) +{ + struct backfill_context *ctx = data; + + if (type != OBJ_BLOB) + BUG("fill_missing_blobs only takes blob objects"); + + for (size_t i = 0; i < list->nr; i++) { + off_t size = 0; + struct object_info info = OBJECT_INFO_INIT; + info.disk_sizep = &size; + if (oid_object_info_extended(the_repository, + &list->oid[i], + &info, + OBJECT_INFO_FOR_PREFETCH) || + !size) + oid_array_append(&ctx->current_batch, &list->oid[i]); + } + + if (ctx->current_batch.nr >= ctx->batch_size) + download_batch(ctx); + + return 0; +} + +static int do_backfill(struct backfill_context *ctx) +{ + struct rev_info revs; + struct path_walk_info info = PATH_WALK_INFO_INIT; + int ret; + + if (ctx->sparse) { + CALLOC_ARRAY(info.pl, 1); + if (get_sparse_checkout_patterns(info.pl)) + return error(_("problem loading sparse-checkout")); + } + + repo_init_revisions(ctx->repo, &revs, ""); + handle_revision_arg("HEAD", &revs, 0, 0); + + info.revs = &revs; + info.path_fn = fill_missing_blobs; + info.path_fn_data = ctx; + + ret = walk_objects_by_path(&info); + + /* Download the objects that did not fill a batch. */ + if (!ret) + download_batch(ctx); + + clear_backfill_context(ctx); + return ret; +} + +int cmd_backfill(int argc, const char **argv, const char *prefix) +{ + struct backfill_context ctx = { + .repo = the_repository, + .current_batch = OID_ARRAY_INIT, + .batch_size = 16000, + .sparse = 0, + }; + struct option options[] = { + OPT_INTEGER(0, "batch-size", &ctx.batch_size, + N_("Minimun number of objects to request at a time")), + OPT_BOOL(0, "sparse", &ctx.sparse, + N_("Restrict the missing objects to the current sparse-checkout")), + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(builtin_backfill_usage, options); + + argc = parse_options(argc, argv, prefix, options, builtin_backfill_usage, + 0); + + git_config(git_default_config, NULL); + + if (ctx.sparse < 0) + ctx.sparse = core_apply_sparse_checkout; + + return do_backfill(&ctx); +} diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 778be80f5646d1..7d1dd5a65577bc 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -39,6 +39,9 @@ #include "promisor-remote.h" #include "pack-mtimes.h" #include "parse-options.h" +#include "blob.h" +#include "tree.h" +#include "path-walk.h" /* * Objects we are going to pack are collected in the `to_pack` structure. @@ -47,6 +50,9 @@ */ static struct packing_data to_pack; +static FILE *delta_file; +static int delta_file_nr; + static inline struct object_entry *oe_delta( const struct packing_data *pack, const struct object_entry *e) @@ -215,6 +221,7 @@ static int delta_search_threads; static int pack_to_stdout; static int sparse; static int thin; +static int path_walk; static int num_preferred_base; static struct progress *progress_state; @@ -266,6 +273,14 @@ struct configured_exclusion { static struct oidmap configured_exclusions; static struct oidset excluded_by_config; +static int use_full_name_hash; + +static inline uint32_t pack_name_hash_fn(const char *name) +{ + if (use_full_name_hash) + return pack_full_name_hash(name); + return pack_name_hash(name); +} /* * stats @@ -504,6 +519,14 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hdrlen = encode_in_pack_object_header(header, sizeof(header), type, size); + if (delta_file) { + if (delta_file_nr++) + fprintf(delta_file, ",\n"); + fprintf(delta_file, "\t\t{\n"); + fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid)); + fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", datalen); + } + if (type == OBJ_OFS_DELTA) { /* * Deltas with relative base contain an additional @@ -524,6 +547,11 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hashwrite(f, header, hdrlen); hashwrite(f, dheader + pos, sizeof(dheader) - pos); hdrlen += sizeof(dheader) - pos; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n"); + fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else if (type == OBJ_REF_DELTA) { /* * Deltas with a base reference contain @@ -538,6 +566,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent hashwrite(f, header, hdrlen); hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz); hdrlen += hashsz; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n"); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else { if (limit && hdrlen + datalen + hashsz >= limit) { if (st) @@ -547,6 +579,10 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent } hashwrite(f, header, hdrlen); } + + if (delta_file) + fprintf(delta_file, "\t\t\t\"reused\" : false\n\t\t}"); + if (st) { datalen = write_large_blob_data(st, f, &entry->idx.oid); close_istream(st); @@ -607,6 +643,14 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, return write_no_reuse_object(f, entry, limit, usable_delta); } + if (delta_file) { + if (delta_file_nr++) + fprintf(delta_file, ",\n"); + fprintf(delta_file, "\t\t{\n"); + fprintf(delta_file, "\t\t\t\"oid\" : \"%s\",\n", oid_to_hex(&entry->idx.oid)); + fprintf(delta_file, "\t\t\t\"size\" : %"PRIuMAX",\n", entry_size); + } + if (type == OBJ_OFS_DELTA) { off_t ofs = entry->idx.offset - DELTA(entry)->idx.offset; unsigned pos = sizeof(dheader) - 1; @@ -621,6 +665,12 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, hashwrite(f, dheader + pos, sizeof(dheader) - pos); hdrlen += sizeof(dheader) - pos; reused_delta++; + + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"OFS\",\n"); + fprintf(delta_file, "\t\t\t\"offset\" : %"PRIuMAX",\n", ofs); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else if (type == OBJ_REF_DELTA) { if (limit && hdrlen + hashsz + datalen + hashsz >= limit) { unuse_pack(&w_curs); @@ -630,6 +680,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, hashwrite(f, DELTA(entry)->idx.oid.hash, hashsz); hdrlen += hashsz; reused_delta++; + if (delta_file) { + fprintf(delta_file, "\t\t\t\"delta_type\" : \"REF\",\n"); + fprintf(delta_file, "\t\t\t\"delta_base\" : \"%s\",\n", oid_to_hex(&DELTA(entry)->idx.oid)); + } } else { if (limit && hdrlen + datalen + hashsz >= limit) { unuse_pack(&w_curs); @@ -640,6 +694,10 @@ static off_t write_reuse_object(struct hashfile *f, struct object_entry *entry, copy_pack_data(f, p, &w_curs, offset, datalen); unuse_pack(&w_curs); reused++; + + if (delta_file) + fprintf(delta_file, "\t\t\t\"reused\" : true\n\t\t}"); + return hdrlen + datalen; } @@ -1252,6 +1310,11 @@ static void write_pack_file(void) ALLOC_ARRAY(written_list, to_pack.nr_objects); write_order = compute_write_order(); + if (delta_file) { + fprintf(delta_file, "{\n\t\"num_objects\" : %"PRIu32",\n", to_pack.nr_objects); + fprintf(delta_file, "\t\"objects\" : [\n"); + } + do { unsigned char hash[GIT_MAX_RAWSZ]; char *pack_tmp_name = NULL; @@ -1400,6 +1463,9 @@ static void write_pack_file(void) written, nr_result); trace2_data_intmax("pack-objects", the_repository, "write_pack_file/wrote", nr_result); + + if (delta_file) + fprintf(delta_file, "\n\t]\n}"); } static int no_try_delta(const char *path) @@ -1670,7 +1736,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type, return 0; } - create_object_entry(oid, type, pack_name_hash(name), + create_object_entry(oid, type, pack_name_hash_fn(name), exclude, name && no_try_delta(name), found_pack, found_offset); return 1; @@ -1884,7 +1950,7 @@ static void add_preferred_base_object(const char *name) { struct pbase_tree *it; size_t cmplen; - unsigned hash = pack_name_hash(name); + unsigned hash = pack_name_hash_fn(name); if (!num_preferred_base || check_pbase_path(hash)) return; @@ -3139,6 +3205,35 @@ static int add_ref_tag(const char *tag UNUSED, const char *referent UNUSED, cons return 0; } +static int should_attempt_deltas(struct object_entry *entry) +{ + if (DELTA(entry)) + /* This happens if we decided to reuse existing + * delta from a pack. "reuse_delta &&" is implied. + */ + return 0; + + if (!entry->type_valid || oe_size_less_than(&to_pack, entry, 50)) + return 0; + + if (entry->no_try_delta) + return 0; + + if (!entry->preferred_base) { + if (oe_type(entry) < 0) + die(_("unable to get type of object %s"), + oid_to_hex(&entry->idx.oid)); + } else if (oe_type(entry) < 0) { + /* + * This object is not found, but we + * don't have to include it anyway. + */ + return 0; + } + + return 1; +} + static void prepare_pack(int window, int depth) { struct object_entry **delta_list; @@ -3169,33 +3264,11 @@ static void prepare_pack(int window, int depth) for (i = 0; i < to_pack.nr_objects; i++) { struct object_entry *entry = to_pack.objects + i; - if (DELTA(entry)) - /* This happens if we decided to reuse existing - * delta from a pack. "reuse_delta &&" is implied. - */ - continue; - - if (!entry->type_valid || - oe_size_less_than(&to_pack, entry, 50)) - continue; - - if (entry->no_try_delta) + if (!should_attempt_deltas(entry)) continue; - if (!entry->preferred_base) { + if (!entry->preferred_base) nr_deltas++; - if (oe_type(entry) < 0) - die(_("unable to get type of object %s"), - oid_to_hex(&entry->idx.oid)); - } else { - if (oe_type(entry) < 0) { - /* - * This object is not found, but we - * don't have to include it anyway. - */ - continue; - } - } delta_list[n++] = entry; } @@ -3394,7 +3467,7 @@ static void show_object_pack_hint(struct object *object, const char *name, * here using a now in order to perhaps improve the delta selection * process. */ - oe->hash = pack_name_hash(name); + oe->hash = pack_name_hash_fn(name); oe->no_try_delta = name && no_try_delta(name); stdin_packs_hints_nr++; @@ -3544,7 +3617,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type entry = packlist_find(&to_pack, oid); if (entry) { if (name) { - entry->hash = pack_name_hash(name); + entry->hash = pack_name_hash_fn(name); entry->no_try_delta = no_try_delta(name); } } else { @@ -3567,7 +3640,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type return; } - entry = create_object_entry(oid, type, pack_name_hash(name), + entry = create_object_entry(oid, type, pack_name_hash_fn(name), 0, name && no_try_delta(name), pack, offset); } @@ -4110,6 +4183,117 @@ static void mark_bitmap_preferred_tips(void) } } +static inline int is_oid_interesting(struct repository *repo, + struct object_id *oid, + enum object_type type) +{ + if (type == OBJ_TAG) { + struct tag *t = lookup_tag(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_COMMIT) { + struct commit *c = lookup_commit(repo, oid); + return c && !(c->object.flags & UNINTERESTING); + } + + if (type == OBJ_TREE) { + struct tree *t = lookup_tree(repo, oid); + return t && !(t->object.flags & UNINTERESTING); + } + + if (type == OBJ_BLOB) { + struct blob *b = lookup_blob(repo, oid); + return b && !(b->object.flags & UNINTERESTING); + } + + return 0; +} + +static int add_objects_by_path(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct object_entry **delta_list; + size_t oe_start = to_pack.nr_objects; + size_t oe_end; + unsigned int sub_list_size; + unsigned int *processed = data; + + /* + * First, add all objects to the packing data, including the ones + * marked UNINTERESTING (translated to 'exclude') as they can be + * used as delta bases. + */ + for (size_t i = 0; i < oids->nr; i++) { + struct object_id *oid = &oids->oid[i]; + int exclude = !is_oid_interesting(the_repository, oid, type); + add_object_entry(oid, type, path, exclude); + } + + oe_end = to_pack.nr_objects; + + /* We can skip delta calculations if it is a no-op. */ + if (oe_end == oe_start || !window) + return 0; + + sub_list_size = 0; + ALLOC_ARRAY(delta_list, oe_end - oe_start); + + for (size_t i = 0; i < oe_end - oe_start; i++) { + struct object_entry *entry = to_pack.objects + oe_start + i; + + if (!should_attempt_deltas(entry)) + continue; + + delta_list[sub_list_size++] = entry; + } + + /* + * Find delta bases among this list of objects that all match the same + * path. This causes the delta compression to be interleaved in the + * object walk, which can lead to confusing progress indicators. This is + * also incompatible with threaded delta calculations. In the future, + * consider creating a list of regions in the full to_pack.objects array + * that could be picked up by the threaded delta computation. + */ + if (sub_list_size && window) { + QSORT(delta_list, sub_list_size, type_size_sort); + find_deltas(delta_list, &sub_list_size, window, depth, processed); + } + + free(delta_list); + return 0; +} + +static void get_object_list_path_walk(struct rev_info *revs) +{ + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int processed = 0; + + info.revs = revs; + + info.revs->tag_objects = 1; + info.tags = 1; + info.commits = 1; + info.trees = 1; + info.blobs = 1; + info.path_fn = add_objects_by_path; + info.path_fn_data = &processed; + + /* + * Allow the --[no-]sparse option to be interesting here, if only + * for testing purposes. Paths with no interesting objects will not + * contribute to the resulting pack, but only create noisy preferred + * base objects. + */ + info.prune_all_uninteresting = sparse; + + if (walk_objects_by_path(&info)) + die(_("failed to pack objects via path-walk")); +} + static void get_object_list(struct rev_info *revs, int ac, const char **av) { struct setup_revision_opt s_r_opt = { @@ -4156,7 +4340,7 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) warn_on_object_refname_ambiguity = save_warning; - if (use_bitmap_index && !get_object_list_from_bitmap(revs)) + if (use_bitmap_index && !path_walk && !get_object_list_from_bitmap(revs)) return; if (use_delta_islands) @@ -4165,15 +4349,19 @@ static void get_object_list(struct rev_info *revs, int ac, const char **av) if (write_bitmap_index) mark_bitmap_preferred_tips(); - if (prepare_revision_walk(revs)) - die(_("revision walk setup failed")); - mark_edges_uninteresting(revs, show_edge, sparse); - if (!fn_show_object) fn_show_object = show_object; - traverse_commit_list(revs, - show_commit, fn_show_object, - NULL); + + if (path_walk) { + get_object_list_path_walk(revs); + } else { + if (prepare_revision_walk(revs)) + die(_("revision walk setup failed")); + mark_edges_uninteresting(revs, show_edge, sparse); + traverse_commit_list(revs, + show_commit, fn_show_object, + NULL); + } if (unpack_unreachable_expiration) { revs->ignore_missing_links = 1; @@ -4296,6 +4484,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; struct list_objects_filter_options filter_options = LIST_OBJECTS_FILTER_INIT; + const char *delta_file_name = NULL; struct option pack_objects_options[] = { OPT_CALLBACK_F('q', "quiet", &progress, NULL, @@ -4368,6 +4557,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) N_("use the sparse reachability algorithm")), OPT_BOOL(0, "thin", &thin, N_("create thin packs")), + OPT_BOOL(0, "path-walk", &path_walk, + N_("use the path-walk API to walk objects when possible")), OPT_BOOL(0, "shallow", &shallow, N_("create packs suitable for shallow fetches")), OPT_BOOL(0, "honor-pack-keep", &ignore_packed_keep_on_disk, @@ -4398,6 +4589,11 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) OPT_STRING_LIST(0, "uri-protocol", &uri_protocols, N_("protocol"), N_("exclude any configured uploadpack.blobpackfileuri with this protocol")), + OPT_BOOL(0, "full-name-hash", &use_full_name_hash, + N_("optimize delta compression across identical path names over time")), + OPT_STRING(0, "delta-file", &delta_file_name, + N_("filename"), + N_("output delta compression details to the given file")), OPT_END(), }; @@ -4406,11 +4602,14 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) disable_replace_refs(); + path_walk = git_env_bool("GIT_TEST_PACK_PATH_WALK", -1); sparse = git_env_bool("GIT_TEST_PACK_SPARSE", -1); if (the_repository->gitdir) { prepare_repo_settings(the_repository); if (sparse < 0) sparse = the_repository->settings.pack_use_sparse; + if (path_walk < 0) + path_walk = the_repository->settings.pack_use_path_walk; if (the_repository->settings.pack_use_multi_pack_reuse) allow_pack_reuse = MULTI_PACK_REUSE; } @@ -4432,6 +4631,12 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (pack_to_stdout != !base_name || argc) usage_with_options(pack_usage, pack_objects_options); + if (delta_file_name) { + delta_file = fopen(delta_file_name, "w"); + if (!delta_file) + die_errno("failed to open '%s'", delta_file_name); + trace2_printf("opened '%s' for writing deltas", delta_file_name); + } if (depth < 0) depth = 0; if (depth >= (1 << OE_DEPTH_BITS)) { @@ -4448,7 +4653,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) window = 0; strvec_push(&rp, "pack-objects"); - if (thin) { + + if (path_walk && filter_options.choice) { + warning(_("cannot use --filter with --path-walk")); + path_walk = 0; + } + if (path_walk) { + strvec_push(&rp, "--boundary"); + /* + * We must disable the bitmaps because we are removing + * the --objects / --objects-edge[-aggressive] options. + */ + use_bitmap_index = 0; + } else if (thin) { use_internal_rev_list = 1; strvec_push(&rp, shallow ? "--objects-edge-aggressive" @@ -4643,5 +4860,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) list_objects_filter_release(&filter_options); strvec_clear(&rp); + if (delta_file) { + fflush(delta_file); + fclose(delta_file); + } + return 0; } diff --git a/builtin/repack.c b/builtin/repack.c index 62cfa50c50f893..a1ab103e62d0ce 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -57,6 +57,8 @@ struct pack_objects_args { int no_reuse_object; int quiet; int local; + int path_walk; + int full_name_hash; struct list_objects_filter_options filter_options; }; @@ -288,6 +290,10 @@ static void prepare_pack_objects(struct child_process *cmd, strvec_pushf(&cmd->args, "--no-reuse-delta"); if (args->no_reuse_object) strvec_pushf(&cmd->args, "--no-reuse-object"); + if (args->path_walk) + strvec_pushf(&cmd->args, "--path-walk"); + if (args->full_name_hash) + strvec_pushf(&cmd->args, "--full-name-hash"); if (args->local) strvec_push(&cmd->args, "--local"); if (args->quiet) @@ -1158,6 +1164,10 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("pass --no-reuse-delta to git-pack-objects")), OPT_BOOL('F', NULL, &po_args.no_reuse_object, N_("pass --no-reuse-object to git-pack-objects")), + OPT_BOOL(0, "path-walk", &po_args.path_walk, + N_("pass --path-walk to git-pack-objects")), + OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash, + N_("pass --full-name-hash to git-pack-objects")), OPT_NEGBIT('n', NULL, &run_update_server_info, N_("do not run git-update-server-info"), 1), OPT__QUIET(&po_args.quiet, N_("be quiet")), diff --git a/builtin/survey.c b/builtin/survey.c new file mode 100644 index 00000000000000..90b041967c887e --- /dev/null +++ b/builtin/survey.c @@ -0,0 +1,965 @@ +#include "builtin.h" +#include "config.h" +#include "environment.h" +#include "hex.h" +#include "object.h" +#include "object-name.h" +#include "object-store-ll.h" +#include "parse-options.h" +#include "path-walk.h" +#include "progress.h" +#include "ref-filter.h" +#include "refs.h" +#include "revision.h" +#include "strbuf.h" +#include "strvec.h" +#include "tag.h" +#include "trace2.h" + +static const char * const survey_usage[] = { + N_("(EXPERIMENTAL!) git survey "), + NULL, +}; + +struct survey_refs_wanted { + int want_all_refs; /* special override */ + + int want_branches; + int want_tags; + int want_remotes; + int want_detached; + int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ +}; + +static struct survey_refs_wanted default_ref_options = { + .want_all_refs = 1, +}; + +enum survey_format { + SURVEY_PLAINTEXT = 0, + SURVEY_JSON = 1, +}; + +struct survey_opts { + int verbose; + int show_progress; + struct survey_refs_wanted refs; + enum survey_format format; +}; + +struct survey_report_ref_summary { + size_t refs_nr; + size_t branches_nr; + size_t remote_refs_nr; + size_t tags_nr; + size_t tags_annotated_nr; + size_t others_nr; + size_t unknown_nr; +}; + +struct survey_report_object_summary { + size_t commits_nr; + size_t tags_nr; + size_t trees_nr; + size_t blobs_nr; +}; + +/** + * For some category given by 'label', count the number of objects + * that match that label along with the on-disk size and the size + * after decompressing (both with delta bases and zlib). + */ +struct survey_report_object_size_summary { + char *label; + size_t nr; + size_t disk_size; + size_t inflated_size; + size_t num_missing; +}; + +typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2); + +static int cmp_by_nr(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->nr < s2->nr) + return -1; + if (s1->nr > s2->nr) + return 1; + return 0; +} + +static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->disk_size < s2->disk_size) + return -1; + if (s1->disk_size > s2->disk_size) + return 1; + return 0; +} + +static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1, + struct survey_report_object_size_summary *s2) +{ + if (s1->inflated_size < s2->inflated_size) + return -1; + if (s1->inflated_size > s2->inflated_size) + return 1; + return 0; +} + +/** + * Store a list of "top" categories by some sorting function. When + * inserting a new category, reorder the list and free the one that + * got ejected (if any). + */ +struct survey_report_top_sizes { + const char *name; + survey_top_size_cmp cmp_fn; + struct survey_report_object_size_summary *data; + size_t nr; + size_t alloc; +}; + +static void init_top_sizes(struct survey_report_top_sizes *top, + size_t limit, const char *name, + survey_top_size_cmp cmp) +{ + top->name = name; + top->alloc = limit; + top->nr = 0; + CALLOC_ARRAY(top->data, limit); + top->cmp_fn = cmp; +} + +MAYBE_UNUSED +static void clear_top_sizes(struct survey_report_top_sizes *top) +{ + for (size_t i = 0; i < top->nr; i++) + free(top->data[i].label); + free(top->data); +} + +static void maybe_insert_into_top_size(struct survey_report_top_sizes *top, + struct survey_report_object_size_summary *summary) +{ + size_t pos = top->nr; + + /* Compare against list from the bottom. */ + while (pos > 0 && top->cmp_fn(&top->data[pos - 1], summary) < 0) + pos--; + + /* Not big enough! */ + if (pos >= top->alloc) + return; + + /* We need to shift the data. */ + if (top->nr == top->alloc) + free(top->data[top->nr - 1].label); + else + top->nr++; + + for (size_t i = top->nr - 1; i > pos; i--) + memcpy(&top->data[i], &top->data[i - 1], sizeof(*top->data)); + + memcpy(&top->data[pos], summary, sizeof(*summary)); + top->data[pos].label = xstrdup(summary->label); +} + +/** + * This struct contains all of the information that needs to be printed + * at the end of the exploration of the repository and its references. + */ +struct survey_report { + struct survey_report_ref_summary refs; + struct survey_report_object_summary reachable_objects; + + struct survey_report_object_size_summary *by_type; + + struct survey_report_top_sizes *top_paths_by_count; + struct survey_report_top_sizes *top_paths_by_disk; + struct survey_report_top_sizes *top_paths_by_inflate; +}; + +#define REPORT_TYPE_COMMIT 0 +#define REPORT_TYPE_TREE 1 +#define REPORT_TYPE_BLOB 2 +#define REPORT_TYPE_COUNT 3 + +struct survey_context { + /* Options that control what is done. */ + struct survey_opts opts; + + /* Info for output only. */ + struct survey_report report; + + /* + * The rest of the members are about enabling the activity + * of the 'git survey' command, including ref listings, object + * pointers, and progress. + */ + + struct repository *repo; + + struct progress *progress; + size_t progress_nr; + size_t progress_total; + + struct strvec refs; + struct ref_array ref_array; +}; + +static void clear_survey_context(struct survey_context *ctx) +{ + ref_array_clear(&ctx->ref_array); + strvec_clear(&ctx->refs); +} + +struct survey_table { + const char *table_name; + struct strvec header; + struct strvec *rows; + size_t rows_nr; + size_t rows_alloc; +}; + +#define SURVEY_TABLE_INIT { \ + .header = STRVEC_INIT, \ +} + +static void clear_table(struct survey_table *table) +{ + strvec_clear(&table->header); + for (size_t i = 0; i < table->rows_nr; i++) + strvec_clear(&table->rows[i]); + free(table->rows); +} + +static void insert_table_rowv(struct survey_table *table, ...) +{ + va_list ap; + char *arg; + ALLOC_GROW(table->rows, table->rows_nr + 1, table->rows_alloc); + + memset(&table->rows[table->rows_nr], 0, sizeof(struct strvec)); + + va_start(ap, table); + while ((arg = va_arg(ap, char *))) + strvec_push(&table->rows[table->rows_nr], arg); + va_end(ap); + + table->rows_nr++; +} + +static void print_table_title(const char *name, size_t *widths, size_t nr) +{ + static struct strbuf lines = STRBUF_INIT; + size_t width = 0; + size_t min_width; + strbuf_setlen(&lines, 0); + + strbuf_addch(&lines, '\n'); + strbuf_addstr(&lines, name); + min_width = lines.len - 1; + strbuf_addch(&lines, '\n'); + + for (size_t i = 0; i < nr; i++) { + if (i) + width += 3; + width += widths[i]; + } + + if (width < min_width) + width = min_width; + + strbuf_addchars(&lines, '=', width); + printf("%s\n", lines.buf); +} + +static void print_row_plaintext(struct strvec *row, size_t *widths) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < row->nr; i++) { + const char *str = row->v[i]; + size_t len = strlen(str); + if (i) + strbuf_add(&line, " | ", 3); + strbuf_addchars(&line, ' ', widths[i] - len); + strbuf_add(&line, str, len); + } + printf("%s\n", line.buf); +} + +static void print_divider_plaintext(size_t *widths, size_t nr) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < nr; i++) { + if (i) + strbuf_add(&line, "-+-", 3); + strbuf_addchars(&line, '-', widths[i]); + } + printf("%s\n", line.buf); +} + +static void print_table_plaintext(struct survey_table *table) +{ + size_t *column_widths; + size_t columns_nr = table->header.nr; + CALLOC_ARRAY(column_widths, columns_nr); + + for (size_t i = 0; i < columns_nr; i++) { + column_widths[i] = strlen(table->header.v[i]); + + for (size_t j = 0; j < table->rows_nr; j++) { + size_t rowlen = strlen(table->rows[j].v[i]); + if (column_widths[i] < rowlen) + column_widths[i] = rowlen; + } + } + + print_table_title(table->table_name, column_widths, columns_nr); + print_row_plaintext(&table->header, column_widths); + print_divider_plaintext(column_widths, columns_nr); + + for (size_t j = 0; j < table->rows_nr; j++) + print_row_plaintext(&table->rows[j], column_widths); +} + +static void survey_report_plaintext_refs(struct survey_context *ctx) +{ + struct survey_report_ref_summary *refs = &ctx->report.refs; + struct survey_table table = SURVEY_TABLE_INIT; + + table.table_name = _("REFERENCES SUMMARY"); + + strvec_push(&table.header, _("Ref Type")); + strvec_push(&table.header, _("Count")); + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_branches) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->branches_nr); + insert_table_rowv(&table, _("Branches"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_remotes) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->remote_refs_nr); + insert_table_rowv(&table, _("Remote refs"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_tags) { + char *fmt = xstrfmt("%"PRIuMAX"", refs->tags_nr); + insert_table_rowv(&table, _("Tags (all)"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", refs->tags_annotated_nr); + insert_table_rowv(&table, _("Tags (annotated)"), fmt, NULL); + free(fmt); + } + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_plaintext_reachable_object_summary(struct survey_context *ctx) +{ + struct survey_report_object_summary *objs = &ctx->report.reachable_objects; + struct survey_table table = SURVEY_TABLE_INIT; + char *fmt; + + table.table_name = _("REACHABLE OBJECT SUMMARY"); + + strvec_push(&table.header, _("Object Type")); + strvec_push(&table.header, _("Count")); + + fmt = xstrfmt("%"PRIuMAX"", objs->tags_nr); + insert_table_rowv(&table, _("Tags"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->commits_nr); + insert_table_rowv(&table, _("Commits"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->trees_nr); + insert_table_rowv(&table, _("Trees"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", objs->blobs_nr); + insert_table_rowv(&table, _("Blobs"), fmt, NULL); + free(fmt); + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_object_sizes(const char *title, + const char *categories, + struct survey_report_object_size_summary *summary, + size_t summary_nr) +{ + struct survey_table table = SURVEY_TABLE_INIT; + table.table_name = title; + + strvec_push(&table.header, xstrdup(categories)); + strvec_push(&table.header, xstrdup(_("Count"))); + strvec_push(&table.header, xstrdup(_("Disk Size"))); + strvec_push(&table.header, xstrdup(_("Inflated Size"))); + + for (size_t i = 0; i < summary_nr; i++) { + insert_table_rowv(&table, xstrdup(summary[i].label), + xstrfmt("%"PRIuMAX, summary[i].nr), + xstrfmt("%"PRIuMAX, summary[i].disk_size), + xstrfmt("%"PRIuMAX, summary[i].inflated_size), + NULL); + } + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_plaintext_sorted_size( + struct survey_report_top_sizes *top) +{ + survey_report_object_sizes(top->name, _("Path"), + top->data, top->nr); +} + +static void survey_report_plaintext(struct survey_context *ctx) +{ + printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); + printf("-----------------------------------------------------\n"); + survey_report_plaintext_refs(ctx); + survey_report_plaintext_reachable_object_summary(ctx); + survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"), + _("Object Type"), + ctx->report.by_type, + REPORT_TYPE_COUNT); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); +} + +static void survey_report_json(struct survey_context *ctx) +{ + /* TODO. */ +} + +/* + * After parsing the command line arguments, figure out which refs we + * should scan. + * + * If ANY were given in positive sense, then we ONLY include them and + * do not use the builtin values. + */ +static void fixup_refs_wanted(struct survey_context *ctx) +{ + struct survey_refs_wanted *rw = &ctx->opts.refs; + + /* + * `--all-refs` overrides and enables everything. + */ + if (rw->want_all_refs == 1) { + rw->want_branches = 1; + rw->want_tags = 1; + rw->want_remotes = 1; + rw->want_detached = 1; + rw->want_other = 1; + return; + } + + /* + * If none of the `--` were given, we assume all + * of the builtin unspecified values. + */ + if (rw->want_branches == -1 && + rw->want_tags == -1 && + rw->want_remotes == -1 && + rw->want_detached == -1 && + rw->want_other == -1) { + *rw = default_ref_options; + return; + } + + /* + * Since we only allow positive boolean values on the command + * line, we will only have true values where they specified + * a `--`. + * + * So anything that still has an unspecified value should be + * set to false. + */ + if (rw->want_branches == -1) + rw->want_branches = 0; + if (rw->want_tags == -1) + rw->want_tags = 0; + if (rw->want_remotes == -1) + rw->want_remotes = 0; + if (rw->want_detached == -1) + rw->want_detached = 0; + if (rw->want_other == -1) + rw->want_other = 0; +} + +static int survey_load_config_cb(const char *var, const char *value, + const struct config_context *cctx, void *pvoid) +{ + struct survey_context *sctx = pvoid; + if (!strcmp(var, "survey.verbose")) { + sctx->opts.verbose = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.progress")) { + sctx->opts.show_progress = git_config_bool(var, value); + return 0; + } + + return git_default_config(var, value, cctx, pvoid); +} + +static void survey_load_config(struct survey_context *ctx) +{ + git_config(survey_load_config_cb, ctx); +} + +static void do_load_refs(struct survey_context *ctx, + struct ref_array *ref_array) +{ + struct ref_filter filter = REF_FILTER_INIT; + struct ref_sorting *sorting; + struct string_list sorting_options = STRING_LIST_INIT_DUP; + + string_list_append(&sorting_options, "objectname"); + sorting = ref_sorting_options(&sorting_options); + + if (ctx->opts.refs.want_detached) + strvec_push(&ctx->refs, "HEAD"); + + if (ctx->opts.refs.want_all_refs) { + strvec_push(&ctx->refs, "refs/"); + } else { + if (ctx->opts.refs.want_branches) + strvec_push(&ctx->refs, "refs/heads/"); + if (ctx->opts.refs.want_tags) + strvec_push(&ctx->refs, "refs/tags/"); + if (ctx->opts.refs.want_remotes) + strvec_push(&ctx->refs, "refs/remotes/"); + if (ctx->opts.refs.want_other) { + strvec_push(&ctx->refs, "refs/notes/"); + strvec_push(&ctx->refs, "refs/stash/"); + } + } + + filter.name_patterns = ctx->refs.v; + filter.ignore_case = 0; + filter.match_as_path = 1; + + if (ctx->opts.show_progress) { + ctx->progress_total = 0; + ctx->progress = start_progress(_("Scanning refs..."), 0); + } + + filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK); + + if (ctx->opts.show_progress) { + ctx->progress_total = ref_array->nr; + display_progress(ctx->progress, ctx->progress_total); + } + + ref_array_sort(sorting, ref_array); + + stop_progress(&ctx->progress); + ref_filter_clear(&filter); + ref_sorting_release(sorting); +} + +/* + * The REFS phase: + * + * Load the set of requested refs and assess them for scalablity problems. + * Use that set to start a treewalk to all reachable objects and assess + * them. + * + * This data will give us insights into the repository itself (the number + * of refs, the size and shape of the DAG, the number and size of the + * objects). + * + * Theoretically, this data is independent of the on-disk representation + * (e.g. independent of packing concerns). + */ +static void survey_phase_refs(struct survey_context *ctx) +{ + trace2_region_enter("survey", "phase/refs", ctx->repo); + do_load_refs(ctx, &ctx->ref_array); + + ctx->report.refs.refs_nr = ctx->ref_array.nr; + for (size_t i = 0; i < ctx->ref_array.nr; i++) { + size_t size; + struct ref_array_item *item = ctx->ref_array.items[i]; + + switch (item->kind) { + case FILTER_REFS_TAGS: + ctx->report.refs.tags_nr++; + if (oid_object_info(ctx->repo, + &item->objectname, + &size) == OBJ_TAG) + ctx->report.refs.tags_annotated_nr++; + break; + + case FILTER_REFS_BRANCHES: + ctx->report.refs.branches_nr++; + break; + + case FILTER_REFS_REMOTES: + ctx->report.refs.remote_refs_nr++; + break; + + case FILTER_REFS_OTHERS: + ctx->report.refs.others_nr++; + break; + + default: + ctx->report.refs.unknown_nr++; + break; + } + } + + trace2_region_leave("survey", "phase/refs", ctx->repo); +} + +static void increment_object_counts( + struct survey_report_object_summary *summary, + enum object_type type, + size_t nr) +{ + switch (type) { + case OBJ_COMMIT: + summary->commits_nr += nr; + break; + + case OBJ_TREE: + summary->trees_nr += nr; + break; + + case OBJ_BLOB: + summary->blobs_nr += nr; + break; + + default: + break; + } +} + +static void increment_totals(struct survey_context *ctx, + struct oid_array *oids, + struct survey_report_object_size_summary *summary) +{ + for (size_t i = 0; i < oids->nr; i++) { + struct object_info oi = OBJECT_INFO_INIT; + unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH; + unsigned long object_length = 0; + off_t disk_sizep = 0; + enum object_type type; + + oi.typep = &type; + oi.sizep = &object_length; + oi.disk_sizep = &disk_sizep; + + if (oid_object_info_extended(ctx->repo, &oids->oid[i], + &oi, oi_flags) < 0) { + summary->num_missing++; + } else { + summary->nr++; + summary->disk_size += disk_sizep; + summary->inflated_size += object_length; + } + } +} + +static void increment_object_totals(struct survey_context *ctx, + struct oid_array *oids, + enum object_type type, + const char *path) +{ + struct survey_report_object_size_summary *total; + struct survey_report_object_size_summary summary = { 0 }; + + increment_totals(ctx, oids, &summary); + + switch (type) { + case OBJ_COMMIT: + total = &ctx->report.by_type[REPORT_TYPE_COMMIT]; + break; + + case OBJ_TREE: + total = &ctx->report.by_type[REPORT_TYPE_TREE]; + break; + + case OBJ_BLOB: + total = &ctx->report.by_type[REPORT_TYPE_BLOB]; + break; + + default: + BUG("No other type allowed"); + } + + total->nr += summary.nr; + total->disk_size += summary.disk_size; + total->inflated_size += summary.inflated_size; + total->num_missing += summary.num_missing; + + if (type == OBJ_TREE || type == OBJ_BLOB) { + int index = type == OBJ_TREE ? + REPORT_TYPE_TREE : REPORT_TYPE_BLOB; + struct survey_report_top_sizes *top; + + /* + * Temporarily store (const char *) here, but it will + * be duped if inserted and will not be freed. + */ + summary.label = (char *)path; + + top = ctx->report.top_paths_by_count; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_disk; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_inflate; + maybe_insert_into_top_size(&top[index], &summary); + } +} + +static int survey_objects_path_walk_fn(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct survey_context *ctx = data; + + increment_object_counts(&ctx->report.reachable_objects, + type, oids->nr); + increment_object_totals(ctx, oids, type, path); + + ctx->progress_nr += oids->nr; + display_progress(ctx->progress, ctx->progress_nr); + + return 0; +} + +static int iterate_tag_chain(struct survey_context *ctx, + struct object_id *oid, + struct object_id *peeled) +{ + struct object *o = lookup_unknown_object(ctx->repo, oid); + struct tag *t; + + if (o->type != OBJ_TAG) { + oidcpy(peeled, &o->oid); + return o->type != OBJ_COMMIT; + } + + t = lookup_tag(ctx->repo, oid); + while (t) { + parse_tag(t); + ctx->report.reachable_objects.tags_nr++; + + if (!t->tagged) + break; + + o = lookup_unknown_object(ctx->repo, &t->tagged->oid); + if (o && o->type == OBJ_TAG) + t = lookup_tag(ctx->repo, &t->tagged->oid); + else + break; + } + + if (!t || !t->tagged) + return -1; + + oidcpy(peeled, &t->tagged->oid); + o = lookup_unknown_object(ctx->repo, peeled); + if (o && o->type == OBJ_COMMIT) + return 0; + return -1; +} + +static void initialize_report(struct survey_context *ctx) +{ + const int top_limit = 100; + + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); + ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); + ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); + ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + + CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + + CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + + CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); +} + +static void survey_phase_objects(struct survey_context *ctx) +{ + struct rev_info revs = REV_INFO_INIT; + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int add_flags = 0; + + trace2_region_enter("survey", "phase/objects", ctx->repo); + + info.revs = &revs; + info.path_fn = survey_objects_path_walk_fn; + info.path_fn_data = ctx; + + info.commits = 1; + info.trees = 1; + info.blobs = 1; + info.tags = 1; + + initialize_report(ctx); + + repo_init_revisions(ctx->repo, &revs, ""); + + ctx->progress_nr = 0; + ctx->progress_total = ctx->ref_array.nr; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Preparing object walk"), + ctx->progress_total); + for (size_t i = 0; i < ctx->ref_array.nr; i++) { + struct ref_array_item *item = ctx->ref_array.items[i]; + struct object_id peeled; + + switch (item->kind) { + case FILTER_REFS_TAGS: + if (!iterate_tag_chain(ctx, &item->objectname, &peeled)) + add_pending_oid(&revs, NULL, &peeled, add_flags); + break; + case FILTER_REFS_BRANCHES: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_REMOTES: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_OTHERS: + /* + * This may be a note, stash, or custom namespace branch. + */ + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + case FILTER_REFS_DETACHED_HEAD: + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + break; + default: + break; + } + + display_progress(ctx->progress, ++(ctx->progress_nr)); + } + stop_progress(&ctx->progress); + + ctx->progress_nr = 0; + ctx->progress_total = 0; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Walking objects"), 0); + walk_objects_by_path(&info); + stop_progress(&ctx->progress); + + release_revisions(&revs); + trace2_region_leave("survey", "phase/objects", ctx->repo); +} + +int cmd_survey(int argc, const char **argv, const char *prefix) +{ + static struct survey_context ctx = { + .opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ + + .refs.want_all_refs = -1, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, + }, + .refs = STRVEC_INIT, + }; + + static struct option survey_options[] = { + OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + + OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &ctx.opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &ctx.opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &ctx.opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(survey_usage, survey_options); + + ctx.repo = the_repository; + prepare_repo_settings(ctx.repo); + survey_load_config(&ctx); + + argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); + + if (ctx.opts.show_progress < 0) + ctx.opts.show_progress = isatty(2); + fixup_refs_wanted(&ctx); + + survey_phase_refs(&ctx); + + survey_phase_objects(&ctx); + + switch (ctx.opts.format) { + case SURVEY_PLAINTEXT: + survey_report_plaintext(&ctx); + break; + + case SURVEY_JSON: + survey_report_json(&ctx); + break; + + default: + BUG("Undefined format"); + } + + clear_survey_context(&ctx); + return 0; +} diff --git a/command-list.txt b/command-list.txt index e0bb87b3b5c278..ecc9d2281a0909 100644 --- a/command-list.txt +++ b/command-list.txt @@ -60,6 +60,7 @@ git-annotate ancillaryinterrogators git-apply plumbingmanipulators complete git-archimport foreignscminterface git-archive mainporcelain +git-backfill mainporcelain history git-bisect mainporcelain info git-blame ancillaryinterrogators complete git-branch mainporcelain history @@ -186,6 +187,7 @@ git-stash mainporcelain git-status mainporcelain info git-stripspace purehelpers git-submodule mainporcelain +git-survey mainporcelain git-svn foreignscminterface git-switch mainporcelain history git-symbolic-ref plumbingmanipulators diff --git a/git.c b/git.c index 9a618a2740f195..98e90838e42fbf 100644 --- a/git.c +++ b/git.c @@ -509,6 +509,7 @@ static struct cmd_struct commands[] = { { "annotate", cmd_annotate, RUN_SETUP }, { "apply", cmd_apply, RUN_SETUP_GENTLY }, { "archive", cmd_archive, RUN_SETUP_GENTLY }, + { "backfill", cmd_backfill, RUN_SETUP }, { "bisect", cmd_bisect, RUN_SETUP }, { "blame", cmd_blame, RUN_SETUP }, { "branch", cmd_branch, RUN_SETUP | DELAY_PAGER_CONFIG }, @@ -629,6 +630,7 @@ static struct cmd_struct commands[] = { { "status", cmd_status, RUN_SETUP | NEED_WORK_TREE }, { "stripspace", cmd_stripspace }, { "submodule--helper", cmd_submodule__helper, RUN_SETUP }, + { "survey", cmd_survey, RUN_SETUP }, { "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE }, { "symbolic-ref", cmd_symbolic_ref, RUN_SETUP }, { "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/pack-objects.h b/pack-objects.h index b9898a4e64b8b4..50097552d03f20 100644 --- a/pack-objects.h +++ b/pack-objects.h @@ -207,6 +207,26 @@ static inline uint32_t pack_name_hash(const char *name) return hash; } +static inline uint32_t pack_full_name_hash(const char *name) +{ + const uint32_t bigp = 1234572167U; + uint32_t c, hash = bigp; + + if (!name) + return 0; + + /* + * Just do the dumbest thing possible: add random multiples of a + * large prime number with a binary shift. Goal is not cryptographic, + * but generally uniformly distributed. + */ + while ((c = *name++) != 0) { + hash += c * bigp; + hash = (hash >> 5) | (hash << 27); + } + return hash; +} + static inline enum object_type oe_type(const struct object_entry *e) { return e->type_valid ? e->type_ : OBJ_BAD; diff --git a/path-walk.c b/path-walk.c new file mode 100644 index 00000000000000..9391e0579aea4b --- /dev/null +++ b/path-walk.c @@ -0,0 +1,401 @@ +/* + * path-walk.c: implementation for path-based walks of the object graph. + */ +#include "git-compat-util.h" +#include "path-walk.h" +#include "blob.h" +#include "commit.h" +#include "dir.h" +#include "hashmap.h" +#include "hex.h" +#include "object.h" +#include "oid-array.h" +#include "repository.h" +#include "revision.h" +#include "string-list.h" +#include "strmap.h" +#include "tag.h" +#include "trace2.h" +#include "tree.h" +#include "tree-walk.h" + +struct type_and_oid_list +{ + enum object_type type; + struct oid_array oids; + int maybe_interesting; +}; + +#define TYPE_AND_OID_LIST_INIT { \ + .type = OBJ_NONE, \ + .oids = OID_ARRAY_INIT \ +} + +struct path_walk_context { + /** + * Repeats of data in 'struct path_walk_info' for + * access with fewer characters. + */ + struct repository *repo; + struct rev_info *revs; + struct path_walk_info *info; + + /** + * Map a path to a 'struct type_and_oid_list' + * containing the objects discovered at that + * path. + */ + struct strmap paths_to_lists; + + /** + * Store the current list of paths in a stack, to + * facilitate depth-first-search without recursion. + */ + struct string_list path_stack; +}; + +static int add_children(struct path_walk_context *ctx, + const char *base_path, + struct object_id *oid) +{ + struct tree_desc desc; + struct name_entry entry; + struct strbuf path = STRBUF_INIT; + size_t base_len; + struct tree *tree = lookup_tree(ctx->repo, oid); + + if (!tree) { + error(_("failed to walk children of tree %s: not found"), + oid_to_hex(oid)); + return -1; + } + + strbuf_addstr(&path, base_path); + base_len = path.len; + + parse_tree(tree); + init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size); + while (tree_entry(&desc, &entry)) { + struct type_and_oid_list *list; + struct object *o; + /* Not actually true, but we will ignore submodules later. */ + enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB; + + /* Skip submodules. */ + if (S_ISGITLINK(entry.mode)) + continue; + + /* If the caller doesn't want blobs, then don't bother. */ + if (!ctx->info->blobs && type == OBJ_BLOB) + continue; + + if (type == OBJ_TREE) { + struct tree *child = lookup_tree(ctx->repo, &entry.oid); + o = child ? &child->object : NULL; + } else if (type == OBJ_BLOB) { + struct blob *child = lookup_blob(ctx->repo, &entry.oid); + o = child ? &child->object : NULL; + } else { + /* Wrong type? */ + continue; + } + + if (!o) /* report error?*/ + continue; + + /* Skip this object if already seen. */ + if (o->flags & SEEN) + continue; + o->flags |= SEEN; + + strbuf_setlen(&path, base_len); + strbuf_add(&path, entry.path, entry.pathlen); + + /* + * Trees will end with "/" for concatenation and distinction + * from blobs at the same path. + */ + if (type == OBJ_TREE) + strbuf_addch(&path, '/'); + + if (ctx->info->pl) { + int dtype; + enum pattern_match_result match; + match = path_matches_pattern_list(path.buf, path.len, + path.buf + base_len, &dtype, + ctx->info->pl, + ctx->repo->index); + + if (ctx->info->pl->use_cone_patterns && + match == NOT_MATCHED) + continue; + else if (!ctx->info->pl->use_cone_patterns && + type == OBJ_BLOB && + match != MATCHED) + continue; + } + + if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) { + CALLOC_ARRAY(list, 1); + list->type = type; + strmap_put(&ctx->paths_to_lists, path.buf, list); + string_list_append(&ctx->path_stack, path.buf); + + if (!(o->flags & UNINTERESTING)) + list->maybe_interesting = 1; + } + oid_array_append(&list->oids, &entry.oid); + } + + free_tree_buffer(tree); + strbuf_release(&path); + return 0; +} + +/* + * For each path in paths_to_explore, walk the trees another level + * and add any found blobs to the batch (but only if they don't + * exist and haven't been added yet). + */ +static int walk_path(struct path_walk_context *ctx, + const char *path) +{ + struct type_and_oid_list *list; + int ret = 0; + + list = strmap_get(&ctx->paths_to_lists, path); + + if (ctx->info->prune_all_uninteresting) { + /* + * This is true if all objects were UNINTERESTING + * when added to the list. + */ + if (!list->maybe_interesting) + return 0; + + /* + * But it's still possible that the objects were set + * as UNINTERESTING after being added. Do a quick check. + */ + list->maybe_interesting = 0; + for (size_t i = 0; + !list->maybe_interesting && i < list->oids.nr; + i++) { + if (list->type == OBJ_TREE) { + struct tree *t = lookup_tree(ctx->repo, + &list->oids.oid[i]); + if (t && !(t->object.flags & UNINTERESTING)) + list->maybe_interesting = 1; + } else { + struct blob *b = lookup_blob(ctx->repo, + &list->oids.oid[i]); + if (b && !(b->object.flags & UNINTERESTING)) + list->maybe_interesting = 1; + } + } + + /* We have confirmed that all objects are UNINTERESTING. */ + if (!list->maybe_interesting) + return 0; + } + + /* Evaluate function pointer on this data, if requested. */ + if ((list->type == OBJ_TREE && ctx->info->trees) || + (list->type == OBJ_BLOB && ctx->info->blobs)) + ret = ctx->info->path_fn(path, &list->oids, list->type, + ctx->info->path_fn_data); + + /* Expand data for children. */ + if (list->type == OBJ_TREE) { + for (size_t i = 0; i < list->oids.nr; i++) { + ret |= add_children(ctx, + path, + &list->oids.oid[i]); + } + } + + oid_array_clear(&list->oids); + strmap_remove(&ctx->paths_to_lists, path, 1); + return ret; +} + +static void clear_strmap(struct strmap *map) +{ + struct hashmap_iter iter; + struct strmap_entry *e; + + hashmap_for_each_entry(&map->map, &iter, e, ent) { + struct type_and_oid_list *list = e->value; + oid_array_clear(&list->oids); + } + strmap_clear(map, 1); + strmap_init(map); +} + +/** + * Given the configuration of 'info', walk the commits based on 'info->revs' and + * call 'info->path_fn' on each discovered path. + * + * Returns nonzero on an error. + */ +int walk_objects_by_path(struct path_walk_info *info) +{ + const char *root_path = ""; + int ret = 0, has_uninteresting = 0; + size_t commits_nr = 0, paths_nr = 0; + struct commit *c; + struct type_and_oid_list *root_tree_list; + struct type_and_oid_list *commit_list; + struct path_walk_context ctx = { + .repo = info->revs->repo, + .revs = info->revs, + .info = info, + .path_stack = STRING_LIST_INIT_DUP, + .paths_to_lists = STRMAP_INIT + }; + struct oidset root_tree_set = OIDSET_INIT; + + struct oid_array tagged_tree_list = OID_ARRAY_INIT; + struct oid_array tagged_blob_list = OID_ARRAY_INIT; + + trace2_region_enter("path-walk", "commit-walk", info->revs->repo); + + CALLOC_ARRAY(commit_list, 1); + commit_list->type = OBJ_COMMIT; + + /* Insert a single list for the root tree into the paths. */ + CALLOC_ARRAY(root_tree_list, 1); + root_tree_list->type = OBJ_TREE; + root_tree_list->maybe_interesting = 1; + strmap_put(&ctx.paths_to_lists, root_path, root_tree_list); + + if (prepare_revision_walk(info->revs)) + die(_("failed to setup revision walk")); + + while ((c = get_revision(info->revs))) { + struct object_id *oid; + struct tree *t; + commits_nr++; + + if (info->commits) + oid_array_append(&commit_list->oids, + &c->object.oid); + + /* If we only care about commits, then skip trees. */ + if (!info->trees && !info->blobs) + continue; + + oid = get_commit_tree_oid(c); + t = lookup_tree(info->revs->repo, oid); + + if (t) { + oidset_insert(&root_tree_set, oid); + oid_array_append(&root_tree_list->oids, oid); + } else { + warning("could not find tree %s", oid_to_hex(oid)); + } + + if (t && (c->object.flags & UNINTERESTING)) { + t->object.flags |= UNINTERESTING; + has_uninteresting = 1; + } + } + + trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr); + trace2_region_leave("path-walk", "commit-walk", info->revs->repo); + + /* Track all commits. */ + if (info->commits) + ret = info->path_fn("initial", &commit_list->oids, OBJ_COMMIT, + info->path_fn_data); + oid_array_clear(&commit_list->oids); + free(commit_list); + + if (info->tags) { + struct oid_array tags = OID_ARRAY_INIT; + + trace2_region_enter("path-walk", "tag-walk", info->revs->repo); + + /* + * Walk any pending objects at this point, but they should only + * be tags. + */ + for (size_t i = 0; i < info->revs->pending.nr; i++) { + struct object_array_entry *pending = info->revs->pending.objects + i; + struct object *obj = pending->item; + + while (obj->type == OBJ_TAG) { + struct tag *tag = lookup_tag(info->revs->repo, + &obj->oid); + oid_array_append(&tags, &obj->oid); + obj = tag->tagged; + } + + switch (obj->type) { + case OBJ_TREE: + oid_array_append(&tagged_tree_list, &obj->oid); + break; + + case OBJ_BLOB: + oid_array_append(&tagged_blob_list, &obj->oid); + break; + + case OBJ_COMMIT: + /* skip */ + break; + + default: + BUG("should not see any other type here"); + } + } + + info->path_fn("initial", &tags, OBJ_TAG, info->path_fn_data); + + if (tagged_tree_list.nr) + info->path_fn("tagged-trees", &tagged_tree_list, OBJ_TREE, + info->path_fn_data); + if (tagged_blob_list.nr) + info->path_fn("tagged-blobs", &tagged_blob_list, OBJ_BLOB, + info->path_fn_data); + + trace2_data_intmax("path-walk", ctx.repo, "tags", tags.nr); + trace2_region_leave("path-walk", "tag-walk", info->revs->repo); + oid_array_clear(&tags); + oid_array_clear(&tagged_tree_list); + oid_array_clear(&tagged_blob_list); + } + + /* + * Before performing a DFS of our paths and emitting them as interesting, + * do a full walk of the trees to distribute the UNINTERESTING bit. Use + * the sparse algorithm if prune_all_uninteresting was set. + */ + if (has_uninteresting) { + trace2_region_enter("path-walk", "uninteresting-walk", info->revs->repo); + if (info->prune_all_uninteresting) + mark_trees_uninteresting_sparse(ctx.repo, &root_tree_set); + else + mark_trees_uninteresting_dense(ctx.repo, &root_tree_set); + trace2_region_leave("path-walk", "uninteresting-walk", info->revs->repo); + } + oidset_clear(&root_tree_set); + + string_list_append(&ctx.path_stack, root_path); + + trace2_region_enter("path-walk", "path-walk", info->revs->repo); + while (!ret && ctx.path_stack.nr) { + char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string; + ctx.path_stack.nr--; + paths_nr++; + + ret = walk_path(&ctx, path); + + free(path); + } + trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr); + trace2_region_leave("path-walk", "path-walk", info->revs->repo); + + clear_strmap(&ctx.paths_to_lists); + string_list_clear(&ctx.path_stack, 0); + return ret; +} diff --git a/path-walk.h b/path-walk.h new file mode 100644 index 00000000000000..7c02bca71561aa --- /dev/null +++ b/path-walk.h @@ -0,0 +1,73 @@ +/* + * path-walk.h : Methods and structures for walking the object graph in batches + * by the paths that can reach those objects. + */ +#include "object.h" /* Required for 'enum object_type'. */ + +struct rev_info; +struct oid_array; +struct pattern_list; + +/** + * The type of a function pointer for the method that is called on a list of + * objects reachable at a given path. + */ +typedef int (*path_fn)(const char *path, + struct oid_array *oids, + enum object_type type, + void *data); + +struct path_walk_info { + /** + * revs provides the definitions for the commit walk, including + * which commits are UNINTERESTING or not. + */ + struct rev_info *revs; + + /** + * The caller wishes to execute custom logic on objects reachable at a + * given path. Every reachable object will be visited exactly once, and + * the first path to see an object wins. This may not be a stable choice. + */ + path_fn path_fn; + void *path_fn_data; + + /** + * Initialize which object types the path_fn should be called on. This + * could also limit the walk to skip blobs if not set. + */ + int commits; + int trees; + int blobs; + int tags; + + /** + * Specify a sparse-checkout definition to match our paths to. Do not + * walk outside of this sparse definition. If the patterns are in + * cone mode, then the search may prune directories that are outside + * of the cone. If not in cone mode, then all tree paths will be + * explored but the path_fn will only be called when the path matches + * the sparse-checkout patterns. + */ + struct pattern_list *pl; + + /** + * When 'prune_all_uninteresting' is set and a path has all objects + * marked as UNINTERESTING, then the path-walk will not visit those + * objects. It will not call path_fn on those objects and will not + * walk the children of such trees. + */ + int prune_all_uninteresting; +}; + +#define PATH_WALK_INFO_INIT { \ + .blobs = 1, \ +} + +/** + * Given the configuration of 'info', walk the commits based on 'info->revs' and + * call 'info->path_fn' on each discovered path. + * + * Returns nonzero on an error. + */ +int walk_objects_by_path(struct path_walk_info *info); diff --git a/repo-settings.c b/repo-settings.c index 2b4e68731bedce..d9597d84556161 100644 --- a/repo-settings.c +++ b/repo-settings.c @@ -45,11 +45,13 @@ void prepare_repo_settings(struct repository *r) r->settings.fetch_negotiation_algorithm = FETCH_NEGOTIATION_SKIPPING; r->settings.pack_use_bitmap_boundary_traversal = 1; r->settings.pack_use_multi_pack_reuse = 1; + r->settings.pack_use_path_walk = 1; } if (manyfiles) { r->settings.index_version = 4; r->settings.index_skip_hash = 1; r->settings.core_untracked_cache = UNTRACKED_CACHE_WRITE; + r->settings.pack_use_path_walk = 1; } /* Commit graph config or default, does not cascade (simple) */ @@ -64,6 +66,7 @@ void prepare_repo_settings(struct repository *r) /* Boolean config or default, does not cascade (simple) */ repo_cfg_bool(r, "pack.usesparse", &r->settings.pack_use_sparse, 1); + repo_cfg_bool(r, "pack.usepathwalk", &r->settings.pack_use_path_walk, 0); repo_cfg_bool(r, "core.multipackindex", &r->settings.core_multi_pack_index, 1); repo_cfg_bool(r, "index.sparse", &r->settings.sparse_index, 0); repo_cfg_bool(r, "index.skiphash", &r->settings.index_skip_hash, r->settings.index_skip_hash); diff --git a/repository.h b/repository.h index af6ea0a62cdb70..2ae9c2b1741528 100644 --- a/repository.h +++ b/repository.h @@ -62,6 +62,7 @@ struct repo_settings { enum untracked_cache_setting core_untracked_cache; int pack_use_sparse; + int pack_use_path_walk; enum fetch_negotiation_setting fetch_negotiation_algorithm; int core_multi_pack_index; diff --git a/revision.c b/revision.c index ac94f8d4292ff3..21c8b6d1bc0486 100644 --- a/revision.c +++ b/revision.c @@ -219,6 +219,21 @@ static void add_children_by_path(struct repository *r, free_tree_buffer(tree); } +void mark_trees_uninteresting_dense(struct repository *r, + struct oidset *trees) +{ + struct object_id *oid; + struct oidset_iter iter; + + oidset_iter_init(trees, &iter); + while ((oid = oidset_iter_next(&iter))) { + struct tree *tree = lookup_tree(r, oid); + + if (tree->object.flags & UNINTERESTING) + mark_tree_contents_uninteresting(r, tree); + } +} + void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees) { diff --git a/revision.h b/revision.h index 0e470d1df19f69..6c3df8e42bfa6d 100644 --- a/revision.h +++ b/revision.h @@ -487,6 +487,7 @@ void put_revision_mark(const struct rev_info *revs, void mark_parents_uninteresting(struct rev_info *revs, struct commit *commit); void mark_tree_uninteresting(struct repository *r, struct tree *tree); +void mark_trees_uninteresting_dense(struct repository *r, struct oidset *trees); void mark_trees_uninteresting_sparse(struct repository *r, struct oidset *trees); void show_object_with_name(FILE *, struct object *, const char *); diff --git a/scalar.c b/scalar.c index 6166a8dd4c8d1f..031d1ac179f782 100644 --- a/scalar.c +++ b/scalar.c @@ -170,6 +170,7 @@ static int set_recommended_config(int reconfigure) { "core.autoCRLF", "false" }, { "core.safeCRLF", "false" }, { "fetch.showForcedUpdates", "false" }, + { "push.usePathWalk", "true" }, { NULL, NULL }, }; int i; diff --git a/t/README b/t/README index 44c02d81298dc6..a5d7d0239e074f 100644 --- a/t/README +++ b/t/README @@ -433,6 +433,10 @@ GIT_TEST_PACK_SPARSE= if disabled will default the pack-objects builtin to use the non-sparse object walk. This can still be overridden by the --sparse command-line argument. +GIT_TEST_PACK_PATH_WALK= if enabled will default the pack-objects +builtin to use the path-walk API for the object walk. This can still be +overridden by the --no-path-walk command-line argument. + GIT_TEST_PRELOAD_INDEX= exercises the preload-index code path by overriding the minimum number of cache entries required per thread. diff --git a/t/helper/test-name-hash.c b/t/helper/test-name-hash.c new file mode 100644 index 00000000000000..c82ccd7cefd860 --- /dev/null +++ b/t/helper/test-name-hash.c @@ -0,0 +1,23 @@ +/* + * test-name-hash.c: Read a list of paths over stdin and report on their + * name-hash and full name-hash. + */ + +#include "test-tool.h" +#include "git-compat-util.h" +#include "pack-objects.h" +#include "strbuf.h" + +int cmd__name_hash(int argc, const char **argv) +{ + struct strbuf line = STRBUF_INIT; + + while (!strbuf_getline(&line, stdin)) { + uint32_t name_hash = pack_name_hash(line.buf); + uint32_t full_hash = pack_full_name_hash(line.buf); + + printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf); + } + + return 0; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index f8a67df7de90fc..4a6039210020e5 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -43,6 +43,7 @@ static struct test_cmd cmds[] = { { "match-trees", cmd__match_trees }, { "mergesort", cmd__mergesort }, { "mktemp", cmd__mktemp }, + { "name-hash", cmd__name_hash }, { "oid-array", cmd__oid_array }, { "online-cpus", cmd__online_cpus }, { "pack-mtimes", cmd__pack_mtimes }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index e74bc0ffd4179c..56a83bf3aacd06 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -37,6 +37,7 @@ int cmd__lazy_init_name_hash(int argc, const char **argv); int cmd__match_trees(int argc, const char **argv); int cmd__mergesort(int argc, const char **argv); int cmd__mktemp(int argc, const char **argv); +int cmd__name_hash(int argc, const char **argv); int cmd__online_cpus(int argc, const char **argv); int cmd__pack_mtimes(int argc, const char **argv); int cmd__parse_options(int argc, const char **argv); diff --git a/t/perf/p5313-pack-objects.sh b/t/perf/p5313-pack-objects.sh new file mode 100755 index 00000000000000..b3b7fff8abf3bf --- /dev/null +++ b/t/perf/p5313-pack-objects.sh @@ -0,0 +1,101 @@ +#!/bin/sh + +test_description='Tests pack performance using bitmaps' +. ./perf-lib.sh + +GIT_TEST_PASSING_SANITIZE_LEAK=0 +export GIT_TEST_PASSING_SANITIZE_LEAK + +test_perf_large_repo + +test_expect_success 'create rev input' ' + cat >in-thin <<-EOF && + $(git rev-parse HEAD) + ^$(git rev-parse HEAD~1) + EOF + + cat >in-big-recent <<-EOF + $(git rev-parse HEAD) + ^$(git rev-parse HEAD~1000) + EOF +' + +test_perf 'thin pack' ' + git pack-objects --thin --stdout --revs --sparse out +' + +test_size 'thin pack size' ' + wc -c out +' + +test_size 'thin pack size with --full-name-hash' ' + wc -c out +' + +test_size 'thin pack size with --path-walk' ' + wc -c out +' + +test_size 'big recent pack size' ' + wc -c out +' + +test_size 'big recent pack size with --full-name-hash' ' + wc -c out +' + +test_size 'big recent pack size with --path-walk' ' + wc -c path-list && + wc -l name-hashes && + cat name-hashes | awk "{ print \$1; }" | sort -n | uniq -c >name-hash-count && + wc -l full-name-hash-count && + wc -l print_1.awk && + echo "{print \$2}" >print_2.awk && + + git init src && + + mkdir -p src/a/b/c && + mkdir -p src/d/e && + + for i in 1 2 + do + for n in 1 2 3 4 + do + echo "Version $i of file $n" > src/file.$n.txt && + echo "Version $i of file a/$n" > src/a/file.$n.txt && + echo "Version $i of file a/b/$n" > src/a/b/file.$n.txt && + echo "Version $i of file a/b/c/$n" > src/a/b/c/file.$n.txt && + echo "Version $i of file d/$n" > src/d/file.$n.txt && + echo "Version $i of file d/e/$n" > src/d/e/file.$n.txt && + git -C src add . && + git -C src commit -m "Iteration $n" || return 1 + done + done +' + +# Clone 'src' into 'srv.bare' so we have a bare repo to be our origin +# server for the partial clone. +test_expect_success 'setup bare clone for server' ' + git clone --bare "file://$(pwd)/src" srv.bare && + git -C srv.bare config --local uploadpack.allowfilter 1 && + git -C srv.bare config --local uploadpack.allowanysha1inwant 1 +' + +# do basic partial clone from "srv.bare" +test_expect_success 'do partial clone 1, backfill gets all objects' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill1 && + + # Backfill with no options gets everything reachable from HEAD. + GIT_TRACE2_EVENT="$(pwd)/backfill-file-trace" git \ + -C backfill1 backfill && + + # We should have engaged the partial clone machinery + test_trace2_data promisor fetch_count 48 revs2 && + test_line_count = 0 revs2 +' + +test_expect_success 'do partial clone 2, backfill batch size' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill2 && + + GIT_TRACE2_EVENT="$(pwd)/batch-trace" git \ + -C backfill2 backfill --batch-size=20 && + + # Batches were used + test_trace2_data promisor fetch_count 20 matches && + test_line_count = 2 matches && + test_trace2_data promisor fetch_count 8 revs2 && + test_line_count = 0 revs2 +' + +test_expect_success 'backfill --sparse without sparse-checkout fails' ' + git init not-sparse && + test_must_fail git -C not-sparse backfill --sparse 2>err && + grep "problem loading sparse-checkout" err +' + +test_expect_success 'backfill --sparse' ' + git clone --sparse --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill3 && + + # Initial checkout includes four files at root. + git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 44 missing && + + # Initial sparse-checkout is just the files at root, so we get the + # older versions of the four files at tip. + GIT_TRACE2_EVENT="$(pwd)/sparse-trace1" git \ + -C backfill3 backfill --sparse && + test_trace2_data promisor fetch_count 4 missing && + test_line_count = 40 missing && + + # Expand the sparse-checkout to include 'd' recursively. This + # engages the algorithm to skip the trees for 'a'. Note that + # the "sparse-checkout set" command downloads the objects at tip + # to satisfy the current checkout. + git -C backfill3 sparse-checkout set d && + GIT_TRACE2_EVENT="$(pwd)/sparse-trace2" git \ + -C backfill3 backfill --sparse && + test_trace2_data promisor fetch_count 8 missing && + test_line_count = 24 missing && + + # Disabling the --sparse option (on by default) will download everything + git -C backfill3 backfill --no-sparse && + git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 0 missing +' + +test_expect_success 'backfill --sparse without cone mode' ' + git clone --no-checkout --filter=blob:none \ + --single-branch --branch=main \ + "file://$(pwd)/srv.bare" backfill4 && + + # No blobs yet + git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing && + test_line_count = 48 missing && + + # Define sparse-checkout by filename regardless of parent directory. + # This downloads 6 blobs to satisfy the checkout. + git -C backfill4 sparse-checkout set --no-cone "**/file.1.txt" && + git -C backfill4 checkout main && + + GIT_TRACE2_EVENT="$(pwd)/no-cone-trace1" git \ + -C backfill4 backfill --sparse && + test_trace2_data promisor fetch_count 6 missing && + test_line_count = 36 missing +' + +. "$TEST_DIRECTORY"/lib-httpd.sh +start_httpd + +test_expect_success 'create a partial clone over HTTP' ' + SERVER="$HTTPD_DOCUMENT_ROOT_PATH/server" && + rm -rf "$SERVER" repo && + git clone --bare "file://$(pwd)/src" "$SERVER" && + test_config -C "$SERVER" uploadpack.allowfilter 1 && + test_config -C "$SERVER" uploadpack.allowanysha1inwant 1 && + + git clone --no-checkout --filter=blob:none \ + "$HTTPD_URL/smart/server" backfill-http +' + +test_expect_success 'backfilling over HTTP succeeds' ' + GIT_TRACE2_EVENT="$(pwd)/backfill-http-trace" git \ + -C backfill-http backfill && + + # We should have engaged the partial clone machinery + test_trace2_data promisor fetch_count 48 rev-list-out && + awk "{print \$1;}" oids && + GIT_TRACE2_EVENT="$(pwd)/walk-trace" git -C backfill-http \ + cat-file --batch-check batch-out && + ! grep missing batch-out +' + +# DO NOT add non-httpd-specific tests here, because the last part of this +# test script is only executed when httpd is available and enabled. + +test_done diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh new file mode 100755 index 00000000000000..c2dab0033f9379 --- /dev/null +++ b/t/t8100-git-survey.sh @@ -0,0 +1,76 @@ +#!/bin/sh + +test_description='git survey' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +TEST_PASSES_SANITIZE_LEAK=0 +export TEST_PASSES_SANITIZE_LEAK + +. ./test-lib.sh + +test_expect_success 'git survey -h shows experimental warning' ' + test_expect_code 129 git survey -h 2>usage && + grep "EXPERIMENTAL!" usage +' + +test_expect_success 'creat a semi-interesting repo' ' + test_commit_bulk 10 && + git tag -a -m one one HEAD~5 && + git tag -a -m two two HEAD~3 && + git tag -a -m three three two && + git tag -a -m four four three && + git update-ref -d refs/tags/three && + git update-ref -d refs/tags/two +' + +test_expect_success 'git survey (default)' ' + git survey --all-refs >out 2>err && + test_line_count = 0 err && + + cat >expect <<-EOF && + GIT SURVEY for "$(pwd)" + ----------------------------------------------------- + + REFERENCES SUMMARY + ======================== + Ref Type | Count + -----------------+------ + Branches | 1 + Remote refs | 0 + Tags (all) | 2 + Tags (annotated) | 2 + + REACHABLE OBJECT SUMMARY + ======================== + Object Type | Count + ------------+------ + Tags | 0 + Commits | 10 + Trees | 10 + Blobs | 10 + + TOTAL OBJECT SIZES BY TYPE + =============================================== + Object Type | Count | Disk Size | Inflated Size + ------------+-------+-----------+-------------- + Commits | 10 | 1523 | 2153 + Trees | 10 | 495 | 1706 + Blobs | 10 | 191 | 101 + EOF + + lines=$(wc -l out-trimmed && + test_cmp expect out-trimmed && + + for type in "DIRECTORIES" "FILES" + do + for metric in "COUNT" "DISK SIZE" "INFLATED SIZE" + do + grep "TOP $type BY $metric" out || return 1 + done || return 1 + done +' + +test_done