Skip to content

Commit 35b7e38

Browse files
derrickstoleedscho
authored andcommitted
backfill: add --sparse option
One way to significantly reduce the cost of a Git clone and later fetches is to use a blobless partial clone and combine that with a sparse-checkout that reduces the paths that need to be populated in the working directory. Not only does this reduce the cost of clones and fetches, the sparse-checkout reduces the number of objects needed to download from a promisor remote. However, history investigations can be expensie as computing blob diffs will trigger promisor remote requests for one object at a time. This can be avoided by downloading the blobs needed for the given sparse-checkout using 'git backfill' and its new '--sparse' mode, at a time that the user is willing to pay that extra cost. Note that this is distinctly different from the '--filter=sparse:<oid>' option, as this assumes that the partial clone has all reachable trees and we are using client-side logic to avoid downloading blobs outside of the sparse-checkout cone. This avoids the server-side cost of walking trees while also achieving a similar goal. It also downloads in batches based on similar path names, presenting a resumable download if things are interrupted. This augments the path-walk API to have a possibly-NULL 'pl' member that may point to a 'struct pattern_list'. This could be more general than the sparse-checkout definition at HEAD, but 'git backfill --sparse' is currently the only consumer. Be sure to test this in both cone mode and not cone mode. Cone mode has the benefit that the path-walk can skip certain paths once they would expand beyond the sparse-checkout. Signed-off-by: Derrick Stolee <[email protected]>
1 parent dc94934 commit 35b7e38

File tree

10 files changed

+177
-10
lines changed

10 files changed

+177
-10
lines changed

Documentation/git-backfill.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ git-backfill - Download missing objects in a partial clone
99
SYNOPSIS
1010
--------
1111
[verse]
12-
'git backfill' [--batch-size=<n>]
12+
'git backfill' [--batch-size=<n>] [--[no-]sparse]
1313

1414
DESCRIPTION
1515
-----------
@@ -46,6 +46,10 @@ OPTIONS
4646
from the server. This size may be exceeded by the last set of
4747
blobs seen at a given path. Default batch size is 16,000.
4848

49+
--[no-]sparse::
50+
Only download objects if they appear at a path that matches the
51+
current sparse-checkout.
52+
4953
SEE ALSO
5054
--------
5155
linkgit:git-clone[1].

Documentation/technical/api-path-walk.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,14 @@ better off using the revision walk API instead.
6565
the revision walk so that the walk emits commits marked with the
6666
`UNINTERESTING` flag.
6767

68+
`pl`::
69+
This pattern list pointer allows focusing the path-walk search to
70+
a set of patterns, only emitting paths that match the given
71+
patterns. See linkgit:gitignore[5] or
72+
linkgit:git-sparse-checkout[1] for details about pattern lists.
73+
When the pattern list uses cone-mode patterns, then the path-walk
74+
API can prune the set of paths it walks to improve performance.
75+
6876
Examples
6977
--------
7078

builtin/backfill.c

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "parse-options.h"
55
#include "repository.h"
66
#include "commit.h"
7+
#include "dir.h"
78
#include "hex.h"
89
#include "tree.h"
910
#include "tree-walk.h"
@@ -21,14 +22,15 @@
2122
#include "path-walk.h"
2223

2324
static const char * const builtin_backfill_usage[] = {
24-
N_("git backfill [--batch-size=<n>]"),
25+
N_("git backfill [--batch-size=<n>] [--[no-]sparse]"),
2526
NULL
2627
};
2728

2829
struct backfill_context {
2930
struct repository *repo;
3031
struct oid_array current_batch;
3132
size_t batch_size;
33+
int sparse;
3234
};
3335

3436
static void clear_backfill_context(struct backfill_context *ctx)
@@ -84,6 +86,15 @@ static int do_backfill(struct backfill_context *ctx)
8486
struct path_walk_info info = PATH_WALK_INFO_INIT;
8587
int ret;
8688

89+
if (ctx->sparse) {
90+
CALLOC_ARRAY(info.pl, 1);
91+
if (get_sparse_checkout_patterns(info.pl)) {
92+
clear_pattern_list(info.pl);
93+
free(info.pl);
94+
return error(_("problem loading sparse-checkout"));
95+
}
96+
}
97+
8798
repo_init_revisions(ctx->repo, &revs, "");
8899
handle_revision_arg("HEAD", &revs, 0, 0);
89100

@@ -102,6 +113,10 @@ static int do_backfill(struct backfill_context *ctx)
102113

103114
clear_backfill_context(ctx);
104115
release_revisions(&revs);
116+
if (info.pl) {
117+
clear_pattern_list(info.pl);
118+
free(info.pl);
119+
}
105120
return ret;
106121
}
107122

@@ -111,10 +126,13 @@ int cmd_backfill(int argc, const char **argv, const char *prefix, struct reposit
111126
.repo = repo,
112127
.current_batch = OID_ARRAY_INIT,
113128
.batch_size = 50000,
129+
.sparse = 0,
114130
};
115131
struct option options[] = {
116132
OPT_INTEGER(0, "batch-size", &ctx.batch_size,
117133
N_("Minimun number of objects to request at a time")),
134+
OPT_BOOL(0, "sparse", &ctx.sparse,
135+
N_("Restrict the missing objects to the current sparse-checkout")),
118136
OPT_END(),
119137
};
120138

dir.c

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,10 +1093,6 @@ static void invalidate_directory(struct untracked_cache *uc,
10931093
dir->dirs[i]->recurse = 0;
10941094
}
10951095

1096-
static int add_patterns_from_buffer(char *buf, size_t size,
1097-
const char *base, int baselen,
1098-
struct pattern_list *pl);
1099-
11001096
/* Flags for add_patterns() */
11011097
#define PATTERN_NOFOLLOW (1<<0)
11021098

@@ -1186,9 +1182,9 @@ static int add_patterns(const char *fname, const char *base, int baselen,
11861182
return 0;
11871183
}
11881184

1189-
static int add_patterns_from_buffer(char *buf, size_t size,
1190-
const char *base, int baselen,
1191-
struct pattern_list *pl)
1185+
int add_patterns_from_buffer(char *buf, size_t size,
1186+
const char *base, int baselen,
1187+
struct pattern_list *pl)
11921188
{
11931189
char *orig = buf;
11941190
int i, lineno = 1;

dir.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,9 @@ void add_patterns_from_file(struct dir_struct *, const char *fname);
467467
int add_patterns_from_blob_to_list(struct object_id *oid,
468468
const char *base, int baselen,
469469
struct pattern_list *pl);
470+
int add_patterns_from_buffer(char *buf, size_t size,
471+
const char *base, int baselen,
472+
struct pattern_list *pl);
470473
void parse_path_pattern(const char **string, int *patternlen, unsigned *flags, int *nowildcardlen);
471474
void add_pattern(const char *string, const char *base,
472475
int baselen, struct pattern_list *pl, int srcpos);

path-walk.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "hex.h"
1111
#include "object.h"
1212
#include "oid-array.h"
13+
#include "repository.h"
1314
#include "revision.h"
1415
#include "string-list.h"
1516
#include "strmap.h"
@@ -119,6 +120,23 @@ static int add_children(struct path_walk_context *ctx,
119120
if (type == OBJ_TREE)
120121
strbuf_addch(&path, '/');
121122

123+
if (ctx->info->pl) {
124+
int dtype;
125+
enum pattern_match_result match;
126+
match = path_matches_pattern_list(path.buf, path.len,
127+
path.buf + base_len, &dtype,
128+
ctx->info->pl,
129+
ctx->repo->index);
130+
131+
if (ctx->info->pl->use_cone_patterns &&
132+
match == NOT_MATCHED)
133+
continue;
134+
else if (!ctx->info->pl->use_cone_patterns &&
135+
type == OBJ_BLOB &&
136+
match != MATCHED)
137+
continue;
138+
}
139+
122140
if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
123141
CALLOC_ARRAY(list, 1);
124142
list->type = type;

path-walk.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
struct rev_info;
88
struct oid_array;
9+
struct pattern_list;
910

1011
/**
1112
* The type of a function pointer for the method that is called on a list of
@@ -46,6 +47,16 @@ struct path_walk_info {
4647
* walk the children of such trees.
4748
*/
4849
int prune_all_uninteresting;
50+
51+
/**
52+
* Specify a sparse-checkout definition to match our paths to. Do not
53+
* walk outside of this sparse definition. If the patterns are in
54+
* cone mode, then the search may prune directories that are outside
55+
* of the cone. If not in cone mode, then all tree paths will be
56+
* explored but the path_fn will only be called when the path matches
57+
* the sparse-checkout patterns.
58+
*/
59+
struct pattern_list *pl;
4960
};
5061

5162
#define PATH_WALK_INFO_INIT { \

t/helper/test-path-walk.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#define USE_THE_REPOSITORY_VARIABLE
22

33
#include "test-tool.h"
4+
#include "dir.h"
45
#include "environment.h"
56
#include "hex.h"
67
#include "object-name.h"
@@ -9,6 +10,7 @@
910
#include "revision.h"
1011
#include "setup.h"
1112
#include "parse-options.h"
13+
#include "strbuf.h"
1214
#include "path-walk.h"
1315
#include "oid-array.h"
1416

@@ -67,7 +69,7 @@ static int emit_block(const char *path, struct oid_array *oids,
6769

6870
int cmd__path_walk(int argc, const char **argv)
6971
{
70-
int res;
72+
int res, stdin_pl = 0;
7173
struct rev_info revs = REV_INFO_INIT;
7274
struct path_walk_info info = PATH_WALK_INFO_INIT;
7375
struct path_walk_test_data data = { 0 };
@@ -82,6 +84,8 @@ int cmd__path_walk(int argc, const char **argv)
8284
N_("toggle inclusion of tree objects")),
8385
OPT_BOOL(0, "prune", &info.prune_all_uninteresting,
8486
N_("toggle pruning of uninteresting paths")),
87+
OPT_BOOL(0, "stdin-pl", &stdin_pl,
88+
N_("read a pattern list over stdin")),
8589
OPT_END(),
8690
};
8791

@@ -101,6 +105,17 @@ int cmd__path_walk(int argc, const char **argv)
101105
info.path_fn = emit_block;
102106
info.path_fn_data = &data;
103107

108+
if (stdin_pl) {
109+
struct strbuf in = STRBUF_INIT;
110+
CALLOC_ARRAY(info.pl, 1);
111+
112+
info.pl->use_cone_patterns = 1;
113+
114+
strbuf_fread(&in, 2048, stdin);
115+
add_patterns_from_buffer(in.buf, in.len, "", 0, info.pl);
116+
strbuf_release(&in);
117+
}
118+
104119
res = walk_objects_by_path(&info);
105120

106121
printf("commits:%" PRIuMAX "\n"
@@ -109,6 +124,10 @@ int cmd__path_walk(int argc, const char **argv)
109124
"tags:%" PRIuMAX "\n",
110125
data.commit_nr, data.tree_nr, data.blob_nr, data.tag_nr);
111126

127+
if (info.pl) {
128+
clear_pattern_list(info.pl);
129+
free(info.pl);
130+
}
112131
release_revisions(&revs);
113132
return res;
114133
}

t/t5620-backfill.sh

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,61 @@ test_expect_success 'do partial clone 2, backfill batch size' '
7777
test_line_count = 0 revs2
7878
'
7979

80+
test_expect_success 'backfill --sparse' '
81+
git clone --sparse --filter=blob:none \
82+
--single-branch --branch=main \
83+
"file://$(pwd)/srv.bare" backfill3 &&
84+
85+
# Initial checkout includes four files at root.
86+
git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
87+
test_line_count = 44 missing &&
88+
89+
# Initial sparse-checkout is just the files at root, so we get the
90+
# older versions of the four files at tip.
91+
GIT_TRACE2_EVENT="$(pwd)/sparse-trace1" git \
92+
-C backfill3 backfill --sparse &&
93+
test_trace2_data promisor fetch_count 4 <sparse-trace1 &&
94+
test_trace2_data path-walk paths 5 <sparse-trace1 &&
95+
git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
96+
test_line_count = 40 missing &&
97+
98+
# Expand the sparse-checkout to include 'd' recursively. This
99+
# engages the algorithm to skip the trees for 'a'. Note that
100+
# the "sparse-checkout set" command downloads the objects at tip
101+
# to satisfy the current checkout.
102+
git -C backfill3 sparse-checkout set d &&
103+
GIT_TRACE2_EVENT="$(pwd)/sparse-trace2" git \
104+
-C backfill3 backfill --sparse &&
105+
test_trace2_data promisor fetch_count 8 <sparse-trace2 &&
106+
test_trace2_data path-walk paths 15 <sparse-trace2 &&
107+
git -C backfill3 rev-list --quiet --objects --missing=print HEAD >missing &&
108+
test_line_count = 24 missing
109+
'
110+
111+
test_expect_success 'backfill --sparse without cone mode' '
112+
git clone --no-checkout --filter=blob:none \
113+
--single-branch --branch=main \
114+
"file://$(pwd)/srv.bare" backfill4 &&
115+
116+
# No blobs yet
117+
git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
118+
test_line_count = 48 missing &&
119+
120+
# Define sparse-checkout by filename regardless of parent directory.
121+
# This downloads 6 blobs to satisfy the checkout.
122+
git -C backfill4 sparse-checkout set --no-cone "**/file.1.txt" &&
123+
git -C backfill4 checkout main &&
124+
125+
GIT_TRACE2_EVENT="$(pwd)/no-cone-trace1" git \
126+
-C backfill4 backfill --sparse &&
127+
test_trace2_data promisor fetch_count 6 <no-cone-trace1 &&
128+
129+
# This walk needed to visit all directories to search for these paths.
130+
test_trace2_data path-walk paths 12 <no-cone-trace1 &&
131+
git -C backfill4 rev-list --quiet --objects --missing=print HEAD >missing &&
132+
test_line_count = 36 missing
133+
'
134+
80135
. "$TEST_DIRECTORY"/lib-httpd.sh
81136
start_httpd
82137

t/t6601-path-walk.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,41 @@ test_expect_success 'all' '
108108
test_cmp expect.sorted out.sorted
109109
'
110110

111+
test_expect_success 'base & topic, sparse' '
112+
cat >patterns <<-EOF &&
113+
/*
114+
!/*/
115+
/left/
116+
EOF
117+
118+
test-tool path-walk --stdin-pl -- base topic <patterns >out &&
119+
120+
cat >expect <<-EOF &&
121+
COMMIT::$(git rev-parse topic)
122+
COMMIT::$(git rev-parse base)
123+
COMMIT::$(git rev-parse base~1)
124+
COMMIT::$(git rev-parse base~2)
125+
commits:4
126+
TREE::$(git rev-parse topic^{tree})
127+
TREE::$(git rev-parse base^{tree})
128+
TREE::$(git rev-parse base~1^{tree})
129+
TREE::$(git rev-parse base~2^{tree})
130+
TREE:left/:$(git rev-parse base:left)
131+
TREE:left/:$(git rev-parse base~2:left)
132+
trees:6
133+
BLOB:a:$(git rev-parse base~2:a)
134+
BLOB:left/b:$(git rev-parse base~2:left/b)
135+
BLOB:left/b:$(git rev-parse base:left/b)
136+
blobs:3
137+
tags:0
138+
EOF
139+
140+
sort expect >expect.sorted &&
141+
sort out >out.sorted &&
142+
143+
test_cmp expect.sorted out.sorted
144+
'
145+
111146
test_expect_success 'topic only' '
112147
test-tool path-walk -- topic >out &&
113148

0 commit comments

Comments
 (0)