Skip to content

Commit 717d2ea

Browse files
derrickstoleedscho
authored andcommitted
path-walk: introduce an object walk by path
In anticipation of a few planned applications, introduce the most basic form of a path-walk API. It currently assumes that there are no UNINTERESTING objects, and does not include any complicated filters. It calls a function pointer on groups of tree and blob objects as grouped by path. This only includes objects the first time they are discovered, so an object that appears at multiple paths will not be included in two batches. There are many future adaptations that could be made, but they are left for future updates when consumers are ready to take advantage of those features. Signed-off-by: Derrick Stolee <[email protected]>
1 parent d3c8e10 commit 717d2ea

File tree

5 files changed

+336
-0
lines changed

5 files changed

+336
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
Path-Walk API
2+
=============
3+
4+
The path-walk API is used to walk reachable objects, but to visit objects
5+
in batches based on a common path they appear in, or by type.
6+
7+
For example, all reachable commits are visited in a group. All tags are
8+
visited in a group. Then, all root trees are visited. At some point, all
9+
blobs reachable via a path `my/dir/to/A` are visited. When there are
10+
multiple paths possible to reach the same object, then only one of those
11+
paths is used to visit the object.
12+
13+
When walking a range of commits with some `UNINTERESTING` objects, the
14+
objects with the `UNINTERESTING` flag are included in these batches. In
15+
order to walk `UNINTERESTING` objects, the `--boundary` option must be
16+
used in the commit walk in order to visit `UNINTERESTING` commits.
17+
18+
Basics
19+
------
20+
21+
To use the path-walk API, include `path-walk.h` and call
22+
`walk_objects_by_path()` with a customized `path_walk_info` struct. The
23+
struct is used to set all of the options for how the walk should proceed.
24+
Let's dig into the different options and their use.
25+
26+
`path_fn` and `path_fn_data`::
27+
The most important option is the `path_fn` option, which is a
28+
function pointer to the callback that can execute logic on the
29+
object IDs for objects grouped by type and path. This function
30+
also receives a `data` value that corresponds to the
31+
`path_fn_data` member, for providing custom data structures to
32+
this callback function.
33+
34+
`revs`::
35+
To configure the exact details of the reachable set of objects,
36+
use the `revs` member and initialize it using the revision
37+
machinery in `revision.h`. Initialize `revs` using calls such as
38+
`setup_revisions()` or `parse_revision_opt()`. Do not call
39+
`prepare_revision_walk()`, as that will be called within
40+
`walk_objects_by_path()`.
41+
+
42+
It is also important that you do not specify the `--objects` flag for the
43+
`revs` struct. The revision walk should only be used to walk commits, and
44+
the objects will be walked in a separate way based on those starting
45+
commits.
46+
+
47+
If you want the path-walk API to emit `UNINTERESTING` objects based on the
48+
commit walk's boundary, be sure to set `revs.boundary` so the boundary
49+
commits are emitted.
50+
51+
Examples
52+
--------
53+
54+
See example usages in future changes.

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,7 @@ LIB_OBJS += parse-options.o
10951095
LIB_OBJS += patch-delta.o
10961096
LIB_OBJS += patch-ids.o
10971097
LIB_OBJS += path.o
1098+
LIB_OBJS += path-walk.o
10981099
LIB_OBJS += pathspec.o
10991100
LIB_OBJS += pkt-line.o
11001101
LIB_OBJS += preload-index.o

meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,7 @@ libgit_sources = [
358358
'patch-delta.c',
359359
'patch-ids.c',
360360
'path.c',
361+
'path-walk.c',
361362
'pathspec.c',
362363
'pkt-line.c',
363364
'preload-index.c',

path-walk.c

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
/*
2+
* path-walk.c: implementation for path-based walks of the object graph.
3+
*/
4+
#include "git-compat-util.h"
5+
#include "path-walk.h"
6+
#include "blob.h"
7+
#include "commit.h"
8+
#include "dir.h"
9+
#include "hashmap.h"
10+
#include "hex.h"
11+
#include "object.h"
12+
#include "oid-array.h"
13+
#include "revision.h"
14+
#include "string-list.h"
15+
#include "strmap.h"
16+
#include "trace2.h"
17+
#include "tree.h"
18+
#include "tree-walk.h"
19+
20+
struct type_and_oid_list
21+
{
22+
enum object_type type;
23+
struct oid_array oids;
24+
};
25+
26+
#define TYPE_AND_OID_LIST_INIT { \
27+
.type = OBJ_NONE, \
28+
.oids = OID_ARRAY_INIT \
29+
}
30+
31+
struct path_walk_context {
32+
/**
33+
* Repeats of data in 'struct path_walk_info' for
34+
* access with fewer characters.
35+
*/
36+
struct repository *repo;
37+
struct rev_info *revs;
38+
struct path_walk_info *info;
39+
40+
/**
41+
* Map a path to a 'struct type_and_oid_list'
42+
* containing the objects discovered at that
43+
* path.
44+
*/
45+
struct strmap paths_to_lists;
46+
47+
/**
48+
* Store the current list of paths in a stack, to
49+
* facilitate depth-first-search without recursion.
50+
*/
51+
struct string_list path_stack;
52+
};
53+
54+
static int add_children(struct path_walk_context *ctx,
55+
const char *base_path,
56+
struct object_id *oid)
57+
{
58+
struct tree_desc desc;
59+
struct name_entry entry;
60+
struct strbuf path = STRBUF_INIT;
61+
size_t base_len;
62+
struct tree *tree = lookup_tree(ctx->repo, oid);
63+
64+
if (!tree) {
65+
error(_("failed to walk children of tree %s: not found"),
66+
oid_to_hex(oid));
67+
return -1;
68+
} else if (parse_tree_gently(tree, 1)) {
69+
die("bad tree object %s", oid_to_hex(oid));
70+
}
71+
72+
strbuf_addstr(&path, base_path);
73+
base_len = path.len;
74+
75+
parse_tree(tree);
76+
init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
77+
while (tree_entry(&desc, &entry)) {
78+
struct type_and_oid_list *list;
79+
struct object *o;
80+
/* Not actually true, but we will ignore submodules later. */
81+
enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
82+
83+
/* Skip submodules. */
84+
if (S_ISGITLINK(entry.mode))
85+
continue;
86+
87+
if (type == OBJ_TREE) {
88+
struct tree *child = lookup_tree(ctx->repo, &entry.oid);
89+
o = child ? &child->object : NULL;
90+
} else if (type == OBJ_BLOB) {
91+
struct blob *child = lookup_blob(ctx->repo, &entry.oid);
92+
o = child ? &child->object : NULL;
93+
} else {
94+
/* Wrong type? */
95+
continue;
96+
}
97+
98+
if (!o) /* report error?*/
99+
continue;
100+
101+
/* Skip this object if already seen. */
102+
if (o->flags & SEEN)
103+
continue;
104+
o->flags |= SEEN;
105+
106+
strbuf_setlen(&path, base_len);
107+
strbuf_add(&path, entry.path, entry.pathlen);
108+
109+
/*
110+
* Trees will end with "/" for concatenation and distinction
111+
* from blobs at the same path.
112+
*/
113+
if (type == OBJ_TREE)
114+
strbuf_addch(&path, '/');
115+
116+
if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
117+
CALLOC_ARRAY(list, 1);
118+
list->type = type;
119+
strmap_put(&ctx->paths_to_lists, path.buf, list);
120+
string_list_append(&ctx->path_stack, path.buf);
121+
}
122+
oid_array_append(&list->oids, &entry.oid);
123+
}
124+
125+
free_tree_buffer(tree);
126+
strbuf_release(&path);
127+
return 0;
128+
}
129+
130+
/*
131+
* For each path in paths_to_explore, walk the trees another level
132+
* and add any found blobs to the batch (but only if they exist and
133+
* haven't been added yet).
134+
*/
135+
static int walk_path(struct path_walk_context *ctx,
136+
const char *path)
137+
{
138+
struct type_and_oid_list *list;
139+
int ret = 0;
140+
141+
list = strmap_get(&ctx->paths_to_lists, path);
142+
143+
/* Evaluate function pointer on this data. */
144+
ret = ctx->info->path_fn(path, &list->oids, list->type,
145+
ctx->info->path_fn_data);
146+
147+
/* Expand data for children. */
148+
if (list->type == OBJ_TREE) {
149+
for (size_t i = 0; i < list->oids.nr; i++) {
150+
ret |= add_children(ctx,
151+
path,
152+
&list->oids.oid[i]);
153+
}
154+
}
155+
156+
oid_array_clear(&list->oids);
157+
strmap_remove(&ctx->paths_to_lists, path, 1);
158+
return ret;
159+
}
160+
161+
static void clear_strmap(struct strmap *map)
162+
{
163+
struct hashmap_iter iter;
164+
struct strmap_entry *e;
165+
166+
hashmap_for_each_entry(&map->map, &iter, e, ent) {
167+
struct type_and_oid_list *list = e->value;
168+
oid_array_clear(&list->oids);
169+
}
170+
strmap_clear(map, 1);
171+
strmap_init(map);
172+
}
173+
174+
/**
175+
* Given the configuration of 'info', walk the commits based on 'info->revs' and
176+
* call 'info->path_fn' on each discovered path.
177+
*
178+
* Returns nonzero on an error.
179+
*/
180+
int walk_objects_by_path(struct path_walk_info *info)
181+
{
182+
const char *root_path = "";
183+
int ret = 0;
184+
size_t commits_nr = 0, paths_nr = 0;
185+
struct commit *c;
186+
struct type_and_oid_list *root_tree_list;
187+
struct path_walk_context ctx = {
188+
.repo = info->revs->repo,
189+
.revs = info->revs,
190+
.info = info,
191+
.path_stack = STRING_LIST_INIT_DUP,
192+
.paths_to_lists = STRMAP_INIT
193+
};
194+
195+
trace2_region_enter("path-walk", "commit-walk", info->revs->repo);
196+
197+
/* Insert a single list for the root tree into the paths. */
198+
CALLOC_ARRAY(root_tree_list, 1);
199+
root_tree_list->type = OBJ_TREE;
200+
strmap_put(&ctx.paths_to_lists, root_path, root_tree_list);
201+
202+
if (prepare_revision_walk(info->revs))
203+
die(_("failed to setup revision walk"));
204+
205+
while ((c = get_revision(info->revs))) {
206+
struct object_id *oid = get_commit_tree_oid(c);
207+
struct tree *t = lookup_tree(info->revs->repo, oid);
208+
commits_nr++;
209+
210+
if (t)
211+
oid_array_append(&root_tree_list->oids, oid);
212+
else
213+
warning("could not find tree %s", oid_to_hex(oid));
214+
}
215+
216+
trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr);
217+
trace2_region_leave("path-walk", "commit-walk", info->revs->repo);
218+
219+
string_list_append(&ctx.path_stack, root_path);
220+
221+
trace2_region_enter("path-walk", "path-walk", info->revs->repo);
222+
while (!ret && ctx.path_stack.nr) {
223+
char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string;
224+
ctx.path_stack.nr--;
225+
paths_nr++;
226+
227+
ret = walk_path(&ctx, path);
228+
229+
free(path);
230+
}
231+
trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr);
232+
trace2_region_leave("path-walk", "path-walk", info->revs->repo);
233+
234+
clear_strmap(&ctx.paths_to_lists);
235+
string_list_clear(&ctx.path_stack, 0);
236+
return ret;
237+
}

path-walk.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* path-walk.h : Methods and structures for walking the object graph in batches
3+
* by the paths that can reach those objects.
4+
*/
5+
#include "object.h" /* Required for 'enum object_type'. */
6+
7+
struct rev_info;
8+
struct oid_array;
9+
10+
/**
11+
* The type of a function pointer for the method that is called on a list of
12+
* objects reachable at a given path.
13+
*/
14+
typedef int (*path_fn)(const char *path,
15+
struct oid_array *oids,
16+
enum object_type type,
17+
void *data);
18+
19+
struct path_walk_info {
20+
/**
21+
* revs provides the definitions for the commit walk, including
22+
* which commits are UNINTERESTING or not.
23+
*/
24+
struct rev_info *revs;
25+
26+
/**
27+
* The caller wishes to execute custom logic on objects reachable at a
28+
* given path. Every reachable object will be visited exactly once, and
29+
* the first path to see an object wins. This may not be a stable choice.
30+
*/
31+
path_fn path_fn;
32+
void *path_fn_data;
33+
};
34+
35+
#define PATH_WALK_INFO_INIT { 0 }
36+
37+
/**
38+
* Given the configuration of 'info', walk the commits based on 'info->revs' and
39+
* call 'info->path_fn' on each discovered path.
40+
*
41+
* Returns nonzero on an error.
42+
*/
43+
int walk_objects_by_path(struct path_walk_info *info);

0 commit comments

Comments
 (0)