Skip to content

Commit b12b07d

Browse files
committed
feat: add basic connectivity check
Implement a simple connectivity check in a new `gix-fsck` crate, and add this to `gix` via a new `fsck` subcommand. Currently this is functionally equivalent to: `git rev-list --objects --quiet --missing=print`
1 parent d78f445 commit b12b07d

File tree

18 files changed

+432
-2
lines changed

18 files changed

+432
-2
lines changed

Cargo.lock

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ members = [
281281
"gix-archive",
282282
"gix-worktree-stream",
283283
"gix-revwalk",
284+
"gix-fsck",
284285

285286
"tests/tools",
286287

@@ -294,6 +295,7 @@ members = [
294295
"gix-ref/tests",
295296
"gix-config/tests",
296297
"gix-traverse/tests",
298+
"gix-fsck/tests",
297299
]
298300

299301
[workspace.dependencies]

gitoxide-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
4949
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
5050
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
5151
gix-status = { version = "^0.2.0", path = "../gix-status" }
52+
gix-fsck = { version = "^0.0.0", path = "../gix-fsck" }
5253
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
5354
anyhow = "1.0.42"
5455
thiserror = "1.0.34"

gitoxide-core/src/repository/fsck.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
use std::io::{BufWriter, Write};
2+
3+
use anyhow::Context;
4+
use gix::{objs::Kind, ObjectId};
5+
6+
pub fn connectivity(mut repo: gix::Repository, spec: Option<String>, out: impl std::io::Write) -> anyhow::Result<()> {
7+
let mut out = BufWriter::with_capacity(64 * 1024, out);
8+
let spec = spec.unwrap_or("HEAD".into());
9+
10+
repo.object_cache_size_if_unset(4 * 1024 * 1024);
11+
// We expect to be finding a bunch of non-existent objects here - never refresh the ODB
12+
repo.objects.refresh_never();
13+
14+
let id = repo
15+
.rev_parse_single(spec.as_str())
16+
.context("Only single revisions are supported")?;
17+
let commits: gix::revision::Walk<'_> = id
18+
.object()?
19+
.peel_to_kind(gix::object::Kind::Commit)
20+
.context("Need commitish as starting point")?
21+
.id()
22+
.ancestors()
23+
.all()?;
24+
25+
let missing_cb = |oid: &ObjectId, kind: Kind| {
26+
writeln!(out, "{oid}: {kind}").expect("failed to write output");
27+
};
28+
let mut conn = gix_fsck::ConnectivityCheck::new(&repo.objects, missing_cb);
29+
30+
// Walk all commits, checking each one for connectivity
31+
for commit in commits {
32+
let commit = commit?;
33+
conn.check_commit(&commit.id);
34+
for parent in commit.parent_ids {
35+
conn.check_commit(&parent);
36+
}
37+
}
38+
39+
Ok(())
40+
}

gitoxide-core/src/repository/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pub use clone::function::clone;
3535
pub use fetch::function::fetch;
3636

3737
pub mod commitgraph;
38+
pub mod fsck;
3839
pub mod index;
3940
pub mod mailmap;
4041
pub mod odb;

gix-fsck/Cargo.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[package]
2+
name = "gix-fsck"
3+
version = "0.0.0"
4+
repository = "https://github.com/Byron/gitoxide"
5+
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
6+
license = "MIT OR Apache-2.0"
7+
description = "Verifies the connectivity and validity of objects in the database"
8+
edition = "2021"
9+
include = ["src/**/*", "LICENSE-*", "CHANGELOG.md"]
10+
rust-version = "1.65"
11+
autotests = false
12+
13+
[lib]
14+
doctest = false
15+
16+
[dependencies]
17+
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
18+
gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
19+
gix-object = { version = "^0.38.0", path = "../gix-object" }
20+
gix-odb = { version = "^0.54.0", path = "../gix-odb" }

gix-fsck/Changelog.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

gix-fsck/LICENSE-APACHE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-APACHE

gix-fsck/LICENSE-MIT

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-MIT

gix-fsck/src/lib.rs

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
//! A library for performing object database integrity and connectivity checks
2+
#![deny(rust_2018_idioms)]
3+
4+
use gix_hash::ObjectId;
5+
use gix_hashtable::HashSet;
6+
use gix_object::{tree::EntryMode, Kind};
7+
use gix_odb::{pack::Find, FindExt, Handle};
8+
9+
pub struct ConnectivityCheck<'a, F>
10+
where
11+
F: FnMut(&ObjectId, Kind),
12+
{
13+
/// ODB handle to use for the check
14+
db: &'a Handle,
15+
/// Closure to invoke when a missing object is encountered
16+
missing_cb: F,
17+
/// Set of Object IDs already (or about to be) scanned during the check
18+
oid_set: HashSet,
19+
/// Single buffer for decoding objects from the ODB
20+
/// This is slightly faster than allocating throughout the connectivity check (and reduces the memory requirements)
21+
buf: Vec<u8>,
22+
}
23+
24+
impl<'a, F> ConnectivityCheck<'a, F>
25+
where
26+
F: FnMut(&ObjectId, Kind),
27+
{
28+
/// Instantiate a connectivity check
29+
pub fn new(db: &'a Handle, missing_cb: F) -> ConnectivityCheck<'a, F> {
30+
ConnectivityCheck {
31+
db,
32+
missing_cb,
33+
oid_set: HashSet::default(),
34+
buf: Vec::new(),
35+
}
36+
}
37+
38+
/// Run the connectivity check on the provided commit object ID
39+
/// - This will walk the trees and blobs referenced by the commit and verify they exist in the ODB
40+
/// - Any objects previously encountered by this [`ConnectivityCheck`] instance will be skipped
41+
/// - Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`
42+
/// - Missing commits or trees will currently result in panic
43+
/// - TODO: add support for missing trees
44+
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
45+
pub fn check_commit(&mut self, oid: &ObjectId) {
46+
// Attempt to insert the commit ID in the set, and if already present, return immediately
47+
if !self.oid_set.insert(*oid) {
48+
return;
49+
}
50+
// Obtain the commit's tree ID
51+
let tree_id = {
52+
let commit = self.db.find_commit(oid, &mut self.buf).expect("failed to find commit");
53+
commit.tree()
54+
};
55+
56+
// Attempt to insert the tree ID in the set, and if already present, return immediately
57+
if self.oid_set.insert(tree_id) {
58+
self.check_tree(&tree_id);
59+
}
60+
}
61+
62+
fn check_tree(&mut self, oid: &ObjectId) {
63+
let tree = match self.db.find_tree(oid, &mut self.buf) {
64+
Ok(tree) => tree,
65+
Err(_) => {
66+
// Tree is missing, so invoke `missing_cb`
67+
(self.missing_cb)(oid, Kind::Tree);
68+
return;
69+
}
70+
};
71+
72+
// Keeping separate sets for trees and blobs for now...
73+
// This is about a wash when compared to using a HashMap<ObjectID, Kind>
74+
struct TreeEntries {
75+
trees: HashSet<ObjectId>,
76+
blobs: HashSet<ObjectId>,
77+
}
78+
79+
// Build up a set of trees and a set of blobs
80+
let entries: TreeEntries = {
81+
let mut entries = TreeEntries {
82+
trees: HashSet::default(),
83+
blobs: HashSet::default(),
84+
};
85+
86+
// For each entry in the tree
87+
for entry_ref in tree.entries.iter() {
88+
match entry_ref.mode {
89+
EntryMode::Tree => {
90+
let tree_id = entry_ref.oid.to_owned();
91+
// Check if the tree has already been encountered
92+
if self.oid_set.insert(tree_id) {
93+
entries.trees.insert(tree_id);
94+
}
95+
}
96+
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
97+
let blob_id = entry_ref.oid.to_owned();
98+
// Check if the blob has already been encountered
99+
if self.oid_set.insert(blob_id) {
100+
entries.blobs.insert(blob_id);
101+
}
102+
}
103+
EntryMode::Commit => {
104+
// This implies a submodule (OID is the commit hash of the submodule)
105+
// Skip it as it's not in this repository!
106+
}
107+
}
108+
}
109+
entries
110+
};
111+
112+
for tree_id in entries.trees.iter() {
113+
self.check_tree(tree_id);
114+
}
115+
for blob_id in entries.blobs.iter() {
116+
self.check_blob(blob_id);
117+
}
118+
}
119+
120+
fn check_blob(&mut self, oid: &ObjectId) {
121+
// Check if the blob is missing from the ODB
122+
if !self.db.contains(oid) {
123+
// Blob is missing, so invoke `missing_cb`
124+
(self.missing_cb)(oid, Kind::Blob);
125+
}
126+
}
127+
}

gix-fsck/tests/Cargo.toml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "gix-fsck-tests"
3+
version = "0.0.0"
4+
repository = "https://github.com/Byron/gitoxide"
5+
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
6+
license = "MIT OR Apache-2.0"
7+
description = "Tests for gix-odb with feature-toggle support"
8+
edition = "2021"
9+
rust-version = "1.65"
10+
publish = false
11+
12+
[[test]]
13+
name = "integrate"
14+
path = "integrate.rs"
15+
16+
[dev-dependencies]
17+
gix-fsck = { path = ".." }
18+
gix-hash = { path = "../../gix-hash" }
19+
gix-hashtable = { path = "../../gix-hashtable" }
20+
gix-object = { path = "../../gix-object" }
21+
gix-odb = { path = "../../gix-odb" }
22+
gix-testtools = { path = "../../tests/tools"}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:b2dbf9142109ea28a9b170efafa7e7a5845a7ef5f024063ded42513b9f29b3d3
3+
size 13696
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
set -x
3+
set -euo pipefail
4+
5+
# Stolen from `set-static-git-environment` in tests/helpers.sh
6+
export GIT_AUTHOR_DATE="2020-09-09 09:06:03 +0800"
7+
export GIT_COMMITTER_DATE="${GIT_AUTHOR_DATE}"
8+
export GIT_AUTHOR_NAME="Sebastian Thiel"
9+
export GIT_COMMITTER_NAME="${GIT_AUTHOR_NAME}"
10+
export GIT_AUTHOR_EMAIL="[email protected]"
11+
export GIT_COMMITTER_EMAIL="${GIT_AUTHOR_EMAIL}"
12+
13+
# Ignore the system config
14+
export GIT_CONFIG_SYSTEM=""
15+
# We override the global config with our own local one (see below)
16+
export GIT_CONFIG_GLOBAL="$PWD/.gitconfig"
17+
18+
# The default value for `init.defaultBranch` may soon change, so to be safe, set it explicitly
19+
git config --global init.defaultBranch main
20+
# We need to be able to do partial clones, so enable it
21+
# Note: This used to only work at global scope, so to be safe, we'll keep it there (no harm in doing so)
22+
git config --global uploadpack.allowFilter true
23+
# Disable the reflog, as that will cause the absolute path of the remote to be inserted
24+
git config --global core.logAllRefUpdates false
25+
26+
# Helper for _nearly_ reproducible clones
27+
# - this fixes up the remote URL to be relative (`git` will resolve it to an absolute path upon a clone operation)
28+
# - clones made with this helper will still have an index (`.git/index`)
29+
# - the index contains lots of non-reproducible fields (e.g. mtime, ctime, inode number, uid, gid)
30+
# - we _could_ remove the index file and later rebuild it from the working tree, but no current need to do so
31+
function git_static_clone() {
32+
git clone --no-local --no-hardlinks "$@"
33+
# Get aboslute paths of the clone destination and the remote
34+
CLONE_PATH="$(realpath "${@: -1}")"
35+
REMOTE_PATH="$(realpath "${@: (-2):1}")"
36+
# Get the remote path relative to the clone path
37+
REMOTE_REL_PATH="$(realpath --relative-to="$CLONE_PATH" "$REMOTE_PATH")"
38+
(
39+
cd "$CLONE_PATH"
40+
# Fixup the remote URL to be relative
41+
git remote set-url origin "$REMOTE_REL_PATH"
42+
)
43+
}
44+
45+
# First build out a base repository
46+
git init base
47+
(
48+
cd base
49+
50+
echo "blob 1" > blob-1
51+
git add -A
52+
git commit -m "commit 1"
53+
echo "blob-2" > blob-2
54+
git add -A
55+
git commit -m "commit 2"
56+
git rm blob-1
57+
git add -A
58+
git commit -m "commit 3"
59+
)
60+
61+
# Blobless clone
62+
git_static_clone --filter=blob:none ./base blobless
63+
64+
# Treeless (and blobless) clone
65+
git_static_clone --filter=tree:0 ./base treeless

0 commit comments

Comments
 (0)