Skip to content

Commit 1f9aca5

Browse files
committed
Merge branch 'feat_basic_connectivity_check'
2 parents 7227410 + 7ab5c76 commit 1f9aca5

File tree

19 files changed

+347
-2
lines changed

19 files changed

+347
-2
lines changed

Cargo.lock

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ members = [
281281
"gix-archive",
282282
"gix-worktree-stream",
283283
"gix-revwalk",
284+
"gix-fsck",
284285

285286
"tests/tools",
286287

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ is usable to some extent.
140140
* [gix-tui](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tui)
141141
* [gix-tix](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-tix)
142142
* [gix-bundle](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-bundle)
143+
* [gix-fsck](https://github.com/Byron/gitoxide/blob/main/crate-status.md#gix-fsck)
143144

144145
### Stress Testing
145146
* [x] Verify huge packs

crate-status.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,23 @@ See its [README.md](https://github.com/Byron/gitoxide/blob/main/gix-lock/README.
775775
* [x] validate submodule names
776776
* [x] [validate][tagname-validation] tag names
777777

778+
### gix-fsck
779+
* [x] validate connectivity and find missing objects starting from…
780+
- [x] commits
781+
- [ ] tags
782+
- [ ] tree-cache in the `index` or any entry within
783+
* [ ] validate object hashes during connectivity traversal
784+
* [ ] progress reporting and interruptability
785+
* [ ] skipList to exclude objects which are known to be broken
786+
* [ ] validate blob hashes (connectivity check
787+
* [ ] identify objects that exist but are not reachable (i.e. what remains after a full graph traversal from all valid starting points)
788+
* [ ] write dangling objects to the `.git/log-found` directory structure
789+
* [ ] `strict` mode, to check for tree objects with `g+w` permissions
790+
* [ ] consider reflog entries from `ref` starting points
791+
* [ ] when reporting reachable objects, provide the path through which they are reachable, i.e. ref-log@{3} -> commit -> tree -> path-in-tree
792+
* [ ] limit search to ODB without alternates (default is equivalent to `git fsck --full` due to ODB implementation)
793+
* [ ] all individual [checks available in `git fsck`](https://git-scm.com/docs/git-fsck#_fsck_messages) (*too many to print here*)
794+
778795
### gix-ref
779796
* [ ] Prepare code for arrival of longer hashes like Sha256. It's part of the [V2 proposal][reftable-v2] but should work for loose refs as well.
780797
* **Stores**

etc/check-package-size.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ function indent () {
1515
}
1616

1717
echo "in root: gitoxide CLI"
18+
(enter gix-fsck && indent cargo diet -n --package-size-limit 10KB)
1819
(enter gix-actor && indent cargo diet -n --package-size-limit 10KB)
1920
(enter gix-archive && indent cargo diet -n --package-size-limit 10KB)
2021
(enter gix-worktree-stream && indent cargo diet -n --package-size-limit 40KB)

gitoxide-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ gix-pack-for-configuration-only = { package = "gix-pack", version = "^0.44.0", p
4949
gix-transport-configuration-only = { package = "gix-transport", version = "^0.38.0", path = "../gix-transport", default-features = false }
5050
gix-archive-for-configuration-only = { package = "gix-archive", version = "^0.6.0", path = "../gix-archive", optional = true, features = ["tar", "tar_gz"] }
5151
gix-status = { version = "^0.2.0", path = "../gix-status" }
52+
gix-fsck = { version = "^0.1.0", path = "../gix-fsck" }
5253
serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] }
5354
anyhow = "1.0.42"
5455
thiserror = "1.0.34"

gitoxide-core/src/repository/fsck.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
use anyhow::Context;
2+
use gix::{objs::Kind, ObjectId};
3+
4+
pub fn function(mut repo: gix::Repository, spec: Option<String>, mut out: impl std::io::Write) -> anyhow::Result<()> {
5+
let spec = spec.unwrap_or("HEAD".into());
6+
7+
repo.object_cache_size_if_unset(4 * 1024 * 1024);
8+
// We expect to be finding a bunch of non-existent objects here - never refresh the ODB
9+
repo.objects.refresh_never();
10+
11+
let id = repo
12+
.rev_parse_single(spec.as_str())
13+
.context("Only single revisions are supported")?;
14+
let commits: gix::revision::Walk<'_> = id
15+
.object()?
16+
.peel_to_kind(gix::object::Kind::Commit)
17+
.context("Need commitish as starting point")?
18+
.id()
19+
.ancestors()
20+
.all()?;
21+
22+
let on_missing = |oid: &ObjectId, kind: Kind| {
23+
writeln!(out, "{oid}: {kind}").expect("failed to write output");
24+
};
25+
26+
let mut check = gix_fsck::Connectivity::new(&repo.objects, on_missing);
27+
// Walk all commits, checking each one for connectivity
28+
for commit in commits {
29+
let commit = commit?;
30+
check.check_commit(&commit.id)?;
31+
// Note that we leave parent-iteration to the commits iterator, as it will
32+
// correctly handle shallow repositories which are expected to have the commits
33+
// along the shallow boundary missing.
34+
}
35+
Ok(())
36+
}

gitoxide-core/src/repository/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ pub use clone::function::clone;
3535
pub use fetch::function::fetch;
3636

3737
pub mod commitgraph;
38+
mod fsck;
39+
pub use fsck::function as fsck;
3840
pub mod index;
3941
pub mod mailmap;
4042
pub mod odb;

gix-fsck/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

gix-fsck/Cargo.toml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "gix-fsck"
3+
version = "0.1.0"
4+
repository = "https://github.com/Byron/gitoxide"
5+
authors = ["Cameron Esfahani <[email protected]>", "Sebastian Thiel <[email protected]>"]
6+
license = "MIT OR Apache-2.0"
7+
description = "Verifies the connectivity and validity of objects in the database"
8+
edition = "2021"
9+
include = ["src/**/*", "LICENSE-*"]
10+
rust-version = "1.65"
11+
12+
[lib]
13+
doctest = false
14+
15+
[dependencies]
16+
gix-hash = { version = "^0.13.1", path = "../gix-hash" }
17+
gix-hashtable = { version = "^0.4.0", path = "../gix-hashtable" }
18+
gix-object = { version = "^0.38.0", path = "../gix-object" }
19+
20+
[dev-dependencies]
21+
gix-odb = { path = "../gix-odb" }
22+
gix-testtools = { path = "../tests/tools"}

gix-fsck/LICENSE-APACHE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-APACHE

gix-fsck/LICENSE-MIT

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../LICENSE-MIT

gix-fsck/src/lib.rs

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
//! A library for performing object database integrity and connectivity checks
2+
#![deny(rust_2018_idioms, unsafe_code, missing_docs)]
3+
4+
use gix_hash::ObjectId;
5+
use gix_hashtable::HashSet;
6+
use gix_object::{tree::EntryMode, Exists, FindExt, Kind};
7+
use std::collections::VecDeque;
8+
9+
/// Perform a connectivity check.
10+
pub struct Connectivity<T, F>
11+
where
12+
T: FindExt + Exists,
13+
F: FnMut(&ObjectId, Kind),
14+
{
15+
/// ODB handle to use for the check
16+
db: T,
17+
/// Closure to invoke when a missing object is encountered
18+
missing_cb: F,
19+
/// Set of Object IDs already (or about to be) scanned during the check
20+
seen: HashSet,
21+
/// A buffer to keep a single object at a time.
22+
buf: Vec<u8>,
23+
}
24+
25+
impl<T, F> Connectivity<T, F>
26+
where
27+
T: FindExt + Exists,
28+
F: FnMut(&ObjectId, Kind),
29+
{
30+
/// Instantiate a connectivity check.
31+
pub fn new(db: T, missing_cb: F) -> Connectivity<T, F> {
32+
Connectivity {
33+
db,
34+
missing_cb,
35+
seen: HashSet::default(),
36+
buf: Default::default(),
37+
}
38+
}
39+
40+
/// Run the connectivity check on the provided commit `oid`.
41+
///
42+
/// ### Algorithm
43+
///
44+
/// Walk the trees and blobs referenced by the commit and verify they exist in the ODB.
45+
/// Any objects previously encountered by this instance will be skipped silently.
46+
/// Any referenced blobs that are not present in the ODB will result in a call to the `missing_cb`.
47+
/// Missing commits or trees will cause an error to be returned.
48+
/// - TODO: consider how to handle a missing commit (invoke `missing_cb`, or possibly return a Result?)
49+
pub fn check_commit(&mut self, oid: &ObjectId) -> Result<(), gix_object::find::existing_object::Error> {
50+
// Attempt to insert the commit ID in the set, and if already present, return immediately
51+
if !self.seen.insert(*oid) {
52+
return Ok(());
53+
}
54+
// Obtain the commit's tree ID
55+
let tree_id = {
56+
let commit = self.db.find_commit(oid, &mut self.buf)?;
57+
commit.tree()
58+
};
59+
60+
let mut tree_ids = VecDeque::from_iter(Some(tree_id));
61+
while let Some(tree_id) = tree_ids.pop_front() {
62+
if self.seen.insert(tree_id) {
63+
self.check_tree(&tree_id, &mut tree_ids);
64+
}
65+
}
66+
67+
Ok(())
68+
}
69+
70+
/// Blobs are checked right away, trees are stored in `tree_ids` for the parent to iterate them, and only
71+
/// if they have not been `seen` yet.
72+
fn check_tree(&mut self, oid: &ObjectId, tree_ids: &mut VecDeque<ObjectId>) {
73+
let Ok(tree) = self.db.find_tree(oid, &mut self.buf) else {
74+
(self.missing_cb)(oid, Kind::Tree);
75+
return;
76+
};
77+
78+
for entry_ref in tree.entries.iter() {
79+
match entry_ref.mode {
80+
EntryMode::Tree => {
81+
let tree_id = entry_ref.oid.to_owned();
82+
if self.seen.insert(tree_id) {
83+
tree_ids.push_back(tree_id);
84+
}
85+
}
86+
EntryMode::Blob | EntryMode::BlobExecutable | EntryMode::Link => {
87+
let blob_id = entry_ref.oid.to_owned();
88+
if self.seen.insert(blob_id) {
89+
check_blob(&self.db, &blob_id, &mut self.missing_cb);
90+
}
91+
}
92+
EntryMode::Commit => {
93+
// Skip submodules as it's not in this repository!
94+
}
95+
}
96+
}
97+
}
98+
}
99+
100+
fn check_blob<F>(db: impl Exists, oid: &ObjectId, mut missing_cb: F)
101+
where
102+
F: FnMut(&ObjectId, Kind),
103+
{
104+
if !db.exists(oid) {
105+
missing_cb(oid, Kind::Blob);
106+
}
107+
}

gix-fsck/tests/connectivity/mod.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
use gix_fsck::Connectivity;
2+
use gix_hash::ObjectId;
3+
use gix_hashtable::HashMap;
4+
use gix_object::Kind;
5+
use gix_testtools::once_cell::sync::Lazy;
6+
7+
use crate::hex_to_id;
8+
9+
fn check_missing<'a>(repo_name: &str, commits: impl IntoIterator<Item = &'a ObjectId>) -> HashMap<ObjectId, Kind> {
10+
let db = {
11+
let fixture_path = gix_testtools::scripted_fixture_read_only("make_test_repos.sh")
12+
.expect("fixture path")
13+
.join(repo_name)
14+
.join(".git")
15+
.join("objects");
16+
let mut db = gix_odb::at(fixture_path).expect("valid odb");
17+
db.refresh_never();
18+
db
19+
};
20+
21+
let mut missing: HashMap<ObjectId, Kind> = HashMap::default();
22+
let record_missing_and_assert_no_duplicate = |oid: &ObjectId, kind: Kind| {
23+
missing.try_insert(*oid, kind).expect("no duplicate oid");
24+
};
25+
26+
let mut check = Connectivity::new(db, record_missing_and_assert_no_duplicate);
27+
for commit in commits.into_iter() {
28+
check.check_commit(commit).expect("commit is present")
29+
}
30+
missing
31+
}
32+
33+
fn hex_to_ids<'a>(hex_ids: impl IntoIterator<Item = &'a str>) -> Vec<ObjectId> {
34+
hex_ids.into_iter().map(hex_to_id).collect()
35+
}
36+
37+
fn hex_to_objects<'a>(hex_ids: impl IntoIterator<Item = &'a str>, kind: Kind) -> HashMap<ObjectId, Kind> {
38+
hex_to_ids(hex_ids).into_iter().map(|id| (id, kind)).collect()
39+
}
40+
41+
// Get a `&Vec<ObjectID` for each commit in the test fixture repository
42+
fn all_commits() -> &'static [ObjectId] {
43+
static ALL_COMMITS: Lazy<Vec<ObjectId>> = Lazy::new(|| {
44+
hex_to_ids([
45+
"5d18db2e2aabadf7b914435ef34f2faf8b4546dd",
46+
"3a3dfaa55a515f3fb3a25751107bbb523af6a1b0",
47+
"734c926856a328d1168ffd7088532e0d1ad19bbe",
48+
])
49+
});
50+
&ALL_COMMITS
51+
}
52+
53+
#[test]
54+
fn no_missing() {
55+
// The "base" repo is the original, and has every object present
56+
assert_eq!(check_missing("base", all_commits()), HashMap::default());
57+
}
58+
59+
#[test]
60+
fn missing_blobs() {
61+
// The "blobless" repo is cloned with `--filter=blob:none`, and is missing one blob
62+
let expected = hex_to_objects(["c18147dc648481eeb65dc5e66628429a64843327"], Kind::Blob);
63+
assert_eq!(check_missing("blobless", all_commits()), expected);
64+
}
65+
66+
#[test]
67+
fn missing_trees() {
68+
// The "treeless" repo is cloned with `--filter=tree:0`, and is missing two trees
69+
// NOTE: This repo is also missing a blob, but we have no way of knowing that, as the tree referencing it is missing
70+
let expected = hex_to_objects(
71+
[
72+
"9561cfbae43c5e2accdfcd423378588dd10d827f",
73+
"fc264b3b6875a46e9031483aeb9994a1b897ffd3",
74+
],
75+
Kind::Tree,
76+
);
77+
assert_eq!(check_missing("treeless", all_commits()), expected);
78+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
make_test_repos.tar.xz
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
set -x
3+
set -euo pipefail
4+
5+
# We override the global config with our own local one (see below)
6+
export GIT_CONFIG_GLOBAL="$PWD/.gitconfig"
7+
8+
# We need to be able to do partial clones, so enable it
9+
# - needs to be present in the persistent gitconfig, as a clone with `--no-local`
10+
git config --global uploadpack.allowFilter true
11+
12+
# First build out a base repository
13+
git init base
14+
(
15+
cd base
16+
17+
echo "blob 1" > blob-1
18+
git add -A
19+
git commit -m "commit 1"
20+
echo "blob-2" > blob-2
21+
git add -A
22+
git commit -m "commit 2"
23+
git rm blob-1
24+
git add -A
25+
git commit -m "commit 3"
26+
)
27+
28+
# Blobless clone
29+
git clone --no-local --no-hardlinks --filter=blob:none ./base blobless
30+
31+
# Treeless (and blobless) clone
32+
git clone --no-local --no-hardlinks --filter=tree:0 ./base treeless

gix-fsck/tests/fsck.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
use gix_hash::ObjectId;
2+
3+
pub fn hex_to_id(hex: &str) -> ObjectId {
4+
ObjectId::from_hex(hex.as_bytes()).expect("40 bytes hex")
5+
}
6+
7+
mod connectivity;

0 commit comments

Comments
 (0)