diff --git a/Cargo.toml b/Cargo.toml index 8e89bad..ed6a052 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,20 @@ include = ["src/**/*", "LICENSE.md", "README.md", "CHANGELOG.md"] [lib] test = false +[[test]] +name = "baseline" +path = "tests/baseline.rs" +required-features = ["max-performance"] + +[[test]] +name = "baseline-atomic" +path = "tests/baseline_atomic.rs.rs" +required-features = ["max-performance"] + +[features] +default = [] +max-performance = ["git-repository/max-performance"] + [dependencies] git-repository = { version = "0.28.0", default-features = false, features = ["max-performance-safe", "blocking-network-client", "blocking-http-transport"] } serde = { version = "1", features = ["std", "derive"] } @@ -22,6 +36,8 @@ smartstring = "1.0.1" serde_json = "1" bstr = "1.0.1" thiserror = "1.0.32" +ahash = "0.8.0" +hashbrown = { version = "0.13.1", features = ["raw"] } [dev-dependencies] git-testtools = "0.9.0" diff --git a/Makefile b/Makefile index 3a74066..2bcf093 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,10 @@ CARGO = $(shell command -v cargo) ##@ Development +baseline-atomic: ## run very slow tests that single-step through all commits + GITOXIDE_PACK_CACHE_MEMORY=1g GITOXIDE_OBJECT_CACHE_MEMORY=3g RUST_BACKTRACE=1 cargo test --test baseline_atomic --release --features max-performance -- --nocapture + test: ## run all tests with cargo RUST_BACKTRACE=1 cargo test --test crates-index-diff - RUST_BACKTRACE=1 cargo test --test baseline --release + GITOXIDE_PACK_CACHE_MEMORY=1g RUST_BACKTRACE=1 cargo test --test baseline --release --features max-performance diff --git a/src/index/diff/delegate.rs b/src/index/diff/delegate.rs index 8f43fe8..86090e5 100644 --- a/src/index/diff/delegate.rs +++ b/src/index/diff/delegate.rs @@ -1,14 +1,13 @@ use crate::index::diff::Error; use crate::{Change, CrateVersion}; +use ahash::{AHashSet, RandomState}; use bstr::BStr; use git_repository as git; -use std::collections::BTreeSet; -use std::ops::Range; +use hashbrown::raw::RawTable; #[derive(Default)] pub(crate) struct Delegate { changes: Vec, - deleted_version_ids: BTreeSet, err: Option, } @@ -32,16 +31,18 @@ impl Delegate { if change.location.contains(&b'.') { return Ok(Default::default()); } + match change.event { Addition { entry_mode, id } => { if let Some(obj) = entry_data(entry_mode, id)? { for line in obj.data.lines() { let version = version_from_json_line(line, change.location)?; - self.changes.push(if version.yanked { - Change::Yanked(version) + let change = if version.yanked { + Change::AddedAndYanked(version) } else { Change::Added(version) - }); + }; + self.changes.push(change) } } } @@ -60,93 +61,69 @@ impl Delegate { } Modification { .. } => { if let Some(diff) = change.event.diff().transpose()? { + let mut old_lines = AHashSet::with_capacity(1024); let location = change.location; + for line in diff.old.data.lines() { + // Safety: We transform an &'_ [u8] to and &'static [u8] here + // this is safe because we always drain the hashmap at the end of the function + // the reason the HashMap has a static is that we want to reuse + // the allocation for modifications + old_lines.insert(line); + } - let input = diff.line_tokens(); - let mut err = None; - git::diff::blob::diff( - diff.algo, - &input, - |before: Range, after: Range| { - if err.is_some() { - return; - } - let mut lines_before = input.before - [before.start as usize..before.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()) - .peekable(); - let mut lines_after = input.after - [after.start as usize..after.end as usize] - .iter() - .map(|&line| input.interner[line].as_bstr()) - .peekable(); - let mut remember = |version: CrateVersion| { - self.changes.push(if version.yanked { - Change::Yanked(version) - } else { - Change::Added(version) - }); - }; - 'outer: loop { - match (lines_before.peek().is_some(), lines_after.peek().is_some()) - { - (true, false) => { - for removed in lines_before { - match version_from_json_line(removed, location) { - Ok(version) => { - self.deleted_version_ids.insert(version.id()); - } - Err(e) => { - err = Some(e); - break; - } - } - } - break 'outer; - } - (false, true) => { - for inserted in lines_after { - match version_from_json_line(inserted, location) { - Ok(version) => remember(version), - Err(e) => { - err = Some(e); - break; - } - } - } - break 'outer; - } - (true, true) => { - for (removed, inserted) in - lines_before.by_ref().zip(lines_after.by_ref()) - { - match version_from_json_line(inserted, location) - .and_then(|inserted| { - version_from_json_line(removed, location) - .map(|removed| (removed, inserted)) - }) { - Ok((removed_version, inserted_version)) => { - if removed_version.yanked - != inserted_version.yanked - { - remember(inserted_version); - } - } - Err(e) => { - err = Some(e); - break; - } - } - } - } - (false, false) => break 'outer, - } + // A RawTable is used to represent a Checksum -> CrateVersion map + // because the checksum is already stored in the CrateVersion + // and we want to avoid storing the checksum twice for performance reasons + let mut new_versions = RawTable::with_capacity(old_lines.len().min(1024)); + let hasher = RandomState::new(); + + for line in diff.new.data.lines() { + // first quickly check if the exact same line is already present in this file in that case we don't need to do anything else + if old_lines.remove(line) { + continue; + } + // no need to check if the checksum already exists in the hashmap + // as each checksum appear only once + let new_version = version_from_json_line(line, location)?; + new_versions.insert( + hasher.hash_one(new_version.checksum), + new_version, + |rehashed| hasher.hash_one(rehashed.checksum), + ); + } + + let mut deleted = Vec::new(); + for line in old_lines.drain() { + let old_version = version_from_json_line(line, location)?; + let new_version = new_versions + .remove_entry(hasher.hash_one(old_version.checksum), |version| { + version.checksum == old_version.checksum + }); + match new_version { + Some(new_version) => { + let change = match (old_version.yanked, new_version.yanked) { + (true, false) => Change::Unyanked(new_version), + (false, true) => Change::Yanked(new_version), + _ => continue, + }; + self.changes.push(change) } - }, - ); - if let Some(err) = err { - return Err(err); + None => deleted.push(old_version), + } + } + if !deleted.is_empty() { + self.changes.push(Change::Deleted { + name: deleted[0].name.to_string(), + versions: deleted, + }) + } + for version in new_versions.drain() { + let change = if version.yanked { + Change::AddedAndYanked(version) + } else { + Change::Added(version) + }; + self.changes.push(change); } } } @@ -154,21 +131,10 @@ impl Delegate { Ok(Default::default()) } - pub fn into_result(mut self) -> Result, Error> { + pub fn into_result(self) -> Result, Error> { match self.err { Some(err) => Err(err), - None => { - if !self.deleted_version_ids.is_empty() { - let deleted_version_ids = &self.deleted_version_ids; - self.changes.retain(|change| match change { - Change::Added(v) | Change::Yanked(v) => { - !deleted_version_ids.contains(&v.id()) - } - Change::Deleted { .. } => true, - }) - } - Ok(self.changes) - } + None => Ok(self.changes), } } } diff --git a/src/index/diff/mod.rs b/src/index/diff/mod.rs index 5a09bff..f0a9674 100644 --- a/src/index/diff/mod.rs +++ b/src/index/diff/mod.rs @@ -164,6 +164,16 @@ impl Index { /// Similar to `changes()`, but requires `from` and `to` objects to be provided. They may point /// to either `Commit`s or `Tree`s. + /// + /// # Returns + /// + /// A list of atomic chanes that were performed on the index + /// between the two revisions. + /// The changes are grouped by the crate they belong to. + /// The order of the changes for each crate are **non-deterministic**. + /// The order of crates is also **non-deterministic**. + /// + /// If a specific order is required, the changes must be sorted by the caller. pub fn changes_between_commits( &self, from: impl Into, @@ -237,6 +247,16 @@ impl Index { /// Learn more about specifying revisions /// in the /// [official documentation](https://www.kernel.org/pub/software/scm/git/docs/gitrevisions.html) + /// + /// # Returns + /// + /// A list of atomic chanes that were performed on the index + /// between the two revisions. + /// The changes are grouped by the crate they belong to. + /// The order of the changes for each crate are **non-deterministic**. + /// The order of crates is also **non-deterministic**. + /// + /// If a specific order is required, the changes must be sorted by the caller. pub fn changes( &self, from: impl AsRef, diff --git a/src/types.rs b/src/types.rs index 83526c0..8e8a3fa 100644 --- a/src/types.rs +++ b/src/types.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use git_repository as git; use smartstring::alias::String as SmolString; -use std::fmt; -use std::hash::{Hash, Hasher}; +use std::hash::Hash; +use std::{fmt, slice}; /// A wrapper for a repository of the crates.io index. pub struct Index { @@ -22,8 +22,15 @@ pub struct Index { /// Identify a kind of change that occurred to a crate #[derive(Clone, Eq, PartialEq, Debug)] pub enum Change { - /// A crate version was added or it was unyanked. + /// A crate version was added. Added(CrateVersion), + /// A crate version was unyanked. + Unyanked(CrateVersion), + /// A crate version was added in a yanked state. + /// + /// This can happen if we don't see the commit that added them, so it appears to pop into existence yanked. + /// Knowing this should help to trigger the correct action, as simply `Yanked` crates would be treated quite differently. + AddedAndYanked(CrateVersion), /// A crate version was yanked. Yanked(CrateVersion), /// The name of the crate whose file was deleted, which implies all versions were deleted as well. @@ -39,7 +46,7 @@ impl Change { /// Return the added crate, if this is this kind of change. pub fn added(&self) -> Option<&CrateVersion> { match self { - Change::Added(v) => Some(v), + Change::Added(v) | Change::AddedAndYanked(v) => Some(v), _ => None, } } @@ -47,7 +54,15 @@ impl Change { /// Return the yanked crate, if this is this kind of change. pub fn yanked(&self) -> Option<&CrateVersion> { match self { - Change::Yanked(v) => Some(v), + Change::Yanked(v) | Change::AddedAndYanked(v) => Some(v), + _ => None, + } + } + + /// Return the unyanked crate, if this is this kind of change. + pub fn unyanked(&self) -> Option<&CrateVersion> { + match self { + Change::Unyanked(v) => Some(v), _ => None, } } @@ -59,6 +74,21 @@ impl Change { _ => None, } } + + /// Returns all versions affected by this change. + /// + /// The returned slice usually has length 1. + /// However, if a crate was purged from the index by an admin, + /// all versions of the purged crate are returned. + pub fn versions(&self) -> &[CrateVersion] { + match self { + Change::Added(v) + | Change::Unyanked(v) + | Change::AddedAndYanked(v) + | Change::Yanked(v) => slice::from_ref(v), + Change::Deleted { versions, .. } => versions, + } + } } impl fmt::Display for Change { @@ -70,6 +100,8 @@ impl fmt::Display for Change { Change::Added(_) => "added", Change::Yanked(_) => "yanked", Change::Deleted { .. } => "deleted", + Change::Unyanked(_) => "unyanked", + Change::AddedAndYanked(_) => "added and yanked", } ) } @@ -109,17 +141,6 @@ pub struct CrateVersion { pub dependencies: Vec, } -impl CrateVersion { - pub(crate) fn id(&self) -> u64 { - let mut s = std::collections::hash_map::DefaultHasher::new(); - self.name.hash(&mut s); - self.yanked.hash(&mut s); - self.version.hash(&mut s); - self.checksum.hash(&mut s); - s.finish() - } -} - /// A single dependency of a specific crate version #[derive( Clone, serde::Serialize, serde::Deserialize, Ord, PartialOrd, Eq, PartialEq, Debug, Hash, diff --git a/tests/baseline.rs b/tests/baseline.rs index 14a3ddc..e866f52 100644 --- a/tests/baseline.rs +++ b/tests/baseline.rs @@ -1,72 +1,8 @@ -use std::collections::HashSet; +use crate::shared::Step; +mod shared; +#[cfg_attr(debug_assertions, ignore)] #[test] -fn all_aggregrated_diffs_equal_latest_version( -) -> Result<(), Box> { - let ((expected, baseline_duration), (actual, diff_duration)) = std::thread::scope( - |scope| -> Result<_, Box> { - let baseline = scope.spawn(|| -> Result<_, crates_index::Error> { - let index = crates_index::Index::new_cargo_default()?; - let start = std::time::Instant::now(); - let mut versions = HashSet::new(); - for krate in index.crates() { - for version in krate.versions() { - versions.insert(version.checksum().to_owned()); - } - } - Ok((versions, start.elapsed())) - }); - - let actual = scope.spawn(|| -> Result<_, Box> { - use crates_index_diff::git; - - let start = std::time::Instant::now(); - let repo_path = crates_index::Index::new_cargo_default()?.path().to_owned(); - let index = crates_index_diff::Index::from_path_or_cloned(repo_path)?; - let changes = index.changes_between_commits( - git::hash::ObjectId::empty_tree(git::hash::Kind::Sha1), - index - .repository() - .find_reference("refs/remotes/origin/HEAD")? - .target() - .id(), - )?; - - use crates_index_diff::Change::*; - let mut versions = HashSet::new(); - - for change in changes { - match change { - Added(v) | Yanked(v) => { - versions.insert(v.checksum.to_owned()); - } - Deleted { - versions: deleted, .. - } => { - for deleted_version in deleted { - versions.remove(&deleted_version.checksum); - } - } - } - } - Ok((versions, start.elapsed())) - }); - - Ok(( - baseline.join().expect("no panic")?, - actual.join().expect("no panic")?, - )) - }, - )?; - - dbg!(baseline_duration, expected.len()); - dbg!(diff_duration, actual.len()); - assert_eq!( - actual.len(), - expected.len(), - "aggregated of all changes produces the final result" - ); - assert!(actual.eq(&expected), "actual should be exactly the same"); - - Ok(()) +fn big_steps() -> Result<(), Box> { + shared::baseline(Step::Partitioned { size: 4 }) } diff --git a/tests/baseline_atomic.rs b/tests/baseline_atomic.rs new file mode 100644 index 0000000..a02bfa8 --- /dev/null +++ b/tests/baseline_atomic.rs @@ -0,0 +1,9 @@ +use crate::shared::Step; + +mod shared; + +#[cfg_attr(debug_assertions, ignore)] +#[test] +fn one_per_commit() -> Result<(), Box> { + shared::baseline(Step::OnePerCommit) +} diff --git a/tests/fixtures/generated-archives/make-index-from-parts.tar.xz b/tests/fixtures/generated-archives/make-index-from-parts.tar.xz index cd599bd..d14b8dd 100644 Binary files a/tests/fixtures/generated-archives/make-index-from-parts.tar.xz and b/tests/fixtures/generated-archives/make-index-from-parts.tar.xz differ diff --git a/tests/index/changes_between_commits.rs b/tests/index/changes_between_commits.rs index 76236ac..78029c7 100644 --- a/tests/index/changes_between_commits.rs +++ b/tests/index/changes_between_commits.rs @@ -17,15 +17,16 @@ fn directory_deletions_are_not_picked_up() -> crate::Result { fn updates_before_yanks_are_picked_up() -> crate::Result { let index = index_ro()?; let repo = index.repository(); - let changes = index.changes_between_commits( + let mut changes = index.changes_between_commits( repo.rev_parse_single("@^{/updating ansi-color-codec 0.3.11}~1")?, repo.rev_parse_single("@^{/yanking ansi-color-codec 0.3.5}")?, )?; assert_eq!(changes.len(), 3, "1 update and 2 yanks"); - assert_eq!(changes[0].yanked().expect("second yanked").version, "0.3.4"); - assert_eq!(changes[1].yanked().expect("third yanked").version, "0.3.5"); - assert_eq!(changes[2].added().expect("first updated").version, "0.3.11"); + changes.sort_by_key(|change| change.versions()[0].version.clone()); + assert_eq!(changes[1].yanked().expect("second yanked").version, "0.3.4"); + assert_eq!(changes[2].yanked().expect("third yanked").version, "0.3.5"); + assert_eq!(changes[0].added().expect("first updated").version, "0.3.11"); Ok(()) } @@ -97,13 +98,13 @@ fn yanked_in_new_file() -> crate::Result { } #[test] -fn unyanked_crates_recognized_as_added() -> crate::Result { +fn unyanked_crates_recognized() -> crate::Result { let changes = changes(index_ro()?, ":/Unyanking crate `git2mail#0.3.2`")?; assert_eq!(changes.len(), 1); assert_eq!( changes .first() - .and_then(|c| c.added().map(|v| v.name.as_str())), + .and_then(|c| c.unyanked().map(|v| v.name.as_str())), Some("git2mail") ); Ok(()) diff --git a/tests/shared/mod.rs b/tests/shared/mod.rs new file mode 100644 index 0000000..3f9af49 --- /dev/null +++ b/tests/shared/mod.rs @@ -0,0 +1,115 @@ +use crates_index_diff::Change::*; +use std::collections::HashSet; + +#[allow(dead_code)] +pub enum Step { + Partitioned { size: usize }, + OnePerCommit, +} + +pub fn baseline(mode: Step) -> Result<(), Box> { + let ((expected, baseline_duration), (actual, diff_duration)) = std::thread::scope( + |scope| -> Result<_, Box> { + let baseline = scope.spawn(|| -> Result<_, crates_index::Error> { + let index = crates_index::Index::new_cargo_default()?; + let start = std::time::Instant::now(); + let mut versions = HashSet::new(); + for krate in index.crates() { + for version in krate.versions() { + versions.insert(version.checksum().to_owned()); + } + } + Ok((versions, start.elapsed())) + }); + let actual = scope.spawn(|| -> Result<_, Box> { + use crates_index_diff::git; + + let start = std::time::Instant::now(); + let repo_path = crates_index::Index::new_cargo_default()?.path().to_owned(); + let index = crates_index_diff::Index::from_path_or_cloned(repo_path)?; + let repo = index.repository(); + let head = repo.find_reference("refs/remotes/origin/HEAD")?.id(); + let commits = head + .ancestors() + .first_parent_only() + .all()? + .map(|id| id.map(|id| id.detach())) + .collect::, _>>()?; + + // This could be more complex, like jumping to landmarks like 'Delete crate(s)' and so forth. + let partitions = match mode { + Step::Partitioned { size } => size, + Step::OnePerCommit => commits.len(), + }; + let chunk_size = (commits.len() / partitions).max(1); + let mut steps = (0..commits.len()).step_by(chunk_size).collect::>(); + if *steps.last().expect("at least 1") != commits.len() - 1 { + steps.push(commits.len() - 1); + } + + let mut versions = HashSet::default(); + let mut previous = None; + let num_steps = steps.len(); + for (step, current) in steps + .into_iter() + .rev() + .map(|idx| commits[idx].to_owned()) + .enumerate() + { + let old = previous + .unwrap_or_else(|| git::hash::ObjectId::empty_tree(git::hash::Kind::Sha1)); + previous = Some(current); + + let start = std::time::Instant::now(); + let changes = index.changes_between_commits(old, current)?; + let num_changes = changes.len(); + for change in changes { + match change { + Added(v) | AddedAndYanked(v) => { + // found a new crate, add it to the index + versions.insert(v.checksum.to_owned()); + } + Unyanked(v) | Yanked(v) => { + // yanked/unyanked crates must be part of the index + assert!(versions.contains(&v.checksum)) + } + Deleted { + versions: deleted, .. + } => { + // delete a yanked crate + for deleted_version in deleted { + versions.remove(&deleted_version.checksum); + } + } + } + } + let elapsed = start.elapsed().as_secs_f32(); + eprintln!( + "Step {} / {} and {} change(s) took {:.02}s ({:.0} changes/s)", + step, + num_steps, + num_changes, + elapsed, + num_changes as f32 / elapsed + ); + } + Ok((versions, start.elapsed())) + }); + + Ok(( + baseline.join().expect("no panic")?, + actual.join().expect("no panic")?, + )) + }, + )?; + + dbg!(baseline_duration, expected.len()); + dbg!(diff_duration, actual.len()); + assert_eq!( + actual.len(), + expected.len(), + "aggregated of all changes produces the final result" + ); + assert!(actual.eq(&expected), "actual should be exactly the same"); + Ok(()) +}