From 181479cf580520a9d5935c4258f34f9226899226 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Tue, 28 May 2024 16:55:22 +0200 Subject: [PATCH 1/2] Create `db-dump.zip` file too Zip files use compression per file, which allows users to only extract the data that they need, instead of needlessly extracting the full tarball to read the small table that they are interested in. --- Cargo.lock | 75 ++++++++++++++++++++++ Cargo.toml | 1 + deny.toml | 1 + src/storage.rs | 6 +- src/tests/dump_db.rs | 37 ++++++++++- src/worker/jobs/dump_db.rs | 127 +++++++++++++++++++++++++++---------- 6 files changed, 209 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 71ecf2a2e6a..39266ccbe19 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -183,6 +183,15 @@ version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "async-compression" version = "0.4.11" @@ -1029,6 +1038,7 @@ dependencies = [ "typomania", "unicode-xid", "url", + "zip", ] [[package]] @@ -1410,6 +1420,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "derive_builder" version = "0.20.0" @@ -1544,6 +1565,17 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.66", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -2596,6 +2628,12 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.21" @@ -4093,6 +4131,12 @@ dependencies = [ "rand_core", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "similar" version = "2.5.0" @@ -5269,6 +5313,37 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +[[package]] +name = "zip" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd56a4d5921bc2f99947ac5b3abe5f510b1be7376fdc5e9fce4a23c6a93e87c" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "memchr", + "thiserror", + "zopfli", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] + [[package]] name = "zstd" version = "0.13.1" diff --git a/Cargo.toml b/Cargo.toml index 257109f0753..fd2edcb9a2d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,6 +120,7 @@ tracing-subscriber = { version = "=0.3.18", features = ["env-filter"] } typomania = { version = "=0.1.2", default-features = false } url = "=2.5.0" unicode-xid = "=0.2.4" +zip = { version = "=2.1.1", default-features = false, features = ["deflate"] } [dev-dependencies] bytes = "=1.6.0" diff --git a/deny.toml b/deny.toml index 42c125915f4..75f2e61d858 100644 --- a/deny.toml +++ b/deny.toml @@ -100,6 +100,7 @@ allow = [ #"Apache-2.0 WITH LLVM-exception", "BSD-2-Clause", "BSD-3-Clause", + "BSL-1.0", "ISC", "MIT", "MPL-2.0", diff --git a/src/storage.rs b/src/storage.rs index cd7bb81055a..ef254da5017 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -19,7 +19,8 @@ const PREFIX_CRATES: &str = "crates"; const PREFIX_READMES: &str = "readmes"; const DEFAULT_REGION: &str = "us-west-1"; const CONTENT_TYPE_CRATE: &str = "application/gzip"; -const CONTENT_TYPE_DB_DUMP: &str = "application/gzip"; +const CONTENT_TYPE_GZIP: &str = "application/gzip"; +const CONTENT_TYPE_ZIP: &str = "application/zip"; const CONTENT_TYPE_INDEX: &str = "text/plain"; const CONTENT_TYPE_README: &str = "text/html"; const CACHE_CONTROL_IMMUTABLE: &str = "public,max-age=31536000,immutable"; @@ -126,7 +127,8 @@ impl Storage { // The `BufWriter::new()` API currently does not allow // specifying any file attributes, so we need to set the // content type here instead for the database dump upload. - .with_content_type_for_suffix("gz", CONTENT_TYPE_DB_DUMP); + .with_content_type_for_suffix("gz", CONTENT_TYPE_GZIP) + .with_content_type_for_suffix("zip", CONTENT_TYPE_ZIP); let store = build_s3(default, options); diff --git a/src/tests/dump_db.rs b/src/tests/dump_db.rs index 71ccbf959a8..f42ce881e72 100644 --- a/src/tests/dump_db.rs +++ b/src/tests/dump_db.rs @@ -9,7 +9,7 @@ use insta::{assert_debug_snapshot, assert_snapshot}; use once_cell::sync::Lazy; use regex::Regex; use secrecy::ExposeSecret; -use std::io::Read; +use std::io::{Cursor, Read}; use tar::Archive; static PATH_DATE_RE: Lazy = Lazy::new(|| Regex::new(r"^\d{4}-\d{2}-\d{2}-\d{6}").unwrap()); @@ -28,8 +28,9 @@ async fn test_dump_db_job() { app.run_pending_background_jobs().await; let stored_files = app.stored_files().await; - assert_eq!(stored_files.len(), 1); + assert_eq!(stored_files.len(), 2); assert_eq!(stored_files[0], "db-dump.tar.gz"); + assert_eq!(stored_files[1], "db-dump.zip"); let path = object_store::path::Path::parse("db-dump.tar.gz").unwrap(); let result = app.as_inner().storage.as_inner().get(&path).await.unwrap(); @@ -65,6 +66,38 @@ async fn test_dump_db_job() { "YYYY-MM-DD-HHMMSS/data/version_downloads.csv", ] "###); + + let path = object_store::path::Path::parse("db-dump.zip").unwrap(); + let result = app.as_inner().storage.as_inner().get(&path).await.unwrap(); + let bytes = result.bytes().await.unwrap(); + + let archive = zip::ZipArchive::new(Cursor::new(bytes)).unwrap(); + let zip_paths = archive.file_names().collect::>(); + assert_debug_snapshot!(zip_paths, @r###" + [ + "README.md", + "export.sql", + "import.sql", + "metadata.json", + "schema.sql", + "data/", + "data/categories.csv", + "data/crate_downloads.csv", + "data/crates.csv", + "data/keywords.csv", + "data/metadata.csv", + "data/reserved_crate_names.csv", + "data/teams.csv", + "data/users.csv", + "data/crates_categories.csv", + "data/crates_keywords.csv", + "data/crate_owners.csv", + "data/versions.csv", + "data/default_versions.csv", + "data/dependencies.csv", + "data/version_downloads.csv", + ] + "###); } fn tar_paths(archive: &mut Archive) -> Vec { diff --git a/src/worker/jobs/dump_db.rs b/src/worker/jobs/dump_db.rs index b1b13a93ce3..d6cd247e56d 100644 --- a/src/worker/jobs/dump_db.rs +++ b/src/worker/jobs/dump_db.rs @@ -6,6 +6,7 @@ use crates_io_worker::BackgroundJob; use std::fs::{self, File}; use std::path::{Path, PathBuf}; use std::sync::Arc; +use zip::write::SimpleFileOptions; #[derive(Clone, Serialize, Deserialize)] pub struct DumpDb { @@ -28,38 +29,56 @@ impl BackgroundJob for DumpDb { /// Create CSV dumps of the public information in the database, wrap them in a /// tarball and upload to S3. async fn run(&self, env: Self::Context) -> anyhow::Result<()> { - let target_name = "db-dump.tar.gz"; + const TAR_PATH: &str = "db-dump.tar.gz"; + const ZIP_PATH: &str = "db-dump.zip"; + let database_url = self.database_url.clone(); - let tarball = spawn_blocking(move || { + let (tarball, zip) = spawn_blocking(move || { let directory = DumpDirectory::create()?; - info!("Begin exporting database"); + info!("Exporting database…"); directory.populate(&database_url)?; let export_dir = directory.path(); - info!(path = ?export_dir, "Creating tarball"); - let prefix = PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string()); - create_tarball(export_dir, &prefix) + info!(path = ?export_dir, "Creating tarball…"); + let tarball_prefix = + PathBuf::from(directory.timestamp.format("%Y-%m-%d-%H%M%S").to_string()); + create_archives(export_dir, &tarball_prefix) }) .await?; - info!("Uploading tarball"); - env.storage - .upload_db_dump(target_name, tarball.path()) - .await?; + info!("Uploading tarball…"); + env.storage.upload_db_dump(TAR_PATH, tarball.path()).await?; info!("Database dump tarball uploaded"); - info!("Invalidating CDN caches"); + info!("Invalidating CDN caches…"); + if let Some(cloudfront) = env.cloudfront() { + if let Err(error) = cloudfront.invalidate(TAR_PATH).await { + warn!("Failed to invalidate CloudFront cache: {}", error); + } + } + + if let Some(fastly) = env.fastly() { + if let Err(error) = fastly.invalidate(TAR_PATH).await { + warn!("Failed to invalidate Fastly cache: {}", error); + } + } + + info!("Uploading zip file…"); + env.storage.upload_db_dump(ZIP_PATH, zip.path()).await?; + info!("Database dump zip file uploaded"); + + info!("Invalidating CDN caches…"); if let Some(cloudfront) = env.cloudfront() { - if let Err(error) = cloudfront.invalidate(target_name).await { - warn!("failed to invalidate CloudFront cache: {}", error); + if let Err(error) = cloudfront.invalidate(ZIP_PATH).await { + warn!("Failed to invalidate CloudFront cache: {}", error); } } if let Some(fastly) = env.fastly() { - if let Err(error) = fastly.invalidate(target_name).await { - warn!("failed to invalidate Fastly cache: {}", error); + if let Err(error) = fastly.invalidate(ZIP_PATH).await { + warn!("Failed to invalidate Fastly cache: {}", error); } } @@ -202,15 +221,22 @@ pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> { Ok(()) } -fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result { - debug!("Creating tarball file"); - let tempfile = tempfile::NamedTempFile::new()?; - let encoder = flate2::write::GzEncoder::new(tempfile.as_file(), flate2::Compression::default()); +fn create_archives( + export_dir: &Path, + tarball_prefix: &Path, +) -> anyhow::Result<(tempfile::NamedTempFile, tempfile::NamedTempFile)> { + debug!("Creating tarball file…"); + let tar_tempfile = tempfile::NamedTempFile::new()?; + let encoder = + flate2::write::GzEncoder::new(tar_tempfile.as_file(), flate2::Compression::default()); + let mut tar = tar::Builder::new(encoder); - let mut archive = tar::Builder::new(encoder); + debug!("Creating zip file…"); + let zip_tempfile = tempfile::NamedTempFile::new()?; + let mut zip = zip::ZipWriter::new(zip_tempfile.as_file()); - debug!(path = ?prefix, "Appending directory to tarball"); - archive.append_dir(prefix, export_dir)?; + debug!("Appending `{tarball_prefix:?}` directory to tarball…"); + tar.append_dir(tarball_prefix, export_dir)?; // Append readme, metadata, schemas. let mut paths = Vec::new(); @@ -224,9 +250,13 @@ fn create_tarball(export_dir: &Path, prefix: &Path) -> anyhow::Result anyhow::Result>(); + assert_debug_snapshot!(zip_paths, @r###" + [ + "README.md", + "data/", + "data/crates.csv", + "data/users.csv", + "data/crate_owners.csv", + ] + "###); } } From c137ccce21640f7e0f1a1f7cf235d368de8adc02 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Wed, 29 May 2024 11:41:56 +0200 Subject: [PATCH 2/2] dump_db: Extract `Archives` struct --- src/worker/jobs/dump_db.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/worker/jobs/dump_db.rs b/src/worker/jobs/dump_db.rs index d6cd247e56d..8a273c6a1d9 100644 --- a/src/worker/jobs/dump_db.rs +++ b/src/worker/jobs/dump_db.rs @@ -34,7 +34,7 @@ impl BackgroundJob for DumpDb { let database_url = self.database_url.clone(); - let (tarball, zip) = spawn_blocking(move || { + let archives = spawn_blocking(move || { let directory = DumpDirectory::create()?; info!("Exporting database…"); @@ -49,7 +49,9 @@ impl BackgroundJob for DumpDb { .await?; info!("Uploading tarball…"); - env.storage.upload_db_dump(TAR_PATH, tarball.path()).await?; + env.storage + .upload_db_dump(TAR_PATH, archives.tar.path()) + .await?; info!("Database dump tarball uploaded"); info!("Invalidating CDN caches…"); @@ -66,7 +68,9 @@ impl BackgroundJob for DumpDb { } info!("Uploading zip file…"); - env.storage.upload_db_dump(ZIP_PATH, zip.path()).await?; + env.storage + .upload_db_dump(ZIP_PATH, archives.zip.path()) + .await?; info!("Database dump zip file uploaded"); info!("Invalidating CDN caches…"); @@ -221,10 +225,12 @@ pub fn run_psql(script: &Path, database_url: &str) -> anyhow::Result<()> { Ok(()) } -fn create_archives( - export_dir: &Path, - tarball_prefix: &Path, -) -> anyhow::Result<(tempfile::NamedTempFile, tempfile::NamedTempFile)> { +struct Archives { + tar: tempfile::NamedTempFile, + zip: tempfile::NamedTempFile, +} + +fn create_archives(export_dir: &Path, tarball_prefix: &Path) -> anyhow::Result { debug!("Creating tarball file…"); let tar_tempfile = tempfile::NamedTempFile::new()?; let encoder = @@ -293,7 +299,10 @@ fn create_archives( drop(tar); zip.finish()?; - Ok((tar_tempfile, zip_tempfile)) + Ok(Archives { + tar: tar_tempfile, + zip: zip_tempfile, + }) } mod configuration; @@ -321,8 +330,8 @@ mod tests { fs::write(p.join("data").join("crate_owners.csv"), "").unwrap(); fs::write(p.join("data").join("users.csv"), "").unwrap(); - let (tarball, zip) = create_archives(p, &PathBuf::from("0000-00-00")).unwrap(); - let gz = GzDecoder::new(File::open(tarball.path()).unwrap()); + let archives = create_archives(p, &PathBuf::from("0000-00-00")).unwrap(); + let gz = GzDecoder::new(File::open(archives.tar.path()).unwrap()); let mut tar = Archive::new(gz); let entries = tar.entries().unwrap(); @@ -341,7 +350,7 @@ mod tests { ] "###); - let file = File::open(zip.path()).unwrap(); + let file = File::open(archives.zip.path()).unwrap(); let reader = BufReader::new(file); let archive = zip::ZipArchive::new(reader).unwrap();