diff --git a/Cargo.lock b/Cargo.lock index 35241fddb49..aa134a78ea8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4772,6 +4772,7 @@ dependencies = [ "rustls", "samael", "schemars", + "semver 1.0.17", "serde", "serde_json", "serde_urlencoded", diff --git a/common/src/sql/dbinit.sql b/common/src/sql/dbinit.sql index 7fd37313106..423f55c12f2 100644 --- a/common/src/sql/dbinit.sql +++ b/common/src/sql/dbinit.sql @@ -2510,7 +2510,7 @@ CREATE TABLE omicron.public.switch_port_settings_address_config ( */ CREATE TABLE omicron.public.db_metadata ( - name STRING(63) NOT NULL, + name STRING(63) NOT NULL PRIMARY KEY, value STRING(1023) NOT NULL ); diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index 7296bb4cd0e..412bda01671 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -60,6 +60,7 @@ reqwest = { workspace = true, features = ["json"] } ring.workspace = true samael.workspace = true schemars = { workspace = true, features = ["chrono", "uuid1"] } +semver.workspace = true serde.workspace = true serde_json.workspace = true serde_urlencoded.workspace = true diff --git a/nexus/db-model/src/db_metadata.rs b/nexus/db-model/src/db_metadata.rs new file mode 100644 index 00000000000..9c11d7437c3 --- /dev/null +++ b/nexus/db-model/src/db_metadata.rs @@ -0,0 +1,27 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +use crate::schema::db_metadata; + +/// Internal database metadata +#[derive(Queryable, Insertable, Debug, Clone, Selectable)] +#[diesel(table_name = db_metadata)] +pub struct DbMetadata { + name: String, + value: String, +} + +impl DbMetadata { + pub fn new(name: String, value: String) -> Self { + Self { name, value } + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn value(&self) -> &str { + &self.value + } +} diff --git a/nexus/db-model/src/lib.rs b/nexus/db-model/src/lib.rs index 88541333596..334dedad9ff 100644 --- a/nexus/db-model/src/lib.rs +++ b/nexus/db-model/src/lib.rs @@ -18,6 +18,7 @@ mod collection; mod console_session; mod dataset; mod dataset_kind; +mod db_metadata; mod device_auth; mod digest; mod disk; @@ -104,6 +105,7 @@ pub use collection::*; pub use console_session::*; pub use dataset::*; pub use dataset_kind::*; +pub use db_metadata::*; pub use device_auth::*; pub use digest::*; pub use disk::*; diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index dcbd9f4c8ae..25be6b67a08 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -6,6 +6,8 @@ //! //! NOTE: Should be kept up-to-date with dbinit.sql. +use omicron_common::api::external::SemverVersion; + table! { disk (id) { id -> Uuid, @@ -1109,6 +1111,17 @@ table! { } } +table! { + db_metadata (name) { + name -> Text, + value -> Text, + } +} + +/// The version of the database schema this particular version of Nexus was +/// built against. +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(1, 0, 0); + allow_tables_to_appear_in_same_query!( system_update, component_update, diff --git a/nexus/db-queries/src/db/datastore/db_metadata.rs b/nexus/db-queries/src/db/datastore/db_metadata.rs new file mode 100644 index 00000000000..10faf808995 --- /dev/null +++ b/nexus/db-queries/src/db/datastore/db_metadata.rs @@ -0,0 +1,36 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! [`DataStore`] methods on Database Metadata. + +use super::DataStore; +use crate::db; +use crate::db::error::public_error_from_diesel_pool; +use crate::db::error::ErrorHandler; +use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::prelude::*; +use omicron_common::api::external::Error; +use omicron_common::api::external::SemverVersion; +use std::str::FromStr; + +impl DataStore { + pub async fn database_schema_version( + &self, + ) -> Result { + use db::schema::db_metadata::dsl; + + let version: String = dsl::db_metadata + .filter(dsl::name.eq("schema_version")) + .select(dsl::value) + .get_result_async(self.pool()) + .await + .map_err(|e| { + public_error_from_diesel_pool(e, ErrorHandler::Server) + })?; + + SemverVersion::from_str(&version).map_err(|e| { + Error::internal_error(&format!("Invalid schema version: {e}")) + }) + } +} diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 882e79f7cb0..7d000ad8bd3 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -37,6 +37,11 @@ use omicron_common::api::external::Error; use omicron_common::api::external::IdentityMetadataCreateParams; use omicron_common::api::external::LookupType; use omicron_common::api::external::ResourceType; +use omicron_common::api::external::SemverVersion; +use omicron_common::backoff::{ + retry_notify, retry_policy_internal_service, BackoffError, +}; +use slog::Logger; use std::net::Ipv6Addr; use std::sync::Arc; use uuid::Uuid; @@ -45,6 +50,7 @@ mod address_lot; mod certificate; mod console_session; mod dataset; +mod db_metadata; mod device_auth; mod disk; mod dns; @@ -131,12 +137,40 @@ pub struct DataStore { // to compilation times; changing a query only requires incremental // recompilation of that query's module instead of all queries on `DataStore`. impl DataStore { - pub fn new(pool: Arc) -> Self { - DataStore { + /// Constructs a new Datastore object. + /// + /// Only returns if the database schema is compatible with Nexus's known + /// schema version. + pub async fn new(log: &Logger, pool: Arc) -> Result { + let datastore = DataStore { pool, virtual_provisioning_collection_producer: crate::provisioning::Producer::new(), - } + }; + + // Keep looping until we find that the schema matches our expectation. + const EXPECTED_VERSION: SemverVersion = SemverVersion::new(1, 0, 0); + retry_notify( + retry_policy_internal_service(), + || async { + match datastore.database_schema_version().await { + Ok(version) => { + if version == nexus_db_model::schema::SCHEMA_VERSION { + return Ok(()); + } + let observed = version.0; + warn!(log, "Incompatible database schema: Saw {observed}, expected {EXPECTED_VERSION}"); + } + Err(e) => { + warn!(log, "Cannot read database schema version: {e}"); + } + }; + return Err(BackoffError::transient(())); + }, + |_, _| {}, + ).await.map_err(|_| "Failed to read valid DB schema".to_string())?; + + Ok(datastore) } pub fn register_producers(&self, registry: &ProducerRegistry) { @@ -248,7 +282,7 @@ pub async fn datastore_test( let cfg = db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new(&logctx.log, &cfg)); - let datastore = Arc::new(DataStore::new(pool)); + let datastore = Arc::new(DataStore::new(&logctx.log, pool).await.unwrap()); // Create an OpContext with the credentials of "db-init" just for the // purpose of loading the built-in users, roles, and assignments. @@ -907,7 +941,8 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let cfg = db::Config { url: db.pg_config().clone() }; let pool = db::Pool::new(&logctx.log, &cfg); - let datastore = DataStore::new(Arc::new(pool)); + let datastore = + DataStore::new(&logctx.log, Arc::new(pool)).await.unwrap(); let explanation = DataStore::get_allocated_regions_query(Uuid::nil()) .explain_async(datastore.pool()) @@ -956,7 +991,8 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let cfg = db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new(&logctx.log, &cfg)); - let datastore = Arc::new(DataStore::new(Arc::clone(&pool))); + let datastore = + Arc::new(DataStore::new(&logctx.log, pool).await.unwrap()); let opctx = OpContext::for_tests(logctx.log.new(o!()), datastore.clone()); diff --git a/nexus/db-queries/src/db/queries/external_ip.rs b/nexus/db-queries/src/db/queries/external_ip.rs index be13775d46c..6b173ff428e 100644 --- a/nexus/db-queries/src/db/queries/external_ip.rs +++ b/nexus/db-queries/src/db/queries/external_ip.rs @@ -851,8 +851,11 @@ mod tests { crate::db::datastore::datastore_test(&logctx, &db).await; let cfg = crate::db::Config { url: db.pg_config().clone() }; let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg)); - let db_datastore = - Arc::new(crate::db::DataStore::new(Arc::clone(&pool))); + let db_datastore = Arc::new( + crate::db::DataStore::new(&logctx.log, Arc::clone(&pool)) + .await + .unwrap(), + ); let opctx = OpContext::for_tests(log.new(o!()), db_datastore.clone()); Self { logctx, opctx, db, db_datastore } diff --git a/nexus/db-queries/src/db/queries/vpc_subnet.rs b/nexus/db-queries/src/db/queries/vpc_subnet.rs index c76d8cac917..0e30db7745a 100644 --- a/nexus/db-queries/src/db/queries/vpc_subnet.rs +++ b/nexus/db-queries/src/db/queries/vpc_subnet.rs @@ -444,8 +444,9 @@ mod test { let mut db = test_setup_database(&log).await; let cfg = crate::db::Config { url: db.pg_config().clone() }; let pool = Arc::new(crate::db::Pool::new(&logctx.log, &cfg)); - let db_datastore = - Arc::new(crate::db::DataStore::new(Arc::clone(&pool))); + let db_datastore = Arc::new( + crate::db::DataStore::new(&log, Arc::clone(&pool)).await.unwrap(), + ); // We should be able to insert anything into an empty table. assert!( diff --git a/nexus/db-queries/src/db/saga_recovery.rs b/nexus/db-queries/src/db/saga_recovery.rs index 6c4a539be7b..871ec64e761 100644 --- a/nexus/db-queries/src/db/saga_recovery.rs +++ b/nexus/db-queries/src/db/saga_recovery.rs @@ -329,7 +329,9 @@ mod test { let db = test_setup_database(&log).await; let cfg = crate::db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new(log, &cfg)); - let db_datastore = Arc::new(db::DataStore::new(Arc::clone(&pool))); + let db_datastore = Arc::new( + db::DataStore::new(&log, Arc::clone(&pool)).await.unwrap(), + ); (db, db_datastore) } diff --git a/nexus/src/app/mod.rs b/nexus/src/app/mod.rs index b38cf6954af..dbfbb7d2471 100644 --- a/nexus/src/app/mod.rs +++ b/nexus/src/app/mod.rs @@ -155,7 +155,8 @@ impl Nexus { authz: Arc, ) -> Result, String> { let pool = Arc::new(pool); - let db_datastore = Arc::new(db::DataStore::new(Arc::clone(&pool))); + let db_datastore = + Arc::new(db::DataStore::new(&log, Arc::clone(&pool)).await?); db_datastore.register_producers(&producer_registry); let my_sec_id = db::SecId::from(config.deployment.id); diff --git a/nexus/src/populate.rs b/nexus/src/populate.rs index 5bbc723e3d5..8fa549adcc0 100644 --- a/nexus/src/populate.rs +++ b/nexus/src/populate.rs @@ -384,7 +384,8 @@ mod test { let mut db = test_setup_database(&logctx.log).await; let cfg = db::Config { url: db.pg_config().clone() }; let pool = Arc::new(db::Pool::new(&logctx.log, &cfg)); - let datastore = Arc::new(db::DataStore::new(pool)); + let datastore = + Arc::new(db::DataStore::new(&logctx.log, pool).await.unwrap()); let opctx = OpContext::for_background( logctx.log.clone(), Arc::new(authz::Authz::new(&logctx.log)), @@ -423,9 +424,6 @@ mod test { }) .unwrap(); - info!(&log, "cleaning up database"); - db.cleanup().await.unwrap(); - // Test again with the database offline. In principle we could do this // immediately without creating a new pool and datastore. However, the // pool's default behavior is to wait 30 seconds for a connection, which @@ -439,7 +437,10 @@ mod test { // ServiceUnavailable error, which indicates a transient failure. let pool = Arc::new(db::Pool::new_failfast_for_tests(&logctx.log, &cfg)); - let datastore = Arc::new(db::DataStore::new(pool)); + // We need to create the datastore before tearing down the database, as + // it verifies the schema version of the DB while booting. + let datastore = + Arc::new(db::DataStore::new(&logctx.log, pool).await.unwrap()); let opctx = OpContext::for_background( logctx.log.clone(), Arc::new(authz::Authz::new(&logctx.log)), @@ -447,6 +448,9 @@ mod test { Arc::clone(&datastore), ); + info!(&log, "cleaning up database"); + db.cleanup().await.unwrap(); + info!(&log, "populator {:?}, with database offline", p); match p.populate(&opctx, &datastore, &args).await { Err(Error::ServiceUnavailable { .. }) => (), diff --git a/nexus/tests/integration_tests/initialization.rs b/nexus/tests/integration_tests/initialization.rs index 1b2a7f64a25..6cdd555f4bf 100644 --- a/nexus/tests/integration_tests/initialization.rs +++ b/nexus/tests/integration_tests/initialization.rs @@ -4,6 +4,9 @@ use nexus_test_interface::NexusServer; use nexus_test_utils::{load_test_config, ControlPlaneTestContextBuilder}; +use tokio::time::sleep; +use tokio::time::timeout; +use tokio::time::Duration; #[tokio::test] async fn test_nexus_boots_before_cockroach() { @@ -127,3 +130,148 @@ async fn test_nexus_boots_before_dendrite() { builder.teardown().await; } + +// Helper to ensure we perform the same setup for the positive and negative test +// cases. +async fn nexus_schema_test_setup( + builder: &mut ControlPlaneTestContextBuilder<'_, omicron_nexus::Server>, +) { + builder.start_crdb().await; + builder.start_internal_dns().await; + builder.start_external_dns().await; + builder.start_dendrite().await; + builder.populate_internal_dns().await; +} + +#[tokio::test] +async fn test_nexus_boots_with_valid_schema() { + let mut config = load_test_config(); + + let mut builder = + ControlPlaneTestContextBuilder::::new( + "test_nexus_boots_with_valid_schema", + &mut config, + ); + + nexus_schema_test_setup(&mut builder).await; + + assert!( + timeout(Duration::from_secs(60), builder.start_nexus_internal(),) + .await + .is_ok(), + "Nexus should have started" + ); + + builder.teardown().await; +} + +#[tokio::test] +async fn test_nexus_does_not_boot_without_valid_schema() { + let s = nexus_db_model::schema::SCHEMA_VERSION; + + let schemas_to_test = vec![ + semver::Version::new(s.0.major + 1, s.0.minor, s.0.patch), + semver::Version::new(s.0.major, s.0.minor + 1, s.0.patch), + semver::Version::new(s.0.major, s.0.minor, s.0.patch + 1), + ]; + + for schema in schemas_to_test { + let mut config = load_test_config(); + + let mut builder = + ControlPlaneTestContextBuilder::::new( + "test_nexus_does_not_boot_without_valid_schema", + &mut config, + ); + + nexus_schema_test_setup(&mut builder).await; + + builder.database + .as_ref() + .expect("Should have started CRDB") + .connect() + .await + .expect("Failed to connect to CRDB") + .batch_execute( + &format!( + "UPDATE omicron.public.db_metadata SET value = '{schema}' WHERE name = 'schema_version'" + ) + ) + .await + .expect("Failled to update schema"); + + assert!( + timeout( + std::time::Duration::from_secs(5), + builder.start_nexus_internal(), + ) + .await + .is_err(), + "Nexus should have failed to start" + ); + + builder.teardown().await; + } +} + +#[tokio::test] +async fn test_nexus_does_not_boot_until_schema_updated() { + let good_schema = nexus_db_model::schema::SCHEMA_VERSION; + let bad_schema = semver::Version::new( + good_schema.0.major + 1, + good_schema.0.minor, + good_schema.0.patch, + ); + + let mut config = load_test_config(); + + let mut builder = + ControlPlaneTestContextBuilder::::new( + "test_nexus_does_not_boot_until_schema_updated", + &mut config, + ); + + nexus_schema_test_setup(&mut builder).await; + + let crdb = builder + .database + .as_ref() + .expect("Should have started CRDB") + .connect() + .await + .expect("Failed to connect to CRDB"); + + // Inject a bad schema into the DB. This should mimic the + // "test_nexus_does_not_boot_without_valid_schema" test. + crdb.batch_execute( + &format!( + "UPDATE omicron.public.db_metadata SET value = '{bad_schema}' WHERE name = 'schema_version'" + ) + ) + .await + .expect("Failled to update schema"); + + // Let Nexus attempt to initialize with an invalid version. + // + // However, after a bit, mimic "operator intervention", where the + // DB has been upgraded manually. + tokio::spawn(async move { + sleep(Duration::from_secs(1)).await; + crdb.batch_execute( + &format!( + "UPDATE omicron.public.db_metadata SET value = '{good_schema}' WHERE name = 'schema_version'" + ) + ) + .await + .expect("Failled to update schema"); + }); + + assert!( + timeout(Duration::from_secs(60), builder.start_nexus_internal(),) + .await + .is_ok(), + "Nexus should have started" + ); + + builder.teardown().await; +}