Skip to content

Commit 3611b42

Browse files
committed
Refactor zone update readiness check
1 parent 959e1e4 commit 3611b42

File tree

3 files changed

+120
-83
lines changed

3 files changed

+120
-83
lines changed

nexus/reconfigurator/planning/src/blueprint_builder/builder.rs

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,34 +1947,48 @@ impl<'a> BlueprintBuilder<'a> {
19471947
) -> BlueprintZoneImageSource {
19481948
let new_repo = self.input.tuf_repo();
19491949
let old_repo = self.input.old_repo();
1950-
let new_artifact = Self::zone_image_artifact(new_repo, zone_kind);
1951-
let old_artifact = Self::zone_image_artifact(old_repo, zone_kind);
1950+
Self::zone_image_artifact(
1951+
if self.zone_is_ready_for_update(zone_kind, new_repo) {
1952+
new_repo
1953+
} else {
1954+
old_repo
1955+
},
1956+
zone_kind,
1957+
)
1958+
}
1959+
1960+
/// Return `true` iff a zone of the given kind is ready to be updated;
1961+
/// i.e., its dependencies have been updated, or its data sufficiently
1962+
/// replicated, etc.
1963+
fn zone_is_ready_for_update(
1964+
&self,
1965+
zone_kind: ZoneKind,
1966+
new_repo: Option<&TufRepoDescription>,
1967+
) -> bool {
19521968
match OrderedComponent::from(zone_kind) {
1953-
// Nexus can only be updated if all non-Nexus zones have been updated.
1954-
OrderedComponent::NexusZone => {
1955-
if self.sled_ids_with_zones().any(|sled_id| {
1956-
self.current_sled_zones(
1957-
sled_id,
1958-
BlueprintZoneDisposition::is_in_service,
1959-
)
1960-
.filter(|z| {
1961-
OrderedComponent::from(z.zone_type.kind())
1962-
== OrderedComponent::NonNexusOmicronZone
1963-
})
1964-
.any(|z| z.image_source != new_artifact)
1965-
}) {
1966-
// Some dependent zone is not up-to-date.
1967-
old_artifact
1968-
} else {
1969-
// All dependent zones are up-to-date.
1970-
new_artifact
1971-
}
1972-
}
1973-
// It's always safe to use the new artifact for non-Nexus zones.
1974-
OrderedComponent::NonNexusOmicronZone => new_artifact,
19751969
OrderedComponent::HostOs | OrderedComponent::SpRot => {
1976-
unreachable!("can't get an OS or SP/RoT image from a zone")
1970+
todo!("can't yet update Host OS or SP/RoT")
19771971
}
1972+
OrderedComponent::OmicronZone(kind) => match kind {
1973+
ZoneKind::Nexus => {
1974+
// Nexus can only be updated if all non-Nexus zones have been updated,
1975+
// i.e., their image source is an artifact from the new repo.
1976+
self.sled_ids_with_zones().all(|sled_id| {
1977+
self.current_sled_zones(
1978+
sled_id,
1979+
BlueprintZoneDisposition::is_in_service,
1980+
)
1981+
.filter(|z| z.zone_type.kind() != ZoneKind::Nexus)
1982+
.all(|z| {
1983+
z.image_source
1984+
== Self::zone_image_artifact(new_repo, z.kind())
1985+
})
1986+
})
1987+
}
1988+
// <https://github.com/oxidecomputer/omicron/issues/6404>
1989+
// ZoneKind::CockroachDb => todo!("check cluster status in inventory"),
1990+
_ => true, // other zone kinds have no special dependencies
1991+
},
19781992
}
19791993
}
19801994

nexus/reconfigurator/planning/src/planner.rs

Lines changed: 79 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4548,6 +4548,8 @@ pub(crate) mod test {
45484548
};
45494549
}
45504550

4551+
/// Ensure that dependent zones (here just Crucible Pantry) are updated
4552+
/// before Nexus.
45514553
#[test]
45524554
fn test_update_crucible_pantry() {
45534555
static TEST_NAME: &str = "update_crucible_pantry";
@@ -4654,6 +4656,9 @@ pub(crate) mod test {
46544656
BlueprintZoneImageSource::InstallDataset
46554657
)
46564658
};
4659+
let is_up_to_date_nexus = |zone: &BlueprintZoneConfig| -> bool {
4660+
zone.zone_type.is_nexus() && zone.image_source == image_source
4661+
};
46574662
let is_old_pantry = |zone: &BlueprintZoneConfig| -> bool {
46584663
zone.zone_type.is_crucible_pantry()
46594664
&& matches!(
@@ -4765,7 +4770,7 @@ pub(crate) mod test {
47654770

47664771
// One more iteration for the last old zone to be expunged.
47674772
update_collection_from_blueprint(&mut example, &parent);
4768-
let blueprint = Planner::new_based_on(
4773+
let blueprint10 = Planner::new_based_on(
47694774
log.clone(),
47704775
&parent,
47714776
&input,
@@ -4777,29 +4782,76 @@ pub(crate) mod test {
47774782
.plan()
47784783
.expect("replan for last blueprint");
47794784

4785+
// All old Pantry zones should now be expunged.
47804786
assert_eq!(
4781-
blueprint
4787+
blueprint10
47824788
.all_omicron_zones(BlueprintZoneDisposition::is_expunged)
47834789
.filter(|(_, z)| is_old_pantry(z))
47844790
.count(),
47854791
CRUCIBLE_PANTRY_REDUNDANCY
47864792
);
47874793

4788-
// We won't be able to update Nexus until *all* of its dependent zones
4789-
// are updated. Stay tuned for the next test!
4794+
// Now we can update Nexus, because all of its dependent zones
4795+
// are up-to-date w/r/t the new repo.
47904796
assert_eq!(
47914797
parent
47924798
.all_omicron_zones(BlueprintZoneDisposition::is_in_service)
47934799
.filter(|(_, z)| is_old_nexus(z))
47944800
.count(),
47954801
NEXUS_REDUNDANCY + 1,
47964802
);
4803+
let mut parent = blueprint10;
4804+
for i in 11..=17 {
4805+
let blueprint_name = format!("blueprint_{i}");
4806+
update_collection_from_blueprint(&mut example, &parent);
4807+
4808+
let blueprint = Planner::new_based_on(
4809+
log.clone(),
4810+
&parent,
4811+
&input,
4812+
&blueprint_name,
4813+
&example.collection,
4814+
)
4815+
.expect("can't create planner")
4816+
.with_rng(PlannerRng::from_seed((TEST_NAME, &blueprint_name)))
4817+
.plan()
4818+
.unwrap_or_else(|_| panic!("can't re-plan after {i} iterations"));
4819+
4820+
let summary = blueprint.diff_since_blueprint(&parent);
4821+
eprintln!("{}", summary.display());
4822+
for sled in summary.diff.sleds.modified_values_diff() {
4823+
if i % 2 == 0 {
4824+
assert!(sled.zones.added.is_empty());
4825+
assert!(sled.zones.removed.is_empty());
4826+
} else {
4827+
assert!(sled.zones.removed.is_empty());
4828+
assert_eq!(sled.zones.added.len(), 1);
4829+
let added = sled.zones.added.values().next().unwrap();
4830+
assert!(matches!(
4831+
&added.zone_type,
4832+
BlueprintZoneType::Nexus(_)
4833+
));
4834+
assert_eq!(added.image_source, image_source);
4835+
}
4836+
}
4837+
4838+
parent = blueprint;
4839+
}
47974840

47984841
// Everything's up-to-date in Kansas City!
4799-
update_collection_from_blueprint(&mut example, &blueprint);
4842+
let blueprint17 = parent;
4843+
assert_eq!(
4844+
blueprint17
4845+
.all_omicron_zones(BlueprintZoneDisposition::is_in_service)
4846+
.filter(|(_, z)| is_up_to_date_nexus(z))
4847+
.count(),
4848+
NEXUS_REDUNDANCY + 1,
4849+
);
4850+
4851+
update_collection_from_blueprint(&mut example, &blueprint17);
48004852
assert_planning_makes_no_changes(
48014853
&logctx.log,
4802-
&blueprint,
4854+
&blueprint17,
48034855
&input,
48044856
&example.collection,
48054857
TEST_NAME,
@@ -4808,6 +4860,7 @@ pub(crate) mod test {
48084860
logctx.cleanup_successful();
48094861
}
48104862

4863+
/// Ensure that planning to update all zones terminates.
48114864
#[test]
48124865
fn test_update_all_zones() {
48134866
static TEST_NAME: &str = "update_all_zones";
@@ -4871,9 +4924,17 @@ pub(crate) mod test {
48714924
input_builder.policy_mut().tuf_repo = Some(tuf_repo);
48724925
let input = input_builder.build();
48734926

4874-
let mut comp = OrderedComponent::NonNexusOmicronZone;
4875-
let mut parent = blueprint1;
4927+
/// Expected number of planner iterations required to converge.
4928+
/// If incidental planner work changes this value occasionally,
4929+
/// that's fine; but if we find we're changing it all the time,
4930+
/// we should probably drop it and keep just the maximum below.
4931+
const EXP_PLANNING_ITERATIONS: usize = 57;
4932+
4933+
/// Planning must not take more than this number of iterations.
48764934
const MAX_PLANNING_ITERATIONS: usize = 100;
4935+
assert!(EXP_PLANNING_ITERATIONS < MAX_PLANNING_ITERATIONS);
4936+
4937+
let mut parent = blueprint1;
48774938
for i in 2..=MAX_PLANNING_ITERATIONS {
48784939
let blueprint_name = format!("blueprint_{i}");
48794940
update_collection_from_blueprint(&mut example, &parent);
@@ -4894,62 +4955,28 @@ pub(crate) mod test {
48944955
&& summary.total_zones_removed() == 0
48954956
&& summary.total_zones_modified() == 0
48964957
{
4897-
println!("planning converged after {i} iterations");
48984958
assert!(
48994959
blueprint
49004960
.all_omicron_zones(
49014961
BlueprintZoneDisposition::is_in_service
49024962
)
4903-
.all(|(_, zone)| zone.image_source == image_source)
4963+
.all(|(_, zone)| zone.image_source == image_source),
4964+
"failed to update all zones"
4965+
);
4966+
4967+
assert_eq!(
4968+
i, EXP_PLANNING_ITERATIONS,
4969+
"expected {EXP_PLANNING_ITERATIONS} iterations but converged in {i}"
49044970
);
4971+
println!("planning converged after {i} iterations");
4972+
49054973
logctx.cleanup_successful();
49064974
return;
49074975
}
49084976

4909-
match comp {
4910-
OrderedComponent::HostOs | OrderedComponent::SpRot => {
4911-
unreachable!("can only update zones")
4912-
}
4913-
OrderedComponent::NonNexusOmicronZone => {
4914-
assert!(
4915-
blueprint
4916-
.all_omicron_zones(
4917-
BlueprintZoneDisposition::is_in_service
4918-
)
4919-
.all(|(_, zone)| match zone.zone_type.kind() {
4920-
ZoneKind::Nexus => {
4921-
if zone.image_source == image_source {
4922-
comp = OrderedComponent::NexusZone;
4923-
true
4924-
} else {
4925-
zone.image_source
4926-
== BlueprintZoneImageSource::InstallDataset
4927-
}
4928-
}
4929-
_ => zone.image_source
4930-
== BlueprintZoneImageSource::InstallDataset
4931-
|| zone.image_source == image_source,
4932-
})
4933-
);
4934-
}
4935-
OrderedComponent::NexusZone => {
4936-
assert!(
4937-
blueprint
4938-
.all_omicron_zones(
4939-
BlueprintZoneDisposition::is_in_service
4940-
)
4941-
.all(|(_, zone)| match zone.zone_type.kind() {
4942-
ZoneKind::Nexus => zone.image_source
4943-
== BlueprintZoneImageSource::InstallDataset
4944-
|| zone.image_source == image_source,
4945-
_ => zone.image_source == image_source,
4946-
})
4947-
);
4948-
}
4949-
}
4950-
49514977
parent = blueprint;
49524978
}
4979+
49534980
panic!("did not converge after {MAX_PLANNING_ITERATIONS} iterations");
49544981
}
49554982
}

nexus/reconfigurator/planning/src/planner/update_sequence.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,11 @@ use nexus_sled_agent_shared::inventory::ZoneKind;
1212
pub enum OrderedComponent {
1313
HostOs,
1414
SpRot,
15-
NonNexusOmicronZone,
16-
NexusZone,
15+
OmicronZone(ZoneKind),
1716
}
1817

1918
impl From<ZoneKind> for OrderedComponent {
2019
fn from(zone_kind: ZoneKind) -> Self {
21-
match zone_kind {
22-
ZoneKind::Nexus => Self::NexusZone,
23-
_ => Self::NonNexusOmicronZone,
24-
}
20+
Self::OmicronZone(zone_kind)
2521
}
2622
}

0 commit comments

Comments
 (0)