Skip to content

Commit b751cad

Browse files
authored
[reconfigurator] Verify SP component version before resetting (#9201)
Closes: #9136
1 parent 0fe3e4d commit b751cad

File tree

1 file changed

+19
-7
lines changed

1 file changed

+19
-7
lines changed

nexus/mgs-updates/src/driver_update.rs

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ pub enum ApplyUpdateError {
178178
StuckUpdating { update_id: Uuid, timeout: Duration },
179179
#[error("failed to abort in-progress SP update")]
180180
SpUpdateAbortFailed(#[from] AbortError),
181-
#[error("SP reports that reset failed: {0:?}")]
182-
SpResetFailed(String),
181+
#[error("SP component reports that reset failed: {0:?}")]
182+
SpComponentResetFailed(String),
183183

184184
#[error("failed waiting for artifact delivery")]
185185
DeliveryWaitError(#[from] DeliveryWaitError),
@@ -413,17 +413,29 @@ pub(crate) async fn apply_update(
413413
status.update(UpdateAttemptStatus::PostUpdate);
414414

415415
if try_reset {
416-
// We retry this until we get some error *other* than a communication
417-
// error or some other transient error. There is intentionally no
418-
// timeout here. If we've staged an update but not managed to reset
419-
// the device, there's no point where we'd want to stop trying to do so.
416+
// We retry this until the component update has been successfully
417+
// updated, or we get some error *other* than a communication error or
418+
// some other transient error. There is intentionally no timeout here.
419+
// If we've staged an update but not managed to reset the device,
420+
// there's no point where we'd want to stop trying to do so.
420421
while let Err(error) =
421422
update_helper.post_update(log, &mut mgs_clients, update).await
422423
{
423424
if error.is_fatal() {
424425
let error = InlineErrorChain::new(&error);
425426
error!(log, "post_update failed"; &error);
426-
return Err(ApplyUpdateError::SpResetFailed(error.to_string()));
427+
return Err(ApplyUpdateError::SpComponentResetFailed(
428+
error.to_string(),
429+
));
430+
}
431+
432+
// We only care whether the update has completed. We ignore all
433+
// pre-check errors because they could all be transient if a reset
434+
// is in the process of happening.
435+
if let Ok(PrecheckStatus::UpdateComplete) =
436+
update_helper.precheck(log, &mut mgs_clients, update).await
437+
{
438+
break;
427439
}
428440

429441
tokio::time::sleep(RESET_DELAY_INTERVAL).await;

0 commit comments

Comments
 (0)