Skip to content
8 changes: 6 additions & 2 deletions src/core/src/bootstrap/Constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,12 @@ class PackageClassification(EnumBackport):
'IfRequired': REBOOT_IF_REQUIRED,
'Always': REBOOT_ALWAYS
}
REBOOT_BUFFER_IN_MINUTES = 15
REBOOT_WAIT_TIMEOUT_IN_MINUTES = 5
REBOOT_BUFFER_IN_MINUTES = 15 # minimum MW time required to consider rebooting if required (notify - 3, wait - 7, machine - 5)
REBOOT_NOTIFY_WINDOW_IN_MINUTES = 3 # time to broadcast reboot notification to other processes as part of the reboot command
REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN = 7 # minimum time to wait for a reboot to have started in the current execution context
REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX = 40 # maximum possible** time to wait for a reboot to have started in the current execution context (**IF MW time remaining allows it)
REBOOT_TO_MACHINE_READY_TIME_IN_MINUTES = 5 # time to wait for the machine to be ready after a reboot actually happens
REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS = 60 # time to wait between checks for reboot completion

# Installation Reboot Statuses
class RebootStatus(EnumBackport):
Expand Down
4 changes: 2 additions & 2 deletions src/core/src/core_logic/PatchInstaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def start_installation(self, simulate=False):

# Reboot as per setting and environment state
reboot_manager.start_reboot_if_required_and_time_available(maintenance_window.get_remaining_time_in_minutes(None, False))
maintenance_window_exceeded = maintenance_window_exceeded or reboot_manager.maintenance_window_exceeded_flag
maintenance_window_exceeded = maintenance_window_exceeded or reboot_manager.has_maintenance_window_exceeded_at_reboot_manager()

# Combining maintenance
overall_patch_installation_successful = bool(update_run_successful and not maintenance_window_exceeded)
Expand Down Expand Up @@ -671,7 +671,7 @@ def mark_installation_completed(self):
self.status_handler.set_current_operation(Constants.INSTALLATION) # Required for status handler to log errors, that occur during marking installation completed, in installation substatus

# RebootNever is selected and pending, set status warning else success
if self.reboot_manager.reboot_setting == Constants.REBOOT_NEVER and self.reboot_manager.is_reboot_pending():
if self.reboot_manager.get_reboot_setting_sanitized() == Constants.REBOOT_NEVER and self.reboot_manager.is_reboot_pending():
# Set error details inline with windows extension when setting warning status. This message will be shown in portal.
self.status_handler.add_error_to_status("Machine is Required to reboot. However, the customer-specified reboot setting doesn't allow reboots.", Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
self.status_handler.set_installation_substatus_json(status=Constants.STATUS_WARNING)
Expand Down
172 changes: 112 additions & 60 deletions src/core/src/core_logic/RebootManager.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,116 +14,168 @@
#
# Requires Python 2.7+

"""Reboot management"""
""" Reboot management """
import time

from core.src.bootstrap.Constants import Constants


class RebootManager(object):
"""Implements the reboot management logic"""
def __init__(self, env_layer, execution_config, composite_logger, status_handler, package_manager, default_reboot_setting='IfRequired'):
""" Implements the reboot management logic """
def __init__(self, env_layer, execution_config, composite_logger, status_handler, package_manager, default_reboot_setting=Constants.REBOOT_IF_REQUIRED):
self.execution_config = execution_config

self.composite_logger = composite_logger
self.package_manager = package_manager
self.status_handler = status_handler
self.env_layer = env_layer

self.minutes_to_shutdown = str((Constants.REBOOT_BUFFER_IN_MINUTES - 5) if (Constants.REBOOT_BUFFER_IN_MINUTES > 5) else Constants.REBOOT_BUFFER_IN_MINUTES) # give at least 5 minutes for a reboot unless the buffer is configured to be lower than that
self.reboot_cmd = 'sudo shutdown -r '
self.maintenance_window_exceeded_flag = False
self.__reboot_cmd = 'sudo shutdown -r '
self.__maintenance_window_exceeded_flag = False # flag to indicate if the maintenance window was exceeded **separately** at reboot manager

self.reboot_setting = self.sanitize_reboot_setting(self.execution_config.reboot_setting, default_reboot_setting)
self.__reboot_setting_sanitized = self.sanitize_reboot_setting(self.execution_config.reboot_setting, default_reboot_setting)

# region - Reboot condition reporters
@staticmethod
def is_reboot_time_available(current_time_available):
# type: (int) -> bool
""" Check if time still available for system reboot """
return current_time_available >= Constants.REBOOT_BUFFER_IN_MINUTES

# REBOOT SETTING
# ==============
def is_reboot_pending(self):
# type: () -> bool
""" Check if a reboot is pending either from the package manager or the status handler """
return self.package_manager.force_reboot or (self.status_handler and self.status_handler.is_reboot_pending)

def has_maintenance_window_exceeded_at_reboot_manager(self):
# type: () -> bool
""" Check if the maintenance window was exceeded at reboot manager """
return self.__maintenance_window_exceeded_flag

def get_reboot_setting_sanitized(self):
# type: () -> str
""" Get the sanitized reboot setting """
return self.__reboot_setting_sanitized
# endregion

# region - Reboot setting helpers
def sanitize_reboot_setting(self, reboot_setting_key, default_reboot_setting):
# type: (str, str) -> str
""" Ensures that the value obtained is one we know what to do with. """
reboot_setting = Constants.REBOOT_SETTINGS[default_reboot_setting]

try:
reboot_setting = Constants.REBOOT_SETTINGS[reboot_setting_key]
except KeyError:
error_msg = 'Invalid reboot setting detected in update configuration: ' + str(reboot_setting_key)
error_msg = '[RM] Invalid reboot setting detected. [InvalidSetting={0}][DefaultFallback={1}]'.format(str(reboot_setting_key), str(default_reboot_setting))
self.composite_logger.log_error(error_msg)
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
self.composite_logger.log_warning('Defaulting reboot setting to: ' + str(default_reboot_setting))
finally:
return reboot_setting

def is_setting(self, setting_to_check):
return self.reboot_setting == setting_to_check

# REBOOT ACTION
# =============
def start_reboot(self, message="Azure Patch Management initiated a reboot after a patch installation run."):
""" Perform a system reboot """
self.composite_logger.log("\nThe machine is set to reboot in " + self.minutes_to_shutdown + " minutes.")

self.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED)
reboot_init_time = self.env_layer.datetime.datetime_utcnow()
self.env_layer.reboot_machine(self.reboot_cmd + self.minutes_to_shutdown + ' ' + message)

# Wait for timeout
max_allowable_time_to_reboot_in_minutes = int(self.minutes_to_shutdown) + Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES
while 1:
current_time = self.env_layer.datetime.datetime_utcnow()
elapsed_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(current_time - reboot_init_time)
if elapsed_time_in_minutes >= max_allowable_time_to_reboot_in_minutes:
self.status_handler.set_installation_reboot_status(Constants.RebootStatus.FAILED)
error_msg = "Reboot failed to proceed on the machine in a timely manner."
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
raise Exception(error_msg, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS))
else:
self.composite_logger.file_logger.flush()
self.composite_logger.log("Waiting for machine reboot. [ElapsedTimeInMinutes={0}] [MaxTimeInMinutes={1}]".format(str(elapsed_time_in_minutes), str(max_allowable_time_to_reboot_in_minutes)))
self.composite_logger.file_logger.flush()
time.sleep(60)
# type: (str) -> bool
return self.__reboot_setting_sanitized == setting_to_check
# endregion

# region - Reboot action methods
def start_reboot_if_required_and_time_available(self, current_time_available):
# type: (int) -> any
""" Starts a reboot if required. Happens only at the end of the run if required. """
self.composite_logger.log("\nReboot Management")
reboot_pending = self.is_reboot_pending()

# Log a special-case message if the package manager is forcing a reboot that's not normally visible on the machine (encoded into is_reboot_pending())
if self.package_manager.force_reboot:
self.composite_logger.log("A reboot is pending as the package manager required it.")
self.composite_logger.log("[RM] A reboot is pending as the package manager required it.")

# return false if never
if self.reboot_setting == Constants.REBOOT_NEVER:
# No-op - return false if config says never reboot
if self.__reboot_setting_sanitized == Constants.REBOOT_NEVER:
if reboot_pending:
self.composite_logger.log_warning(' - There is a reboot pending, but reboot is blocked, as per patch installation configuration. (' + str(Constants.REBOOT_NEVER) + ')')
self.composite_logger.log_warning('[RM][!] Reboot is pending but BLOCKED by the customer configuration ({0}).'.format(str(Constants.REBOOT_NEVER)))
else:
self.composite_logger.log_warning(' - There is no reboot pending, and reboot is blocked regardless, as per patch installation configuration (' + str(Constants.REBOOT_NEVER) + ').')
self.composite_logger.log_debug('[RM] No reboot pending, and reboot is blocked regardless by the customer configuration ({0}).'.format(str(Constants.REBOOT_NEVER)))
return False

# return if system doesn't require it (and only reboot if it does)
if self.reboot_setting == Constants.REBOOT_IF_REQUIRED and not reboot_pending:
self.composite_logger.log(" - There was no reboot pending detected. Reboot is being skipped as it's not required, as per patch installation configuration (" + str(Constants.REBOOT_IF_REQUIRED) + ").")
# No-op - return if system doesn't require it (and only reboot if it does)
if self.__reboot_setting_sanitized == Constants.REBOOT_IF_REQUIRED and not reboot_pending:
self.composite_logger.log_debug("[RM] No reboot pending detected. Reboot skipped as per customer configuration ({0}).".format(str(Constants.REBOOT_IF_REQUIRED)))
return False

# prevent repeated reboots
if self.reboot_setting == Constants.REBOOT_ALWAYS and not reboot_pending and self.status_handler.get_installation_reboot_status() == Constants.RebootStatus.COMPLETED:
self.composite_logger.log(" - At least one reboot has occurred, and there's no reboot pending, so the conditions for the 'Reboot Always' setting is fulfilled and reboot won't be repeated.")
# No-op - prevent repeated reboots
if self.__reboot_setting_sanitized == Constants.REBOOT_ALWAYS and not reboot_pending and self.status_handler.get_installation_reboot_status() == Constants.RebootStatus.COMPLETED:
self.composite_logger.log_debug("[RM] At least one reboot has occurred, and there's no reboot pending, so the conditions for the 'Reboot Always' setting is fulfilled and reboot won't be repeated.")
return False

# attempt to reboot is enough time is available
if self.reboot_setting == Constants.REBOOT_ALWAYS or (self.reboot_setting == Constants.REBOOT_IF_REQUIRED and reboot_pending):
# Try to reboot - if enough time is available
if self.__reboot_setting_sanitized == Constants.REBOOT_ALWAYS or (self.__reboot_setting_sanitized == Constants.REBOOT_IF_REQUIRED and reboot_pending):
if self.is_reboot_time_available(current_time_available):
self.composite_logger.log(' - Reboot is being scheduled, as per patch installation configuration (' + str(self.reboot_setting) + ').')
self.composite_logger.log(" - Reboot-pending status: " + str(reboot_pending))
self.start_reboot()
self.composite_logger.log_debug('[RM] Reboot is being scheduled, as per customer configuration ({0}). [RebootPending={1}][CurrentTimeAvailable={2}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending), str(current_time_available)))
self.__start_reboot(maintenance_window_available_time_in_minutes=current_time_available)
return True
else:
error_msg = ' - There is not enough time to schedule a reboot as per patch installation configuration (' + str(self.reboot_setting) + '). Reboot-pending status: ' + str(reboot_pending)
# Maintenance window will be marked exceeded as reboot is required and not enough time is available
error_msg = '[RM][!] Insufficient time to schedule a required reboot ({0}). [RebootPending={1}][CurrentTimeAvailable={2}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending), str(current_time_available))
self.composite_logger.log_error(error_msg)
self.status_handler.add_error_to_status("Reboot Management" + str(error_msg), Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
self.maintenance_window_exceeded_flag = True
self.status_handler.add_error_to_status(str(error_msg), Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
self.__maintenance_window_exceeded_flag = True
return False

def is_reboot_pending(self):
return self.package_manager.force_reboot or (self.status_handler and self.status_handler.is_reboot_pending)
# No-op - This code should never be reached. If seen, it indicates a bug in the code.
self.composite_logger.log_error('[RM] Bug-check: Unexpected code branch reached. [RebootSetting={0}][RebootPending={1}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending)))
return False

Check warning on line 125 in src/core/src/core_logic/RebootManager.py

View check run for this annotation

Codecov / codecov/patch

src/core/src/core_logic/RebootManager.py#L124-L125

Added lines #L124 - L125 were not covered by tests

def __start_reboot(self, message="Azure VM Guest Patching initiated a reboot as part of an 'InstallPatches' operation.", maintenance_window_available_time_in_minutes=0):
# type: (str, int) -> None
""" Performs a controlled system reboot with a system-wide notification broadcast. """
self.composite_logger.log("[RM] The machine is set to reboot in " + str(Constants.REBOOT_NOTIFY_WINDOW_IN_MINUTES) + " minutes.")
self.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED)
reboot_init_time = self.env_layer.datetime.datetime_utcnow()

# Reboot after system-wide notification broadcast - no new logins will be allowed after this point.
self.env_layer.reboot_machine(self.__reboot_cmd + str(Constants.REBOOT_NOTIFY_WINDOW_IN_MINUTES) + ' ' + message)

# Safety net - if the machine doesn't reboot, we need to fail the operation.
max_allowable_time_to_reboot_in_minutes = self.__calc_max_allowable_time_to_reboot_in_minutes(maintenance_window_available_time_in_minutes)
while 1:
current_time = self.env_layer.datetime.datetime_utcnow()
elapsed_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(current_time - reboot_init_time)

# Keep logging to indicate machine hasn't rebooted yet. If successful, this will be the last log we see from this process.
if elapsed_time_in_minutes < max_allowable_time_to_reboot_in_minutes:
self.__reboot_wait_pulse(int(elapsed_time_in_minutes), int(max_allowable_time_to_reboot_in_minutes),

Check warning on line 145 in src/core/src/core_logic/RebootManager.py

View check run for this annotation

Codecov / codecov/patch

src/core/src/core_logic/RebootManager.py#L145

Added line #L145 was not covered by tests
maintenance_window_allowable_limit_remaining_in_minutes = int(maintenance_window_available_time_in_minutes - elapsed_time_in_minutes))
continue

Check warning on line 147 in src/core/src/core_logic/RebootManager.py

View check run for this annotation

Codecov / codecov/patch

src/core/src/core_logic/RebootManager.py#L147

Added line #L147 was not covered by tests

# If we get here, the machine has not rebooted in the time we expected. We need to fail the operation.
# This may be because of the following reasons:
# 1. The machine is not responding to the reboot command because of a customer environment issue. (customer should retry after a forcing a control-plane reboot)
# 2. The reboot command was externally interrupted during the broadcast period. (customer should retry after a forcing a control-plane reboot)
# 3. The time required to handle changes prior to reboot is greater than the time we've allocated. (action on AzGPS if seen at scale in Azure)
self.status_handler.set_installation_reboot_status(Constants.RebootStatus.FAILED)
error_msg = "Customer environment issue: Reboot failed to proceed on the machine in a timely manner. Please retry the operation."
self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR)
raise Exception(error_msg, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS))

def __reboot_wait_pulse(self, elapsed_time_in_minutes, max_allowable_time_to_reboot_in_minutes, maintenance_window_allowable_limit_remaining):
# type: (int, int, int) -> None
self.composite_logger.log("[RM] Waiting for machine reboot. [ElapsedTimeInMinutes={0}][MaxTimeInMinutes={1}][MWAllowableLimitRemainingInMins={2}]"
.format(str(elapsed_time_in_minutes), str(max_allowable_time_to_reboot_in_minutes), str(maintenance_window_allowable_limit_remaining)))
self.composite_logger.file_logger.flush()
self.status_handler.set_installation_substatus_json() # keep refreshing to minimize the chance of service-side timeout
time.sleep(Constants.REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS)

@staticmethod
def __calc_max_allowable_time_to_reboot_in_minutes(maintenance_window_available_time_in_minutes):
# type: (int) -> int
""" Calculates the maximum amount of time to wait before considering the reboot attempt a failure. """

# remove the reboot to machine ready time from the available time
available_time = maintenance_window_available_time_in_minutes - Constants.REBOOT_TO_MACHINE_READY_TIME_IN_MINUTES

if available_time >= Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX:
# If the maintenance window is greater than the max, we can use the max.
return Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX
else:
# Otherwise, we use the greater of the time available or the minimum wait timeout allowable.
return max(available_time, Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN)
# endregion
Loading