diff --git a/src/core/src/bootstrap/Constants.py b/src/core/src/bootstrap/Constants.py index a5d29ab0..c8e1421d 100644 --- a/src/core/src/bootstrap/Constants.py +++ b/src/core/src/bootstrap/Constants.py @@ -261,8 +261,12 @@ class PackageClassification(EnumBackport): 'IfRequired': REBOOT_IF_REQUIRED, 'Always': REBOOT_ALWAYS } - REBOOT_BUFFER_IN_MINUTES = 15 - REBOOT_WAIT_TIMEOUT_IN_MINUTES = 5 + REBOOT_BUFFER_IN_MINUTES = 15 # minimum MW time required to consider rebooting if required (notify - 3, wait - 7, machine - 5) + REBOOT_NOTIFY_WINDOW_IN_MINUTES = 3 # time to broadcast reboot notification to other processes as part of the reboot command + REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN = 7 # minimum time to wait for a reboot to have started in the current execution context + REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX = 40 # maximum possible** time to wait for a reboot to have started in the current execution context (**IF MW time remaining allows it) + REBOOT_TO_MACHINE_READY_TIME_IN_MINUTES = 5 # time to wait for the machine to be ready after a reboot actually happens + REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS = 60 # time to wait between checks for reboot completion # Installation Reboot Statuses class RebootStatus(EnumBackport): diff --git a/src/core/src/core_logic/PatchInstaller.py b/src/core/src/core_logic/PatchInstaller.py index 63f8771c..0baa9a24 100644 --- a/src/core/src/core_logic/PatchInstaller.py +++ b/src/core/src/core_logic/PatchInstaller.py @@ -117,7 +117,7 @@ def start_installation(self, simulate=False): # Reboot as per setting and environment state reboot_manager.start_reboot_if_required_and_time_available(maintenance_window.get_remaining_time_in_minutes(None, False)) - maintenance_window_exceeded = maintenance_window_exceeded or reboot_manager.maintenance_window_exceeded_flag + maintenance_window_exceeded = maintenance_window_exceeded or reboot_manager.has_maintenance_window_exceeded_at_reboot_manager() # Combining maintenance overall_patch_installation_successful = bool(update_run_successful and not maintenance_window_exceeded) @@ -671,7 +671,7 @@ def mark_installation_completed(self): self.status_handler.set_current_operation(Constants.INSTALLATION) # Required for status handler to log errors, that occur during marking installation completed, in installation substatus # RebootNever is selected and pending, set status warning else success - if self.reboot_manager.reboot_setting == Constants.REBOOT_NEVER and self.reboot_manager.is_reboot_pending(): + if self.reboot_manager.get_reboot_setting_sanitized() == Constants.REBOOT_NEVER and self.reboot_manager.is_reboot_pending(): # Set error details inline with windows extension when setting warning status. This message will be shown in portal. self.status_handler.add_error_to_status("Machine is Required to reboot. However, the customer-specified reboot setting doesn't allow reboots.", Constants.PatchOperationErrorCodes.DEFAULT_ERROR) self.status_handler.set_installation_substatus_json(status=Constants.STATUS_WARNING) diff --git a/src/core/src/core_logic/RebootManager.py b/src/core/src/core_logic/RebootManager.py index 292e1a14..d25cb05d 100644 --- a/src/core/src/core_logic/RebootManager.py +++ b/src/core/src/core_logic/RebootManager.py @@ -14,14 +14,15 @@ # # Requires Python 2.7+ -"""Reboot management""" +""" Reboot management """ import time + from core.src.bootstrap.Constants import Constants class RebootManager(object): - """Implements the reboot management logic""" - def __init__(self, env_layer, execution_config, composite_logger, status_handler, package_manager, default_reboot_setting='IfRequired'): + """ Implements the reboot management logic """ + def __init__(self, env_layer, execution_config, composite_logger, status_handler, package_manager, default_reboot_setting=Constants.REBOOT_IF_REQUIRED): self.execution_config = execution_config self.composite_logger = composite_logger @@ -29,101 +30,152 @@ def __init__(self, env_layer, execution_config, composite_logger, status_handler self.status_handler = status_handler self.env_layer = env_layer - self.minutes_to_shutdown = str((Constants.REBOOT_BUFFER_IN_MINUTES - 5) if (Constants.REBOOT_BUFFER_IN_MINUTES > 5) else Constants.REBOOT_BUFFER_IN_MINUTES) # give at least 5 minutes for a reboot unless the buffer is configured to be lower than that - self.reboot_cmd = 'sudo shutdown -r ' - self.maintenance_window_exceeded_flag = False + self.__reboot_cmd = 'sudo shutdown -r ' + self.__maintenance_window_exceeded_flag = False # flag to indicate if the maintenance window was exceeded **separately** at reboot manager - self.reboot_setting = self.sanitize_reboot_setting(self.execution_config.reboot_setting, default_reboot_setting) + self.__reboot_setting_sanitized = self.sanitize_reboot_setting(self.execution_config.reboot_setting, default_reboot_setting) + # region - Reboot condition reporters @staticmethod def is_reboot_time_available(current_time_available): + # type: (int) -> bool """ Check if time still available for system reboot """ return current_time_available >= Constants.REBOOT_BUFFER_IN_MINUTES - # REBOOT SETTING - # ============== + def is_reboot_pending(self): + # type: () -> bool + """ Check if a reboot is pending either from the package manager or the status handler """ + return self.package_manager.force_reboot or (self.status_handler and self.status_handler.is_reboot_pending) + + def has_maintenance_window_exceeded_at_reboot_manager(self): + # type: () -> bool + """ Check if the maintenance window was exceeded at reboot manager """ + return self.__maintenance_window_exceeded_flag + + def get_reboot_setting_sanitized(self): + # type: () -> str + """ Get the sanitized reboot setting """ + return self.__reboot_setting_sanitized + # endregion + + # region - Reboot setting helpers def sanitize_reboot_setting(self, reboot_setting_key, default_reboot_setting): + # type: (str, str) -> str """ Ensures that the value obtained is one we know what to do with. """ reboot_setting = Constants.REBOOT_SETTINGS[default_reboot_setting] try: reboot_setting = Constants.REBOOT_SETTINGS[reboot_setting_key] except KeyError: - error_msg = 'Invalid reboot setting detected in update configuration: ' + str(reboot_setting_key) + error_msg = '[RM] Invalid reboot setting detected. [InvalidSetting={0}][DefaultFallback={1}]'.format(str(reboot_setting_key), str(default_reboot_setting)) self.composite_logger.log_error(error_msg) self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR) - self.composite_logger.log_warning('Defaulting reboot setting to: ' + str(default_reboot_setting)) finally: return reboot_setting def is_setting(self, setting_to_check): - return self.reboot_setting == setting_to_check - - # REBOOT ACTION - # ============= - def start_reboot(self, message="Azure Patch Management initiated a reboot after a patch installation run."): - """ Perform a system reboot """ - self.composite_logger.log("\nThe machine is set to reboot in " + self.minutes_to_shutdown + " minutes.") - - self.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED) - reboot_init_time = self.env_layer.datetime.datetime_utcnow() - self.env_layer.reboot_machine(self.reboot_cmd + self.minutes_to_shutdown + ' ' + message) - - # Wait for timeout - max_allowable_time_to_reboot_in_minutes = int(self.minutes_to_shutdown) + Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES - while 1: - current_time = self.env_layer.datetime.datetime_utcnow() - elapsed_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(current_time - reboot_init_time) - if elapsed_time_in_minutes >= max_allowable_time_to_reboot_in_minutes: - self.status_handler.set_installation_reboot_status(Constants.RebootStatus.FAILED) - error_msg = "Reboot failed to proceed on the machine in a timely manner." - self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR) - raise Exception(error_msg, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS)) - else: - self.composite_logger.file_logger.flush() - self.composite_logger.log("Waiting for machine reboot. [ElapsedTimeInMinutes={0}] [MaxTimeInMinutes={1}]".format(str(elapsed_time_in_minutes), str(max_allowable_time_to_reboot_in_minutes))) - self.composite_logger.file_logger.flush() - time.sleep(60) + # type: (str) -> bool + return self.__reboot_setting_sanitized == setting_to_check + # endregion + # region - Reboot action methods def start_reboot_if_required_and_time_available(self, current_time_available): + # type: (int) -> any """ Starts a reboot if required. Happens only at the end of the run if required. """ - self.composite_logger.log("\nReboot Management") reboot_pending = self.is_reboot_pending() + # Log a special-case message if the package manager is forcing a reboot that's not normally visible on the machine (encoded into is_reboot_pending()) if self.package_manager.force_reboot: - self.composite_logger.log("A reboot is pending as the package manager required it.") + self.composite_logger.log("[RM] A reboot is pending as the package manager required it.") - # return false if never - if self.reboot_setting == Constants.REBOOT_NEVER: + # No-op - return false if config says never reboot + if self.__reboot_setting_sanitized == Constants.REBOOT_NEVER: if reboot_pending: - self.composite_logger.log_warning(' - There is a reboot pending, but reboot is blocked, as per patch installation configuration. (' + str(Constants.REBOOT_NEVER) + ')') + self.composite_logger.log_warning('[RM][!] Reboot is pending but BLOCKED by the customer configuration ({0}).'.format(str(Constants.REBOOT_NEVER))) else: - self.composite_logger.log_warning(' - There is no reboot pending, and reboot is blocked regardless, as per patch installation configuration (' + str(Constants.REBOOT_NEVER) + ').') + self.composite_logger.log_debug('[RM] No reboot pending, and reboot is blocked regardless by the customer configuration ({0}).'.format(str(Constants.REBOOT_NEVER))) return False - # return if system doesn't require it (and only reboot if it does) - if self.reboot_setting == Constants.REBOOT_IF_REQUIRED and not reboot_pending: - self.composite_logger.log(" - There was no reboot pending detected. Reboot is being skipped as it's not required, as per patch installation configuration (" + str(Constants.REBOOT_IF_REQUIRED) + ").") + # No-op - return if system doesn't require it (and only reboot if it does) + if self.__reboot_setting_sanitized == Constants.REBOOT_IF_REQUIRED and not reboot_pending: + self.composite_logger.log_debug("[RM] No reboot pending detected. Reboot skipped as per customer configuration ({0}).".format(str(Constants.REBOOT_IF_REQUIRED))) return False - # prevent repeated reboots - if self.reboot_setting == Constants.REBOOT_ALWAYS and not reboot_pending and self.status_handler.get_installation_reboot_status() == Constants.RebootStatus.COMPLETED: - self.composite_logger.log(" - At least one reboot has occurred, and there's no reboot pending, so the conditions for the 'Reboot Always' setting is fulfilled and reboot won't be repeated.") + # No-op - prevent repeated reboots + if self.__reboot_setting_sanitized == Constants.REBOOT_ALWAYS and not reboot_pending and self.status_handler.get_installation_reboot_status() == Constants.RebootStatus.COMPLETED: + self.composite_logger.log_debug("[RM] At least one reboot has occurred, and there's no reboot pending, so the conditions for the 'Reboot Always' setting is fulfilled and reboot won't be repeated.") return False - # attempt to reboot is enough time is available - if self.reboot_setting == Constants.REBOOT_ALWAYS or (self.reboot_setting == Constants.REBOOT_IF_REQUIRED and reboot_pending): + # Try to reboot - if enough time is available + if self.__reboot_setting_sanitized == Constants.REBOOT_ALWAYS or (self.__reboot_setting_sanitized == Constants.REBOOT_IF_REQUIRED and reboot_pending): if self.is_reboot_time_available(current_time_available): - self.composite_logger.log(' - Reboot is being scheduled, as per patch installation configuration (' + str(self.reboot_setting) + ').') - self.composite_logger.log(" - Reboot-pending status: " + str(reboot_pending)) - self.start_reboot() + self.composite_logger.log_debug('[RM] Reboot is being scheduled, as per customer configuration ({0}). [RebootPending={1}][CurrentTimeAvailable={2}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending), str(current_time_available))) + self.__start_reboot(maintenance_window_available_time_in_minutes=current_time_available) return True else: - error_msg = ' - There is not enough time to schedule a reboot as per patch installation configuration (' + str(self.reboot_setting) + '). Reboot-pending status: ' + str(reboot_pending) + # Maintenance window will be marked exceeded as reboot is required and not enough time is available + error_msg = '[RM][!] Insufficient time to schedule a required reboot ({0}). [RebootPending={1}][CurrentTimeAvailable={2}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending), str(current_time_available)) self.composite_logger.log_error(error_msg) - self.status_handler.add_error_to_status("Reboot Management" + str(error_msg), Constants.PatchOperationErrorCodes.DEFAULT_ERROR) - self.maintenance_window_exceeded_flag = True + self.status_handler.add_error_to_status(str(error_msg), Constants.PatchOperationErrorCodes.DEFAULT_ERROR) + self.__maintenance_window_exceeded_flag = True return False - def is_reboot_pending(self): - return self.package_manager.force_reboot or (self.status_handler and self.status_handler.is_reboot_pending) + # No-op - This code should never be reached. If seen, it indicates a bug in the code. + self.composite_logger.log_error('[RM] Bug-check: Unexpected code branch reached. [RebootSetting={0}][RebootPending={1}]'.format(str(self.__reboot_setting_sanitized), str(reboot_pending))) + return False + + def __start_reboot(self, message="Azure VM Guest Patching initiated a reboot as part of an 'InstallPatches' operation.", maintenance_window_available_time_in_minutes=0): + # type: (str, int) -> None + """ Performs a controlled system reboot with a system-wide notification broadcast. """ + self.composite_logger.log("[RM] The machine is set to reboot in " + str(Constants.REBOOT_NOTIFY_WINDOW_IN_MINUTES) + " minutes.") + self.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED) + reboot_init_time = self.env_layer.datetime.datetime_utcnow() + + # Reboot after system-wide notification broadcast - no new logins will be allowed after this point. + self.env_layer.reboot_machine(self.__reboot_cmd + str(Constants.REBOOT_NOTIFY_WINDOW_IN_MINUTES) + ' ' + message) + + # Safety net - if the machine doesn't reboot, we need to fail the operation. + max_allowable_time_to_reboot_in_minutes = self.__calc_max_allowable_time_to_reboot_in_minutes(maintenance_window_available_time_in_minutes) + while 1: + current_time = self.env_layer.datetime.datetime_utcnow() + elapsed_time_in_minutes = self.env_layer.datetime.total_minutes_from_time_delta(current_time - reboot_init_time) + + # Keep logging to indicate machine hasn't rebooted yet. If successful, this will be the last log we see from this process. + if elapsed_time_in_minutes < max_allowable_time_to_reboot_in_minutes: + self.__reboot_wait_pulse(int(elapsed_time_in_minutes), int(max_allowable_time_to_reboot_in_minutes), + maintenance_window_allowable_limit_remaining_in_minutes = int(maintenance_window_available_time_in_minutes - elapsed_time_in_minutes)) + continue + + # If we get here, the machine has not rebooted in the time we expected. We need to fail the operation. + # This may be because of the following reasons: + # 1. The machine is not responding to the reboot command because of a customer environment issue. (customer should retry after a forcing a control-plane reboot) + # 2. The reboot command was externally interrupted during the broadcast period. (customer should retry after a forcing a control-plane reboot) + # 3. The time required to handle changes prior to reboot is greater than the time we've allocated. (action on AzGPS if seen at scale in Azure) + self.status_handler.set_installation_reboot_status(Constants.RebootStatus.FAILED) + error_msg = "Customer environment issue: Reboot failed to proceed on the machine in a timely manner. Please retry the operation." + self.status_handler.add_error_to_status(error_msg, Constants.PatchOperationErrorCodes.DEFAULT_ERROR) + raise Exception(error_msg, "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS)) + + def __reboot_wait_pulse(self, elapsed_time_in_minutes, max_allowable_time_to_reboot_in_minutes, maintenance_window_allowable_limit_remaining): + # type: (int, int, int) -> None + self.composite_logger.log("[RM] Waiting for machine reboot. [ElapsedTimeInMinutes={0}][MaxTimeInMinutes={1}][MWAllowableLimitRemainingInMins={2}]" + .format(str(elapsed_time_in_minutes), str(max_allowable_time_to_reboot_in_minutes), str(maintenance_window_allowable_limit_remaining))) + self.composite_logger.file_logger.flush() + self.status_handler.set_installation_substatus_json() # keep refreshing to minimize the chance of service-side timeout + time.sleep(Constants.REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS) + + @staticmethod + def __calc_max_allowable_time_to_reboot_in_minutes(maintenance_window_available_time_in_minutes): + # type: (int) -> int + """ Calculates the maximum amount of time to wait before considering the reboot attempt a failure. """ + + # remove the reboot to machine ready time from the available time + available_time = maintenance_window_available_time_in_minutes - Constants.REBOOT_TO_MACHINE_READY_TIME_IN_MINUTES + + if available_time >= Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX: + # If the maintenance window is greater than the max, we can use the max. + return Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX + else: + # Otherwise, we use the greater of the time available or the minimum wait timeout allowable. + return max(available_time, Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN) + # endregion diff --git a/src/core/tests/Test_RebootManager.py b/src/core/tests/Test_RebootManager.py index a692fb45..837a09c4 100644 --- a/src/core/tests/Test_RebootManager.py +++ b/src/core/tests/Test_RebootManager.py @@ -130,17 +130,74 @@ def test_start_reboot_raise_exception(self): argument_composer = ArgumentComposer() argument_composer.reboot_setting = reboot_setting_in_api runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.YUM) - Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES = -20 + Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN = -20 with self.assertRaises(Exception) as context: runtime.use_original_rm_start_reboot() - runtime.reboot_manager.start_reboot() + runtime.reboot_manager._RebootManager__start_reboot() # assert - self.assertIn("Reboot failed to proceed on the machine in a timely manner.", repr(context.exception)) + self.assertIn("Customer environment issue: Reboot failed to proceed on the machine in a timely manner. Please retry the operation.", repr(context.exception)) self.assertEqual(context.exception.args[1], "[{0}]".format(Constants.ERROR_ADDED_TO_STATUS)) runtime.stop() + def test_reboot_manager_start_reboot_bug_check(self): + """ Branches code into a code path that should never execute """ + argument_composer = ArgumentComposer() + runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.YUM) + runtime.reboot_manager._RebootManager__reboot_setting = 'NotARealSetting' # needs to be set like this to bypass code protections. + self.assertEqual(runtime.reboot_manager.start_reboot_if_required_and_time_available(20), False) + runtime.stop() + + def test_max_allowable_time_calculations(self): + """ Test the max allowable time to reboot calculations """ + argument_composer = ArgumentComposer() + runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.YUM) + + # shorten vars to aid test matrix readability + min_timeout = Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MIN + max_timeout = Constants.REBOOT_WAIT_TIMEOUT_IN_MINUTES_MAX + ready_time = Constants.REBOOT_TO_MACHINE_READY_TIME_IN_MINUTES + + # declarative test case matrix + input_output_matrix = [ + # [dataset number, total mw time available, expected_output] + [0, 0, min_timeout], # never less than min timeout + + # Boundary of min + [1, min_timeout - 1, min_timeout], + [2, min_timeout, min_timeout], + [3, min_timeout + 1, min_timeout], + [4, min_timeout + ready_time, min_timeout], + [5, min_timeout + ready_time + 1, min_timeout + 1], # only allowed to exceed min timeout if available time > min timeout + time allocated for machine to be ready + + # Boundary of max + [6, max_timeout - 1, max_timeout - 1 - ready_time], # ready time is always subtracted to get the max allowable time + [7, max_timeout, max_timeout - ready_time], + [8, max_timeout + 1, max_timeout - ready_time + 1], + [9, max_timeout + ready_time - 1, max_timeout - 1], + [10, max_timeout + ready_time + 1, max_timeout], + [11, max_timeout + 100, max_timeout], # never exceeds max timeout + ] + + for dataset in input_output_matrix: + input_value = dataset[1] + expected_output = dataset[2] + self.assertEqual(runtime.reboot_manager._RebootManager__calc_max_allowable_time_to_reboot_in_minutes(input_value), + expected_output, + msg="Failed on dataset {0} with input {1} and expected output {2}.".format(dataset[0], input_value, expected_output)) + + runtime.stop() + + def test_reboot_manager_wait_pulse(self): + argument_composer = ArgumentComposer() + runtime = RuntimeCompositor(argument_composer.get_composed_arguments(), True, Constants.APT) + backup_pulse_interval_value = Constants.REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS + Constants.REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS = 0 + runtime.reboot_manager._RebootManager__reboot_wait_pulse(1, 10, 15) + Constants.REBOOT_WAIT_PULSE_INTERVAL_IN_SECONDS = backup_pulse_interval_value + runtime.stop() + if __name__ == '__main__': unittest.main() diff --git a/src/core/tests/library/RuntimeCompositor.py b/src/core/tests/library/RuntimeCompositor.py index 2d755a96..716d5082 100644 --- a/src/core/tests/library/RuntimeCompositor.py +++ b/src/core/tests/library/RuntimeCompositor.py @@ -158,16 +158,16 @@ def reconfigure_env_layer_to_legacy_mode(self): def reconfigure_reboot_manager(self): # Preserve the original reboot manager start_reboot method - self.original_rm_start_reboot = self.reboot_manager.start_reboot + self.original_rm_start_reboot = self.reboot_manager._RebootManager__start_reboot # Reassign start_reboot to a new mock method - self.reboot_manager.start_reboot = self.start_reboot + self.reboot_manager._RebootManager__start_reboot = self.start_reboot - def start_reboot(self, message="Test initiated reboot mock"): + def start_reboot(self, message="Test initiated reboot mock", maintenance_window_available_time_in_minutes=0): self.status_handler.set_installation_reboot_status(Constants.RebootStatus.STARTED) def use_original_rm_start_reboot(self): - self.reboot_manager.start_reboot = self.original_rm_start_reboot + self.reboot_manager._RebootManager__start_reboot = self.original_rm_start_reboot def reconfigure_package_manager(self): self.backup_get_current_auto_os_patch_state = self.package_manager.get_current_auto_os_patch_state