diff --git a/.env.example b/.env.example index 5afcf92..c0729f0 100644 --- a/.env.example +++ b/.env.example @@ -1,22 +1,15 @@ -# All environmental variables are optional. The associated settings can be -# updated at runtime. These merely provide defaults at import time. +# All environment variables are optional. The associated settings can be +# updated at runtime. These provide a friendly interface to changing settings +# when running commands from a CLI. +# Diff-Related variables -------------------------- -# These are used in the web_monitoring.db module. -# They can point to a local deployment of web-monitoring-db (as in this -# example) or to the production deployment. -export WEB_MONITORING_DB_URL="http://localhost:3000" -export WEB_MONITORING_DB_EMAIL="seed-admin@example.com" -export WEB_MONITORING_DB_PASSWORD="PASSWORD" -export WEB_MONITORING_APP_ENV="development" # or production or test - -# These are used in test_html_diff -export WEB_MONITORING_DB_STAGING_URL="https://api-staging.monitoring.envirodatagov.org" -export WEB_MONITORING_DB_STAGING_EMAIL="" -export WEB_MONITORING_DB_STAGING_PASSWORD="" +# These CSS color values are used to set the colors in html_diff_render, differs and links_diff +# export DIFFER_COLOR_INSERTION="#4dac26" +# export DIFFER_COLOR_DELETION="#d01c8b" -# Diff-Related variables -------------------------- +# Server-Related variables ------------------------ # Set the diffing server to debug mode. Returns tracebacks in error responses # and auto-reloads the server when source files change. @@ -38,13 +31,9 @@ export DIFFER_MAX_BODY_SIZE='10485760' # 10 MB # https:// requests have invalid certificates. # export VALIDATE_TARGET_CERTIFICATES="false" -# These CSS color values are used to set the colors in html_diff_render, differs and links_diff -# export DIFFER_COLOR_INSERTION="#4dac26" -# export DIFFER_COLOR_DELETION="#d01c8b" - # Set how many diffs can be run in parallel. # export DIFFER_PARALLELISM=10 -# Uncomment to enable logging. Set the level as any normal level. -# https://docs.python.org/3.6/library/logging.html#logging-levels -# export LOG_LEVEL=INFO +# Instead of crashing when the process pool used for running diffs breaks, +# keep accepting requests and try to restart the pool. +# RESTART_BROKEN_DIFFER='true' diff --git a/docs/source/release-history.rst b/docs/source/release-history.rst index 980cb7d..bf0983e 100644 --- a/docs/source/release-history.rst +++ b/docs/source/release-history.rst @@ -2,6 +2,12 @@ Release History =============== +In Development +-------------- + +- The server uses a pool of child processes to run diffs. If the pool breaks while running a diff, it will be re-created once, and, if it fails again, the server will now crash with an exit code of ``10``. (An external process manager like Supervisor, Kubernetes, etc. can then decide how to handle the situation.) Previously, the diff would fail at this point, but server would try to re-create the process pool again the next time a diff was requested. You can opt-in to the old behavior by setting the ``RESTART_BROKEN_DIFFER`` environment variable to ``true``. (`#49 `_) + + Version 0.1.1 (2020-11-24) -------------------------- diff --git a/web_monitoring_diff/server/server.py b/web_monitoring_diff/server/server.py index 9bb1a4a..40a7c1b 100644 --- a/web_monitoring_diff/server/server.py +++ b/web_monitoring_diff/server/server.py @@ -36,6 +36,7 @@ sentry_sdk.integrations.logging.ignore_logger('tornado.access') DIFFER_PARALLELISM = int(os.environ.get('DIFFER_PARALLELISM', 10)) +RESTART_BROKEN_DIFFER = os.environ.get('RESTART_BROKEN_DIFFER', 'False').strip().lower() == 'true' # Map tokens in the REST API to functions in modules. # The modules do not have to be part of the web_monitoring_diff package. @@ -208,6 +209,7 @@ def initialize_diff_worker(): class DiffServer(tornado.web.Application): terminating = False + server = None def listen(self, port, address='', **kwargs): self.server = super().listen(port, address, **kwargs) @@ -220,9 +222,11 @@ async def shutdown(self, immediate=False): Otherwise, diffs are allowed to try and finish. """ self.terminating = True - self.server.stop() + if self.server: + self.server.stop() await self.shutdown_differs(immediate) - await self.server.close_all_connections() + if self.server: + await self.server.close_all_connections() async def shutdown_differs(self, immediate=False): """Stop all child processes used for running diffs.""" @@ -238,6 +242,12 @@ async def shutdown_differs(self, immediate=False): else: await shutdown_executor_in_loop(differs) + async def quit(self, immediate=False, code=0): + await self.shutdown(immediate=immediate) + tornado.ioloop.IOLoop.current().stop() + if code: + sys.exit(code) + def handle_signal(self, signal_type, frame): """Handle a signal by shutting down the application and IO loop.""" loop = tornado.ioloop.IOLoop.current() @@ -515,6 +525,16 @@ async def diff(self, func, a, b, params, tries=2): if executor == old_executor: executor = self.get_diff_executor(reset=True) else: + # If we shouldn't allow the server to keep rebuilding the + # differ pool for new requests, schedule a shutdown. + # (*Schuduled* so that current requests have a chance to + # complete with an error.) + if not RESTART_BROKEN_DIFFER: + logger.error('Process pool for diffing has failed too ' + 'many times; quitting server...') + tornado.ioloop.IOLoop.current().add_callback( + self.application.quit, + code=10) raise # NOTE: this doesn't do anything async, but if we change it to do so, we diff --git a/web_monitoring_diff/tests/test_server_exc_handling.py b/web_monitoring_diff/tests/test_server_exc_handling.py index 5a4638f..cef7fa5 100644 --- a/web_monitoring_diff/tests/test_server_exc_handling.py +++ b/web_monitoring_diff/tests/test_server_exc_handling.py @@ -393,6 +393,9 @@ class BrokenProcessPoolExecutor(concurrent.futures.Executor): "Fake process pool that only raises BrokenProcessPool exceptions." submit_count = 0 + def __init__(max_workers=None, mp_context=None, initializer=None, initargs=()): + return super().__init__() + def submit(self, fn, *args, **kwargs): self.submit_count += 1 result = concurrent.futures.Future() @@ -403,10 +406,10 @@ def submit(self, fn, *args, **kwargs): class ExecutionPoolTestCase(DiffingServerTestCase): - def fetch_async(self, path, **kwargs): + def fetch_async(self, path, raise_error=False, **kwargs): "Like AyncHTTPTestCase.fetch, but async." url = self.get_url(path) - return self.http_client.fetch(url, **kwargs) + return self.http_client.fetch(url, raise_error=raise_error, **kwargs) def test_rebuilds_process_pool_when_broken(self): # Get a custom executor that will always fail the first time, but get @@ -427,7 +430,8 @@ def get_executor(self, reset=False): assert response.code == 200 assert did_get_executor == True - def test_diff_returns_error_if_process_pool_repeatedly_breaks(self): + @patch.object(df.DiffServer, "quit") + def test_diff_returns_error_if_process_pool_repeatedly_breaks(self, _): # Set a custom executor that will always fail. def get_executor(self, reset=False): return BrokenProcessPoolExecutor() @@ -474,6 +478,46 @@ def get_executor(self, reset=False): # have one reset because only one request hit the bad executor. assert bad_executor.submit_count == 2 + # NOTE: the real `quit` tears up the server, which causes porblems with + # the test, so we just mock it and test that it was called, rather than + # checking whether `sys.exit` was ultimately called. Not totally ideal, + # but better than no testing. + @patch.object(df.DiffServer, "quit") + def test_server_exits_if_process_pool_repeatedly_breaks(self, mock_quit): + # Set a custom executor that will always fail. + def get_executor(self, reset=False): + return BrokenProcessPoolExecutor() + + with patch.object(df.DiffHandler, 'get_diff_executor', get_executor): + response = self.fetch('/html_source_dmp?format=json&' + f'a=file://{fixture_path("empty.txt")}&' + f'b=file://{fixture_path("empty.txt")}') + self.json_check(response) + assert response.code == 500 + + assert mock_quit.called + assert mock_quit.call_args == ({'code': 10},) + + # NOTE: the real `quit` tears up the server, which causes porblems with + # the test, so we just mock it and test that it was called, rather than + # checking whether `sys.exit` was ultimately called. Not totally ideal, + # but better than no testing. + @patch.object(df.DiffServer, "quit") + @patch('web_monitoring_diff.server.server.RESTART_BROKEN_DIFFER', True) + def test_server_does_not_exit_if_env_var_set_when_process_pool_repeatedly_breaks(self, mock_quit): + # Set a custom executor that will always fail. + def get_executor(self, reset=False): + return BrokenProcessPoolExecutor() + + with patch.object(df.DiffHandler, 'get_diff_executor', get_executor): + response = self.fetch('/html_source_dmp?format=json&' + f'a=file://{fixture_path("empty.txt")}&' + f'b=file://{fixture_path("empty.txt")}') + self.json_check(response) + assert response.code == 500 + + assert not mock_quit.called + def mock_diffing_method(c_body): return