Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions samtranslator/plugins/application/serverless_app_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ def _handle_get_application_request(self, app_id, semver, key, logical_id):
warning_message = "{}. Unable to verify access to {}/{}.".format(e, app_id, semver)
LOG.warning(warning_message)
self._applications[key] = {"Unable to verify"}
except ClientError as e:
LOG.exception(e)
raise e
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't the exception get logged if this is raised and not caught? Any specific reason we added this here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wondered the same thing -- I added this here to preserve the existing behavior. I removed LOG.exception from _sar_service_call as throttles are not really exceptions for GetCloudFormation. I can do a bit more digging to see if we need that extra log.


def _handle_create_cfn_template_request(self, app_id, semver, key, logical_id):
"""
Expand All @@ -194,7 +197,12 @@ def _handle_create_cfn_template_request(self, app_id, semver, key, logical_id):
create_cfn_template = lambda app_id, semver: self._sar_client.create_cloud_formation_template(
ApplicationId=self._sanitize_sar_str_param(app_id), SemanticVersion=self._sanitize_sar_str_param(semver)
)
response = self._sar_service_call(create_cfn_template, logical_id, app_id, semver)
try:
response = self._sar_service_call(create_cfn_template, logical_id, app_id, semver)
except ClientError as e:
LOG.exception(e)
raise e

LOG.info("Requested to create CFN template {}/{} in serverless application repo.".format(app_id, semver))
self._applications[key] = response[self.TEMPLATE_URL_KEY]
if response["Status"] != "ACTIVE":
Expand Down Expand Up @@ -308,6 +316,9 @@ def on_after_transform_template(self, template):
while (time() - start_time) < self.TEMPLATE_WAIT_TIMEOUT_SECONDS:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This resets the throttle flag. Is the idea that the 2s sleep will give us enough break from calling that we can safely call SAR again? Might be worth commenting about line 319 to make it clear. This method is pretty hard to parse (lots of nesting).

temp = self._in_progress_templates
self._in_progress_templates = []
# it's either our first time in this loop or we just slept SLEEP_TIME_SECONDS so we can
# try calling SAR
throttled = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we introduce some jitter?

Should we update our Boto config to be standard (default is legacy)? Seems like that provides a better default retry behavior and I think standard is the newer recommendation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should -- I'd suggest doing that in a future PR so we're introducing one change at a time. I might have misread the botocore code but I thought even the legacy retry had some jitter built into it.


# Check each resource to make sure it's active
LOG.info("Checking resources in serverless application repo...")
Expand All @@ -318,8 +329,26 @@ def on_after_transform_template(self, template):
TemplateId=self._sanitize_sar_str_param(template_id),
)
)
response = self._sar_service_call(get_cfn_template, application_id, application_id, template_id)
response = {"Status": "PREPARING"} # default response if we can't reach SAR

try:
if not throttled:
response = self._sar_service_call(
get_cfn_template, application_id, application_id, template_id
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since now we manually skip boto3 calls when throttled, it looks like it will go to next iteration immediately, which is likely to result in another throttle. Do you think we should add sleep at the end of the iteration if throttled is True?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should (will?) refactor this code to make it easier to read. Once throttled is True, we will continue looping but will not make any more SAR calls. We continue looping to in order to call _handle_get_cfn_template_response. Once the for loop is done, we will sleep(self.SLEEP_TIME_SECONDS) before going through the while again, which resets Throttled

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see, just found the line.

except ClientError as e:
error_code = e.response["Error"]["Code"]
if error_code == "TooManyRequestsException":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why only catch this here? Can't this happen in the create_cloud_formation_template?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably can -- we just haven't seen it in any availability dips. There's no loop around the create at the moment so we'd either have to add a loop (to _sar_service_call if we want it to be common) or consider throttling an error.

# We were throttled by SAR, don't hammer SAR with more calls in this for loop
throttled = True
LOG.debug("SAR call timed out for application id {}".format(application_id))
# don't re-raise, fall through to regular processing as if the template wasn't ready yet
else:
LOG.exception(e)
raise e

self._handle_get_cfn_template_response(response, application_id, template_id)

LOG.info("Finished checking resources in serverless application repo.")

# Don't sleep if there are no more templates with PREPARING status
Expand Down Expand Up @@ -372,9 +401,6 @@ def _sar_service_call(self, service_call_lambda, logical_id, *args):
error_code = e.response["Error"]["Code"]
if error_code in ("AccessDeniedException", "NotFoundException"):
raise InvalidResourceException(logical_id, e.response["Error"]["Message"])

# 'ForbiddenException'- SAR rejects connection
LOG.exception(e)
raise e

def _resource_is_supported(self, resource_type):
Expand Down
53 changes: 48 additions & 5 deletions tests/plugins/application/test_serverless_app_plugin.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import boto3
import itertools
from botocore.exceptions import ClientError

from mock import Mock, patch
from unittest import TestCase
from parameterized import parameterized, param

from samtranslator.plugins.application.serverless_app_plugin import ServerlessAppPlugin
from samtranslator.plugins.exceptions import InvalidPluginException
from samtranslator.model.exceptions import InvalidResourceException

# TODO: run tests when AWS CLI is not configured (so they can run in brazil)

Expand Down Expand Up @@ -253,9 +255,50 @@ def __init__(self, app_id="app_id", semver="1.3.5"):

# self.plugin.on_before_transform_resource(app_resources[0][0], 'AWS::Serverless::Application', app_resources[0][1].properties)

# class TestServerlessAppPlugin_on_after_transform_template(TestCase):

# def setUp(self):
# self.plugin = SeverlessAppPlugin()

# # TODO: test this lifecycle event
class TestServerlessAppPlugin_on_after_transform_template(TestCase):
def test_sar_throttling_doesnt_stop_processing(self):
client = Mock()
client.get_cloud_formation_template = Mock()
client.get_cloud_formation_template.side_effect = ClientError(
{"Error": {"Code": "TooManyRequestsException"}}, "GetCloudFormationTemplate"
)
plugin = ServerlessAppPlugin(sar_client=client, wait_for_template_active_status=True, validate_only=False)
plugin._in_progress_templates = [("appid", "template")]
plugin.SLEEP_TIME_SECONDS = 0.05
plugin.TEMPLATE_WAIT_TIMEOUT_SECONDS = 0.3
with self.assertRaises(InvalidResourceException):
plugin.on_after_transform_template("template")
# confirm we had at least two attempts to call SAR
self.assertGreater(client.get_cloud_formation_template.call_count, 1)

def test_unexpected_sar_error_stops_processing(self):
client = Mock()
client.get_cloud_formation_template = Mock()
client.get_cloud_formation_template.side_effect = ClientError(
{"Error": {"Code": "BadBadError"}}, "GetCloudFormationTemplate"
)
plugin = ServerlessAppPlugin(sar_client=client, wait_for_template_active_status=True, validate_only=False)
plugin._in_progress_templates = [("appid", "template")]
with self.assertRaises(ClientError):
plugin.on_after_transform_template("template")

def test_sar_success_one_app(self):
client = Mock()
client.get_cloud_formation_template = Mock()
client.get_cloud_formation_template.return_value = {"Status": "ACTIVE"}
plugin = ServerlessAppPlugin(sar_client=client, wait_for_template_active_status=True, validate_only=False)
plugin._in_progress_templates = [("appid", "template")]
plugin.on_after_transform_template("template")
# should have exactly one call to SAR
self.assertEqual(client.get_cloud_formation_template.call_count, 1)

def test_sar_success_two_apps(self):
client = Mock()
client.get_cloud_formation_template = Mock()
client.get_cloud_formation_template.return_value = {"Status": "ACTIVE"}
plugin = ServerlessAppPlugin(sar_client=client, wait_for_template_active_status=True, validate_only=False)
plugin._in_progress_templates = [("appid1", "template1"), ("appid2", "template2")]
plugin.on_after_transform_template("template")
# should have exactly one call to SAR per app
self.assertEqual(client.get_cloud_formation_template.call_count, 2)