Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: Service Lambda timeouts cause user-facing 5xx responses (#6284) #6624

Open
wants to merge 14 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 24 additions & 138 deletions lambdas/indexer/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,21 @@
Optional,
)

# noinspection PyPackageRequirements
import chalice
from chalice import (
Response,
)

from azul import (
CatalogName,
cached_property,
config,
)
from azul.chalice import (
AzulChaliceApp,
LambdaMetric,
)
from azul.deployment import (
aws,
)
from azul.health import (
HealthController,
HealthApp,
)
from azul.hmac import (
HMACAuthentication,
Expand All @@ -46,9 +41,6 @@
from azul.openapi.responses import (
json_content,
)
from azul.openapi.spec import (
CommonEndpointSpecs,
)

log = logging.getLogger(__name__)

Expand All @@ -64,16 +56,12 @@
# changes and reset the minor version to zero. Otherwise, increment only
# the minor version for backwards compatible changes. A backwards
# compatible change is one that does not require updates to clients.
'version': '1.0'
'version': '1.1'
}
}


class IndexerApp(AzulChaliceApp, SignatureHelper):

@cached_property
def health_controller(self):
return self._controller(HealthController, lambda_name='indexer')
class IndexerApp(HealthApp, SignatureHelper):

@cached_property
def index_controller(self) -> IndexController:
Expand All @@ -98,7 +86,9 @@ def log_forwarder(self, prefix: str):
error_decorator = self.metric_alarm(metric=LambdaMetric.errors,
threshold=1, # One alarm …
period=24 * 60 * 60) # … per day.
throttle_decorator = self.metric_alarm(metric=LambdaMetric.throttles)
throttle_decorator = self.metric_alarm(metric=LambdaMetric.throttles,
threshold=0,
period=5 * 60)
retry_decorator = self.retry(num_retries=2)

def decorator(f):
Expand All @@ -115,120 +105,7 @@ def _authenticate(self) -> Optional[HMACAuthentication]:
app = IndexerApp()
configure_app_logging(app, log)


@app.route(
'/',
cache_control='public, max-age=0, must-revalidate',
cors=False
)
def swagger_ui():
return app.swagger_ui()


@app.route(
'/static/{file}',
cache_control='public, max-age=86400',
cors=True
)
def static_resource(file):
return app.swagger_resource(file)


common_specs = CommonEndpointSpecs(app_name='indexer')


@app.route(
'/openapi',
methods=['GET'],
cache_control='public, max-age=500',
cors=True,
**common_specs.openapi
)
def openapi():
return Response(status_code=200,
headers={'content-type': 'application/json'},
body=app.spec())


@app.route(
'/version',
methods=['GET'],
cors=True,
**common_specs.version
)
def version():
from azul.changelog import (
compact_changes,
)
return {
'git': config.lambda_git_status,
'changes': compact_changes(limit=10)
}


@app.route(
'/health',
methods=['GET'],
cors=True,
**common_specs.full_health
)
def health():
return app.health_controller.health()


@app.route(
'/health/basic',
methods=['GET'],
cors=True,
**common_specs.basic_health
)
def basic_health():
return app.health_controller.basic_health()


@app.route(
'/health/cached',
methods=['GET'],
cors=True,
**common_specs.cached_health
)
def cached_health():
return app.health_controller.cached_health()


@app.route(
'/health/fast',
methods=['GET'],
cors=True,
**common_specs.fast_health
)
def fast_health():
return app.health_controller.fast_health()


@app.route(
'/health/{keys}',
methods=['GET'],
cors=True,
**common_specs.custom_health
)
def health_by_key(keys: Optional[str] = None):
return app.health_controller.custom_health(keys)


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=1,
period=24 * 60 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles)
@app.retry(num_retries=0)
# FIXME: Remove redundant prefix from name
# https://github.com/DataBiosphere/azul/issues/5337
@app.schedule(
'rate(1 minute)',
name='indexercachehealth'
)
def update_health_cache(_event: chalice.app.CloudWatchEvent):
app.health_controller.update_cache()
globals().update(app.default_routes())


@app.route(
Expand Down Expand Up @@ -303,9 +180,11 @@ def post_notification(catalog: CatalogName, action: str):


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.contribution_concurrency(retry=False) * 2 / 3))
threshold=int(config.contribution_concurrency(retry=False) * 2 / 3),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(96000 / config.contribution_concurrency(retry=False)))
threshold=int(96000 / config.contribution_concurrency(retry=False)),
period=5 * 60)
@app.on_sqs_message(
queue=config.notifications_queue_name(),
batch_size=1
Expand All @@ -315,9 +194,11 @@ def contribute(event: chalice.app.SQSEvent):


@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.aggregation_concurrency(retry=False) * 3))
threshold=int(config.aggregation_concurrency(retry=False) * 3),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(37760 / config.aggregation_concurrency(retry=False)))
threshold=int(37760 / config.aggregation_concurrency(retry=False)),
period=5 * 60)
@app.on_sqs_message(
queue=config.tallies_queue_name(),
batch_size=IndexController.document_batch_size
Expand All @@ -330,8 +211,11 @@ def aggregate(event: chalice.app.SQSEvent):
# with more RAM in the tallies_retry queue.

@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.aggregation_concurrency(retry=True) * 1 / 16))
@app.metric_alarm(metric=LambdaMetric.throttles)
threshold=int(config.aggregation_concurrency(retry=True) * 1 / 16),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=0,
period=5 * 60)
@app.on_sqs_message(
queue=config.tallies_queue_name(retry=True),
batch_size=IndexController.document_batch_size
Expand All @@ -344,9 +228,11 @@ def aggregate_retry(event: chalice.app.SQSEvent):
# retried with more RAM and a longer timeout in the notifications_retry queue.

@app.metric_alarm(metric=LambdaMetric.errors,
threshold=int(config.contribution_concurrency(retry=True) * 1 / 4))
threshold=int(config.contribution_concurrency(retry=True) * 1 / 4),
period=5 * 60)
@app.metric_alarm(metric=LambdaMetric.throttles,
threshold=int(31760 / config.contribution_concurrency(retry=True)))
threshold=int(31760 / config.contribution_concurrency(retry=True)),
period=5 * 60)
@app.on_sqs_message(
queue=config.notifications_queue_name(retry=True),
batch_size=1
Expand Down
Loading
Loading