Skip to content

Commit

Permalink
Add in auto scaling when no instance is avail
Browse files Browse the repository at this point in the history
Fix the issue where a deploy fails due to no instance being available
As no task gets started the CPU and MEM reservations dont change
This means existing scaling wont add an instance
As this code runs on every deploy it's now used to check for the error
and trigger the new alarm to scale up the cluster
  • Loading branch information
keirbadger committed Oct 18, 2022
1 parent d4bdef1 commit 2b5f98b
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 7 deletions.
63 changes: 61 additions & 2 deletions ecs_update_monitor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from time import sleep, time
from ecs_update_monitor.logger import logger
import datetime


MAX_FAILURES = 3

Expand All @@ -10,7 +12,7 @@ class UserFacingError(Exception):

def run(cluster, service, taskdef, boto_session):
event_iterator = ECSEventIterator(cluster, service, taskdef, boto_session)
monitor = ECSMonitor(event_iterator)
monitor = ECSMonitor(event_iterator, cluster, boto_session)
monitor.wait()


Expand All @@ -19,10 +21,12 @@ class ECSMonitor:
_TIMEOUT = 600
_INTERVAL = 15

def __init__(self, ecs_event_iterator):
def __init__(self, ecs_event_iterator, cluster, boto_session):
self._ecs_event_iterator = ecs_event_iterator
self._previous_running_count = 0
self._failed_count = 0
self._cluster = cluster
self._boto_session = boto_session

def wait(self):
self._check_ecs_deploy_progress()
Expand All @@ -34,6 +38,8 @@ def _check_ecs_deploy_progress(self):
self._check_for_failed_tasks(event)
if event.done:
return True
if event.new_instance:
self._trigger_new_instance_alarm()
if time() - start > self._TIMEOUT:
raise TimeoutError(
'Deployment timed out - didn\'t complete '
Expand All @@ -52,6 +58,28 @@ def _check_for_failed_tasks(self, event):
raise FailedTasksError
self._previous_running_count = event.running

def _trigger_new_instance_alarm(self):
logger.info("IN NEW INSTANCE TRIGGER CODE")
response = self._boto_session.client('cloudwatch').put_metric_data(
Namespace='Platform/ECS',
MetricData=[self._build_metric_data(self._cluster)]
)
logger.info(response)

def _build_metric_data(self, cluster_name):
return {
'MetricName': "resource-reservation-no-avail-instance-breached",
'Dimensions': [
{
'Name': 'EcsCluster',
'Value': cluster_name,
}
],
'Timestamp': datetime.datetime.utcnow(),
'Value': 1,
'Unit': 'Count'
}


class ECSEventIterator:

Expand Down Expand Up @@ -100,6 +128,11 @@ def __next__(self):
if self._new_service_deployment is None:
self._new_service_deployment = previous_running == 0

if self._need_new_instance(messages):
return NewInstanceEvent(
running, pending, desired, previous_running, messages
)

if self._deploy_in_progress(running, desired, previous_running):
return InProgressEvent(
running, pending, desired, previous_running, messages
Expand All @@ -116,6 +149,13 @@ def _check_taskdef(self, primary_deployment):
self._taskdef, primary_deployment['taskDefinition']
)

def _need_new_instance(self, messages):
for msg in messages:
if "unable to place a task because no " \
"container instance met all of its requirements" in msg:
return True
return False

def _deploy_in_progress(self, running, desired, previous_running):
if running != desired or previous_running:
return True
Expand Down Expand Up @@ -186,19 +226,38 @@ def __init__(self, running, pending, desired, previous_running, messages):
self.messages = messages


class NewInstanceEvent(Event):

@property
def done(self):
return False

@property
def new_instance(self):
return True


class DoneEvent(Event):

@property
def done(self):
return True

@property
def new_instance(self):
return False


class InProgressEvent(Event):

@property
def done(self):
return False

@property
def new_instance(self):
return False


class TaskdefDoesNotMatchError(Exception):
def __init__(self, expected, actual):
Expand Down
2 changes: 1 addition & 1 deletion test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ docker run \
--tb=short \
"$@"

docker run --rm $image flake8 --max-complexity=4
docker run --rm $image flake8 --max-complexity=5
4 changes: 2 additions & 2 deletions test/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ def SessionContructor(
InProgressEvent(2, 0, 2, 2, []),
InProgressEvent(1, 0, 2, 2, []),
]
ecs_monitor = ECSMonitor(ecs_event_iterator)
ecs_monitor = ECSMonitor(ecs_event_iterator, 'dummy', mock_session)
ecs_monitor._INTERVAL = 0
mock_monitor.return_value = ecs_monitor
# When
Expand Down Expand Up @@ -289,7 +289,7 @@ def SessionContructor(
InProgressEvent(0, 0, 2, 0, []),
InProgressEvent(0, 0, 2, 0, []),
])
ecs_monitor = ECSMonitor(ecs_event_iterator)
ecs_monitor = ECSMonitor(ecs_event_iterator, 'dummy', mock_session)
ecs_monitor._INTERVAL = 0.1
ecs_monitor._TIMEOUT = 0.1
mock_monitor.return_value = ecs_monitor
Expand Down
10 changes: 8 additions & 2 deletions test/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def test_ecs_monitor_deployment_times_out(self):
InProgressEvent(0, 0, 2, 0, []),
InProgressEvent(0, 0, 2, 0, []),
])
ecs_monitor = ECSMonitor(ecs_event_iterator)

boto_session = Mock()
ecs_monitor = ECSMonitor(ecs_event_iterator, 'dummy', boto_session)
ecs_monitor._INTERVAL = 0.1
ecs_monitor._TIMEOUT = 0.1

Expand Down Expand Up @@ -574,5 +576,9 @@ def test_run(self, fixtures):
ECSEventIterator.assert_called_once_with(
cluster, service, taskdef, boto_session
)
ECSMonitor.assert_called_once_with(event_iterator)
ECSMonitor.assert_called_once_with(
event_iterator,
cluster,
boto_session
)
ecs_monitor.wait.assert_called_once()

0 comments on commit 2b5f98b

Please sign in to comment.