Skip to content

Commit c5028b2

Browse files
authored
🎨Clusters keeper/use ssm (🚨change in private clusters) (ITISFoundation#6361)
1 parent c28b0da commit c5028b2

27 files changed

+429
-74
lines changed

.env-devel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest
5050
CLUSTERS_KEEPER_DASK_NTHREADS=0
5151
CLUSTERS_KEEPER_DASK_WORKER_SATURATION=inf
5252
CLUSTERS_KEEPER_EC2_ACCESS=null
53+
CLUSTERS_KEEPER_SSM_ACCESS=null
5354
CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX=""
5455
CLUSTERS_KEEPER_LOGLEVEL=WARNING
5556
CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=5

packages/models-library/src/models_library/clusters.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ class Config(BaseAuthentication.Config):
9696
class NoAuthentication(BaseAuthentication):
9797
type: Literal["none"] = "none"
9898

99+
class Config(BaseAuthentication.Config):
100+
schema_extra: ClassVar[dict[str, Any]] = {"examples": [{"type": "none"}]}
101+
99102

100103
class TLSAuthentication(BaseAuthentication):
101104
type: Literal["tls"] = "tls"

services/clusters-keeper/src/simcore_service_clusters_keeper/api/health.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
@router.get("/", include_in_schema=True, response_class=PlainTextResponse)
2222
async def health_check():
2323
# NOTE: sync url in docker/healthcheck.py with this entrypoint!
24-
return f"{__name__}.health_check@{datetime.datetime.now(datetime.timezone.utc).isoformat()}"
24+
return f"{__name__}.health_check@{datetime.datetime.now(datetime.UTC).isoformat()}"
2525

2626

2727
class _ComponentStatus(BaseModel):
@@ -33,25 +33,34 @@ class _StatusGet(BaseModel):
3333
rabbitmq: _ComponentStatus
3434
ec2: _ComponentStatus
3535
redis_client_sdk: _ComponentStatus
36+
ssm: _ComponentStatus
3637

3738

3839
@router.get("/status", include_in_schema=True, response_model=_StatusGet)
3940
async def get_status(app: Annotated[FastAPI, Depends(get_app)]) -> _StatusGet:
4041
return _StatusGet(
4142
rabbitmq=_ComponentStatus(
4243
is_enabled=is_rabbitmq_enabled(app),
43-
is_responsive=await get_rabbitmq_client(app).ping()
44-
if is_rabbitmq_enabled(app)
45-
else False,
44+
is_responsive=(
45+
await get_rabbitmq_client(app).ping()
46+
if is_rabbitmq_enabled(app)
47+
else False
48+
),
4649
),
4750
ec2=_ComponentStatus(
4851
is_enabled=bool(app.state.ec2_client),
49-
is_responsive=await app.state.ec2_client.ping()
50-
if app.state.ec2_client
51-
else False,
52+
is_responsive=(
53+
await app.state.ec2_client.ping() if app.state.ec2_client else False
54+
),
5255
),
5356
redis_client_sdk=_ComponentStatus(
5457
is_enabled=bool(app.state.redis_client_sdk),
5558
is_responsive=await get_redis_client(app).ping(),
5659
),
60+
ssm=_ComponentStatus(
61+
is_enabled=(app.state.ssm_client is not None),
62+
is_responsive=(
63+
await app.state.ssm_client.ping() if app.state.ssm_client else False
64+
),
65+
),
5766
)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from typing import Final
2+
3+
from aws_library.ec2._models import AWSTagKey, AWSTagValue
4+
from pydantic import parse_obj_as
5+
6+
DOCKER_STACK_DEPLOY_COMMAND_NAME: Final[str] = "private cluster docker deploy"
7+
DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY: Final[AWSTagKey] = parse_obj_as(
8+
AWSTagKey, "io.simcore.clusters-keeper.private_cluster_docker_deploy"
9+
)
10+
11+
USER_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "user_id")
12+
WALLET_ID_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "wallet_id")
13+
ROLE_TAG_KEY: Final[AWSTagKey] = parse_obj_as(AWSTagKey, "role")
14+
WORKER_ROLE_TAG_VALUE: Final[AWSTagValue] = parse_obj_as(AWSTagValue, "worker")
15+
MANAGER_ROLE_TAG_VALUE: Final[AWSTagValue] = parse_obj_as(AWSTagValue, "manager")

services/clusters-keeper/src/simcore_service_clusters_keeper/core/application.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from ..modules.ec2 import setup as setup_ec2
2020
from ..modules.rabbitmq import setup as setup_rabbitmq
2121
from ..modules.redis import setup as setup_redis
22+
from ..modules.ssm import setup as setup_ssm
2223
from ..rpc.rpc_routes import setup_rpc_routes
2324
from .settings import ApplicationSettings
2425

@@ -55,6 +56,7 @@ def create_app(settings: ApplicationSettings) -> FastAPI:
5556
setup_rabbitmq(app)
5657
setup_rpc_routes(app)
5758
setup_ec2(app)
59+
setup_ssm(app)
5860
setup_redis(app)
5961
setup_clusters_management(app)
6062

services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from settings_library.ec2 import EC2Settings
2626
from settings_library.rabbit import RabbitSettings
2727
from settings_library.redis import RedisSettings
28+
from settings_library.ssm import SSMSettings
2829
from settings_library.tracing import TracingSettings
2930
from settings_library.utils_logging import MixinLoggingSettings
3031
from types_aiobotocore_ec2.literals import InstanceTypeType
@@ -50,6 +51,21 @@ class Config(EC2Settings.Config):
5051
}
5152

5253

54+
class ClustersKeeperSSMSettings(SSMSettings):
55+
class Config(SSMSettings.Config):
56+
env_prefix = CLUSTERS_KEEPER_ENV_PREFIX
57+
58+
schema_extra: ClassVar[dict[str, Any]] = { # type: ignore[misc]
59+
"examples": [
60+
{
61+
f"{CLUSTERS_KEEPER_ENV_PREFIX}{key}": var
62+
for key, var in example.items()
63+
}
64+
for example in SSMSettings.Config.schema_extra["examples"]
65+
],
66+
}
67+
68+
5369
class WorkersEC2InstancesSettings(BaseCustomSettings):
5470
WORKERS_EC2_INSTANCES_ALLOWED_TYPES: dict[str, EC2InstanceBootSpecific] = Field(
5571
...,
@@ -183,6 +199,12 @@ class PrimaryEC2InstancesSettings(BaseCustomSettings):
183199
"that take longer than this time will be terminated as sometimes it happens that EC2 machine fail on start.",
184200
)
185201

202+
PRIMARY_EC2_INSTANCES_DOCKER_DEFAULT_ADDRESS_POOL: str = Field(
203+
default="172.20.0.0/14",
204+
description="defines the docker swarm default address pool in CIDR format "
205+
"(see https://docs.docker.com/reference/cli/docker/swarm/init/)",
206+
)
207+
186208
@validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES")
187209
@classmethod
188210
def check_valid_instance_names(
@@ -250,6 +272,10 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings):
250272
auto_default_from_env=True
251273
)
252274

275+
CLUSTERS_KEEPER_SSM_ACCESS: ClustersKeeperSSMSettings | None = Field(
276+
auto_default_from_env=True
277+
)
278+
253279
CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES: PrimaryEC2InstancesSettings | None = Field(
254280
auto_default_from_env=True
255281
)
@@ -285,9 +311,11 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings):
285311
"(default to seconds, or see https://pydantic-docs.helpmanual.io/usage/types/#datetime-types for string formating)",
286312
)
287313

288-
CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: NonNegativeInt = Field(
289-
default=5,
290-
description="Max number of missed heartbeats before a cluster is terminated",
314+
CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION: NonNegativeInt = (
315+
Field(
316+
default=5,
317+
description="Max number of missed heartbeats before a cluster is terminated",
318+
)
291319
)
292320

293321
CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG: str = Field(

services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ async def _get_primary_ec2_params(
4949
ec2_instance_types: list[
5050
EC2InstanceType
5151
] = await ec2_client.get_ec2_instance_capabilities(
52-
instance_type_names=[ec2_type_name]
52+
instance_type_names={ec2_type_name}
5353
)
5454
assert ec2_instance_types # nosec
5555
assert len(ec2_instance_types) == 1 # nosec
@@ -72,15 +72,7 @@ async def create_cluster(
7272
tags=creation_ec2_tags(app_settings, user_id=user_id, wallet_id=wallet_id),
7373
startup_script=create_startup_script(
7474
app_settings,
75-
cluster_machines_name_prefix=get_cluster_name(
76-
app_settings, user_id=user_id, wallet_id=wallet_id, is_manager=False
77-
),
7875
ec2_boot_specific=ec2_instance_boot_specs,
79-
additional_custom_tags={
80-
AWSTagKey("user_id"): AWSTagValue(f"{user_id}"),
81-
AWSTagKey("wallet_id"): AWSTagValue(f"{wallet_id}"),
82-
AWSTagKey("role"): AWSTagValue("worker"),
83-
},
8476
),
8577
ami_id=ec2_instance_boot_specs.ami_id,
8678
key_name=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME,

services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py

Lines changed: 97 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,40 @@
55

66
import arrow
77
from aws_library.ec2 import AWSTagKey, EC2InstanceData
8+
from aws_library.ec2._models import AWSTagValue
89
from fastapi import FastAPI
910
from models_library.users import UserID
1011
from models_library.wallets import WalletID
1112
from pydantic import parse_obj_as
1213
from servicelib.logging_utils import log_catch
13-
14+
from servicelib.utils import limited_gather
15+
16+
from ..constants import (
17+
DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY,
18+
DOCKER_STACK_DEPLOY_COMMAND_NAME,
19+
ROLE_TAG_KEY,
20+
USER_ID_TAG_KEY,
21+
WALLET_ID_TAG_KEY,
22+
WORKER_ROLE_TAG_VALUE,
23+
)
1424
from ..core.settings import get_application_settings
1525
from ..modules.clusters import (
1626
delete_clusters,
1727
get_all_clusters,
1828
get_cluster_workers,
1929
set_instance_heartbeat,
2030
)
31+
from ..utils.clusters import create_deploy_cluster_stack_script
2132
from ..utils.dask import get_scheduler_auth, get_scheduler_url
22-
from ..utils.ec2 import HEARTBEAT_TAG_KEY
33+
from ..utils.ec2 import (
34+
HEARTBEAT_TAG_KEY,
35+
get_cluster_name,
36+
user_id_from_instance_tags,
37+
wallet_id_from_instance_tags,
38+
)
2339
from .dask import is_scheduler_busy, ping_scheduler
40+
from .ec2 import get_ec2_client
41+
from .ssm import get_ssm_client
2442

2543
_logger = logging.getLogger(__name__)
2644

@@ -42,8 +60,8 @@ def _get_instance_last_heartbeat(instance: EC2InstanceData) -> datetime.datetime
4260
async def _get_all_associated_worker_instances(
4361
app: FastAPI,
4462
primary_instances: Iterable[EC2InstanceData],
45-
) -> list[EC2InstanceData]:
46-
worker_instances = []
63+
) -> set[EC2InstanceData]:
64+
worker_instances: set[EC2InstanceData] = set()
4765
for instance in primary_instances:
4866
assert "user_id" in instance.tags # nosec
4967
user_id = UserID(instance.tags[_USER_ID_TAG_KEY])
@@ -55,20 +73,20 @@ async def _get_all_associated_worker_instances(
5573
else None
5674
)
5775

58-
worker_instances.extend(
76+
worker_instances.update(
5977
await get_cluster_workers(app, user_id=user_id, wallet_id=wallet_id)
6078
)
6179
return worker_instances
6280

6381

6482
async def _find_terminateable_instances(
6583
app: FastAPI, instances: Iterable[EC2InstanceData]
66-
) -> list[EC2InstanceData]:
84+
) -> set[EC2InstanceData]:
6785
app_settings = get_application_settings(app)
6886
assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec
6987

7088
# get the corresponding ec2 instance data
71-
terminateable_instances: list[EC2InstanceData] = []
89+
terminateable_instances: set[EC2InstanceData] = set()
7290

7391
time_to_wait_before_termination = (
7492
app_settings.CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION
@@ -82,7 +100,7 @@ async def _find_terminateable_instances(
82100
elapsed_time_since_heartbeat = arrow.utcnow().datetime - last_heartbeat
83101
allowed_time_to_wait = time_to_wait_before_termination
84102
if elapsed_time_since_heartbeat >= allowed_time_to_wait:
85-
terminateable_instances.append(instance)
103+
terminateable_instances.add(instance)
86104
else:
87105
_logger.info(
88106
"%s has still %ss before being terminateable",
@@ -93,14 +111,14 @@ async def _find_terminateable_instances(
93111
elapsed_time_since_startup = arrow.utcnow().datetime - instance.launch_time
94112
allowed_time_to_wait = startup_delay
95113
if elapsed_time_since_startup >= allowed_time_to_wait:
96-
terminateable_instances.append(instance)
114+
terminateable_instances.add(instance)
97115

98116
# get all terminateable instances associated worker instances
99117
worker_instances = await _get_all_associated_worker_instances(
100118
app, terminateable_instances
101119
)
102120

103-
return terminateable_instances + worker_instances
121+
return terminateable_instances.union(worker_instances)
104122

105123

106124
async def check_clusters(app: FastAPI) -> None:
@@ -112,6 +130,7 @@ async def check_clusters(app: FastAPI) -> None:
112130
if await ping_scheduler(get_scheduler_url(instance), get_scheduler_auth(app))
113131
}
114132

133+
# set intance heartbeat if scheduler is busy
115134
for instance in connected_intances:
116135
with log_catch(_logger, reraise=False):
117136
# NOTE: some connected instance could in theory break between these 2 calls, therefore this is silenced and will
@@ -124,6 +143,7 @@ async def check_clusters(app: FastAPI) -> None:
124143
f"{instance.id=} for {instance.tags=}",
125144
)
126145
await set_instance_heartbeat(app, instance=instance)
146+
# clean any cluster that is not doing anything
127147
if terminateable_instances := await _find_terminateable_instances(
128148
app, connected_intances
129149
):
@@ -138,7 +158,7 @@ async def check_clusters(app: FastAPI) -> None:
138158
for instance in disconnected_instances
139159
if _get_instance_last_heartbeat(instance) is None
140160
}
141-
161+
# remove instances that were starting for too long
142162
if terminateable_instances := await _find_terminateable_instances(
143163
app, starting_instances
144164
):
@@ -149,7 +169,72 @@ async def check_clusters(app: FastAPI) -> None:
149169
)
150170
await delete_clusters(app, instances=terminateable_instances)
151171

152-
# the other instances are broken (they were at some point connected but now not anymore)
172+
# NOTE: transmit command to start docker swarm/stack if needed
173+
# once the instance is connected to the SSM server,
174+
# use ssm client to send the command to these instances,
175+
# we send a command that contain:
176+
# the docker-compose file in binary,
177+
# the call to init the docker swarm and the call to deploy the stack
178+
instances_in_need_of_deployment = {
179+
i
180+
for i in starting_instances - terminateable_instances
181+
if DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY not in i.tags
182+
}
183+
184+
if instances_in_need_of_deployment:
185+
app_settings = get_application_settings(app)
186+
ssm_client = get_ssm_client(app)
187+
ec2_client = get_ec2_client(app)
188+
instances_in_need_of_deployment_ssm_connection_state = await limited_gather(
189+
*[
190+
ssm_client.is_instance_connected_to_ssm_server(i.id)
191+
for i in instances_in_need_of_deployment
192+
],
193+
reraise=False,
194+
log=_logger,
195+
limit=20,
196+
)
197+
ec2_connected_to_ssm_server = [
198+
i
199+
for i, c in zip(
200+
instances_in_need_of_deployment,
201+
instances_in_need_of_deployment_ssm_connection_state,
202+
strict=True,
203+
)
204+
if c is True
205+
]
206+
started_instances_ready_for_command = ec2_connected_to_ssm_server
207+
if started_instances_ready_for_command:
208+
# we need to send 1 command per machine here, as the user_id/wallet_id changes
209+
for i in started_instances_ready_for_command:
210+
ssm_command = await ssm_client.send_command(
211+
[i.id],
212+
command=create_deploy_cluster_stack_script(
213+
app_settings,
214+
cluster_machines_name_prefix=get_cluster_name(
215+
app_settings,
216+
user_id=user_id_from_instance_tags(i.tags),
217+
wallet_id=wallet_id_from_instance_tags(i.tags),
218+
is_manager=False,
219+
),
220+
additional_custom_tags={
221+
USER_ID_TAG_KEY: i.tags[USER_ID_TAG_KEY],
222+
WALLET_ID_TAG_KEY: i.tags[WALLET_ID_TAG_KEY],
223+
ROLE_TAG_KEY: WORKER_ROLE_TAG_VALUE,
224+
},
225+
),
226+
command_name=DOCKER_STACK_DEPLOY_COMMAND_NAME,
227+
)
228+
await ec2_client.set_instances_tags(
229+
started_instances_ready_for_command,
230+
tags={
231+
DOCKER_STACK_DEPLOY_COMMAND_EC2_TAG_KEY: AWSTagValue(
232+
ssm_command.command_id
233+
),
234+
},
235+
)
236+
237+
# the remaining instances are broken (they were at some point connected but now not anymore)
153238
broken_instances = disconnected_instances - starting_instances
154239
if terminateable_instances := await _find_terminateable_instances(
155240
app, broken_instances

0 commit comments

Comments
 (0)