-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathhpc_cluster_manager.py
512 lines (433 loc) · 23.6 KB
/
hpc_cluster_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
import threading
from datetime import datetime, timedelta
import pytz
from typing import Iterable, Callable, NamedTuple, Set, Dict, List, Tuple
import logging_aux
from restclient import HpcRestClient
class HpcState:
Unknown, Provisioning, Running, Draining, Closing, Closed = range(6)
Names = ["Unknown", "Provisioning", "Running", "Draining", "Closing", "Closed"]
def _upper_strings(strs):
# type: (Iterable[str]) -> Iterable[str]
return (x.upper() for x in strs)
def _check_node_health_unapproved(node_status):
# type: (Dict[str, any]) -> bool
return node_status[
HpcRestClient.NODE_STATUS_NODE_HEALTH_KEY] == HpcRestClient.NODE_STATUS_NODE_HEALTH_UNAPPROVED_VALUE
def _find_missing_nodes(rq_nodes, res_nodes):
# type: (List[str], List[str]) -> List[str]
return [name for name in _upper_strings(rq_nodes) if name not in _upper_strings(res_nodes)]
def _check_node_state(node_status, target_state):
# type: (dict[str, any], str) -> bool
return node_status[HpcRestClient.NODE_STATUS_NODE_STATE_KEY] == target_state
def _check_node_state_offline(node_status):
# type: (dict[str, any]) -> bool
return _check_node_state(node_status, HpcRestClient.NODE_STATUS_NODE_STATE_OFFLINE_VALUE)
def _check_node_state_online(node_status):
# type: (dict[str, any]) -> bool
return _check_node_state(node_status, HpcRestClient.NODE_STATUS_NODE_STATE_ONLINE_VALUE)
def _get_hostname_from_fqdn(fqdn):
# type: (str) -> str
return fqdn.split('.')[0]
def _get_node_name_from_status(node_status):
# type: (dict[str, any]) -> str
return node_status[HpcRestClient.NODE_STATUS_NODE_NAME_KEY]
def _get_node_names_from_status(node_status_list):
# type: (List[Dict[str, any]]) -> List[str]
return map(_get_node_name_from_status, node_status_list)
def _get_node_state_from_status(node_status):
# type: (dict[str, any]) -> str
return node_status[HpcRestClient.NODE_STATUS_NODE_STATE_KEY]
class HpcClusterManager(object):
CHECK_CONFIGURING_NODES_INTERVAL = 5 # in seconds
MESOS_NODE_GROUP_NAME = "Mesos"
MESOS_NODE_GROUP_DESCRIPTION = "The Mesos compute nodes in the cluster"
NODE_IDLE_TIMEOUT = 180.0
# TODO: add configuration_timeout
def __init__(self, hpc_rest_client, provisioning_timeout=timedelta(minutes=15), idle_timeout=timedelta(minutes=3),
node_group=""):
# type: (HpcRestClient, timedelta, timedelta, str) -> ()
self._slave_info_table = {} # type: Dict[str, SlaveInfo]
self._removed_nodes = set() # type: Set[str]
self._node_idle_check_table = {}
self.logger = logging_aux.init_logger_aux("hpcframework.clustermanager", "hpcframework.clustermanager.log")
self._table_lock = threading.Lock()
self._provisioning_timeout = provisioning_timeout
self._hpc_client = hpc_rest_client
self._node_group = node_group # TODO: change to a centralized config
self._node_idle_timedelta = idle_timeout
# callbacks
self._node_closed_callbacks = [] # type: [Callable[[[str]], ()]]
def _node_group_specified(self):
# type: () -> bool
return self._node_group != ""
def subscribe_node_closed_callback(self, callback):
# type: (Callable[[list[str]], ()]) -> ()
self._node_closed_callbacks.append(callback)
def add_slaveinfo(self, fqdn, agent_id, task_id, cpus, last_heartbeat=None):
# type: (str, str, str, float, datetime) -> ()
if last_heartbeat is None:
last_heartbeat = datetime.now(pytz.utc)
u_fqdn = fqdn.upper()
hostname = _get_hostname_from_fqdn(u_fqdn)
if hostname in self._slave_info_table:
if self._slave_info_table[hostname].fqdn != u_fqdn:
self.logger.error(
"Duplicated hostname {} detected. Existing fqdn: {}, new fqdn {}. Ignore new heartbeat entry.".format(
hostname, self._slave_info_table[hostname].fqdn, u_fqdn))
return
elif self._slave_info_table[hostname].state != HpcState.Closed:
self.logger.warn("Heart beat entry of {} existed. old value: {}.".format(
hostname, str(self._slave_info_table[hostname])))
slaveinfo = SlaveInfo(hostname, u_fqdn, agent_id, task_id, cpus, last_heartbeat, HpcState.Provisioning)
self._slave_info_table[hostname] = slaveinfo
self.logger.info("Heart beat entry added: {}".format(str(slaveinfo)))
def update_slaves_last_seen(self, hostname_arr, now=None):
# type:(Iterable[str], datetime) -> ()
if now is None:
now = datetime.now(pytz.utc)
for hostname in hostname_arr:
self.update_slave_last_seen(hostname, now)
def update_slave_last_seen(self, hostname, now=None):
# type: (str, datetime) -> ()
if now is None:
now = datetime.now(pytz.utc)
u_hostname = hostname.upper()
if u_hostname in self._slave_info_table:
self._slave_info_table[u_hostname] = self._slave_info_table[u_hostname]._replace(last_seen=now)
self.logger.info("Slave seen: {}".format(u_hostname))
else:
self.logger.error("Host {} is not recognized. No entry will be updated.".format(u_hostname))
self.logger.debug("_table {} ".format(self._slave_info_table))
def get_task_info(self, hostname):
# type: (str) -> (str, str)
u_hostname = hostname.upper()
if u_hostname in self._slave_info_table:
entry = self._slave_info_table[u_hostname]
return entry.task_id, entry.agent_id
else:
self.logger.error("Host {} is not recognized. Failed to get task info.".format(u_hostname))
return "", ""
def get_host_state(self, hostname):
# type: (str) -> int
u_hostname = hostname.upper()
if u_hostname in self._slave_info_table:
entry = self._slave_info_table[u_hostname]
return entry.state
else:
self.logger.error("Host {} is not recognized. Failed to get host state.".format(u_hostname))
return HpcState.Unknown
def _exec_callback(self, callbacks):
for callback in callbacks:
try:
self.logger.debug('Callback %s on %s' % callback.__name__)
callback()
except Exception as e:
self.logger.exception('Error in %s callback: %s' % (callback.__name__, str(e)))
def check_fqdn_collision(self, fqdn):
# type: (str) -> bool
u_fqdn = fqdn.upper()
hostname = _get_hostname_from_fqdn(u_fqdn)
if hostname in self._slave_info_table:
if self._slave_info_table[hostname].fqdn != u_fqdn:
return True
return False
def _check_timeout(self, now=None):
# type: (datetime) -> ([SlaveInfo], [SlaveInfo])
# TODO: Check configuring timeout
if now is None:
now = datetime.now(pytz.utc)
provision_timeout_list = []
running_list = []
for host in dict(self._slave_info_table).itervalues():
if host.state == HpcState.Provisioning and now - host.last_seen >= self._provisioning_timeout:
self.logger.warn("Provisioning timeout: {}".format(str(host)))
provision_timeout_list.append(host)
elif host.state == HpcState.Running:
running_list.append(host)
return provision_timeout_list, running_list
def get_cores_in_provisioning(self):
cores = 0.0
for host in dict(self._slave_info_table).itervalues():
if host.state == HpcState.Provisioning:
cores += host.cpus
self.logger.info("Cores in provisioning: {}".format(cores))
return cores
def _get_nodes_name_in_state(self, state):
# type: (HpcState) -> [str]
return [host.hostname for host in dict(self._slave_info_table).itervalues() if host.state == state]
# TODO: make state_machine methods more testable
def _provision_compute_nodes_state_machine(self):
# type: () -> ()
provisioning_node_names = self._get_nodes_name_in_state(HpcState.Provisioning)
if not provisioning_node_names:
return
self.logger.info("Nodes in provisioning: {}".format(provisioning_node_names))
groups = self._hpc_client.list_node_groups(self.MESOS_NODE_GROUP_NAME)
if self.MESOS_NODE_GROUP_NAME not in groups:
self._hpc_client.add_node_group(self.MESOS_NODE_GROUP_NAME, self.MESOS_NODE_GROUP_DESCRIPTION)
# We won't create target node group, but check if it exists
# Check after Mesos group has been created to support specified group is Mesos group
if self._node_group_specified():
target_group = self._hpc_client.list_node_groups(self._node_group)
if self._node_group.upper() not in (x.upper() for x in target_group):
self.logger.error(
"Target node group is not created:{}. Stop configure compute nodes.".format(self._node_group))
return
# state check
node_status_list = self._hpc_client.get_node_status_exact(provisioning_node_names)
self.logger.info("Get node_status_list:{}".format(str(node_status_list)))
unapproved_node_list = []
take_offline_node_list = []
bring_online_node_list = []
change_node_group_node_list = []
provisioned_node_names = []
invalid_state_node_dict = {}
for node_status in node_status_list:
node_name = _get_node_name_from_status(node_status)
node_state = _get_node_state_from_status(node_status)
if _check_node_health_unapproved(node_status):
unapproved_node_list.append(node_name)
# node approved
elif (not self._check_node_in_mesos_group(node_status) or (
self._node_group_specified() and self._check_node_in_specified_group(node_status))):
if _check_node_state_online(node_status):
take_offline_node_list.append(node_name)
elif _check_node_state_offline(node_status):
change_node_group_node_list.append(node_name)
else:
invalid_state_node_dict[node_name] = node_state
# node group properly set
elif _check_node_state_offline(node_status):
bring_online_node_list.append(node_name)
elif _check_node_state_online(node_status):
# this node is all set
provisioned_node_names.append(node_name)
else:
invalid_state_node_dict[node_name] = node_state
missing_nodes = _find_missing_nodes(provisioning_node_names, (_get_node_names_from_status(node_status_list)))
try:
if invalid_state_node_dict:
self.logger.info("Node(s) in invalid state when provisioning: {}".format(str(invalid_state_node_dict)))
if unapproved_node_list:
self.logger.info("Assigning node template for node(s): {}".format(str(unapproved_node_list)))
self._hpc_client.assign_default_compute_node_template(unapproved_node_list)
if take_offline_node_list:
self.logger.info("Taking node(s) offline: {}".format(str(take_offline_node_list)))
self._hpc_client.take_nodes_offline(take_offline_node_list)
if bring_online_node_list:
self.logger.info("Bringing node(s) online: {}".format(str(bring_online_node_list)))
self._hpc_client.bring_nodes_online(bring_online_node_list)
if change_node_group_node_list:
self.logger.info("Changing node group node(s): {}".format(str(change_node_group_node_list)))
self._hpc_client.add_node_to_node_group(self.MESOS_NODE_GROUP_NAME, change_node_group_node_list)
if self._node_group_specified():
self._hpc_client.add_node_to_node_group(self._node_group, change_node_group_node_list)
except:
# Swallow all exceptions here. As we don't want any exception to prevent provisioned nodes to work
self.logger.exception('Exception happened when configuring compute node.')
# state change
if provisioned_node_names:
self.logger.info("Nodes provisioned: {}".format(provisioned_node_names))
self._set_nodes_running(provisioned_node_names)
if missing_nodes:
# Missing is valid state of nodes in provisioning.
self.logger.info("Nodes missing when provisioning: {}".format(missing_nodes))
def _check_runaway_and_idle_compute_nodes(self):
# type: () -> ()
(provision_timeout_list, running_list) = self._check_timeout()
if provision_timeout_list:
self.logger.info("Get provision_timeout_list:{}".format(str(provision_timeout_list)))
self._set_nodes_draining(host.hostname for host in provision_timeout_list)
if running_list:
running_node_names = [host.hostname for host in running_list]
node_status_list = self._hpc_client.get_node_status_exact(running_node_names)
# Unapproved nodes and missing nodes in running state are runaway nodes
unapproved_nodes = [_get_node_name_from_status(status) for status in node_status_list if
_check_node_health_unapproved(status)]
missing_nodes = _find_missing_nodes(running_node_names, _get_node_names_from_status(node_status_list))
if unapproved_nodes:
self.logger.warn("Unapproved nodes in running state:{}".format(unapproved_nodes))
self._set_nodes_closed(unapproved_nodes)
if missing_nodes:
self.logger.warn("Missing nodes in running state:{}".format(missing_nodes))
self._set_nodes_closed(missing_nodes)
if self.get_cores_in_provisioning() <= 0.0:
# Update running node names to remove closed nodes
running_node_names = [name for name in running_node_names if
(name not in unapproved_nodes and name not in missing_nodes)]
idle_nodes = self._hpc_client.check_nodes_idle(running_node_names)
self.logger.info("Get idle_nodes:{}".format(str(idle_nodes)))
idle_timeout_nodes = self._check_node_idle_timeout([node.node_name for node in idle_nodes])
self.logger.info("Get idle_timeout_nodes:{}".format(str(idle_timeout_nodes)))
self._set_nodes_draining(idle_timeout_nodes)
else:
# If there is still node growing, we won't shrink at the same time
self._reset_node_idle_check()
self.logger.info(
"Reset node idle timeout as there are nodes being provisioning. Cores in provisioning: {}".format(
self.get_cores_in_provisioning()))
def _reset_node_idle_check(self):
# type: () -> ()
self._node_idle_check_table.clear()
self._removed_nodes.clear()
def _check_node_idle_timeout(self, node_names, now=None):
# type: (Iterable[str], datetime) -> [str]
if now is None:
now = datetime.now(pytz.utc)
new_node_idle_check_table = {}
for u_node_name in _upper_strings(node_names):
if u_node_name in self._node_idle_check_table:
if u_node_name in self._removed_nodes:
new_node_idle_check_table[u_node_name] = now
self._removed_nodes.discard(u_node_name)
else:
new_node_idle_check_table[u_node_name] = self._node_idle_check_table[u_node_name]
else:
new_node_idle_check_table[u_node_name] = now
self._node_idle_check_table = new_node_idle_check_table
self.logger.info("_check_node_idle_timeout: now - " + str(now))
self.logger.info("_check_node_idle_timeout: " + str(self._node_idle_check_table))
return [name for name, value in self._node_idle_check_table.iteritems() if
(now - value) >= self._node_idle_timedelta]
def _start_configure_cluster_timer(self):
# type: () -> ()
try:
self._provision_compute_nodes_state_machine()
self._check_runaway_and_idle_compute_nodes()
self._drain_and_stop_nodes()
except Exception as ex:
self.logger.exception(ex)
timer = threading.Timer(self.CHECK_CONFIGURING_NODES_INTERVAL, self._start_configure_cluster_timer)
timer.daemon = True
timer.start()
def start(self):
self._start_configure_cluster_timer()
def _check_deploy_failure(self, set_nodes):
# type: (List[Tuple[str, int]]) -> ()
for node, old_state in set_nodes:
if old_state == HpcState.Provisioning:
self.logger.error(
"Node {} failed to deploy. Previous state: {}".format(node, HpcState.Names[old_state]))
def _set_nodes_draining(self, node_names):
# type: (Iterable[str]) -> ()
self._removed_nodes.update(_upper_strings(node_names))
self._check_deploy_failure(self._set_node_state(node_names, HpcState.Draining, "Draining"))
def _set_nodes_closing(self, node_names):
# type: (Iterable[str]) -> ()
self._removed_nodes.update(_upper_strings(node_names))
self._check_deploy_failure(self._set_node_state(node_names, HpcState.Closing, "Closing"))
def _set_nodes_closed(self, node_names):
# type: (Iterable[str]) -> ()
self._removed_nodes.update(_upper_strings(node_names))
closed_nodes = self._set_node_state(node_names, HpcState.Closed, "Closed")
if self._node_closed_callbacks:
for callback in self._node_closed_callbacks:
callback([node for node, _ in closed_nodes])
self._check_deploy_failure(closed_nodes)
def _set_nodes_running(self, node_names):
# type: (Iterable[str]) -> ()
self._set_node_state(node_names, HpcState.Running, "Running")
def _set_node_state(self, node_names, node_state, state_name):
# type: (Iterable[str], int, str) -> [(str, int)]
set_nodes = []
for node_name in node_names:
u_hostname = node_name.upper()
if u_hostname in self._slave_info_table:
if self._slave_info_table[u_hostname].state != node_state:
old_state = self._slave_info_table[u_hostname].state
self._slave_info_table[u_hostname] = self._slave_info_table[u_hostname]._replace(state=node_state)
set_nodes.append((u_hostname, old_state))
self.logger.info("Host {} set to {} from {}.".format(
u_hostname, state_name, HpcState.Names[old_state]))
else:
self.logger.error("Host {} is not recognized. State {} Ignored.".format(u_hostname, state_name))
return set_nodes
def _drain_and_stop_nodes(self):
# type: () -> ()
node_names_to_drain = self._get_nodes_name_in_state(HpcState.Draining)
node_names_to_close = self._get_nodes_name_in_state(HpcState.Closing)
if node_names_to_drain:
drained_node_names = self._drain_nodes_state_machine(node_names_to_drain)
if drained_node_names:
node_names_to_close += drained_node_names # short-cut the drained nodes to close
if node_names_to_close:
self._close_node_state_machine(node_names_to_close)
def _drain_nodes_state_machine(self, node_names):
# type: (list[str]) -> list[str]
self.logger.info("Draining nodes: {}".format(node_names))
take_offline_node_list = []
invalid_state_node_dict = {}
drained_node_names = []
node_status_list = self._hpc_client.get_node_status_exact(node_names)
for node_status in node_status_list:
node_name = _get_node_name_from_status(node_status)
node_state = _get_node_state_from_status(node_status)
if _check_node_state_online(node_status):
take_offline_node_list.append(node_name)
elif _check_node_state_offline(node_status):
drained_node_names.append(node_name)
else:
invalid_state_node_dict[node_name] = node_state
missing_nodes = _find_missing_nodes(node_names, _get_node_names_from_status(node_status_list))
try:
if invalid_state_node_dict:
self.logger.info("Node(s) in invalid state when draining: {}".format(str(invalid_state_node_dict)))
if take_offline_node_list:
self.logger.info("Taking node(s) offline: {}".format(str(take_offline_node_list)))
self._hpc_client.take_nodes_offline(take_offline_node_list)
except:
# Swallow all exceptions here. As we don't want any exception to prevent drained nodes to be closed
self.logger.exception('Exception happened when draining compute node.')
if drained_node_names:
self.logger.info("Drained nodes:{}".format(drained_node_names))
self._set_nodes_closing(drained_node_names)
if missing_nodes:
self.logger.info("Missing nodes when draining:{}".format(missing_nodes))
self._set_nodes_closed(missing_nodes)
return drained_node_names
def _close_node_state_machine(self, node_names):
# type: (list[str]) -> ()
self.logger.info("Closing nodes: {}".format(node_names))
to_remove_node_names = []
re_drain_node_names = []
closed_node = []
node_status_list = self._hpc_client.get_node_status_exact(node_names)
for node_status in node_status_list:
node_name = node_status[HpcRestClient.NODE_STATUS_NODE_NAME_KEY] # type: str
if _check_node_health_unapproved(node_status):
# already removed node
closed_node.append(node_name)
else:
if not _check_node_state_offline(node_status):
# node is not properly drained
re_drain_node_names.append(node_name)
else:
to_remove_node_names.append(node_name)
# more already removed nodes
closed_node += _find_missing_nodes(node_names, re_drain_node_names + to_remove_node_names)
try:
if to_remove_node_names:
self.logger.info("Remove node(s): {}".format(to_remove_node_names))
self._hpc_client.remove_nodes(to_remove_node_names)
except:
# Swallow all exceptions here. As we don't want any exception to prevent removed nodes go to closed
self.logger.exception('Exception happened when configuring compute node.')
if closed_node:
self.logger.info("Closed nodes:{}".format(closed_node))
self._set_nodes_closed(closed_node)
if re_drain_node_names:
self.logger.info("Closed nodes failed:{}".format(re_drain_node_names))
self._set_nodes_draining(re_drain_node_names)
def _check_node_in_mesos_group(self, node_status):
# type: (dict[str, any]) -> bool
return self.MESOS_NODE_GROUP_NAME.upper() in _upper_strings(
node_status[HpcRestClient.NODE_STATUS_NODE_GROUP_KEY])
def _check_node_in_specified_group(self, node_status):
# type: (dict[str, any]) -> bool
return self._node_group.upper() not in _upper_strings(
node_status[
HpcRestClient.NODE_STATUS_NODE_GROUP_KEY])
SlaveInfo = NamedTuple("SlaveInfo",
[("hostname", str), ("fqdn", str), ("agent_id", str), ("task_id", str), ("cpus", float),
("last_seen", datetime), ("state", int)])