3
3
4
4
from prometheus_client import start_http_server , Gauge , Counter , Histogram
5
5
6
+ # temp code [au]
7
+ import logging
8
+ logger = logging .getLogger ()
9
+ logger .setLevel (logging .INFO )
10
+ # --------------------------------
11
+
6
12
def start (metrics_agent_port ):
7
13
start_http_server (metrics_agent_port )
8
14
@@ -12,10 +18,16 @@ def start(metrics_agent_port):
12
18
13
19
14
20
def add_worker (node_id , service_name ):
21
+ # temp code [au]
22
+ logger .info ("In monitoring add_worker" )
23
+ # --------------------------------
15
24
node_counter .labels (node_id ,service_name ).inc ()
16
25
17
26
18
27
def terminate_worker (node_id , service_name ):
28
+ # temp code [au]
29
+ logger .info ("In monitoring terminate_worker" )
30
+ # --------------------------------
19
31
node_counter .labels (node_id ,service_name ).dec ()
20
32
21
33
@@ -31,6 +43,9 @@ def terminate_worker(node_id, service_name):
31
43
job_started = Gauge (JQUEUER_JOB_STARTED ,JQUEUER_JOB_STARTED ,["node_id" ,"experiment_id" ,"service_name" ,"qworker_id" ,"job_id" ])
32
44
33
45
def run_job (node_id , experiment_id , service_name , qworker_id , job_id ):
46
+ # temp code [au]
47
+ logger .info ("In monitoring run_job" )
48
+ # --------------------------------
34
49
job_started_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
35
50
job_running_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
36
51
job_running .labels (node_id ,experiment_id ,service_name ,qworker_id ,job_id ).set (1 )
@@ -47,6 +62,9 @@ def run_job(node_id, experiment_id, service_name, qworker_id, job_id):
47
62
job_accomplished = Gauge (JQUEUER_JOB_ACCOMPLISHED ,JQUEUER_JOB_ACCOMPLISHED ,["node_id" ,"experiment_id" ,"service_name" ,"qworker_id" ,"job_id" ])
48
63
49
64
def terminate_job (node_id , experiment_id , service_name , qworker_id , job_id , start_time ):
65
+ # temp code [au]
66
+ logger .info ("In monitoring terminate_job" )
67
+ # --------------------------------
50
68
elapsed_time = time .time () - start_time
51
69
job_accomplished_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
52
70
job_running_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
@@ -64,6 +82,9 @@ def terminate_job(node_id, experiment_id, service_name, qworker_id, job_id, star
64
82
job_failed = Gauge (JQUEUER_JOB_FAILED ,JQUEUER_JOB_FAILED ,["node_id" ,"experiment_id" ,"service_name" ,"qworker_id" ,"job_id" ])
65
83
66
84
def job_failed (node_id , experiment_id , service_name , qworker_id , job_id , fail_time ):
85
+ # temp code [au]
86
+ logger .info ("In monitoring job_failed" )
87
+ # --------------------------------
67
88
elapsed_time = time .time () - fail_time
68
89
job_failed_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
69
90
job_running_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ).set (time .time ())
@@ -84,6 +105,9 @@ def job_failed(node_id, experiment_id, service_name, qworker_id, job_id, fail_ti
84
105
task_started = Gauge (JQUEUER_TASK_STARTED ,JQUEUER_TASK_STARTED ,["node_id" ,"experiment_id" ,"service_name" ,"qworker_id" ,"job_id" ,"task_id" ])
85
106
86
107
def run_task (node_id , experiment_id , service_name , qworker_id , job_id , task_id ):
108
+ # temp code [au]
109
+ logger .info ("In monitoring run_task" )
110
+ # --------------------------------
87
111
task_started_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ,task_id ).set (time .time ())
88
112
task_running_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ,task_id ).set (time .time ())
89
113
task_running .labels (node_id ,experiment_id ,service_name ,qworker_id ,job_id ,task_id ).set (1 )
@@ -101,6 +125,9 @@ def run_task(node_id, experiment_id, service_name, qworker_id, job_id, task_id):
101
125
def terminate_task (
102
126
node_id , experiment_id , service_name , qworker_id , job_id , task_id , start_time
103
127
):
128
+ # temp code [au]
129
+ logger .info ("In monitoring terminate_task" )
130
+ # --------------------------------
104
131
elapsed_time = time .time () - start_time
105
132
task_accomplished_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ,task_id ).set (time .time ())
106
133
# In the previous case, this didn't include task_id.
@@ -121,6 +148,9 @@ def terminate_task(
121
148
def task_failed (
122
149
node_id , experiment_id , service_name , qworker_id , job_id , task_id , fail_time
123
150
):
151
+ # temp code [au]
152
+ logger .info ("In monitoring task_failed" )
153
+ # --------------------------------
124
154
elapsed_time = time .time () - fail_time
125
155
task_failed_timestamp .labels (node_id ,experiment_id ,service_name ,job_id ,task_id ).set (time .time ())
126
156
# In the previous case, this didn't include task_id.
0 commit comments