Skip to content

Commit b49680b

Browse files
committed
added some log messages
1 parent 4e6bf0f commit b49680b

File tree

2 files changed

+40
-0
lines changed

2 files changed

+40
-0
lines changed

job_operations.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@
1313
import container_worker as jqw
1414
from container_worker import job_app
1515

16+
# temp code [au]
17+
import logging
18+
logger = logging.getLogger()
19+
logger.setLevel(logging.INFO)
20+
# --------------------------------
21+
1622
# What to do when a job fails
1723
class JQueuer_Task(celery.Task):
1824
def on_failure(self, exc, task_id, args, kwargs, einfo):
@@ -37,6 +43,10 @@ def add(self, exp_id, job_queue_id, job):
3743

3844
worker_id = self.request.hostname.split("@")[1]
3945

46+
# temp code [au]
47+
logger.info("In job_operations run_job")
48+
# --------------------------------
49+
4050
monitoring.run_job(
4151
getNodeID(worker_id), exp_id, getServiceName(worker_id), worker_id, job["id"]
4252
)

monitoring.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33

44
from prometheus_client import start_http_server, Gauge, Counter, Histogram
55

6+
# temp code [au]
7+
import logging
8+
logger = logging.getLogger()
9+
logger.setLevel(logging.INFO)
10+
# --------------------------------
11+
612
def start(metrics_agent_port):
713
start_http_server(metrics_agent_port)
814

@@ -12,10 +18,16 @@ def start(metrics_agent_port):
1218

1319

1420
def add_worker(node_id, service_name):
21+
# temp code [au]
22+
logger.info("In monitoring add_worker")
23+
# --------------------------------
1524
node_counter.labels(node_id,service_name).inc()
1625

1726

1827
def terminate_worker(node_id, service_name):
28+
# temp code [au]
29+
logger.info("In monitoring terminate_worker")
30+
# --------------------------------
1931
node_counter.labels(node_id,service_name).dec()
2032

2133

@@ -31,6 +43,9 @@ def terminate_worker(node_id, service_name):
3143
job_started = Gauge(JQUEUER_JOB_STARTED,JQUEUER_JOB_STARTED,["node_id","experiment_id","service_name","qworker_id","job_id"])
3244

3345
def run_job(node_id, experiment_id, service_name, qworker_id, job_id):
46+
# temp code [au]
47+
logger.info("In monitoring run_job")
48+
# --------------------------------
3449
job_started_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
3550
job_running_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
3651
job_running.labels(node_id,experiment_id,service_name,qworker_id,job_id).set(1)
@@ -47,6 +62,9 @@ def run_job(node_id, experiment_id, service_name, qworker_id, job_id):
4762
job_accomplished = Gauge(JQUEUER_JOB_ACCOMPLISHED,JQUEUER_JOB_ACCOMPLISHED,["node_id","experiment_id","service_name","qworker_id","job_id"])
4863

4964
def terminate_job(node_id, experiment_id, service_name, qworker_id, job_id, start_time):
65+
# temp code [au]
66+
logger.info("In monitoring terminate_job")
67+
# --------------------------------
5068
elapsed_time = time.time() - start_time
5169
job_accomplished_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
5270
job_running_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
@@ -64,6 +82,9 @@ def terminate_job(node_id, experiment_id, service_name, qworker_id, job_id, star
6482
job_failed = Gauge(JQUEUER_JOB_FAILED,JQUEUER_JOB_FAILED,["node_id","experiment_id","service_name","qworker_id","job_id"])
6583

6684
def job_failed(node_id, experiment_id, service_name, qworker_id, job_id, fail_time):
85+
# temp code [au]
86+
logger.info("In monitoring job_failed")
87+
# --------------------------------
6788
elapsed_time = time.time() - fail_time
6889
job_failed_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
6990
job_running_timestamp.labels(node_id,experiment_id,service_name,job_id).set(time.time())
@@ -84,6 +105,9 @@ def job_failed(node_id, experiment_id, service_name, qworker_id, job_id, fail_ti
84105
task_started = Gauge(JQUEUER_TASK_STARTED,JQUEUER_TASK_STARTED,["node_id","experiment_id","service_name","qworker_id","job_id","task_id"])
85106

86107
def run_task(node_id, experiment_id, service_name, qworker_id, job_id, task_id):
108+
# temp code [au]
109+
logger.info("In monitoring run_task")
110+
# --------------------------------
87111
task_started_timestamp.labels(node_id,experiment_id,service_name,job_id,task_id).set(time.time())
88112
task_running_timestamp.labels(node_id,experiment_id,service_name,job_id,task_id).set(time.time())
89113
task_running.labels(node_id,experiment_id,service_name,qworker_id,job_id,task_id).set(1)
@@ -101,6 +125,9 @@ def run_task(node_id, experiment_id, service_name, qworker_id, job_id, task_id):
101125
def terminate_task(
102126
node_id, experiment_id, service_name, qworker_id, job_id, task_id, start_time
103127
):
128+
# temp code [au]
129+
logger.info("In monitoring terminate_task")
130+
# --------------------------------
104131
elapsed_time = time.time() - start_time
105132
task_accomplished_timestamp.labels(node_id,experiment_id,service_name,job_id,task_id).set(time.time())
106133
# In the previous case, this didn't include task_id.
@@ -121,6 +148,9 @@ def terminate_task(
121148
def task_failed(
122149
node_id, experiment_id, service_name, qworker_id, job_id, task_id, fail_time
123150
):
151+
# temp code [au]
152+
logger.info("In monitoring task_failed")
153+
# --------------------------------
124154
elapsed_time = time.time() - fail_time
125155
task_failed_timestamp.labels(node_id,experiment_id,service_name,job_id,task_id).set(time.time())
126156
# In the previous case, this didn't include task_id.

0 commit comments

Comments
 (0)