Skip to content

Commit c6f15c5

Browse files
authored
Cell and can logger health checks (#103)
* Addition of cell_logger health check * Removal of old .env file * Transition to unless-stopped instead of restart always in docker-compose, health check for can-logger * Bus activity check
2 parents 77d08b1 + 1b90629 commit c6f15c5

File tree

13 files changed

+198
-22
lines changed

13 files changed

+198
-22
lines changed

services/.env

Lines changed: 0 additions & 6 deletions
This file was deleted.

services/can_logger/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ COPY --from=builder /opt/venv /opt/venv
3434
# Make sure we use the virtualenv
3535
ENV PATH="/opt/venv/bin:$PATH"
3636

37+
# Copy health check script
38+
COPY healthcheck.py .
39+
HEALTHCHECK CMD ["python", "./healthcheck.py"]
40+
3741
# Copy script over and run
3842
COPY can_logger.py .
3943
CMD [ "./can_logger.py" ]

services/can_logger/can_logger.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def log_can(can_interface):
149149
print('Successfully connected to socketcand at',
150150
f'{host_ip}: {host_port}')
151151
socket_connected = True
152+
sys.stdout.flush()
152153

153154
# Receive socketcand's response. After each command, socketcand replies
154155
# < ok > if the command was successful. Each reply must be received before
@@ -186,6 +187,8 @@ def log_can(can_interface):
186187
# and CAN frame.
187188

188189
while(True):
190+
sys.stdout.flush()
191+
189192

190193
# Buffer to store raw bytes received from the socket.
191194
socket_buff = s.recv(54)
@@ -309,5 +312,5 @@ def log_can(can_interface):
309312
db_started = True
310313

311314
for can_bus in can_interfaces:
312-
315+
print('Creating process for', can_bus)
313316
mp.Process(target=log_can, args=(can_bus,)).start()

services/can_logger/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ services:
33
can-logger:
44
build:
55
context: ../can_logger
6-
restart: always
6+
restart: unless-stopped
77
environment:
88
db_user: "avena"
99
db_host: "postgres"

services/can_logger/healthcheck.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
#!/usr/bin/python3
2+
import postgres
3+
import sys
4+
import os
5+
from datetime import datetime
6+
from time import sleep
7+
8+
# Take name of service, timestamp of last update, and threshold
9+
# Compares the current time to the passed timestamp, and exits or outputs the appropriate message
10+
# Not the most reusable function, but avoids copy and pasting the same code for both the csv and db checks
11+
# If everything is okay, the function returns void, and if something is wrong it exits the program. Inconsistent
12+
# but leads to concise code... Should consider changing in future
13+
def checktimestamp(item, timestamp, threshold):
14+
# Subtracting two datetimes gives a timedelta, we can convert this into an integer seconds
15+
delta = (datetime.now(lastupdate.tzinfo) - timestamp).total_seconds()
16+
if delta > threshold:
17+
print(item, 'was last updated', delta, 'seconds ago, something is wrong (threshold: ' + str(threshold) + 's)')
18+
sys.exit(1)
19+
20+
print(item, 'was last updated', delta, 'seconds ago, everything is normal (threshold: ' + str(threshold) + 's)')
21+
22+
23+
# If the time between the last update is larger than this threshold in seconds, change the status to unhealthy
24+
threshold = 5
25+
26+
# Get environmental variable determining logging to csv or db or both
27+
default_value = 'CSV'
28+
log_env = os.getenv('CELL_LOG', default_value)
29+
host_interfaces = os.environ['can_interface'].split(',')
30+
31+
for interface in host_interfaces:
32+
33+
# Ensure that there is actually being data logged to the bus
34+
# Currently there are two hacky ways to do this. Either we can
35+
# check the amount of rx_bytes, write them to a file, and then
36+
# check for changes next time we run this script, or we can read
37+
# the file, sleep for a certain period of time, and then check again
38+
# Currently we are trying the later as it is a bit simpler although
39+
# slower to respond due to the sleep
40+
rx_start = -1
41+
rx_end = -1
42+
with open('/mnt/host/sys/class/net/' + interface + '/statistics/rx_bytes', 'r') as rx:
43+
rx_start = int(rx.read().strip())
44+
sleep(1)
45+
rx.seek(0,0)
46+
rx_end = int(rx.read().strip())
47+
48+
print(interface, 'rx_end:', rx_end, 'rx_start:', rx_start)
49+
if (rx_end - rx_start) <= 0:
50+
print('No can messages sent on the bus, no expected action by can logger')
51+
continue
52+
'''
53+
rxbfile = interface + '_rx_bytes.tmp'
54+
if os.path.exists(rxbfile):
55+
with open(rxbfile, 'rw') as rxb_write, open('/mnt/host/sys/class/net/' + interface + '/statistics/rx_bytes', 'r') as rxb_read:
56+
old_read = int(rxb_write.read().strip())
57+
new_read = int(rxb_read.read().strip())
58+
old_read.write(str(new_read))
59+
if (new_read - old_read) <= 0:
60+
print('No can messages sent on', interface, 'no expected action by logger')
61+
continue
62+
63+
64+
else:
65+
with open(rxbfile, 'w') as rxb_write, open('/mnt/host/sys/class/net/' + interface + '/statistics/rx_bytes', 'r') as rxb_read:
66+
rxb_write(rxb_read().strip())
67+
print('Created temp file for next run with interface', interface)
68+
continue
69+
'''
70+
# If the container is logging to csv
71+
if 'CSV' in log_env.upper():
72+
logpath = '/data/log/' + interface + '.csv'
73+
# Ensure the file exists in the first place
74+
if not os.path.exists(logpath):
75+
print('Log file did not exist or was not able to be opened. Path: ',logpath)
76+
sys.exit(1)
77+
78+
# Get the last time the file was modified
79+
lastupdate = datetime.fromtimestamp(os.path.getmtime(logpath))
80+
checktimestamp(logpath, lastupdate, threshold)
81+
82+
# If the container is logging to database
83+
if 'DB' in log_env.upper():
84+
# Connect to the database
85+
connectionurl='postgresql://' + os.environ['db_user'] + ':' + os.environ['db_password'] + '@postgres:' + os.environ['db_port'] + '/' + os.environ['db_database']
86+
db = postgres.Postgres(url=connectionurl)
87+
88+
# Query the most recent timestamp
89+
rst = db.one("SELECT * FROM can ORDER BY time where can_interface = ? DESC LIMIT 1;", interface)
90+
91+
# Handle if the database is empty
92+
if rst == None:
93+
print("Database has the wrong schema or no data");
94+
sys.exit(1)
95+
96+
# First column is the timestamp, in datetime format
97+
lastupdate = rst[0]
98+
checktimestamp(interface + ' CAN signal database table', lastupdate, threshold)
99+

services/can_watchdog/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ services:
33
can_watchdog:
44
build:
55
context: ../can_watchdog
6-
restart: always
6+
restart: unless-stopped
77
environment:
88
inactivity_threshold: 5
99
volumes:

services/cell_logger/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ COPY --from=builder /opt/venv /opt/venv
3535
# Make sure we use the virtualenv
3636
ENV PATH="/opt/venv/bin:$PATH"
3737

38+
# Copy health check script
39+
COPY healthcheck.py .
40+
HEALTHCHECK CMD ["python", "./healthcheck.py"]
41+
3842
# Copy script over and run
3943
COPY cell_logger.py .
4044
CMD [ "./cell_logger.py" ]

services/cell_logger/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ services:
33
cell-logger:
44
build:
55
context: ../cell_logger
6-
restart: always
6+
restart: unless-stopped
77
environment:
88
db_user: "avena"
99
db_host: "postgres"

services/cell_logger/healthcheck.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/python3
2+
import postgres
3+
import sys
4+
import os
5+
from datetime import datetime
6+
7+
# Take name of service, timestamp of last update, and threshold
8+
# Compares the current time to the passed timestamp, and exits or outputs the appropriate message
9+
# Not the most reusable function, but avoids copy and pasting the same code for both the csv and db checks
10+
# If everything is okay, the function returns void, and if something is wrong it exits the program. Inconsistent
11+
# but leads to concise code... Should consider changing in future
12+
def checktimestamp(item, timestamp, threshold):
13+
# Subtracting two datetimes gives a timedelta, we can convert this into an integer seconds
14+
delta = (datetime.now(lastupdate.tzinfo) - timestamp).total_seconds()
15+
if delta > threshold:
16+
print(item, 'was last updated', delta, 'seconds ago, something is wrong (threshold: ' + str(threshold) + 's)')
17+
sys.exit(1)
18+
19+
print(item, 'was last updated', delta, 'seconds ago, everything is normal (threshold: ' + str(threshold) + 's)')
20+
21+
22+
# If the time between the last update is larger than this threshold in seconds, change the status to unhealthy
23+
threshold = 5
24+
25+
# Get environmental variable determining logging to csv or db or both
26+
default_value = 'CSV'
27+
log_env = os.getenv('CELL_LOG', default_value)
28+
29+
# If the container is logging to csv
30+
if 'CSV' in log_env.upper():
31+
logpath = '/data/log/cell.csv'
32+
# Ensure the file exists in the first place
33+
if not os.path.exists(logpath):
34+
print('Log file did not exist or was not able to be opened')
35+
sys.exit(1)
36+
37+
# Get the last time the file was modified
38+
lastupdate = datetime.fromtimestamp(os.path.getmtime(logpath))
39+
checktimestamp(logpath, lastupdate, threshold)
40+
41+
# If the container is logging to database
42+
if 'DB' in log_env.upper():
43+
# Connect to the database
44+
connectionurl='postgresql://' + os.environ['db_user'] + ':' + os.environ['db_password'] + '@postgres:' + os.environ['db_port'] + '/' + os.environ['db_database']
45+
db = postgres.Postgres(url=connectionurl)
46+
47+
# Query the most recent timestamp
48+
rst = db.one("SELECT * FROM cell ORDER BY time DESC LIMIT 1;")
49+
50+
# Handle if the database is empty
51+
if rst == None:
52+
print("Database has the wrong schema or no data");
53+
sys.exit(1)
54+
55+
# First column is the timestamp, in datetime format
56+
lastupdate = rst[0]
57+
checktimestamp('Cell signal database table', lastupdate, threshold)
58+

services/gps2tsdb/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ services:
55
build:
66
# Cannot use `./` due to this issue https://github.com/docker/compose/issues/3530#issuecomment-222490501
77
context: ../gps2tsdb
8-
restart: always
8+
restart: unless-stopped
99
ports:
1010
- 10001:10001
1111
environment:

services/gps2tsdb/healthcheck.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,38 @@
22
import postgres
33
import sys
44
import os
5-
import datetime
5+
from datetime import datetime
66

7+
# Take name of service, timestamp of last update, and threshold
8+
# Compares the current time to the passed timestamp, and exits or outputs the appropriate message
9+
# Not the most reusable function, but avoids copy and pasting the same code for both the csv and db checks
10+
# If everything is okay, the function returns void, and if something is wrong it exits the program. Inconsistent
11+
# but leads to concise code... Should consider changing in future
12+
def checktimestamp(item, timestamp, threshold):
13+
# Subtracting two datetimes gives a timedelta, we can convert this into an integer seconds
14+
delta = (datetime.now(lastupdate.tzinfo) - timestamp).total_seconds()
15+
if delta > threshold:
16+
print(item, 'was last updated', delta, 'seconds ago, something is wrong (threshold: ' + str(threshold) + 's)')
17+
sys.exit(1)
18+
19+
print(item, 'was last updated', delta, 'seconds ago, everything is normal (threshold: ' + str(threshold) + 's)')
20+
21+
22+
# If the time between the last update is larger than this threshold in seconds, change the status to unhealthy
723
threshold = 5
824

25+
# Connect to the database
926
connectionurl='postgresql://' + os.environ['db_user'] + ':' + os.environ['db_password'] + '@postgres:' + os.environ['db_port'] + '/' + os.environ['db_database']
1027
db = postgres.Postgres(url=connectionurl)
1128

29+
# Query the most recent timestamp
1230
rst = db.one("SELECT * FROM gps ORDER BY time DESC LIMIT 1;")
1331

32+
# Handle if the database is empty
1433
if rst == None:
15-
print("Database has the wrong schema or no data");
34+
print('GPS database table has the wrong schema or no data');
1635
sys.exit(1)
1736

37+
# First column is the timestampt, in datetime format
1838
lastupdate = rst[0]
19-
tdelta = datetime.datetime.now(lastupdate.tzinfo) - lastupdate
20-
sdelta = tdelta.total_seconds()
21-
if sdelta > threshold:
22-
print('Database was last updated', sdelta, 'seconds ago, something is wrong ( threshold:', threshold, 's )');
23-
sys.exit(1)
24-
25-
print('Database was last updated', sdelta, 'seconds ago, everything is normal ( threshold:', threshold, 's )');
39+
checktimestamp('GPS database table', lastupdate, threshold)

services/oada_upload/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ services:
33
oada-upload:
44
build:
55
context: ../oada_upload
6-
restart: always
6+
restart: unless-stopped
77
environment:
88
isoblue_id: "ib999"
99
oada_server_batchsize: "1000"

services/postgres/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ version: '3.1'
22
services:
33
postgres:
44
image: timescale/timescaledb:latest-pg12
5-
restart: always
5+
restart: unless-stopped
66
environment:
77
POSTGRES_PASSWORD: "password"
88
POSTGRES_USER: "avena"

0 commit comments

Comments
 (0)