diff --git a/bin/mt_station_conf b/bin/mt_station_conf index 0c8bc8b..71a3322 100755 --- a/bin/mt_station_conf +++ b/bin/mt_station_conf @@ -109,6 +109,10 @@ def parse_args(): group.add_argument("--log-file", dest="log_file", help="Log file name") + group.add_argument("--tc-timeout", + dest="tc_timeout", + type=int, + help="Set tc_timeout value in seconds") options = parser.parse_args() if options.debug: @@ -164,6 +168,13 @@ def main(): if options.gps: data['station_conf']['gps'] = options.gps data['station_conf']['pps'] = "fuzzy" if options.fuzzy_pps else "gps" + if options.tc_timeout: + for key in data['station_conf'].keys(): + if key.lower() == "tc_timeout": + data['station_conf'][key] = "%ds" % options.tc_timeout + break + else: + data['station.conf']["tc_timeout"] = "%ds" % options.tc_timeout print(json.dumps(data, indent=4)) diff --git a/roles/conduit/README.md b/roles/conduit/README.md index 9b4cbdb..bf2caf7 100644 --- a/roles/conduit/README.md +++ b/roles/conduit/README.md @@ -114,6 +114,26 @@ The following tags can be used to run a subset of the playbook.
Sets up an ssh tunnel back to a control host
+LED Status +---------- + +On Conduits (not Conduit APs), the LED indications are: + +| LED | Meaning | +|--------|------------------------------------------------------------------------------------------------------| +| Status | Blinks twice a second when the OS is running | +| LS | Blicks every couple seconds when Cellular modem is connected | +| A | We are trying to use PPP | +| B | An SSH tunnel connection to the jump host is in Established state | +| C | A packet forwarder is running and has the LoRa device open (does not work on I2C cards running mp) N | +| D | DNS resolution of google.com works | + +The A-D LEDs are the ones on the right and have different lables on +older Conduits (CD and one to 3 bars). + +The A-D LEDs reverse periodically to indicate that the check program +is running. + License ------- @@ -123,4 +143,3 @@ Author Information ------------------ Jeffrey Honig - diff --git a/roles/conduit/defaults/main.yml b/roles/conduit/defaults/main.yml index 141383c..a27b589 100644 --- a/roles/conduit/defaults/main.yml +++ b/roles/conduit/defaults/main.yml @@ -90,6 +90,8 @@ ssh_tunnel_ssh_key: /etc/ssh/ssh_host_rsa_key ssh_tunnel_daemon: /usr/bin/autossh ssh_tunnel_ssh_port: 22 ssh_tunnel_base_port: 0 +ssh_tunnel_first_poll: 120 +ssh_tunnel_poll: 60 # Static Nameserver defaults resolv_conf_static: /var/config/network/resolv.conf-static @@ -176,11 +178,16 @@ monit_pktfwd_stop: '"/etc/init.d/{{ monit_pktfwd_initscript }} stop"' monit_pktfwd_restart: '"/etc/init.d/{{ monit_pktfwd_initscript }} restart"' monit_pktfwd_reboot: 'exec "/sbin/reboot"' +# Monit loadavg +monit_loadavg_1m: 2 +monit_loadavg_5m: 4 +monit_memory_usage: 30% + # Monitor space on / monit_root_test: "usage > 50%" # Monitor space on /var/config -monit_config_test: "usage > 15%" +monit_config_test: "usage > 20%" # Monitor space on /var/volatile monit_volatile_test: "usage > 75%" diff --git a/roles/conduit/files/conduit_leds.initd b/roles/conduit/files/conduit_leds.initd index e790271..6c1379a 100644 --- a/roles/conduit/files/conduit_leds.initd +++ b/roles/conduit/files/conduit_leds.initd @@ -27,7 +27,7 @@ fi [ -x ${DAEMON} ] || exit 0 is_running() { - pgrep -F ${PIDFILE} > /dev/null + pgrep -F ${PIDFILE} > /dev/null 2>&1 } start() { @@ -40,22 +40,56 @@ stop() { case "$1" in start) - is_running || start + if is_running; then + echo "${NAME} is already running: $(pgrep -F ${PIDFILE} -a)" >&2 + exit 1 + fi + echo "Starting ${NAME}" >&2 + start ;; stop) - stop + if is_running; then + echo "Stopping ${NAME}" >&2 + stop + else + echo "${NAME} is not running" >&2 + fi ;; restart|reload) - nohup ${0} do_restart - ;; - do_restart) - stop + if is_running; then + echo "Stopping ${NAME}" >&2 + stop + else + echo "${NAME} is not running" >&2 + fi + + # Wait for it to stop + tries=10 + while is_running; do + if [ "${tries}" -eq 0 ]; then + echo "${NAME} failed to stop" >&2 + exit 1 + fi + tries=$((tries - 1)) + sleep 1 + done + + echo "Starting ${NAME}" >&2 start ;; status) - is_running + if is_running; then + echo "${NAME} is running with PID $(pgrep -F ${PIDFILE} -a)" >&2 + else + echo "${NAME} is not running" >&2 + exit 1 + fi ;; *) - echo "Usage: $0 {start|stop|status|restart}" + echo "Usage: $0 {start|stop|status|restart}" >&2 + exit 2 + ;; esac + +exit 0 #========================================= diff --git a/roles/conduit/files/conduit_leds.py b/roles/conduit/files/conduit_leds.py index 3c91598..09e84e6 100755 --- a/roles/conduit/files/conduit_leds.py +++ b/roles/conduit/files/conduit_leds.py @@ -31,20 +31,24 @@ from contextlib import contextmanager import errno import fcntl -import ipaddress import logging from logging.handlers import SysLogHandler import os -import pprint import psutil import re import socket import stat -import struct import subprocess import sys import time +cached_ip = None + +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + class LockFileTimeout(Exception): def __init__(self, error): self.value = error @@ -55,26 +59,30 @@ def __str__(self): def pidfilelock(name): """ Context to lock a pid file """ - time_left = 30 + time_end = time.time() + 30 pidfile_path = os.path.join("/var/run", name + ".pid") - lock_file = open(pidfile_path, 'w+') + fd = os.open(pidfile_path, os.O_RDWR | os.O_CREAT, 0o644) + lock_file = os.fdopen(fd, "r+") while True: try: logging.debug("Attempting to lock %s", pidfile_path) fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB) - lock_file.write(str(os.getpid()) + '\n') - lock_file.flush() - logging.debug("Wrote %d to %s", os.getpid(), pidfile_path) - break except IOError as err: if err.errno != errno.EAGAIN: raise err - else: - logging.debug("Timeout trying to lock", pidfile_path) - time.sleep(1) - time_left -= 1 - if time_left == 0: - raise LockFileTimeout("Unable to lock %s" % pidfile_path) + logging.debug("Timeout trying to lock: %s", pidfile_path) + time.sleep(1) + if time.time() >= time_end: + raise LockFileTimeout("Unable to lock %s" % pidfile_path) + continue + else: + lock_file.seek(0) + lock_file.truncate() + lock_file.write("%d\n" % os.getpid()) + lock_file.flush() + os.fsync(fd) + logging.debug("Wrote %d to %s", os.getpid(), pidfile_path) + break try: yield lock_file @@ -83,7 +91,7 @@ def pidfilelock(name): fcntl.flock(lock_file, fcntl.LOCK_UN) os.unlink(pidfile_path) lock_file.close() - + class Defaults(object): """ Read a /etc/defaults file """ @@ -145,7 +153,7 @@ def write(self, name, value): with open(os.path.join(self.ROOT, name), "w") as fp: fp.write("%s\n" % value) - + class LEDs(object): """ Control LEDs """ @@ -211,7 +219,7 @@ def daemonize(): # exit first parent sys.exit(0) except OSError as err: - logging.exception("First fork failed") + logging.exception("First fork failed: %s", err) return False # decouple from parent environment @@ -225,7 +233,7 @@ def daemonize(): # exit from second parent sys.exit(0) except OSError as err: - logging.exception("Second fork failed") + logging.exception("Second fork failed: %s", err) return False # redirect standard file descriptors @@ -265,9 +273,6 @@ def parse_args(): help="Don't send notifications, just list what we are going to do") group = parser.add_argument_group("Options") - group.add_argument("--pidfile", - dest="pidfile", default="/var/run/conduit_leds.pid", - help="Location of the PID file") group.add_argument("--interval", default=60.0, type=float, help="Seconds to wait between checks") @@ -278,9 +283,9 @@ def parse_args(): dest="foreground", default=False, action='store_true', help="Do not fork; run in foreground") - group.add_argument("--modem", - dest="modem", default="/dev/modem_at0", - help="Modem device for Cell service") + group.add_argument("--want-ppp-file", + default="/var/run/using_ppp", + help="File to exist if we want to be using PPP") # Parse args options = parser.parse_args() @@ -288,9 +293,6 @@ def parse_args(): if options.noop: options.debug = True - # Init Logging - init_logging(options) - return options def check_tunnel(options): @@ -319,17 +321,18 @@ def check_tunnel(options): if not cached_ip: logging.info("check_tunnel: Unable to resolve %s", remote_host) return False - remote_host = cached_ip - logging.info("check_tunnel: Using cached IP %s", remote_host) - - for conn in psutil.net_connections(): - if conn.type == socket.SOCK_STREAM and conn.status == psutil.CONN_ESTABLISHED and conn.raddr == (remote_ip, local_port): - logging.info("check_tunnel: Found connection to %s(%s):%s with PID %d", - remote_host, - remote_ip, - local_port, - conn.pid) - return True + remote_ip = cached_ip + logging.info("check_tunnel: Using cached IP %s", remote_ip) + + if remote_ip: + for conn in psutil.net_connections(): + if conn.type == socket.SOCK_STREAM and conn.status == psutil.CONN_ESTABLISHED and conn.raddr == (remote_ip, local_port): + logging.info("check_tunnel: Found connection to %s(%s):%s with PID %d", + remote_host, + remote_ip, + local_port, + conn.pid) + return True logging.info("check_tunnel: No connection found to %s(%s):%s", remote_host, remote_ip, local_port) return False @@ -362,48 +365,16 @@ def check_lora(options, device_path): return True -# PPPd assigns one of the following addresses until we receive one (add ppp interface index) -HISADDR_STATIC = ipaddress.ip_address(u"10.64.64.64") -HISADDR_DYNAMIC = ipaddress.ip_address(u"10.112.112.112") -PPP_RE = re.compile(r'ppp(?P\d+)$') - -def check_ppp(options): - """ Check status of ppp connection """ +def check_ppp(options, mtsio): + """ Check if monitor_modem wants PPP to be running """ try: - modem_stat = os.stat(options.modem) - if not stat.S_ISCHR(modem_stat.st_mode): - logging.debug("check_ppp: %s not a character device", options.modem) - return False - except OSError as error: - logging.debug("check_ppp: %s: %s", options.modem, error) + return stat.S_ISREG(os.stat(options.want_ppp_file).st_mode) + except OSError: + # Not using PPP return False - peer_addr = None - for ifname, ifaddrs in psutil.net_if_addrs().items(): - match = PPP_RE.match(ifname) - if not match: - continue - ppp_ifnum = int(match.group('index')) - for ifaddr in ifaddrs: - if ifaddr.family != socket.AF_INET: - continue - if ifaddr.ptp is None: - continue - if ifaddr.ptp in [str(HISADDR_STATIC + ppp_ifnum), str(HISADDR_DYNAMIC + ppp_ifnum)]: - # Remote has not given us an address yet - logging.debug("check_ppp: Remote has not provided an address for %s: %s", ifname, ifaddr.ptp) - continue - peer_addr = ifaddr.ptp - break - - if not peer_addr: - logging.debug("check_ppp: No valid peer address found") - return False - - return True - -def process(options, leds, device_path): +def process(options, mtsio, leds, device_path): """ Check all the services """ if check_dns(options): @@ -421,7 +392,7 @@ def process(options, leds, device_path): else: leds.clear(LEDs.LED_B) - if check_ppp(options): + if check_ppp(options, mtsio): leds.set(LEDs.LED_A) else: leds.clear(LEDs.LED_A) @@ -430,14 +401,23 @@ def init_logging(options): """ Set up logging """ logger = logging.getLogger() - logger.handlers = [] syslog_format = '%s[%%(process)s]: %%(message)s' % (os.path.basename(sys.argv[0])) - syslog_handler = SysLogHandler(address="/dev/log", - facility=SysLogHandler.LOG_DAEMON) - syslog_handler.setFormatter(logging.Formatter(syslog_format)) if not sys.stdout.isatty(): + # Repeat until syslog is available + while True: + try: + syslog_handler = SysLogHandler(address="/dev/log", + facility=SysLogHandler.LOG_DAEMON) + except FileNotFoundError as err: + logging.warning("Unable to open /dev/log: %s, waiting", err) + time.sleep(1) + else: + break + syslog_handler.setFormatter(logging.Formatter(syslog_format)) + logger.handlers = [] logger.addHandler(syslog_handler) else: + logger.handlers = [] logger.addHandler(logging.StreamHandler(stream=sys.stdout)) if options.debug: @@ -450,7 +430,7 @@ def init_logging(options): def main(): """It all happens here""" - progname = os.path.basename(sys.argv[0]) + progname = os.path.splitext(os.path.basename(sys.argv[0]))[0] options = parse_args() @@ -458,6 +438,9 @@ def main(): if not daemonize(): return 1 + # Do this after daemonize or we'll hang the system startup. + init_logging(options) + mtsio = MTSIO() hwversion = mtsio.read('hw-version') @@ -482,26 +465,33 @@ def main(): logging.warning("No device found for %s", lora_hwversion) try: - with pidfilelock(progname) as pid_file: + with pidfilelock(progname): leds = LEDs(mtsio) # XXX - Spread the tests out over 1/4 of the interval? # XXX - Ping the remote side of the PPP connection? Requires exec - next_time = time.time() - while True: - if time.time() > next_time: - while time.time() > next_time: - next_time += options.interval - logging.debug("Checking status") - process(options, leds, device_path) - else: - logging.debug("Flashing LEDs") - leds.flashall() - duration = min(5.0, next_time - time.time()) - if duration > 0: - logging.debug("Sleeping for %f seconds", duration) - time.sleep(duration) + try: + next_time = time.time() + while True: + if time.time() > next_time: + while time.time() > next_time: + next_time += options.interval + logging.debug("Checking status") + process(options, mtsio, leds, device_path) + else: + logging.debug("Flashing LEDs") + leds.flashall() + duration = min(5.0, next_time - time.time()) + if duration > 0: + logging.debug("Sleeping for %f seconds", duration) + time.sleep(duration) + except KeyboardInterrupt: + print("") + LEDs(MTSIO()) + except Exception as exc: + logging.exception(exc) + LEDs(MTSIO()) except LockFileTimeout: logging.critical("Another instance of %s is running", progname) return 1 @@ -513,9 +503,5 @@ def main(): rc = main() except KeyboardInterrupt: print("") - LEDs(MTSIO()) - except Exception as exc: - logging.exception(exc) - LEDs(MTSIO()) sys.exit(rc) diff --git a/roles/conduit/files/facts.d/firmware.fact b/roles/conduit/files/facts.d/firmware.fact index 15779b5..d94618b 100755 --- a/roles/conduit/files/facts.d/firmware.fact +++ b/roles/conduit/files/facts.d/firmware.fact @@ -2,12 +2,11 @@ import json import os -import sys if os.path.isdir("/lib/firmware"): all_files = [] for realroot, dirs, files in os.walk("/lib/firmware", topdown=True): - if realroot is "/lib/firmware": + if realroot == "/lib/firmware": root = "" else: root = realroot.replace("/lib/firmware/", "") @@ -16,7 +15,3 @@ if os.path.isdir("/lib/firmware"): if len(all_files): print(json.dumps(all_files, indent=4)) - - - - diff --git a/roles/conduit/files/facts.d/mts_io.fact b/roles/conduit/files/facts.d/mts_io.fact new file mode 100755 index 0000000..ecfee2e --- /dev/null +++ b/roles/conduit/files/facts.d/mts_io.fact @@ -0,0 +1,15 @@ +#!/bin/bash + +cd /sys/devices/platform/mts-io || exit 1 + +devs=$(find * -type f | ( + comma= + while read dev; do + echo -n "${comma}\"${dev//-/_}\": \"$(cat ${dev} 2>/dev/null)\"" + comma=", " + done +)) + +echo '{' +echo " ${devs}" +echo '}' diff --git a/roles/conduit/files/ifup_restart b/roles/conduit/files/ifup_restart index c6e2031..bc2c9b1 100644 --- a/roles/conduit/files/ifup_restart +++ b/roles/conduit/files/ifup_restart @@ -2,9 +2,9 @@ # Restart services when we get an address -[ $METHOD = "dhcp" ] && exit 0 +[ "${METHOD}" = "dhcp" ] && exit 0 -logger -s -t $(basename ${0}) -p daemon.info "$METHOD interface $IFACE is up, restarting services" +logger -s -t $(basename ${0}) -p daemon.info "${METHOD} interface ${IFACE} is up, restarting services" test -x /etc/init.d/ttn-pkt-forwarder && /etc/init.d/ttn-pkt-forwarder restart test -x /etc/init.d/lora-basic-station && /etc/init.d/lora-basic-station restart test -x /etc/init.d/ssh_tunnel && /etc/init.d/ssh_tunnel restart diff --git a/roles/conduit/files/monitor_modem.initd b/roles/conduit/files/monitor_modem.initd new file mode 100644 index 0000000..2d60ff9 --- /dev/null +++ b/roles/conduit/files/monitor_modem.initd @@ -0,0 +1,95 @@ +#!/bin/sh +# +#Start monitor_modem as a service + +### BEGIN INIT INFO +# Provides: monitor_modem +# Required-Start: $local_fs $network $syslog $dbus +# Required-Stop: $local_fs $network $syslog $dbus +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Description: Display connection status in Conduit LEDs +### END INIT INFO + +PATH="/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin" +NAME=monitor_modem +DAEMON=/usr/local/sbin/monitor_modem +PIDFILE=/var/run/${NAME}.pid +DAEMON_ARGS= + +# source function library +. /etc/init.d/functions + +if [ -r /etc/default/${NAME} ]; then + . /etc/default/${NAME} +fi + +[ -x ${DAEMON} ] || exit 0 + +is_running() { + pgrep -F ${PIDFILE} > /dev/null 2>&1 +} + +start() { + start-stop-daemon --start --quiet -p ${PIDFILE} --exec ${DAEMON} -- ${DAEMON_ARGS} +} + +stop() { + start-stop-daemon --stop --quiet --p ${PIDFILE} +} + +case "$1" in + start) + if is_running; then + echo "${NAME} is already running: $(pgrep -F ${PIDFILE} -a)" >&2 + exit 1 + fi + echo "Starting ${NAME}" >&2 + start + ;; + stop) + if is_running; then + echo "Stopping ${NAME}" >&2 + stop + else + echo "${NAME} is not running" >&2 + fi + ;; + restart|reload) + if is_running; then + echo "Stopping ${NAME}" >&2 + stop + else + echo "${NAME} is not running" >&2 + fi + + # Wait for it to stop + tries=10 + while is_running; do + if [ "${tries}" -eq 0 ]; then + echo "${NAME} failed to stop" >&2 + exit 1 + fi + tries=$((tries - 1)) + sleep 1 + done + + echo "Starting ${NAME}" >&2 + start + ;; + status) + if is_running; then + echo "${NAME} is running with PID $(pgrep -F ${PIDFILE} -a)" >&2 + else + echo "${NAME} is not running" >&2 + exit 1 + fi + ;; + *) + echo "Usage: $0 {start|stop|status|restart}" >&2 + exit 2 + ;; +esac + +exit 0 +#========================================= diff --git a/roles/conduit/files/monitor_modem.py b/roles/conduit/files/monitor_modem.py new file mode 100755 index 0000000..a58b625 --- /dev/null +++ b/roles/conduit/files/monitor_modem.py @@ -0,0 +1,863 @@ +#!/usr/bin/env python + +""" +MIT License + +Copyright (c) 2025 Jeffrey C Honig + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +from __future__ import print_function + +import array +import argparse +from contextlib import contextmanager +import errno +import fcntl +import logging +from logging.handlers import SysLogHandler +import os +import psutil +import select +import signal +import socket +import struct +import subprocess +import sys +import time + +if not hasattr(socket, 'SO_BINDTODEVICE'): + socket.SO_BINDTODEVICE = 25 + +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + +class LockFileTimeout(Exception): + def __init__(self, error): + self.value = error + def __str__(self): + return repr(self.value) + +class DNSTimeout(Exception): + def __init__(self, error): + self.value = error + def __str__(self): + return repr(self.value) + +# Global flag to indicate shutdown +shutdown_requested = False + +def catch_interrupt(signum, frame): + global shutdown_requested + logging.warning("Received signal %s, initiating shutdown.", signum) + shutdown_requested = True + +@contextmanager +def pidfilelock(name): + """ Context to lock a pid file """ + + time_end = time.time() + 30 + pidfile_path = os.path.join("/var/run", name + ".pid") + fd = os.open(pidfile_path, os.O_RDWR | os.O_CREAT, 0o644) + lock_file = os.fdopen(fd, "r+") + while True: + try: + logging.debug("Attempting to lock %s", pidfile_path) + fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB) + except IOError as err: + if err.errno != errno.EAGAIN: + raise err + logging.debug("Timeout trying to lock: %s", pidfile_path) + time.sleep(1) + if shutdown_requested or time.time() >= time_end: + raise LockFileTimeout("Unable to lock %s" % pidfile_path) + continue + else: + lock_file.seek(0) + lock_file.truncate() + lock_file.write("%d\n" % os.getpid()) + lock_file.flush() + os.fsync(fd) + logging.debug("Wrote %d to %s", os.getpid(), pidfile_path) + break + + try: + yield lock_file + finally: + logging.debug("Unlocking %s", pidfile_path) + fcntl.flock(lock_file, fcntl.LOCK_UN) + os.unlink(pidfile_path) + lock_file.close() + +def daemonize(): + """ Run as a daemon """ + + try: + pid = os.fork() + if pid > 0: + # exit first parent + sys.exit(0) + except OSError: + logging.exception("First fork failed") + return False + + # decouple from parent environment + os.chdir('/') + os.setsid() + os.umask(0) + # do second fork + try: + pid = os.fork() + if pid > 0: + # exit from second parent + sys.exit(0) + except OSError: + logging.exception("Second fork failed") + return False + + # redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + si = open(os.devnull, 'r') + so = open(os.devnull, 'w') + se = open(os.devnull, 'w') + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + + return True + +def parse_args(): + """ What do we need to do """ + + parser = argparse.ArgumentParser(description="Check for errors in Basic Station log") + + # Debugging + group = parser.add_argument_group("Debugging options") + group.add_argument("-d", "--debug", + dest="debug", default=False, + action='store_true', + help="print debugging messages") + group.add_argument("--nodebug", + dest="debug", + action='store_false', + help="print debugging messages") + group.add_argument("-v", "--verbose", + dest="verbose", default=False, + action='store_true', + help="print verbose messages") + group.add_argument("-n", "--noop", + dest="noop", default=False, + action='store_true', + help="Don't send notifications, just list what we are going to do") + + group = parser.add_argument_group("Options") + group.add_argument("--interval", + default=60.0, type=float, + help="Seconds to wait between checks") + group.add_argument("--hostname", + default="ec2-54-221-216-139.compute-1.amazonaws.com", + help="Hostname to check") + group.add_argument("--pings", + type=int, default=10, + help="Number of pings to receive") + group.add_argument("--foreground", "-f", + dest="foreground", default=False, + action='store_true', + help="Do not fork; run in foreground") + group.add_argument("--has-radio", + dest="has_radio", default="/sys/devices/platform/mts-io/has-radio", + help="Device file that indicates presence of modem") + group.add_argument("--real-ppp-on-boot", + default="/var/config/ppp/ppp_on_boot", + help="Where to link /etc/ppp_on_boot to when enabling ppp") + group.add_argument("--ppp-on-boot", + default="/etc/ppp/ppp_on_boot", + help="Where system looks for ppp startup script") + group.add_argument("--change-script", + default="/var/config/ifup_restart", + help="Script to run when status changes") + group.add_argument("--want-ppp-file", + default="/var/run/using_ppp", + help="File to exist if we want to be using PPP") + group.add_argument("--ignore-link-time", + default=60*60*3, + type=int, + help="How often (in seconds) to retry broadcast interfaces if they did not work when we tried them") + + # Parse args + options = parser.parse_args() + + # --test implies --verbose + if options.noop: + options.debug = True + + if options.debug: + options.verbose = True + + return options + +def init_logging(options): + """ Set up logging """ + + logger = logging.getLogger() + syslog_format = '%s[%%(process)s]: %%(message)s' % (os.path.basename(sys.argv[0])) + if not sys.stdout.isatty(): + # Repeat until syslog is available + while True: + try: + syslog_handler = SysLogHandler(address="/dev/log", + facility=SysLogHandler.LOG_DAEMON) + except FileNotFoundError as err: + logging.warning("Unable to open /dev/log: %s, waiting", err) + time.sleep(1) + else: + break + syslog_handler.setFormatter(logging.Formatter(syslog_format)) + logger.handlers = [] + logger.addHandler(syslog_handler) + else: + logger.handlers = [] + logger.addHandler(logging.StreamHandler(stream=sys.stdout)) + + if options.debug: + logger.setLevel('DEBUG') + elif options.verbose: + logger.setLevel('INFO') + else: + logger.setLevel('WARNING') + +# Example usage: +# bytes_sent = sendmsg_with_pktinfo(sock, packet, "8.8.8.8", interface="eth0") +# If you prefer to supply interface index directly, you can call _ifname_to_index("eth0") yourself. +if struct.pack("H",1) == "\x00\x01": # big endian + def checksum(pkt): + if len(pkt) % 2 == 1: + pkt += "\0" + s = sum(array.array("H", pkt)) + s = (s >> 16) + (s & 0xffff) + s += s >> 16 + s = ~s + return s & 0xffff +else: + def checksum(pkt): + if len(pkt) % 2 == 1: + pkt += "\0" + s = sum(array.array("H", pkt)) + s = (s >> 16) + (s & 0xffff) + s += s >> 16 + s = ~s + return (((s>>8)&0xff)|s<<8) & 0xffff + +def resolve_with_timeout(hostname, timeout=5): + def handler(signum, frame): + raise DNSTimeout("Timeout during name resolution") + + old_handler = signal.signal(signal.SIGALRM, handler) + signal.alarm(timeout) # seconds + + ip_addr = None + try: + ip_addr = socket.getaddrinfo(hostname, None)[0][4][0] + except DNSTimeout as error: + logging.error("resolve_with_timeout: Timeout resolving: %s: %s", hostname, error) + except socket.gaierror as error: + logging.error("resolve_with_timeout: error resolving %s: %s", hostname, error) + else: + logging.debug("resolve_with_timeout: %s -> %s", hostname, ip_addr) + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, old_handler) + + return ip_addr + +def icmp_echo(dst_name, interface=None, payload=b'hello', id_=None, seq=1): + + logging.debug("icmp_echo(%s, interface=%s, id=%s, seq=%d)", dst_name, interface, id_, seq) + + if id_ is None: + id_ = os.getpid() & 0xFFFF + + # raw ICMP socket (IPv4) + sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, socket.IPPROTO_ICMP) + + if interface is not None: + # Bind to specific interface (Linux only). Requires root. + sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, interface.encode() + b'\0') + + # Build ICMP echo request header: type(8)=echo request, code=0, checksum, id, seq + ICMP_TYPE = 8 + ICMP_CODE = 0 + header = struct.pack('!BBHHH', ICMP_TYPE, ICMP_CODE, 0, id_, seq) + packet = header + payload + chksum = checksum(packet) + header = struct.pack('!BBHHH', ICMP_TYPE, ICMP_CODE, chksum, id_, seq) + packet = header + payload + + try: + dst_ip = resolve_with_timeout(dst_name, timeout=1) + except DNSTimeout: + return False + else: + if dst_ip is None: + return False + + t0 = time.time() + try: + sock.sendto(packet, (dst_ip, 0)) + except socket.gaierror as error: + logging.error("sendto error: %s", error) + return False + + deadline = t0 + 10.0 + while True: + remaining = deadline - time.time() + if remaining <= 0: + return False + + # Wait for the socket to be ready + try: + ready, _, _ = select.select([sock], [], [], remaining) + if not ready: + logging.debug("icmp_echo: timeout") + return False + except select.error: + if shutdown_requested: + return + continue + + # Read pending packets + while True: + if shutdown_requested: + return + + try: + recv_packet, addr = sock.recvfrom(65535, socket.MSG_DONTWAIT) + except socket.error as err: + if err[0] == errno.EINTR: + continue + if err[0] in (errno.EAGAIN, errno.EWOULDBLOCK): + return False + logging.warning("recvfrom returns: %s", err) + return False + + iph_len = (struct.unpack("!B", recv_packet[:1])[0] & 0xf) * 4 + icmp_packet = recv_packet[iph_len:] + if len(icmp_packet) < 8: + continue + r_type, r_code, r_chksum, r_id, r_seq = struct.unpack("!BBHHH", icmp_packet[:8]) + logging.debug("RECV type %d code %d id %d seq %d", r_type, r_code, r_id, r_seq) + + if r_type == 0 and r_id == id_ and r_seq == seq: + return True + + return False + +def get_ppp_addresses(): + """ + Returns IP addresses of ppp interfaces + """ + + result = set() + + # Get interface addresses and stats + addrs = psutil.net_if_addrs() + stats = psutil.net_if_stats() + + for iface, iface_addrs in addrs.items(): + if iface != 'ppp0': + continue + iface_stat = stats.get(iface) + if not iface_stat: + continue + + # Skip interfaces that are down + if not iface_stat.isup: + continue + + # Check for IPv4 with broadcast + for addr in iface_addrs: + if addr.family == 2: # AF_INET (IPv4) + if addr.address and addr.ptp: + result.add(addr.address) + + return result + +def get_broadcast_interfaces(): + """ + Returns a list of interface names that: + - Are up (`isup` flag) + - Have an IPv4 address assigned + - Have a broadcast address assigned + - Have carrier detected (physical link up for Ethernet) + """ + result = [] + + # Get interface addresses and stats + addrs = psutil.net_if_addrs() + stats = psutil.net_if_stats() + + for iface, iface_addrs in addrs.items(): + iface_stat = stats.get(iface) + if not iface_stat: + continue + + # Skip interfaces that are down + if not iface_stat.isup: + continue + + # Check for IPv4 with broadcast + iface_address = None + for addr in iface_addrs: + if addr.family == 2: # AF_INET (IPv4) + if addr.address and addr.broadcast: + iface_address = addr.address + break + if not iface_address: + continue + + # Check carrier + carrier_file = "/sys/class/net/{}/carrier".format(iface) + try: + with open(carrier_file, 'r') as fp: + carrier = fp.read().strip() + if carrier != '1': + continue + except IOError: + # If the file doesn't exist, assume link is up (virtual interface) + pass + + # Passed all checks + result.append((iface, iface_address)) + + return result + +def ppp_on_boot(options, enable): + """ Link or unlink system ppp startup script """ + + try: + ppp_on_boot_stat = os.stat(options.ppp_on_boot) + except (OSError, IOError): + logging.error("Unable to get stat info about %s", options.ppp_on_boot) + return + + logging.debug("ppp_on_boot(%s): %s -> %o", enable, options.ppp_on_boot, ppp_on_boot_stat.st_mode) + + if enable: + if ppp_on_boot_stat.st_mode & 0o111 != 0o111: + try: + os.chmod(options.ppp_on_boot, 0o755) + logging.warning("ppp_on_boot: %s set to executable", options.ppp_on_boot) + except OSError as error: + logging.error("Error making %s executable: %s", + options.ppp_on_boot, + error) + return + + if ppp_on_boot_stat.st_mode & 0o111 != 0: + try: + os.chmod(options.ppp_on_boot, 0o644) + logging.warning("ppp_on_boot: %s set to non-executable", options.ppp_on_boot) + except OSError as error: + logging.error("Error making %s non-executable: %s", + options.ppp_on_boot, + error) + +def tunnel_addresses(options): + """ Return local addresses of all established tunnel connections """ + + addresses = set() + + try: + dst_ip = resolve_with_timeout(options.hostname, timeout=5) + except DNSTimeout: + return addresses + else: + if dst_ip is None: + return addresses + + for conn in psutil.net_connections('inet4'): + if conn.type != socket.SOCK_STREAM: + continue + if not conn.raddr: + continue + if conn.raddr.port != 22 or conn.raddr.ip != dst_ip: + continue + if conn.status != 'ESTABLISHED': + continue + + addresses.add(conn.laddr.ip) + + return addresses + +def check_modem(options): + """ Run a set of checks """ + + have_modem = False + try: + with open(options.has_radio, "r") as fp: + have_modem = fp.read().strip() == '1' + except IOError as error: + logging.error("Reading %s: %s", options.has_radio, error) + + have_sim = False + if have_modem: + cmd = ["radio-cmd", "AT+CPIN?"] + try: + output = subprocess.check_output(cmd) + logging.debug("check_modem: %s returned: %s", " ".join(cmd), output) + if "+CPIN: READY" in output: + have_sim = True + except subprocess.CalledProcessError as error: + logging.warning("check_modem: %s returned: %s", " ".join(cmd), error) + + logging.debug("have_modem: %s, have_sim: %s", have_modem, have_sim) + + return have_modem, have_sim + +def pppd(options, enable): + """ Start or stop pppd + + Returns True if state was changed. + """ + + logging.debug("pppd(%s)", enable) + + try: + subprocess.check_output(["pidof", "pppd"], stderr=subprocess.STDOUT) + logging.debug("pppd is running") + ppp_is_running = True + except subprocess.CalledProcessError: + logging.debug("pppd is not running") + ppp_is_running = False + + if enable: + # Tell conduit_leds that we want PPP + try: + fd = os.open(options.want_ppp_file, os.O_CREAT|os.O_EXCL|os.O_WRONLY, 0o644) + with os.fdopen(fd, 'w') as fp: + fp.write("1\n") + except OSError as error: + if error.errno != errno.EEXIST: + logging.error("Writing %s: %s", options.want_ppp_file, error) + + if not ppp_is_running: + cmd = ["/etc/init.d/ppp", "start"] + try: + logging.info("Running: %s", " ".join(cmd)) + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + return False + else: + logging.debug("%s: %s", " ".join(cmd), result.strip()) + + for service in [ 'ppp0', 'pppd']: + cmd = ["/usr/bin/monit", "monitor", service] + try: + logging.info("Running: %s", " ".join(cmd)) + result = subprocess.check_output(["monit", "monitor", service], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + else: + logging.debug("%s: %s", " ".join(cmd), result.strip()) + + return True + + return False + + # We do not want PPP + + # Inform conduit_leds + try: + os.unlink(options.want_ppp_file) + except OSError as error: + if error.errno != errno.ENOENT: + logging.error("Deleting %s: %s", options.want_ppp_file, error) + + if ppp_is_running: + cmd = ["/etc/init.d/ppp", "stop"] + try: + logging.info("Running: %s", " ".join(cmd)) + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + else: + logging.debug("%s: %s", " ".join(cmd), result.strip()) + + for service in [ 'ppp0', 'pppd']: + cmd = ["monit", "unmonitor", service] + try: + logging.info("Running: %s", " ".join(cmd)) + result = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + else: + logging.debug("%s: %s", " ".join(cmd), result.strip()) + + return True + + return False + +class IfState(object): + """ Store IF State """ + def __init__(self, name, address): + self.name = name + self.address = address + self.ignore_time = time.time() + self.link_state = True + self.seen = False + self.seq = -1 + self.responding = None + self.missed_cycles = None + + def __str__(self): + return "%s: %s ignore: %f state: %s, seen: %s, seq: %d, responding: %s" % ( + self.name, + self.address, + self.ignore_time - time.time(), + self.link_state, + self.seen, + self.seq, + self.responding) + +def get_interface_for_dest(dest_ip): + """ + Return the outgoing interface name that the kernel would use to + reach the given destination IP. Uses 'ip route get'. + """ + + try: + output = subprocess.check_output( + ["ip", "-4", "route", "get", dest_ip], + stderr=subprocess.STDOUT + ).strip() + except subprocess.CalledProcessError: + return None + + # Example outputs: + # "8.8.8.8 via 192.168.1.1 dev eth0 src 192.168.1.10" + # "192.168.1.50 dev eth0 src 192.168.1.10" + # "local 192.168.1.100 dev lo src 192.168.1.100" + + parts = output.split() + + # The interface always follows "dev" + if "dev" in parts: + idx = parts.index("dev") + if idx + 1 < len(parts): + return parts[idx + 1] + + return None + +def process_interface(options, if_state): + + logging.debug("process_interface(%s)", if_state) + + if_state.responding = False + + # Test ping response of default_interface + # Seq is a unsigned 16 bit integer + responses = 0 + for ping in range(options.pings): + if_state.seq = if_state.seq + 1 if if_state.seq < 65535 else 0 + logging.debug("send_icmp (seq %d) via %s", if_state.seq, if_state.name) + if icmp_echo(options.hostname, interface=if_state.name, seq=if_state.seq): + responses += 1 + time.sleep(.1) + if shutdown_requested: + return + + # Call it good if we get 80% of our pings back + if responses >= float(options.pings) * 0.80: + logging.warning("Received %d/%d responses on %s, pppd not needed", + responses, + options.pings, + if_state.name) + if_state.responding = True + if_state.missed_cycles = 0 + + return + +def process(options, progname): + """ runs tests in a loop """ + + if_states = {} + while time.sleep(options.interval) is None: + global shutdown_requested + if shutdown_requested: + return + + logging.debug("check_modem") + + have_modem, have_sim = check_modem(options) + if not have_modem or not have_sim: + logging.warning("No Modem or SIM, stopping pppd") + ppp_on_boot(options, False) + pppd(options, False) + continue + + # If we have a modem and sim, ensure ppp_on_boot is enabled + ppp_on_boot(options, True) + + # Mark interfaces as not seen + for if_name, if_state in if_states.items(): + if_state.seen = False + + ppp_new_state = True + do_restart = False + for if_name, if_address in get_broadcast_interfaces(): + if_state = if_states.setdefault(if_name, IfState(if_name, if_address)) + + logging.debug("looking at %s", if_state) + + # Mark as seen + if_state.seen = True + + # Check if we are supposed to be ignoring this link + if if_state.ignore_time > time.time(): + logging.warning("%s: ignoring until %s", + if_name, + time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime(if_state.ignore_time))) + continue + + # Does the default route point here? + default_if_name = get_interface_for_dest("1.1.1.1") + if default_if_name == "ppp0": + # No, tell ppp to stop + ppp_new_state = False + logging.warning("%s: up, ppp is not needed", if_name) + continue + if default_if_name != if_name: + # Not at us, continue + logging.info("%s: up, not default", if_name) + continue + + # It's us, try pinging + was_responding = if_state.responding + process_interface(options, if_state) + if shutdown_requested: + return + + if if_state.responding: + ppp_new_state = False + + # Restarte if it's now responding + if was_responding is False: + logging.info("%s: is now responding", if_name) + do_restart = True + continue + + # Fail a few times before we mark it down + if_state.missed_cycles += 1 + if if_state.missed_cycles > 4: + # Not responding, ignore it for a while + if_state.ignore_time = time.time() + options.ignore_link_time + logging.warning("%s: not responding, ignoring until: %s", + if_name, + time.strftime("%Y-%m-%d %H:%M:%S", + time.localtime(if_state.ignore_time))) + + # Mark current link state + active_addresses = set() + active_ifs = set() + for if_name, if_state in if_states.items(): + if_state.link_state = if_state.seen + if if_state.responding: + active_addresses.add(if_state.address) + active_ifs.add(if_name) + + # Ensure pppd is in the correct state + ppp_state_changed = pppd(options, ppp_new_state) + + if do_restart: + logging.warning("Restarting services") + cmd = [options.change_script] + env = os.environ.copy() + env["METHOD"] = "monitor_modem" + if active_ifs: + env["IFACE"] = ", ".join(list(active_ifs)) + try: + logging.info("Running %s", " ".join(cmd)) + subprocess.check_call(cmd, env=env) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + else: + if ppp_new_state and not ppp_state_changed: + active_addresses.update(get_ppp_addresses()) + + if active_addresses: + tunnel_addrs = tunnel_addresses(options) + logging.debug("Checking that tunnel sources (%s) is in (%s)", + " ".join(list(tunnel_addrs)), + " ".join(list(active_addresses))) + if not active_addresses.intersection(tunnel_addrs): + logging.warning("No ssh_tunnel sources (%s) from active addresses (%s), restarting", + " ".join(list(tunnel_addrs)), + " ".join(list(active_addresses))) + # No tunnel connections from an active interface + cmd = ["/etc/init.d/ssh_tunnel", "restart"] + try: + logging.info("Running %s", " ".join(cmd)) + subprocess.check_call(cmd) + except subprocess.CalledProcessError as error: + logging.error("%s: %s", " ".join(cmd), error) + +def main(): + """It all happens here""" + + progname = os.path.splitext(os.path.basename(sys.argv[0]))[0] + + options = parse_args() + + if not options.foreground: + if not daemonize(): + return 1 + + # Do this after daemonize or we'll hang the system startup. + init_logging(options) + + logging.warning("%s: Started", progname) + + signal.signal(signal.SIGTERM, catch_interrupt) + signal.signal(signal.SIGINT, catch_interrupt) + + try: + with pidfilelock(progname): + process(options, progname) + except LockFileTimeout: + logging.critical("Another instance of %s is running", progname) + return 1 + + return 0 + +if __name__ == "__main__": + rc = 1 + try: + rc = main() + except KeyboardInterrupt: + print("") + except Exception as exc: + logging.exception(exc) + + sys.exit(rc) diff --git a/roles/conduit/files/ssh_tunnel.initd b/roles/conduit/files/ssh_tunnel.initd index d8b6f4f..b28d4ad 100644 --- a/roles/conduit/files/ssh_tunnel.initd +++ b/roles/conduit/files/ssh_tunnel.initd @@ -47,6 +47,9 @@ is_running() { } start() { + for env in $(set | grep -E '^AUTOSSH_'); do + eval export ${env} + done start-stop-daemon --start --quiet --exec ${DAEMON} -- ${DAEMON_ARGS} } diff --git a/roles/conduit/handlers/main.yml b/roles/conduit/handlers/main.yml index c65fac6..8567f85 100644 --- a/roles/conduit/handlers/main.yml +++ b/roles/conduit/handlers/main.yml @@ -6,7 +6,7 @@ # - name: update rc shell: "for init in ttn-pkt-forwarder lora-basic-station; do update-rc.d -f ${init} remove; test -f /etc/init.d/${init} && update-rc.d ${init} defaults 95 30 || true; done" - + # # Restart the packet forwarder # @@ -37,7 +37,7 @@ debug: msg: "Please go to https://console.thethingsnetwork.org/gateways and update the antenna Altitude and Placement your gateways. The - API does not allow these parameters to be set" + API does not allow these parameters to be set" run_once: true # @@ -58,6 +58,12 @@ - name: restart conduit_leds include_tasks: restart_conduit_leds.yml +# +# Restart monitor_modem +# +- name: restart monitor_modem + include_tasks: restart_monitor_modem.yml + # # Restart sshd after config changes # @@ -67,7 +73,7 @@ ignore_errors: true # -# Remind +# Remind # - name: interface reboot debug: @@ -118,3 +124,10 @@ - name: Update system checksum file include_tasks: update_checksums.yml + +# +# Summary +# +- name: Rescued sections + ansible.builtin.debug: + msg: "Failed sections: {{ rescued | join(', ') }}" diff --git a/roles/conduit/tasks/conduit_leds.yml b/roles/conduit/tasks/conduit_leds.yml index 1cb652c..c1d7269 100644 --- a/roles/conduit/tasks/conduit_leds.yml +++ b/roles/conduit/tasks/conduit_leds.yml @@ -7,7 +7,9 @@ mode: "0755" owner: root group: root - notify: restart conduit_leds + notify: + - restart conduit_leds + - Update system checksum file - name: conduit_leds /var/config/init.d/conduit_leds copy: @@ -16,7 +18,9 @@ mode: "0755" owner: root group: root - notify: restart conduit_leds + notify: + - restart conduit_leds + - Update system checksum file - name: conduit_leds Link /etc/init.d/conduit_leds to /var/config/init.d/conduit_leds file: @@ -24,5 +28,7 @@ state: link src: /var/config/init.d/conduit_leds force: yes - notify: restart conduit_leds + notify: + - restart conduit_leds + - Update system checksum file ... diff --git a/roles/conduit/tasks/main.yml b/roles/conduit/tasks/main.yml index 2abdfd2..ea29384 100644 --- a/roles/conduit/tasks/main.yml +++ b/roles/conduit/tasks/main.yml @@ -1,5 +1,14 @@ --- +# +# +# +- name: main Create list of failed sections + ansible.builtin.set_fact: + rescued: [] + tags: + - always + # # Build the list of authorized keys # @@ -177,6 +186,10 @@ rescue: - debug: msg: "main: TTN setup failed, continuing" + - ansible.builtin.set_fact: + rescued: "{{ rescued + ['ttn'] }}" + changed_when: true + notify: Rescued sections tags: - ttn @@ -189,26 +202,62 @@ rescue: - debug: msg: "main: SSH tunnel setup failed, continuing..." + - ansible.builtin.set_fact: + rescued: "{{ rescued + ['ssh_tunnel'] }}" + notify: Rescued sections tags: - ssh_tunnel - monit - setup # -# Secure ssh +# Enable status leds # - name: main Set up conduit_leds - import_tasks: conduit_leds.yml - when: ansible_local.conduit.hw_version.startswith("MTCDT-") + block: + - name: import conduit_leds.yml + import_tasks: conduit_leds.yml + when: ansible_local.conduit.hw_version.startswith("MTCDT-") + rescue: + - debug: + msg: "main: conduit_leds setup failed, continuing..." + - ansible.builtin.set_fact: + rescued: "{{ rescued + ['conduit_leds'] }}" + notify: Rescued sections tags: - conduit_leds - setup +# +# Enable modem monitoring +# +- name: main Set up monitor_modem + block: + - name: import monitor_modem.yml + import_tasks: monitor_modem.yml + rescue: + - debug: + msg: "main: monitor_modem setup failed, continuing..." + - ansible.builtin.set_fact: + rescued: "{{ rescued + ['monitor_modem'] }}" + notify: Rescued sections + tags: + - monitor_modem + - setup + # # Set up monit # - name: main Set up moniit - import_tasks: monit.yml + block: + - name: import monit.yml + import_tasks: monit.yml + rescue: + - debug: + msg: "main: monit setup failed, continuing..." + - ansible.builtin.set_fact: + rescued: "{{ rescued + ['monit'] }}" + notify: Rescued sections tags: - monit - setup diff --git a/roles/conduit/tasks/monitor_modem.yml b/roles/conduit/tasks/monitor_modem.yml new file mode 100644 index 0000000..d6c7d71 --- /dev/null +++ b/roles/conduit/tasks/monitor_modem.yml @@ -0,0 +1,56 @@ +--- + +- name: monitor_modem /var/config/local/sbin/monitor_modem + ansible.builtin.copy: + dest: /var/config/local/sbin/monitor_modem + src: monitor_modem.py + mode: "0755" + owner: root + group: root + notify: + - restart monitor_modem + - Update system checksum file + when: ansible_local.mts_io.has_radio == "1" + +- name: monitor_modem /var/config/init.d/monitor_modem + ansible.builtin.copy: + dest: /var/config/init.d/monitor_modem + src: monitor_modem.initd + mode: "0755" + owner: root + group: root + notify: + - restart monitor_modem + - Update system checksum file + when: ansible_local.mts_io.has_radio == "1" + +- name: monitor_modem Link /etc/init.d/monitor_modem to /var/config/init.d/monitor_modem + ansible.builtin.file: + path: /etc/init.d/monitor_modem + state: link + src: /var/config/init.d/monitor_modem + force: yes + notify: + - restart monitor_modem + - Update system checksum file + when: ansible_local.mts_io.has_radio == "1" + +- name: monitor_modem Remove if a modem is not present + ansible.builtin.file: + path: "{{ item }}" + state: absent + force: yes + loop: + - /var/config/local/sbin/monitor_modem + - /var/config/init.d/monitor_modem + notify: + - Update system checksum file + when: ansible_local.mts_io.has_radio == "0" + +- name: monitor_modem Delete service if modem not present + ansible.builtin.shell: "update-rc.d -f monitor_modem remove" + notify: + - Update system checksum file + when: ansible_local.mts_io.has_radio == "0" + +... diff --git a/roles/conduit/tasks/ppp.yml b/roles/conduit/tasks/ppp.yml index f650136..e53117e 100644 --- a/roles/conduit/tasks/ppp.yml +++ b/roles/conduit/tasks/ppp.yml @@ -24,23 +24,14 @@ state: link src: /var/config/ppp/ppp_on_boot force: true - when: ppp_on_boot.stat.islnk is defined and not ppp_on_boot.stat.islnk + when: + - ppp_on_boot.stat.lnk_target is not defined or ppp_on_boot.stat.lnk_target != '/var/config/ppp/ppp_on_boot' - name: ppp Make /var/config/ppp/ppp_on_boot executable when we are using cellular file: dest: /var/config/ppp/ppp_on_boot - mode: "755" + mode: "{{ '0755' if use_cellular and cellular_provider is defined else '0644' }}" notify: Start ppp - when: - - use_cellular - - cellular_provider is defined - -- name: ppp Make /var/config/ppp/ppp_on_boot not executable when we are not using cellular - file: - dest: /var/config/ppp/ppp_on_boot - mode: "644" - notify: Stop ppp - when: use_cellular == False or cellular_provider is not defined # # Set or reset APN @@ -57,15 +48,28 @@ - use_cellular - cellular_apn is defined -- name: ppp Set provider +# +# Setup PPP scripts +# + +- name: ppp Set /var/config/ppp/peers/provider + ansible.builtin.file: + path: /var/config/ppp/peers/provider + state: "{{ 'link' if use_cellular and cellular_provider is defined else 'absent' }}" + src: "{{ '/var/config/ppp/peers/' + cellular_provider if use_cellular and cellular_provider is defined else '' }}" + notify: Start ppp + +- name: ppp Restore default provider in /var/config/ppp/ppp_on_boot lineinfile: - path: /etc/ppp/ppp_on_boot + path: /var/config/ppp/ppp_on_boot regexp: '^\$PPPD call ' - line: '$PPPD call {{ cellular_provider }}' - notify: Start ppp - when: - - use_cellular - - cellular_provider is defined + line: '$PPPD call provider' + +- name: ppp make /var/config/ppp/ppp_on_boot a script + lineinfile: + path: /var/config/ppp/ppp_on_boot + regexp: '/bin/sh$' + line: '#!/bin/sh' # # Set PPP configuration options diff --git a/roles/conduit/tasks/restart_monitor_modem.yml b/roles/conduit/tasks/restart_monitor_modem.yml new file mode 100644 index 0000000..0ca880f --- /dev/null +++ b/roles/conduit/tasks/restart_monitor_modem.yml @@ -0,0 +1,13 @@ +--- + +# +# Configure monitor_modem service and restart it +# + +- name: restart_monitor_modem Update rc.d to start monitor_modem + ansible.builtin.shell: "update-rc.d -f monitor_modem remove; update-rc.d monitor_modem defaults 10 50" + +- name: restart_monitor_modem Restart monitor_modem + ansible.builtin.command: /etc/init.d/monitor_modem restart + +... diff --git a/roles/conduit/tasks/time.yml b/roles/conduit/tasks/time.yml index 9963bac..ed51c50 100644 --- a/roles/conduit/tasks/time.yml +++ b/roles/conduit/tasks/time.yml @@ -53,8 +53,8 @@ - name: time Ensure /var/config/default/ntpd.default exists copy: - src: /etc/default/ntpd.default - dest: /var/config/default/ntpd.default + src: /etc/default/ntpd + dest: /var/config/default/ntpd remote_src: yes force: no notify: @@ -64,7 +64,7 @@ file: dest: /etc/default/ntpd state: link - src: /var/config/default/ntpd.default + src: /var/config/default/ntpd force: yes notify: - restart ntpd @@ -76,7 +76,7 @@ - name: time Prevent ntpd from hanging on boot when net is down lineinfile: - dest: /var/config/default/ntpd.default + dest: /var/config/default/ntpd regexp: "^SET_SYSTEM_CLOCK=" line: "SET_SYSTEM_CLOCK={{ set_system_clock }}" state: present @@ -84,6 +84,13 @@ - restart ntpd - Update system checksum file +- name: time Clean up turds + ansible.builtin.file: + path: /var/config/default/ntpd.default + state: absent + notify: + - Update system checksum file + # # Monit # diff --git a/roles/conduit/tasks/ttn_basic_station.yml b/roles/conduit/tasks/ttn_basic_station.yml index c9f03c0..83f9acb 100644 --- a/roles/conduit/tasks/ttn_basic_station.yml +++ b/roles/conduit/tasks/ttn_basic_station.yml @@ -24,7 +24,7 @@ - forwader_version is defined - ansible_local.opkg.lora_basic_station is not defined or ansible_local.opkg.lora_basic_station != forwarder_version -- name: ttn_basic_station Install the desired version of lora_basic_station +- name: ttn_basic_station Install the desired version of lora_basic_station opkg: name: "lora_basic_station=={{ forwarder_version }}" state: present @@ -52,6 +52,7 @@ --log-level {{ basic_log_level }} --log-size {{ forwarder_logrotate_size }} --log-rotate {{ forwarder_logrotate_count }} + {{ '--tc-timeout 5' if use_cellular else '' }} {{ gps_arg }}" register: station_conf_raw - set_fact: diff --git a/roles/conduit/templates/lora-basic-station.j2 b/roles/conduit/templates/lora-basic-station.j2 index 2c5108b..3023b36 100644 --- a/roles/conduit/templates/lora-basic-station.j2 +++ b/roles/conduit/templates/lora-basic-station.j2 @@ -124,6 +124,11 @@ do_start() { test -f "${conf_dir}/${file}" && cp "${conf_dir}/${file}" "${run_dir}/1/" done + # + # copy mlinux-version info + # + echo "$(head -1 /etc/mlinux-version) - $(opkg status lora-basic-station | sed -n '/Version:/s/Version: //p')" > ${run_dir}/1/version.txt + # # reset concentrator # diff --git a/roles/conduit/templates/monitrc.j2 b/roles/conduit/templates/monitrc.j2 index c826c03..eb419a2 100644 --- a/roles/conduit/templates/monitrc.j2 +++ b/roles/conduit/templates/monitrc.j2 @@ -15,9 +15,9 @@ set httpd # Monitor the system check system $HOST - if loadavg (1min) > 2 then alert - if loadavg (5min) > 4 then alert - if memory usage > 25% then alert + if loadavg (1min) > {{ monit_loadavg_1m }} then alert + if loadavg (5min) > {{ monit_loadavg_5m }} then alert + if memory usage > {{ monit_memory_usage }} then alert # Event Queue set eventqueue basedir {{ monit_eventqueue }} @@ -97,6 +97,14 @@ check process conduit_leds PIDFILE /var/run/conduit_leds.pid stop program = "/etc/init.d/conduit_leds stop" {% endif -%} +{% if 'modem_at0' in ansible_facts['ansible_local'].dev -%} +# Monitor monitor_modem +check process monitor_modem PIDFILE /var/run/monitor_modem.pid + if does not exist for {{ monit_process_period }} then restart + start program = "/etc/init.d/monitor_modem start" with timeout 15 seconds + stop program = "/etc/init.d/monitor_modem stop" +{% endif -%} + # Monitor system directory checksums check program check_system_md5 with path /usr/local/lib/check_system_md5 every "{{ 60 | random(seed=inventory_hostname) }} * * * *" diff --git a/roles/conduit/templates/ppp.monit.j2 b/roles/conduit/templates/ppp.monit.j2 index 9e29936..bb8eb28 100644 --- a/roles/conduit/templates/ppp.monit.j2 +++ b/roles/conduit/templates/ppp.monit.j2 @@ -1,8 +1,8 @@ check network "{{ monit_ppp_if }}" with interface "{{ monit_ppp_if }}" -# if failed link then restart + if link up then exec "/usr/bin/env METHOD=monit IFACE=ppp0 /var/config/ifup_restart" if changed link capacity then alert -check process pppd MATCHING "^/usr/sbin/pppd call {{ cellular_provider }}$" +check process pppd MATCHING "^(/usr/sbin/)?pppd call provider$" if does not exist for 1 cycles then restart start program = {{ monit_ppp_start }} stop program = {{ monit_ppp_stop }} diff --git a/roles/conduit/templates/ssh_tunnel.j2 b/roles/conduit/templates/ssh_tunnel.j2 index 0b8bb33..0d53b54 100644 --- a/roles/conduit/templates/ssh_tunnel.j2 +++ b/roles/conduit/templates/ssh_tunnel.j2 @@ -22,3 +22,5 @@ SSH_KEY={{ ssh_tunnel_ssh_key }} SSH_PORT={{ ssh_tunnel_ssh_port }} {% endif %} DAEMON_ARGS="{{ ssh_tunnel_daemon_args }}" +AUTOSSH_POLL={{ ssh_tunnel_poll }} +AUTOSSH_FIRST_POLL={{ ssh_tunnel_first_poll }}