Merge pull request #8 from ap-wtioit/master-enable_healthcheck_github

add healthcheck support for http and smtp
Tecnativa · Feb 7, 2024 · 118a9ac · 118a9ac
2 parents d2cf614 + 4627f26
commit 118a9ac
Show file tree

Hide file tree

Showing 8 changed files with 972 additions and 308 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,26 @@
 FROM python:3-alpine
 ENTRYPOINT ["dumb-init", "--"]
 CMD ["proxy"]
-RUN apk add --no-cache -t .build build-base &&\
+HEALTHCHECK CMD ["healthcheck"]
+RUN apk add --no-cache -t .build build-base curl-dev &&\
     apk add --no-cache socat &&\
-    pip install --no-cache-dir dnspython dumb-init &&\
+    apk add --no-cache libcurl &&\
+    pip install --no-cache-dir dnspython dumb-init pycurl &&\
     apk del .build
 ENV NAMESERVERS="208.67.222.222 8.8.8.8 208.67.220.220 8.8.4.4" \
     PORT="80 443" \
     PRE_RESOLVE=0 \
     MODE=tcp \
     VERBOSE=0 \
     MAX_CONNECTIONS=100 \
-    UDP_ANSWERS=1
+    UDP_ANSWERS=1 \
+    HTTP_HEALTHCHECK=0\
+    HTTP_HEALTHCHECK_URL="http://\$TARGET/"\
+    SMTP_HEALTHCHECK=0\
+    SMTP_HEALTHCHECK_URL="smtp://\$TARGET/"\
+    SMTP_HEALTHCHECK_COMMAND="HELP"
 COPY proxy.py /usr/local/bin/proxy
+COPY healthcheck.py /usr/local/bin/healthcheck
 
 # Labels
 ARG BUILD_DATE

diff --git a/README.md b/README.md
@@ -35,6 +35,36 @@ Use these environment variables:
 
 Required. It's the host name where the incoming connections will be redirected to.
 
+### `HTTP_HEALTHCHECK`
+
+Default: `0`
+
+Set to `1` to enable healthcheck with pycurl http requests. This is useful if the target
+uses a deployment where the ip of the service gets changed frequently (e.g.
+`accounts.google.com`) and you are using [`PRE_RESOLVE`](#pre_resolve)
+
+#### Automatically restarting unhealthy proxies
+
+When you enable the http healthcheck the container marks itself as unhealthy but does
+nothing. (see https://github.com/moby/moby/pull/22719)
+
+If you want to restart your proxies automatically, you can use
+https://github.com/willfarrell/docker-autoheal.
+
+### `HTTP_HEALTHCHECK_URL`
+
+Default: `http://$TARGET/`
+
+Url to use in [`HTTP_HEALTHCHECK`](#http_healthcheck) if enabled. `$TARGET` gets
+replaced inside the url by the configured [`TARGET`](#target).
+
+### `HTTP_HEALTHCHECK_TIMEOUT_MS`
+
+Default: `2000`
+
+Timeout in milliseconds for http healthcheck. This is used as a timeout for connecting
+and receiving an answer. You may end up with twice the time spend.
+
 ### `MODE`
 
 Default: `tcp`
@@ -94,6 +124,39 @@ Set to `1` to force using the specified [nameservers](#nameservers) to resolve t
 
 This is especially useful when using a network alias to whitelist an external API.
 
+### `SMTP_HEALTHCHECK`
+
+Default: `0`
+
+Set to `1` to enable healthcheck with pycurl smtp requests. This is useful if the target
+uses a deployment where the ip of the service gets changed frequently (e.g.
+`smtp.eu.sparkpostmail.com`) and you are using [`PRE_RESOLVE`](#pre_resolve)
+
+#### Automatically restarting unhealthy proxies
+
+see [HTTP_HEALTHCHECK](#http_healthcheck)
+
+### `SMTP_HEALTHCHECK_URL`
+
+Default: `smtp://$TARGET/`
+
+Url to use in [`SMTP_HEALTHCHECK`](#smtp_healthcheck) if enabled. `$TARGET` gets
+replaced inside the url by the configured [`TARGET`](#target).
+
+### `SMTP_HEALTHCHECK_COMMAND`
+
+Default: `HELP`
+
+Enables changing the healthcheck command for servers that do not support `HELP` (e.g.
+for [MailHog](https://github.com/mailhog/MailHog) you can use `QUIT`)
+
+### `SMTP_HEALTHCHECK_TIMEOUT_MS`
+
+Default: `2000`
+
+Timeout in milliseconds for smtp healthcheck. This is used as a timeout for connecting
+and receiving an answer. You may end up with twice the time spend.
+
 ### `UDP_ANSWERS`
 
 Default: `1`

diff --git a/healthcheck.py b/healthcheck.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+
+import logging
+import os
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("healthcheck")
+
+
+def error(message, exception=None):
+    logger.error(message)
+    if exception is None:
+        exit(1)
+    else:
+        raise exception
+
+
+def http_healthcheck():
+    """
+    Use pycurl to check if the target server is still responding via proxy.py
+    :return: None
+    """
+    import re
+
+    import pycurl
+
+    check_url = os.environ.get("HTTP_HEALTHCHECK_URL", "http://localhost/")
+    check_timeout_ms = int(os.environ.get("HTTP_HEALTHCHECK_TIMEOUT_MS", 2000))
+    target = os.environ.get("TARGET", "localhost")
+    check_url_with_target = check_url.replace("$TARGET", target)
+    port = re.search("https?://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "80"
+    print("checking %s via 127.0.0.1" % check_url_with_target)
+    logger.info("checking %s via 127.0.0.1" % check_url_with_target)
+    try:
+        request = pycurl.Curl()
+        request.setopt(pycurl.URL, check_url_with_target)
+        # do not send the request to the target directly but use our own socat proxy process to check if it's still
+        # working
+        request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)])
+        request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms)
+        request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms)
+        request.perform()
+        request.close()
+    except pycurl.error as e:
+        error("error while checking http connection", e)
+
+
+def smtp_healthcheck():
+    """
+    Use pycurl to check if the target server is still responding via proxy.py
+    :return: None
+    """
+    import re
+
+    import pycurl
+
+    check_url = os.environ.get("SMTP_HEALTHCHECK_URL", "smtp://localhost/")
+    check_command = os.environ.get("SMTP_HEALTHCHECK_COMMAND", "HELP")
+    check_timeout_ms = int(os.environ.get("SMTP_HEALTHCHECK_TIMEOUT_MS", 2000))
+    target = os.environ.get("TARGET", "localhost")
+    check_url_with_target = check_url.replace("$TARGET", target)
+    port = re.search("smtp://[^:]*(?::([^/]+))?", check_url_with_target)[1] or "25"
+    logger.info("checking %s via 127.0.0.1" % check_url_with_target)
+    try:
+        request = pycurl.Curl()
+        request.setopt(pycurl.URL, check_url_with_target)
+        request.setopt(pycurl.CUSTOMREQUEST, check_command)
+        # do not send the request to the target directly but use our own socat proxy process to check if it's still
+        # working
+        request.setopt(pycurl.RESOLVE, ["{}:{}:127.0.0.1".format(target, port)])
+        request.setopt(pycurl.CONNECTTIMEOUT_MS, check_timeout_ms)
+        request.setopt(pycurl.TIMEOUT_MS, check_timeout_ms)
+        request.perform()
+        request.close()
+    except pycurl.error as e:
+        error("error while checking smtp connection", e)
+
+
+def process_healthcheck():
+    """
+    Check that at least one socat process exists per port and no more than the number of configured max connections
+    processes exist for each port.
+    :return:
+    """
+    import subprocess
+
+    ports = os.environ["PORT"].split()
+    max_connections = int(os.environ["MAX_CONNECTIONS"])
+    logger.info(
+        "checking socat processes for port(s) %s having at least one and less than %d socat processes"
+        % (ports, max_connections)
+    )
+    socat_processes = (
+        subprocess.check_output(["sh", "-c", "grep -R socat /proc/[0-9]*/cmdline"])
+        .decode("utf-8")
+        .split("\n")
+    )
+    pids = [process.split("/")[2] for process in socat_processes if process]
+    if len(pids) < len(ports):
+        # if we have less than the number of ports socat processes we do not need to count processes per port and can
+        # fail fast
+        error("Expected at least %d socat processes" % len(ports))
+    port_process_count = {port: 0 for port in ports}
+    for pid in pids:
+        # foreach socat pid we detect the port it's for by checking the last argument (connect to) that ends with
+        # :{ip}:{port} for our processes
+        try:
+            with open("/proc/%d/cmdline" % int(pid)) as fp:
+                # arguments in /proc/.../cmdline are split by null bytes
+                cmd = [part for part in "".join(fp.readlines()).split("\x00") if part]
+                port = cmd[2].split(":")[-1]
+                port_process_count[port] = port_process_count[port] + 1
+        except FileNotFoundError:
+            # ignore processes no longer existing (possibly retrieved an answer)
+            pass
+    for port in ports:
+        if port_process_count[port] == 0:
+            error("Missing socat process(es) for port: %s" % port)
+        if port_process_count[port] >= max_connections + 1:
+            error(
+                "More than %d + 1  socat process(es) for port: %s"
+                % (max_connections, port)
+            )
+
+
+def preresolve_healthcheck():
+    """
+    Check that the pre-resolved ip is still valid now for target
+    :return:
+    """
+    from tempfile import gettempdir
+
+    load_balancing_dns_fs_flag = os.path.join(
+        gettempdir(), "load_balancing_dns_detected"
+    )
+    if not os.path.exists(load_balancing_dns_fs_flag):
+        # only run the resolver check if a previous run didn't flag the target as being dns load-balanced
+        import subprocess
+
+        from dns.resolver import Resolver
+
+        pre_resolved_ips = {
+            line.split(":")[2]
+            for line in subprocess.check_output(
+                ["sh", "-c", "grep -R '\\(udp\\|tcp\\)-connect:' /proc/[0-9]*/cmdline"]
+            )
+            .decode("utf-8")
+            .split("\n")
+            if line
+        }
+        resolver = Resolver()
+        resolver.nameservers = os.environ["NAMESERVERS"].split()
+        target = os.environ["TARGET"]
+        resolved_ips = [answer.address for answer in resolver.resolve(target)]
+        for ip in pre_resolved_ips:
+            logger.info(f"checking {target} resolves to {ip}")
+            if ip not in resolved_ips:
+                resolved_ips_2 = [answer.address for answer in resolver.resolve(target)]
+                if resolved_ips_2 == resolved_ips:
+                    error(
+                        f"{target} no longer resolves to {ip}, {resolved_ips}, {resolved_ips_2}"
+                    )
+                else:
+                    resolved_ips_3 = [
+                        answer.address for answer in resolver.resolve(target)
+                    ]
+                    # to make sure we didn't just hit the server switch in dns, we check again before deactivating
+                    # the healthcheck permanently (until the container restarts)
+                    if resolved_ips_3 != resolved_ips_2:
+                        logger.info(
+                            f"{target} seems to be load-balancing with dns ({resolved_ips} != {resolved_ips_2}), "
+                            f"deactivating the resolver healthcheck"
+                        )
+                        with open(f"{load_balancing_dns_fs_flag}", "w") as fp:
+                            fp.write(target)
+
+
+process_healthcheck()
+if os.environ["PRE_RESOLVE"] == "1":
+    preresolve_healthcheck()
+if os.environ.get("HTTP_HEALTHCHECK", "0") == "1":
+    http_healthcheck()
+if os.environ.get("SMTP_HEALTHCHECK", "0") == "1":
+    smtp_healthcheck()