Refactor worker restart to prevent issues with periodic tests

aequitas · aequitas · commit e74e13880f86 · 2024-10-25T13:01:56.000+02:00
diff --git a/docker/cron-docker/periodic/15min/restart_nassl_worker b/docker/cron-docker/periodic/15min/restart_nassl_worker
@@ -1,4 +1,8 @@
 #!/bin/sh
 set -e
-# find nassl worker and restart the container(s)
-docker ps --filter label=com.docker.compose.service=worker-nassl --quiet |  xargs --no-run-if-empty docker restart
+# stop and start worker one at a time to ensure (batch) tasks are still being picked up
+# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
+for worker in $(docker ps --filter label=com.docker.compose.service=worker-nassl --quiet); do
+  docker stop "$worker"
+  docker start "$worker"
+done
diff --git a/docker/cron-docker/periodic/daily/restart_slow_worker b/docker/cron-docker/periodic/daily/restart_slow_worker
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+# restart slow worker every day to prevent slow memory leaks
+# as the slow worker can run very long tasks (eg: report generation)
+# we first start  a new container before stopping the previous one
+
+set -e
+
+cd /opt/Internet.nl
+
+SERVICE=worker-slow
+REPLICAS=$WORKER_SLOW_REPLICAS
+COMPOSE_CMD="docker compose --env-file=docker/defaults.env --env-file=docker/host.env --env-file=docker/local.env"
+
+OLD_CONTAINERS=$($COMPOSE_CMD ps --format "{{ .Name }}"|grep "$SERVICE")
+
+# bring up new containers, wait until healthy
+$COMPOSE_CMD up --no-deps --no-recreate --wait --scale="$SERVICE=$(($REPLICAS*2))" "$SERVICE"
+
+# graceful shutdown and remove old containers
+docker rm --force "$OLD_CONTAINERS"
+
+# restore replica number to original
+$COMPOSE_CMD scale $SERVICE=$REPLICAS
diff --git a/docker/cron-docker/periodic/hourly/restart_worker b/docker/cron-docker/periodic/hourly/restart_worker
@@ -1,4 +1,8 @@
 #!/bin/sh
 set -e
-# find worker and restart the container(s)
-docker ps --filter label=com.docker.compose.service=worker --quiet | xargs --no-run-if-empty docker restart
+# stop and start worker one at a time to ensure (batch) tasks are still being picked up
+# workers are sent a TERM signal with which a 10 minute grace period before QUIT is sent
+for worker in $(docker ps --filter label=com.docker.compose.service=worker --quiet); do
+  docker stop "$worker"
+  docker start "$worker"
+done
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -249,6 +249,8 @@ services:
     # time after which a SIGKILL is sent to celery after a SIGTERM (warm shutdown), default 10s
     # insufficient short grace period causes issues on batch when tasks are killed during the hourly worker restart
     stop_grace_period: 10m
+    # SIGTERM is default, but make it explicit
+    stop_signal: SIGTERM
 
     depends_on:
       db-migrate:
@@ -735,6 +737,9 @@ services:
     command: crond -f -d7 -c /etc/crontabs-docker
     environment:
       - AUTO_UPDATE_TO
+      - WORKER_SLOW_REPLICAS
+      - WORKER_REPLICAS
+      - RELEASE
 
     restart: unless-stopped
     logging: