Init commit

Ahmad45123 · May 17, 2024 · f81f7cc · f81f7cc
1 parent 54986b1
commit f81f7cc
Show file tree

Hide file tree

Showing 7 changed files with 303 additions and 0 deletions.
diff --git a/autoscaler/README.md b/autoscaler/README.md
@@ -0,0 +1,43 @@
+# docker-swarm-autoscaler
+
+## Current Release: 0.1.0
+
+This project is intended to bring auto service staling to Docker Swarm. This script uses prometheus paired with cadvisor metrics to determine cpu usage. It then uses a manager node to determine if a service wants to be autoscaled and uses a manager node to scale the service.
+
+Currently the project only uses cpu to autoscale. If cpu usage reaches 85% the service will scale up, if it reaches 25% it will scale down.
+
+## Usage
+1. You can deploy prometheus, cadvisor, and docker-swarm-autoscaler by running `docker stack deploy -c swarm-autoscaler-stack.yml autoscaler` from the root of this repo.  
+  * You can also utilize an already deploy prometheus and cadvisor by specifying the `PROMETHEUS_URL` in docker-swarm-autoscaler environment. `swarm-autoscaler-stack.yml` shows an example of this.  
+  * docker-swarm-autoscale needs a placement contstraint to deploy to a manager. `swarm-autoscaler-stack.yml` shows an example of this.  
+2. For services you want to autoscale you will need a deploy label `swarm.autoscaler=true`. 
+
+```
+deploy:
+  labels:
+    - "swarm.autoscaler=true"
+```
+
+This is best paired with resource constraints limits. This is also under the deploy key.
+
+```
+deploy:
+  resources:
+    reservations:
+      cpus: '0.25'
+      memory: 512M
+    limits:
+      cpus: '0.50'
+```
+
+## Configuration
+| Setting | Value | Description |
+| --- | --- | --- |
+| `swarm.autoscaler` | `true` | Required. This enables autoscaling for a service. Anything other than `true` will not enable it |
+| `swarm.autoscaler.minimum` | Integer | Optional. This is the minimum number of replicas wanted for a service. The autoscaler will not downscale below this number |
+| `swarm.autoscaler.maximum` | Integer | Optional. This is the maximum number of replicas wanted for a service. The autoscaler will not scale up past this number | 
+
+## Test
+You can deploy a test app with the following commands below. Helloworld is initially only 1 replica. The autoscaler will scale to the minimum 3 replicas.
+1. `docker stack deploy -c swarm-autoscaler-stack.yml autoscaler`
+2. `docker stack deploy -c helloworld.yml hello`
diff --git a/autoscaler/docker-swarm-autoscaler/Dockerfile b/autoscaler/docker-swarm-autoscaler/Dockerfile
@@ -0,0 +1,26 @@
+FROM ubuntu:xenial
+
+RUN apt-get update -qq \
+  && apt-get install -y -qq \
+    jq \
+    apt-transport-https \
+    ca-certificates \
+    curl \
+    software-properties-common \
+    dnsutils \
+  && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \
+  && add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu xenial stable" \
+  && apt-get update -qq \
+  && apt-get install -y -qq \
+    docker-ce=5:19.03.5* \
+  && apt-get -qq clean \
+  && apt-get autoremove -y \
+  && rm -rf \
+    /var/lib/apt/lists/* \
+    /tmp/* \
+    /var/tmp/*
+
+COPY auto-scale.sh /auto-scale.sh
+RUN chmod a+x /auto-scale.sh
+
+CMD ["/auto-scale.sh"]
diff --git a/autoscaler/docker-swarm-autoscaler/auto-scale.sh b/autoscaler/docker-swarm-autoscaler/auto-scale.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+LOOP=${LOOP:='yes'}
+CPU_PERCENTAGE_UPPER_LIMIT=60
+CPU_PERCENTAGE_LOWER_LIMIT=25
+PROMETHEUS_API="api/v1/query?query="
+PROMETHEUS_QUERY="sum(rate(container_cpu_usage_seconds_total%7Bcontainer_label_com_docker_swarm_task_name%3D~%27.%2B%27%7D%5B5m%5D))BY(container_label_com_docker_swarm_service_name%2Cinstance)*100"
+
+get_high_cpu_services () {
+  local prometheus_results="${1}"
+  local services=""
+  for service in $(printf "%s$prometheus_results" | jq ".data.result[] | select( all(.value[1]|tonumber; . > $CPU_PERCENTAGE_UPPER_LIMIT) ) | .metric.container_label_com_docker_swarm_service_name" | sed 's/"//g' | sort | uniq); do
+    services="$services $service"
+  done
+  echo $services
+}
+
+get_all_services () {
+  # local prometheus_results="${1}"
+  # local services=""
+  # for service in $(printf "%s$prometheus_results" | jq ".data.result[].metric.container_label_com_docker_swarm_service_name" | sed 's/"//g' | sort | uniq); do
+  #   services="$services $service"
+  # done
+  echo "workup_service_users workup_service_webserver"
+}
+
+get_low_cpu_services () {
+  local prometheus_results="${1}"
+  local services=""
+  for service in $(printf "%s$prometheus_results" | jq ".data.result[] | select( all(.value[1]|tonumber; . < $CPU_PERCENTAGE_LOWER_LIMIT) ) | .metric.container_label_com_docker_swarm_service_name" | sed 's/"//g' | sort | uniq); do
+    if [[ $service == "workup_service_users" || $service == "workup_service_webserver" ]]; then
+      services="$services $service"
+    fi
+  done
+
+  echo $services
+}
+
+default_scale () {
+  service_name=$1
+  auto_scale_label=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler"]')
+  replica_minimum=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler.minimum"]' | sed 's/\"//g')
+  replica_maximum=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler.maximum"]' | sed 's/\"//g')
+  if [[ "${auto_scale_label}" == "\"true\"" ]]; then
+    echo Service $service has an autoscale label.
+    current_replicas=$(docker service inspect $service_name | jq ".[].Spec.Mode.Replicated | .Replicas")
+    if [[ $replica_minimum -gt $current_replicas ]]; then
+      echo Service $service_name is below the minimum. Scaling to the minimum of $replica_minimum
+      docker service scale $service_name=$replica_minimum
+    elif [[ $current_replicas -gt $replica_maximum ]]; then
+      echo Service $service_name is above the maximum. Scaling to the maximum of $replica_maximum
+      docker service scale $service_name=$replica_maximum
+    fi
+  else
+    echo Service $service does not have an autoscale label.
+  fi
+
+}
+
+scale_down () {
+  service_name=$1
+  auto_scale_label=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler"]')
+  replica_minimum=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler.minimum"]' | sed 's/\"//g')
+  if [[ "${auto_scale_label}" == "\"true\"" ]]; then
+    current_replicas=$(docker service inspect $service_name | jq ".[].Spec.Mode.Replicated | .Replicas")
+    new_replicas=$(expr $current_replicas - 1)
+    if [[ $replica_minimum -le $new_replicas ]]; then
+      echo Scaling down the service $service_name to $new_replicas
+      docker service scale $service_name=$new_replicas
+    elif [[ $current_replicas -eq $replica_minimum ]]; then
+      echo Service $service_name has the minumum number of replicas.
+    fi
+  fi
+
+}
+
+scale_up () {
+  service_name=$1
+  auto_scale_label=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler"]')
+  replica_maximum=$(docker service inspect $service_name | jq '.[].Spec.Labels["swarm.autoscaler.maximum"]' | sed 's/\"//g')
+  if [[ "${auto_scale_label}" == "\"true\"" ]]; then
+    current_replicas=$(docker service inspect $service_name | jq ".[].Spec.Mode.Replicated | .Replicas")
+    new_replicas=$(expr $current_replicas + 1)
+    if [[ $current_replicas -eq $replica_maximum ]]; then
+      echo Service $service already has the maximum of $replica_maximum replicas
+    elif [[ $replica_maximum -ge $new_replicas ]]; then
+      echo Scaling up the service $service_name to $new_replicas
+      docker service scale $service_name=$new_replicas
+    fi
+  fi
+}
+
+main () {
+    prometheus_initial_results=$(curl --silent "${PROMETHEUS_URL}/${PROMETHEUS_API}${PROMETHEUS_QUERY}" | jq .)
+    echo Prometheus results
+    echo $prometheus_initial_results
+    for service in $(get_all_services "${prometheus_initial_results}"); do
+      default_scale $service
+    done
+    echo Checking for high cpu services
+    for service in $(get_high_cpu_services "${prometheus_initial_results}"); do
+      echo Service $service is above $CPU_PERCENTAGE_UPPER_LIMIT percent cpu usage.
+      scale_up $service
+    done
+    echo Checking for low cpu services
+    for service in $(get_low_cpu_services "${prometheus_initial_results}"); do
+      echo Service $service is below $CPU_PERCENTAGE_LOWER_LIMIT percent cpu usage.
+      scale_down $service  
+    done
+}
+
+main
+while [[ $LOOP == 'yes' ]]; do
+  echo Waiting 5 seconds for the next test
+  sleep 5s
+  main
+done
diff --git a/autoscaler/prometheus.yml b/autoscaler/prometheus.yml
@@ -0,0 +1,18 @@
+global:
+  scrape_interval:     5s
+  evaluation_interval: 5s
+
+scrape_configs:
+  - job_name: 'prometheus'
+    dns_sd_configs:
+    - names:
+      - 'tasks.prometheus'
+      type: 'A'
+      port: 9090
+
+  - job_name: 'cadvisor'
+    dns_sd_configs:
+    - names:
+      - 'tasks.cadvisor'
+      type: 'A'
+      port: 8080
diff --git a/autoscaler/swarm-autoscaler-stack.yml b/autoscaler/swarm-autoscaler-stack.yml
@@ -0,0 +1,73 @@
+version: "3.7"
+
+networks:
+  autoscale:
+
+configs:
+  prometheus_config:
+    file: ./prometheus.yml
+
+services:
+  docker-swarm-autoscaler:
+    image: workup:autoscaler
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    environment:
+      - PROMETHEUS_URL=http://prometheus:9090
+    networks:
+      - autoscale
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+      resources:
+        limits:
+          cpus: '0.10'
+          memory: 128M
+        reservations:
+          cpus: '0.10'
+          memory: 64M
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor
+    networks:
+      - autoscale
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    deploy:
+      mode: global
+      resources:
+        limits:
+          cpus: '0.10'
+          memory: 128M
+        reservations:
+          cpus: '0.10'
+          memory: 64M
+
+  prometheus:
+    image: prom/prometheus:v2.12.0
+    networks:
+      - autoscale
+    command: ["--storage.tsdb.retention.size=1GB", "--config.file=/etc/prometheus/prometheus.yml", "--web.console.libraries=/etc/prometheus/console_libraries", "--web.console.templates=/etc/prometheus/consoles", "--web.enable-lifecycle"]
+    configs:
+       - source: prometheus_config
+         target: /etc/prometheus/prometheus.yml
+    deploy:
+      mode: replicated
+      replicas: 1
+      placement:
+        constraints:
+          - node.role == manager
+      resources:
+        limits:
+          cpus: '0.50'
+          memory: 1024M
+        reservations:
+          cpus: '0.50'
+          memory: 128M
diff --git a/compose.yaml b/compose.yaml
@@ -15,6 +15,11 @@ services:
       - frontend
     env_file:
       - ./webserver/.env
+    deploy:
+      labels:
+        swarm.autoscaler: 'true'
+        swarm.autoscaler.minimum: '1'
+        swarm.autoscaler.maximum: '4'
 
   service_mediaserver:
     image: ahmed45123/workup:mediaserver
@@ -55,6 +60,11 @@ services:
       - jobs
     env_file:
       - ./services/jobs/.env
+    deploy:
+      labels:
+        swarm.autoscaler: 'true'
+        swarm.autoscaler.minimum: '1'
+        swarm.autoscaler.maximum: '3'
 
   jobs_db:
     image: cassandra:4.0.7
@@ -80,6 +90,11 @@ services:
       - payments
     env_file:
       - ./services/payments/.env
+    deploy:
+      labels:
+        swarm.autoscaler: 'true'
+        swarm.autoscaler.minimum: '1'
+        swarm.autoscaler.maximum: '3'
 
   payments_db:
     image: postgres:12.18
@@ -106,6 +121,11 @@ services:
       - contracts
     env_file:
       - ./services/contracts/.env
+    deploy:
+      labels:
+        swarm.autoscaler: 'true'
+        swarm.autoscaler.minimum: '1'
+        swarm.autoscaler.maximum: '3'
 
   contracts_db:
     image: cassandra:4.0.7
@@ -128,6 +148,11 @@ services:
       - users
     env_file:
       - ./services/users/.env
+    deploy:
+      labels:
+        swarm.autoscaler: 'true'
+        swarm.autoscaler.minimum: '1'
+        swarm.autoscaler.maximum: '3'
 
   users_db:
     image: mongo:7.0

diff --git a/webserver/src/main/resources/application.properties b/webserver/src/main/resources/application.properties
@@ -4,5 +4,6 @@ spring.rabbitmq.host=${WEBSERVER_MQ_HOST}
 spring.rabbitmq.port=5672
 spring.rabbitmq.username=guest
 spring.rabbitmq.password=guest
+spring.rabbitmq.template.reply-timeout=60000
 
 auth.secret = ${WEBSERVER_SECRET_KEY}