A/P failure testing keycloak#495

ryanemerson · Aug 29, 2023 · 1b94694 · 1b94694
1 parent 90f6dca
commit 1b94694
Show file tree

Hide file tree

Showing 5 changed files with 228 additions and 0 deletions.
diff --git a/ansible/roles/benchmark/tasks/install.yml b/ansible/roles/benchmark/tasks/install.yml
@@ -14,6 +14,16 @@
 - name: Install jq
   package: name=jq state=present
 
+- name: Install dig
+  package: name=bind-utils state=present
+
+- name: Download & install OpenShift client
+  unarchive:
+    src: "https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz"
+    dest: /usr/bin/
+    remote_src: yes
+    creates: /usr/bin/oc
+
 - name: Copy keycloak-benchmark {{ kcb_version }} on the remote hosts
   copy:
     src: "{{ kcb_zip }}"

diff --git a/benchmark/src/main/content/bin/kc-failover.sh b/benchmark/src/main/content/bin/kc-failover.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Script simulating different xsite failover scenarios
+set -e
+
+if [[ "$RUNNER_DEBUG" == "1" ]]; then
+  set -x
+fi
+
+function activeClusterDown() {
+  DOMAIN=$1
+  CLIENT_IPS=$(dig +short client.${DOMAIN} | sort)
+  PRIMARY_IPS=$(dig +short primary.${DOMAIN} | sort)
+  BACKUP_IPS=$(dig +short backup.${DOMAIN} | sort)
+
+  [[ "${CLIENT_IPS}" == "${BACKUP_IPS}" && "${CLIENT_IPS}" != "${PRIMARY_IPS}" ]]
+  return
+}
+
+# Removes the Keycloak aws-health-route so that Route53 will eventually failover
+function killHealthRoute() {
+    kubectl -n ${PROJECT} delete route aws-health-route || true
+}
+
+# Remove all Keycloak routes so that Route53 will failover from the active to the passive cluster and the old DNS ips will fail
+function killKeycloakRoutes() {
+  kubectl -n ${PROJECT} scale --replicas=0 deployment/keycloak-operator
+  kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/keycloak
+  kubectl -n ${PROJECT} delete ingress keycloak-ingress || true
+  killHealthRoute
+}
+
+# Delete the Keycloak + Infinispan pods to simulate cluster crash
+function killKeycloakCluster() {
+  kubectl -n openshift-operators scale --replicas=0 deployment/infinispan-operator-controller-manager
+  kubectl -n ${PROJECT} scale --replicas=0 deployment/keycloak-operator
+  kubectl -n ${PROJECT} delete pods --all --force --grace-period=0
+  kubectl -n ${PROJECT} delete statefulset --all
+}
+
+# Scale Infinispan and Keycloak Operators so that the original cluster is recreated
+function reviveKeycloakCluster() {
+  echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Recovery scenario - ${RECOVERY_MODE}\033[0m"
+  cat << EOF | kubectl -n ${PROJECT} apply -f -
+  apiVersion: route.openshift.io/v1
+  kind: Route
+  metadata:
+    name: aws-health-route
+  spec:
+    host: "$1.${DOMAIN}"
+    port:
+      targetPort: https
+    tls:
+      insecureEdgeTerminationPolicy: Redirect
+      termination: passthrough
+    to:
+      kind: Service
+      name: keycloak-service
+EOF
+  kubectl -n openshift-operators scale --replicas=1 deployment/infinispan-operator-controller-manager
+  kubectl -n ${PROJECT} scale --replicas=1 deployment/keycloak-operator
+  kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/infinispan
+  kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/keycloak
+  exit
+}
+
+function waitForFailover() {
+  START=$(date +%s)
+  until activeClusterDown ${DOMAIN}
+  do
+    sleep 0.1
+  done
+  END=$(date +%s)
+  DIFF=$(( END - START ))
+
+  echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Route53 took ${DIFF} seconds to failover\033[0m"
+}
+
+function clusterFailover() {
+  killKeycloakCluster
+}
+
+: ${PROJECT:="runner-keycloak"}
+: ${FAILOVER_DELAY:=60}
+
+PROJECT=${PROJECT:-"runner-keycloak"}
+
+if [ -z "${RECOVERY_MODE}" ] && [ -z "${FAILOVER_MODE}" ]; then
+  echo "RECOVERY_MODE or FAILOVER_MODE env must be defined"
+  exit 1
+fi
+
+if [ -z "${DOMAIN}" ]; then
+  echo "DOMAIN env must be defined"
+  exit 1
+fi
+
+if [ -n "${RECOVERY_MODE}" ]; then
+  if [ "${RECOVERY_MODE^^}" == "ACTIVE" ]; then
+    reviveKeycloakCluster primary
+  elif [ "${RECOVERY_MODE^^}" == "PASSIVE" ]; then
+    reviveKeycloakCluster backup
+  else
+    echo "Unknown RECOVERY_MODE=${RECOVERY_MODE}"
+    exit 1
+  fi
+fi
+
+echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Failover mode, with an initial delay of ${FAILOVER_DELAY} seconds\033[0m"
+sleep ${FAILOVER_DELAY}
+echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Failover scenario - ${FAILOVER_MODE}\033[0m"
+
+CLIENT_IPS=$(dig +short client.${DOMAIN} | sort)
+PRIMARY_IPS=$(dig +short primary.${DOMAIN} | sort)
+BACKUP_IPS=$(dig +short backup.${DOMAIN} | sort)
+
+if [ "${FAILOVER_MODE^^}" == "HEALTH_PROBE" ]; then
+  killHealthRoute
+elif [ "${FAILOVER_MODE^^}" == "ALL_ROUTES" ]; then
+  killKeycloakRoutes
+elif [ "${FAILOVER_MODE^^}" == "CLUSTER_FAIL" ]; then
+  killKeycloakCluster
+fi
+
+waitForFailover
diff --git a/doc/kubernetes/modules/ROOT/pages/util/kc-failover.adoc b/doc/kubernetes/modules/ROOT/pages/util/kc-failover.adoc
@@ -0,0 +1,84 @@
+= Simulate Keycloak Site Failover
+:description: How to automate the simulation of Keycloak site failure.
+
+== Prerequisites:
+
+* A Keycloak instance replicated across two Openshift clusters with Infinispan xsite and an Aurora DB
+* Realm, user and client exist with the values required by the benchmark CLI command
+
+== Running the failure test from the CLI
+
+=== Preparations
+
+* Extract the `+keycloak-benchmark-${version}.[zip|tar.gz]+` file
+* xref:benchmark-guide::preparing-keycloak.adoc[]
+* Make sure your local KUBECONFIG is set to the Openshift cluster which you want to fail.
+
+=== Parameters
+
+The failover scripts requires the following env variables to be set; `FAILOVER_MODE` and `DOMAIN`.
+
+The `FAILOVER_MODE` determines the type of failover that is initiated by the script and can be one of the following values:
+[cols='1,3']
+|===
+| FAILOVER_MODE | Description
+
+| [.nowrap]`HEALTH_PROBE`
+| Deletes the Keycloak aws-health-route so that Route53 will eventually failover.
+
+| [.nowrap]`ALL_ROUTES`
+| Deletes all Keycloak routes so that Route53 will eventually failover, but requests to the old DNS IP addresses will fail.
+The Keycloak Operator is scaled down to 0 pods to prevent the Keycloak Ingress from being recreated.
+
+| [.nowrap]`CLUSTER_FAIL`
+| Deletes all Keycloak and Infinispan pods with no grace period and remove the associated StatefulSet. Both operators are
+scaled down to prevent the removed resources from being recreated.
+|===
+
+See below for a description of the other environment variables that can be configured.
+
+`DOMAIN` :: *Required*. The Route53 domain hosting the `client.`, `primary.` and `backup.` subdomains.
+
+`FAILOVER_DELAY` :: *Optional*. The delay in seconds to wait before initiating cluster failover. Defaults to 60 seconds.
+
+=== Execution
+
+Use the xref:benchmark-guide::run/running-benchmark-cli.adoc[] guide to simulate load against a specific Kubernetes environment.
+
+In parallel execute below command to initiate failover:
+
+[source,bash]
+----
+FAILOVER_MODE="ALL_ROUTES" DOMAIN=... ./kc-failover.sh
+----
+
+NOTE: In order for the `kc-failover.sh` script to accurately record the time taken for Route53 failover to occur, it's
+recommended that the script is executed in the same environment as the Keycloak benchmark scenario.
+
+== Restoring clusters after failover tests
+Once a failover benchmark has been executed, it's possible to restore the original cluster state by executing the script
+with the `RECOVERY_MODE` env variable set. The value of  `RECOVERY_MODE` determines the subdomain that is used to recreate
+the `aws-health-route` Route.
+
+=== Parameters
+
+[cols='1,3']
+|===
+| RECOVERY_MODE | Description
+
+| [.nowrap]`ACTIVE`
+| Recreates the `aws-health-route` Route with `primary.${DOMAIN}` URL and scales up the Infinispan and Keycloak operators.
+
+| [.nowrap]`PASSIVE`
+| Recreates the `aws-health-route` Route with `backup.${DOMAIN}` URL and scales up the Infinispan and Keycloak operators.
+|===
+
+
+`DOMAIN` :: *Required*. The Route53 domain hosting the `client.`, `primary.` and `backup.` subdomains.
+
+=== Execution
+
+[source,bash]
+----
+RECOVERY_MODE=ACTIVE DOMAIN=... ./kc-failover.sh
+----
diff --git a/doc/kubernetes/modules/ROOT/partials/util-nav.adoc b/doc/kubernetes/modules/ROOT/partials/util-nav.adoc
@@ -8,3 +8,4 @@
 ** xref:util/manual-jfr.adoc[]
 ** xref:util/task.adoc[]
 ** xref:util/kc-chaos.adoc[]
+** xref:util/kc-failover.adoc[]
diff --git a/provision/rosa-cross-dc/Taskfile.yaml b/provision/rosa-cross-dc/Taskfile.yaml
@@ -117,6 +117,7 @@ tasks:
         helm upgrade --install keycloak --namespace {{.KC_NAMESPACE_PREFIX}}keycloak
         --set hostname={{.KC_HOSTNAME_SUFFIX}}
         --set keycloakHostname={{.KC_HOSTNAME_OVERRIDE}}
+        --set keycloakHealthHostname={{.KC_HEALTH_HOSTNAME}}
         --set otel={{.KC_OTEL}}
         --set otelSamplingPercentage={{.KC_OTEL_SAMPLING_PERCENTAGE}}
         --set dbPoolInitialSize={{.KC_DB_POOL_INITIAL_SIZE}}
@@ -193,13 +194,17 @@ tasks:
     requires:
       vars:
         - ROSA_CLUSTER_NAME
+        - KC_HOSTNAME_OVERRIDE
+        - KC_HEALTH_HOSTNAME
     cmds:
       - task: create-peering-connection
         vars:
           ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
       - task: install-keycloak
         vars:
           ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
+          KC_HOSTNAME_OVERRIDE: "{{.KC_HOSTNAME_OVERRIDE}}"
+          KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_HOSTNAME}}"
       - task: wait-cryostat
         vars:
           ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
@@ -266,9 +271,13 @@ tasks:
       - task: deploy-keycloak
         vars:
           ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}"
+          KC_HOSTNAME_OVERRIDE: "{{.KC_CLIENT_URL}}"
+          KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_URL_CLUSTER_1}}"
       - task: deploy-keycloak
         vars:
           ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}"
+          KC_HOSTNAME_OVERRIDE: "{{.KC_CLIENT_URL}}"
+          KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_URL_CLUSTER_2}}"
 
   undeploy:
     desc: "Undeploy Infinispan and Keycloak in a Cross-Site deployment using ROSA clusters"