Skip to content

Commit

Permalink
A/P failure testing keycloak#495
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanemerson committed Aug 29, 2023
1 parent 90f6dca commit 1b94694
Show file tree
Hide file tree
Showing 5 changed files with 228 additions and 0 deletions.
10 changes: 10 additions & 0 deletions ansible/roles/benchmark/tasks/install.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@
- name: Install jq
package: name=jq state=present

- name: Install dig
package: name=bind-utils state=present

- name: Download & install OpenShift client
unarchive:
src: "https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz"
dest: /usr/bin/
remote_src: yes
creates: /usr/bin/oc

- name: Copy keycloak-benchmark {{ kcb_version }} on the remote hosts
copy:
src: "{{ kcb_zip }}"
Expand Down
124 changes: 124 additions & 0 deletions benchmark/src/main/content/bin/kc-failover.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/bin/bash
# Script simulating different xsite failover scenarios
set -e

if [[ "$RUNNER_DEBUG" == "1" ]]; then
set -x
fi

function activeClusterDown() {
DOMAIN=$1
CLIENT_IPS=$(dig +short client.${DOMAIN} | sort)
PRIMARY_IPS=$(dig +short primary.${DOMAIN} | sort)
BACKUP_IPS=$(dig +short backup.${DOMAIN} | sort)

[[ "${CLIENT_IPS}" == "${BACKUP_IPS}" && "${CLIENT_IPS}" != "${PRIMARY_IPS}" ]]
return
}

# Removes the Keycloak aws-health-route so that Route53 will eventually failover
function killHealthRoute() {
kubectl -n ${PROJECT} delete route aws-health-route || true
}

# Remove all Keycloak routes so that Route53 will failover from the active to the passive cluster and the old DNS ips will fail
function killKeycloakRoutes() {
kubectl -n ${PROJECT} scale --replicas=0 deployment/keycloak-operator
kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/keycloak
kubectl -n ${PROJECT} delete ingress keycloak-ingress || true
killHealthRoute
}

# Delete the Keycloak + Infinispan pods to simulate cluster crash
function killKeycloakCluster() {
kubectl -n openshift-operators scale --replicas=0 deployment/infinispan-operator-controller-manager
kubectl -n ${PROJECT} scale --replicas=0 deployment/keycloak-operator
kubectl -n ${PROJECT} delete pods --all --force --grace-period=0
kubectl -n ${PROJECT} delete statefulset --all
}

# Scale Infinispan and Keycloak Operators so that the original cluster is recreated
function reviveKeycloakCluster() {
echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Recovery scenario - ${RECOVERY_MODE}\033[0m"
cat << EOF | kubectl -n ${PROJECT} apply -f -
apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: aws-health-route
spec:
host: "$1.${DOMAIN}"
port:
targetPort: https
tls:
insecureEdgeTerminationPolicy: Redirect
termination: passthrough
to:
kind: Service
name: keycloak-service
EOF
kubectl -n openshift-operators scale --replicas=1 deployment/infinispan-operator-controller-manager
kubectl -n ${PROJECT} scale --replicas=1 deployment/keycloak-operator
kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/infinispan
kubectl -n ${PROJECT} rollout status --watch --timeout=600s statefulset/keycloak
exit
}

function waitForFailover() {
START=$(date +%s)
until activeClusterDown ${DOMAIN}
do
sleep 0.1
done
END=$(date +%s)
DIFF=$(( END - START ))

echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Route53 took ${DIFF} seconds to failover\033[0m"
}

function clusterFailover() {
killKeycloakCluster
}

: ${PROJECT:="runner-keycloak"}
: ${FAILOVER_DELAY:=60}

PROJECT=${PROJECT:-"runner-keycloak"}

if [ -z "${RECOVERY_MODE}" ] && [ -z "${FAILOVER_MODE}" ]; then
echo "RECOVERY_MODE or FAILOVER_MODE env must be defined"
exit 1
fi

if [ -z "${DOMAIN}" ]; then
echo "DOMAIN env must be defined"
exit 1
fi

if [ -n "${RECOVERY_MODE}" ]; then
if [ "${RECOVERY_MODE^^}" == "ACTIVE" ]; then
reviveKeycloakCluster primary
elif [ "${RECOVERY_MODE^^}" == "PASSIVE" ]; then
reviveKeycloakCluster backup
else
echo "Unknown RECOVERY_MODE=${RECOVERY_MODE}"
exit 1
fi
fi

echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Entering Failover mode, with an initial delay of ${FAILOVER_DELAY} seconds\033[0m"
sleep ${FAILOVER_DELAY}
echo -e "\033[0;31mINFO:$(date '+%F-%T-%Z') Running Failover scenario - ${FAILOVER_MODE}\033[0m"

CLIENT_IPS=$(dig +short client.${DOMAIN} | sort)
PRIMARY_IPS=$(dig +short primary.${DOMAIN} | sort)
BACKUP_IPS=$(dig +short backup.${DOMAIN} | sort)

if [ "${FAILOVER_MODE^^}" == "HEALTH_PROBE" ]; then
killHealthRoute
elif [ "${FAILOVER_MODE^^}" == "ALL_ROUTES" ]; then
killKeycloakRoutes
elif [ "${FAILOVER_MODE^^}" == "CLUSTER_FAIL" ]; then
killKeycloakCluster
fi

waitForFailover
84 changes: 84 additions & 0 deletions doc/kubernetes/modules/ROOT/pages/util/kc-failover.adoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
= Simulate Keycloak Site Failover
:description: How to automate the simulation of Keycloak site failure.

== Prerequisites:

* A Keycloak instance replicated across two Openshift clusters with Infinispan xsite and an Aurora DB
* Realm, user and client exist with the values required by the benchmark CLI command

== Running the failure test from the CLI

=== Preparations

* Extract the `+keycloak-benchmark-${version}.[zip|tar.gz]+` file
* xref:benchmark-guide::preparing-keycloak.adoc[]
* Make sure your local KUBECONFIG is set to the Openshift cluster which you want to fail.

=== Parameters

The failover scripts requires the following env variables to be set; `FAILOVER_MODE` and `DOMAIN`.

The `FAILOVER_MODE` determines the type of failover that is initiated by the script and can be one of the following values:
[cols='1,3']
|===
| FAILOVER_MODE | Description

| [.nowrap]`HEALTH_PROBE`
| Deletes the Keycloak aws-health-route so that Route53 will eventually failover.

| [.nowrap]`ALL_ROUTES`
| Deletes all Keycloak routes so that Route53 will eventually failover, but requests to the old DNS IP addresses will fail.
The Keycloak Operator is scaled down to 0 pods to prevent the Keycloak Ingress from being recreated.

| [.nowrap]`CLUSTER_FAIL`
| Deletes all Keycloak and Infinispan pods with no grace period and remove the associated StatefulSet. Both operators are
scaled down to prevent the removed resources from being recreated.
|===

See below for a description of the other environment variables that can be configured.

`DOMAIN` :: *Required*. The Route53 domain hosting the `client.`, `primary.` and `backup.` subdomains.

`FAILOVER_DELAY` :: *Optional*. The delay in seconds to wait before initiating cluster failover. Defaults to 60 seconds.

=== Execution

Use the xref:benchmark-guide::run/running-benchmark-cli.adoc[] guide to simulate load against a specific Kubernetes environment.

In parallel execute below command to initiate failover:

[source,bash]
----
FAILOVER_MODE="ALL_ROUTES" DOMAIN=... ./kc-failover.sh
----

NOTE: In order for the `kc-failover.sh` script to accurately record the time taken for Route53 failover to occur, it's
recommended that the script is executed in the same environment as the Keycloak benchmark scenario.

== Restoring clusters after failover tests
Once a failover benchmark has been executed, it's possible to restore the original cluster state by executing the script
with the `RECOVERY_MODE` env variable set. The value of `RECOVERY_MODE` determines the subdomain that is used to recreate
the `aws-health-route` Route.

=== Parameters

[cols='1,3']
|===
| RECOVERY_MODE | Description

| [.nowrap]`ACTIVE`
| Recreates the `aws-health-route` Route with `primary.${DOMAIN}` URL and scales up the Infinispan and Keycloak operators.

| [.nowrap]`PASSIVE`
| Recreates the `aws-health-route` Route with `backup.${DOMAIN}` URL and scales up the Infinispan and Keycloak operators.
|===


`DOMAIN` :: *Required*. The Route53 domain hosting the `client.`, `primary.` and `backup.` subdomains.

=== Execution

[source,bash]
----
RECOVERY_MODE=ACTIVE DOMAIN=... ./kc-failover.sh
----
1 change: 1 addition & 0 deletions doc/kubernetes/modules/ROOT/partials/util-nav.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
** xref:util/manual-jfr.adoc[]
** xref:util/task.adoc[]
** xref:util/kc-chaos.adoc[]
** xref:util/kc-failover.adoc[]
9 changes: 9 additions & 0 deletions provision/rosa-cross-dc/Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ tasks:
helm upgrade --install keycloak --namespace {{.KC_NAMESPACE_PREFIX}}keycloak
--set hostname={{.KC_HOSTNAME_SUFFIX}}
--set keycloakHostname={{.KC_HOSTNAME_OVERRIDE}}
--set keycloakHealthHostname={{.KC_HEALTH_HOSTNAME}}
--set otel={{.KC_OTEL}}
--set otelSamplingPercentage={{.KC_OTEL_SAMPLING_PERCENTAGE}}
--set dbPoolInitialSize={{.KC_DB_POOL_INITIAL_SIZE}}
Expand Down Expand Up @@ -193,13 +194,17 @@ tasks:
requires:
vars:
- ROSA_CLUSTER_NAME
- KC_HOSTNAME_OVERRIDE
- KC_HEALTH_HOSTNAME
cmds:
- task: create-peering-connection
vars:
ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
- task: install-keycloak
vars:
ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
KC_HOSTNAME_OVERRIDE: "{{.KC_HOSTNAME_OVERRIDE}}"
KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_HOSTNAME}}"
- task: wait-cryostat
vars:
ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}"
Expand Down Expand Up @@ -266,9 +271,13 @@ tasks:
- task: deploy-keycloak
vars:
ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}"
KC_HOSTNAME_OVERRIDE: "{{.KC_CLIENT_URL}}"
KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_URL_CLUSTER_1}}"
- task: deploy-keycloak
vars:
ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}"
KC_HOSTNAME_OVERRIDE: "{{.KC_CLIENT_URL}}"
KC_HEALTH_HOSTNAME: "{{.KC_HEALTH_URL_CLUSTER_2}}"

undeploy:
desc: "Undeploy Infinispan and Keycloak in a Cross-Site deployment using ROSA clusters"
Expand Down

0 comments on commit 1b94694

Please sign in to comment.