diff --git a/antora/antora-playbook-author.yml b/antora/antora-playbook-author.yml index 2df07945c..cdd8fff8a 100644 --- a/antora/antora-playbook-author.yml +++ b/antora/antora-playbook-author.yml @@ -56,5 +56,5 @@ asciidoc: page-pagination: '' experimental: '' github-files: https://github.com/keycloak/keycloak-benchmark/blob/main - keycloak: Keycloak + project_name: Keycloak ispn: Infinispan diff --git a/antora/antora-playbook.yml b/antora/antora-playbook.yml index 755153474..f038ff44b 100644 --- a/antora/antora-playbook.yml +++ b/antora/antora-playbook.yml @@ -56,5 +56,5 @@ asciidoc: page-pagination: '' experimental: '' github-files: https://github.com/keycloak/keycloak-benchmark/blob/main - keycloak: Keycloak + project_name: Keycloak ispn: Infinispan diff --git a/doc/kubernetes/modules/ROOT/images/aurora/aurora-multi-az.dio.svg b/doc/kubernetes/modules/ROOT/images/aurora/aurora-multi-az.dio.svg deleted file mode 100644 index b1973784a..000000000 --- a/doc/kubernetes/modules/ROOT/images/aurora/aurora-multi-az.dio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
AWS Region
AWS Region
Availability Zone
Availability Zone
Aurora
Writer
Aurora...
Availability Zone
Availability Zone
Aurora
Reader
Aurora...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/kubernetes/modules/ROOT/images/crossdc/active-passive-sync.dio.svg b/doc/kubernetes/modules/ROOT/images/crossdc/active-passive-sync.dio.svg deleted file mode 100644 index 64e01704b..000000000 --- a/doc/kubernetes/modules/ROOT/images/crossdc/active-passive-sync.dio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
Secondary Datacenter (passive)
Secondary Datacenter (passive)
Primary Datacenter (active)
Primary Datacenter (active)
Keycloak
Keycloak
Infinispan
Infinispan
Browser
Browser
Infinispan
Infinispan
Keycloak
Keycloak
Load Balancer
Load Balancer
Communication path
after failover / switchover 
Communication path...
<<sync>>
<<sync>>
Synchronously
replicated
Database
Synchronously...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/kubernetes/modules/ROOT/images/crossdc/infinispan-crossdc-az.dio.svg b/doc/kubernetes/modules/ROOT/images/crossdc/infinispan-crossdc-az.dio.svg deleted file mode 100644 index 66b1b7036..000000000 --- a/doc/kubernetes/modules/ROOT/images/crossdc/infinispan-crossdc-az.dio.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
Primary datacenter (active)
Primary datacenter (active)
Kubernetes Cluster
Kubernetes Cluster
«Pod»
Infinispan
«Pod»...
«Pod»
GossipRouter
«Pod»...
Secondary datacenter (passive)
Secondary datacenter (passive)
Kubernetes Cluster
Kubernetes Cluster
«Pod»
GossipRouter
«Pod»...
«Pod»
Infinispan
«Pod»...
Communication path
after failover / switchover 
of both Keycloak and Aurora
Communication path...
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/kubernetes/modules/ROOT/images/route53/route53-multi-az-failover.svg b/doc/kubernetes/modules/ROOT/images/route53/route53-multi-az-failover.svg deleted file mode 100644 index 822fc80de..000000000 --- a/doc/kubernetes/modules/ROOT/images/route53/route53-multi-az-failover.svg +++ /dev/null @@ -1,4 +0,0 @@ - - - -
AWS Region
AWS Region
«Local»
Browser
«Local»...
Availability Zone
Availability Zone
ROSA Primary Cluster
ROSA Primary Cluster
Client
Route
Client...
«Pod»
Keycloak
«Pod»...
Health
Route
Health...
Availability Zone
Availability Zone
AWS Route53
AWS Route53
ROSA Backup Cluster
ROSA Backup Cluster
Health
Route
Health...
«Pod»
Keycloak
«Pod»...
Client
Route
Client...
Client Requests
Client Requests
Text is not SVG - cannot display
\ No newline at end of file diff --git a/doc/kubernetes/modules/ROOT/nav.adoc b/doc/kubernetes/modules/ROOT/nav.adoc index bdd70281c..cdf3d40b7 100644 --- a/doc/kubernetes/modules/ROOT/nav.adoc +++ b/doc/kubernetes/modules/ROOT/nav.adoc @@ -16,23 +16,6 @@ ** xref:openshift/installation-infinispan.adoc[] ** xref:openshift/cross-site-rosa.adoc[] * xref:running/index.adoc[] -** xref:running/index.adoc#overview[Overviews] -*** xref:running/deployments/active-passive-sync.adoc[] -** xref:running/index.adoc#building-blocks[Building Blocks] -*** xref:running/keycloak-deployment.adoc[] -*** xref:running/keycloak-with-external-infinispan.adoc[] -*** xref:running/infinispan-deployment.adoc[] -*** xref:running/infinispan-crossdc-deployment.adoc[] -*** xref:running/aurora-multi-az.adoc[] -*** xref:running/loadbalancing.adoc[] -** xref:running/index.adoc#operational[Operational Procedures] -*** xref:running/fail-over.adoc[] -*** xref:running/switch-over.adoc[] -*** xref:running/network-partition.adoc[] -*** xref:running/switch-back.adoc[] -** xref:running/concepts/index.adoc[] -*** xref:running/concepts/database-connections.adoc[] -*** xref:running/concepts/threads.adoc[] * xref:customizing-deployment.adoc[] * xref:storage-configurations.adoc[] ** xref:storage/postgres.adoc[] diff --git a/doc/kubernetes/modules/ROOT/pages/running/aurora-multi-az.adoc b/doc/kubernetes/modules/ROOT/pages/running/aurora-multi-az.adoc deleted file mode 100644 index 3957599dd..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/aurora-multi-az.adoc +++ /dev/null @@ -1,58 +0,0 @@ -= Aurora PostgreSQL: Multiple Availability Zone Deployment -:description: This guide describes the procedures required to deploy an AWS Aurora PostgreSQL database across multiple AWS \ -availability zones. - -{description} - -== Audience - -This guide describes how to deploy an Aurora PostgreSQL instance across multiple availability-zones in order to -tolerate one or more availability-zone failures in a given AWS region. - -== Architecture -Aurora DB clusters consist of multiple Aurora DB instances, with one instance designated as the primary writer and all -others as backup readers. To ensure high-availability in the event of availability zone failures, Aurora allows DB instances -to be deployed across multiple zones in a single AWS region. In the event of a failure on the availability-zone hosting -the Primary DB instance, Aurora automatically heals itself and promotes a reader instance from a non-failed availability-zone -to be the new writer instance. - -.Aurora Multiple Availability Zone Deployment -image::aurora/aurora-multi-az.dio.svg[] - -See the https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/CHAP_AuroraOverview.html[AWS Aurora documentation] for more details on the semantics provided by Aurora DBs. - -NOTE: This guide follows AWS best-practices and creates a private Aurora DB that is not exposed over the internet. In order -to access the DB from a ROSA cluster it's necessary to <>. - -== Procedure -The following procedure is split into two parts. First, we create an Aurora Multi-AZ DB cluster with the name "keycloak-aurora" -in eu-west-1. Then we create a peering-connection between our ROSA cluster(s) and the Aurora VPC to allow applications -deployed on the ROSA clusters to be able to establish connections with the DB. - -=== Create Aurora DB Cluster - -include::partial$aurora/aurora-multiaz-create-procedure.adoc[] - -=== Establish Peering Connections with ROSA clusters - -The following steps must be repeated for each ROSA cluster which contains a {keycloak} deployment. - -include::partial$aurora/aurora-create-peering-connections.adoc[] - -== Verifying the connection - -include::partial$aurora/aurora-verify-peering-connections.adoc[] - -== Deploying {keycloak} - -Now that an Aurora DB has been established and linked with all of your ROSA clusters, the next step is to -xref:running/keycloak-deployment.adoc[Deploy {keycloak}] with the JDBC url configured to use the Aurora DB writer -endpoint. To do this we create a `{keycloak}` CR outlined in the xref:running/keycloak-deployment.adoc[] guide, however -we modify the following elements: - - -. Update `spec.db.url` to be `jdbc:postgresql://$HOST:5432/keycloak` where `$HOST` is the -<>. - -. Ensure that the Secrets referenced by `spec.db.usernameSecret` and `spec.db.passwordSecret` contain usernames and -passwords defined when creating Aurora. diff --git a/doc/kubernetes/modules/ROOT/pages/running/aurora-peering-connections.adoc b/doc/kubernetes/modules/ROOT/pages/running/aurora-peering-connections.adoc deleted file mode 100644 index 9d6566ad9..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/aurora-peering-connections.adoc +++ /dev/null @@ -1,32 +0,0 @@ -= Aurora DB and ROSA Peering Connection -:description: This describes how to allow connections between a ROSA cluster and a private Aurora DB cluster. - -{description} - -== Audience -This guide describes how to allow network traffic between ROSA clusters and a private Aurora DB cluster, so that -{keycloak} can establish a connection with the DB. - - -== Architecture -The VPC created for the Aurora DB is not exposed over the internet, therefore it's necessary for us to create a https://docs.aws.amazon.com/vpc/latest/peering/what-is-vpc-peering.html[VPC -peering connection] between each ROSA cluster's VPC and the Aurora DB VPC. If this connection is not made, it will not -be possible for your {keycloak} deployment to establish a connection with the DB. - -IMPORTANT: Multiple ROSA clusters connecting to the same Aurora DB VPC cannot have overlapping machine-cidr ranges as -will prevent Aurora responses from being routed to the ROSA cluster initiating a request. - - -== Prerequisites -* At least one ROSA cluster -* Aurora DB cluster - - -== Procedure -The following procedure must be executed for all ROSA clusters which need to connect to the Aurora DB cluster. - -include::partial$aurora/aurora-create-peering-connections.adoc[] - - -== Verifying the connection -include::partial$aurora/aurora-verify-peering-connections.adoc[] diff --git a/doc/kubernetes/modules/ROOT/pages/running/concepts/database-connections.adoc b/doc/kubernetes/modules/ROOT/pages/running/concepts/database-connections.adoc deleted file mode 100644 index 37c55b384..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/concepts/database-connections.adoc +++ /dev/null @@ -1,20 +0,0 @@ -= Configuration of database connection pools -:navtitle: Database connection pools -:description: This describes the reasoning behind configuring database connection pools for {keycloak}. - -{description} - -== Audience - -Read this page to understand considerations and best practices on how to configure database connection pools for {keycloak}. -For a configuration where this is applied, visit xref:running/keycloak-deployment.adoc[]. - -== Concepts - -Creating new database connections is expensive as it takes time. -Creating them when a request arrives will delay the response, so it is good to have them created before the request arrives. -It can also contribute to a https://en.wikipedia.org/wiki/Cache_stampede[stampede effect] where creating a lot of connections in a short time makes things worse as it slows down the system and blocks threads. -Closing a connection also invalidates all server side statements caching for that connection. - -include::partial$configure-db-connection-pool-best-practices.adoc[] - diff --git a/doc/kubernetes/modules/ROOT/pages/running/concepts/index.adoc b/doc/kubernetes/modules/ROOT/pages/running/concepts/index.adoc deleted file mode 100644 index 15384f8ec..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/concepts/index.adoc +++ /dev/null @@ -1,12 +0,0 @@ -= Concepts on running {keycloak} in production -:navtitle: Concepts -:description: This lists different concepts applied to configuring {keycloak} in production. -:page-aliases: load-behavior.adoc - -{description} - -If you're looking for example configurations, have a look at xref:./../index.adoc[] instead. - -* xref::./database-connections.adoc[] -* xref::./threads.adoc[] - diff --git a/doc/kubernetes/modules/ROOT/pages/running/concepts/threads.adoc b/doc/kubernetes/modules/ROOT/pages/running/concepts/threads.adoc deleted file mode 100644 index 3578d7a26..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/concepts/threads.adoc +++ /dev/null @@ -1,67 +0,0 @@ -= Configuration of thread pools -:navtitle: Thread Pools -:description: This describes the reasoning behind configuring thread pools for {keycloak}. - -{description} - -== Audience - -Read this page to understand considerations and best practices on how to configure thread pools connection pools for {keycloak}. -For a configuration where this is applied, visit xref:running/keycloak-deployment.adoc[]. - -== Concepts - -=== Quarkus executor pool - -{keycloak} requests are handled on the Quarkus executor pool, as well as all liveness and readiness probes. - -The Quarkus executor thread pool is configured in https://quarkus.io/guides/all-config#quarkus-core_quarkus.thread-pool.max-threads[`quarkus.thread-pool.max-threads`] and has a maximum size of at least 200 threads. -Depending on the available CPU cores, it can grow even larger. -Threads are created as needed, and will end when no longer needed, so the system will scale up and down as needed. - -When the load and the number of threads increases, the bottleneck will usually be the database connections. -Once a request can't acquire a database connection, it will fail with a message in the log like `Unable to acquire JDBC Connection` or similar as described in xref:error-messages.adoc#keycloak-message-error-failed-to-obtain-jdbc-connection[the known error messages]. -The caller will receive a response with a 5xx HTTP status code indicating a server side error. - -With the number of threads in the executor pool being an order of magnitude larger than the number of database connections and with requests failing when no database connection is available within the https://quarkus.io/guides/all-config#quarkus-agroal_quarkus.datasource.jdbc.acquisition-timeout[`quarkus.datasource.jdbc.acquisition-timeout`] (5 seconds default), this is somewhat of a https://en.wikipedia.org/wiki/Demand_response#Load_shedding[load-shedding behavior] where it returns an error response instead of queueing requests for an indefinite amount of time. - -=== JGroup connection pool - -The combined number of executor threads in all {keycloak} nodes in the cluster shouldn't exceed the number of threads available in JGroups thread pool to avoid the error described in -xref:kubernetes-guide::error-messages.adoc#jgroups-thread-pool-is-full[`org.jgroups.util.ThreadPool: thread pool is full`]. -To see the error the first time it happens, the system property `jgroups.thread_dumps_threshold` needs to be set to `1`, as otherwise the message will appear only after 10000 threads have been rejected. - --- -include::partial$executor-jgroups-thread-calculation.adoc[] --- - -Use the metrics `vendor_jgroups_tcp_get_thread_pool_size` to monitor the total JGroup threads in the pool and `vendor_jgroups_tcp_get_thread_pool_size_active` for the threads active in the pool. -This is useful to monitor that limiting the Quarkus thread pool size keeps the number of active JGroup threads below the maximum JGroup thread pool size. - -[#load-shedding] -=== Load Shedding - -By default, {keycloak} will queue all incoming requests infinitely, even if the request processing stalls. -This will use additional memory in the Pod, can exhaust resources in the load balancers, and the requests will eventually time out on the client side without the client knowing if the request has been processed. -To limit the number of queued requests in {keycloak}, set an additional Quarkus configuration option. - -Configure `quarkus.thread-pool.queue-size` to specify a maximum queue length to allow for effective load shedding once this queue size is exceeded: {keycloak} will return HTTP Status code 500 (server error). -Assuming a {keycloak} Pod processes around 200 requests per second, a queue of 1000 would lead to maximum waiting times of around 5 seconds. - -When this setting is active, requests that exceed the number of queued requests will return with an HTTP 500 error. -{keycloak} logs the error message in its log. - -Future version of {keycloak} will have better means to handle that: https://github.com/keycloak/keycloak/issues/23340[keycloak#23340]. - -[#probes] -=== Probes - -All health probes, including liveness and readiness probes, are handled in the Quarkus executor worker pool by default. -Starting with Keyloak 22.0.5, there is a workaround in place to have the liveness probe non-blocking (see: https://github.com/keycloak/keycloak/issues/22109[keycloak#22109]). -Future version of {keycloak} and Quarkus plan to have other probes also being non-blocking. - -=== OS Resources - -In order for Java to create threads, when running on Linux it needs to have file handles available. -Therefore, the number of open files (as retrieved as `ulimit -n` on Linux) need to provide head-space for {keycloak} to increase the number of threads needed. -Each thread will also consume memory, and the container memory limits need to be set to a value that allows for this or the Pod will be killed by Kubernetes. diff --git a/doc/kubernetes/modules/ROOT/pages/running/deployments/active-passive-sync.adoc b/doc/kubernetes/modules/ROOT/pages/running/deployments/active-passive-sync.adoc deleted file mode 100644 index 549c34c10..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/deployments/active-passive-sync.adoc +++ /dev/null @@ -1,231 +0,0 @@ -= HA-{keycloak} active/passive with synchronous replication -:navtitle: Active/passive with sync replication -:description: This concept describes a highly available active/passive setup and the behavior to expect. - -{description} - -In this guide, we outline the requirements of the HA active/passive architecture, before exploring its benefits and tradeoffs. - -After summarizing the architecture see <> with the links to blueprints for each building block. - -== Architecture - -=== When to use this setup - -Use this setup to be able to fail over automatically in the event of a datacenter failure, and reduce the likelihood to lose data or sessions. - -Manual interactions are usually required to restore the redundancy after the failover. - -=== Deployment, data storage and caching - -Two independent {keycloak} deployments running in different datacenters connected with a low latency network connection. -Entities like users, realms and clients and also offline sessions are stored in a database which is running synchronously replicated database across the two datacenters. The data is also cached in {keycloak}'s embedded {ispn} as local caches. -When the data is changed in one {keycloak} instance, it is updated in the database, and an invalidation message is sent to the other datacenter using the replicated `work` cache. - -Session-related data is stored in the replicated caches of the embedded {ispn} of {keycloak}, and forwarded to the external {ispn}, which forwards information to the external {ispn} running synchronously in the other datacenter. -As session data of the external {ispn} is also cached in the embedded {ispn}, invalidation messages of the replicated `work` cache are needed for invalidation. - -In the following paragraphs and diagrams, when talking about deploying {ispn}, this always refers to the external {ispn}. - -image::crossdc/active-passive-sync.dio.svg[] - -=== Causes of data and service loss - -While this setup aims for high availability, the following situations can still lead to service or data loss: - -* Network failures between the datacenters or failures of components can lead to short service downtimes while those failures are detected. -The service will be restored automatically. -The system is degraded until the failures are detected and the backup cluster is promoted to service requests. - -* Once failures occur in the communication between the datacenters, manual steps are necessary to re-synchronize a degraded setup. -Future versions of {keycloak} and {ispn} plan to reduce those manual operations. - -* Degraded setups can lead to service or data loss if additional components fail. -Monitoring is necessary to detect degraded setups. - -=== Failures this setup can survive - -[%autowidth] -|=== -| Failure | Recovery | RPO^1^ | RTO^2^ - -| Database node -| If the writer instance fails, the database can promote a reader instance in the same or other datacenter to be the new writer. -| No data loss -| Seconds to minutes (depending on the database) - -| {keycloak} node -| Multiple {keycloak} instances run in each datacenter. If one instance fails, it takes a few seconds for the other nodes to notice the change, and some incoming requests might receive an error message or are delayed for some seconds. -| No data loss -| Less than one minute - -| {ispn} node -| Multiple {ispn} instances run in each datacenter. If one instance fails, it takes a few seconds for the other nodes to notice the change. Sessions are stored in at least two {ispn} nodes, so a single node failure doesn't lead to data loss. -| No data loss -| Less than one minute - -| {ispn} cluster failure -| If the {ispn} cluster fails in the active datacenter, {keycloak} won't be able to communicate with the external {ispn}, and the {keycloak} service will be unavailable. -Manual switchover to the secondary datacenter is recommended. -Future versions will detect this situation and do an automatic failover. - -When the {ispn} cluster is restored, its data will be out-of-sync with {keycloak}. -Manual operations are required to get {ispn} in the primary datacenter in sync with the secondary datacenter. -| Loss of service -| Human intervention required - -| Connectivity {ispn} -| If the connectivity between the two datacenters is lost, session information can't be sent to the other datacenter. -Incoming requests might receive an error message or are delayed for some seconds. -The primary site marks the secondary site offline, and will stop sending data to the secondary. -The setup is degraded until the connection is restored and the session data is re-synchronized to the secondary datacenter. -| No data loss ^3^ -| Less than one minute - -| Connectivity Database -| If the connectivity between the two datacenters is lost, the synchronous replication will fail, and it might take some time for the primary site to mark the secondary offline. -Some requests might receive an error message or are delayed for some seconds. -Manual operations might be necessary depending on the database. -| No data loss ^3^ -| Seconds to minutes (depending on the database) - -| Primary Datacenter -| If none of the {keycloak} nodes are available, the loadbalancer will detect the outage and redirect the traffic to the secondary site. -Some requests might receive an error message while the loadbalancer hasn't detected the primary datacenter failure. -The setup will be degraded until the primary site is back up and the session state has been manually synced from the secondary to the primary site. -| No data loss^3^ -| Less than one minute - -| Secondary Datacenter -| If the secondary datacenter is not available, it will take a moment for the primary {ispn} and database to mark the secondary datacenter offline. -Some requests might receive an error message while the detection takes place. -Once the secondary datacenter is up again, the session state needs to be manually synced from the primary site to the secondary site. -| No data loss^3^ -| Less than one minute - -|=== - -^1^: Recovery point objective, assuming all parts of the setup were healthy at the time this occurred. + -^2^: Recovery time objective. + -^3^: Manual operations needed to restore the degraded setup. - -The statement "`No data loss`" depends on the setup not being degraded from previous failures, which includes completing any pending manual operations to resynchronize the state between the datacenters. - -=== Known limitations - -==== Upgrades - -* On {keycloak} or {ispn} version upgrades ((major, minor and patch), all session data (except offline session) will be lost as neither supports zero downtime upgrades. - -==== Failovers - -* A successful failover requires a setup not degraded from previous failures. -All manual operations like a re-synchronization after a previous failure must be complete to preventdata loss. -Use monitoring to ensure degradations are detected and handled in a timely manner. - -==== Switchovers - -* A successful switchover requires a setup not degraded from previous failures. -All manual operations like a re-synchronization after a previous failure must be complete to preventdata loss. -Use monitoring to ensure degradations are detected and handled in a timely manner. - -==== Out-of-sync datacenters - -* The datacenters can become out of sync when a synchronous {ispn} request fails. -This is currently difficult to monitor, and it would need a full manual re-sync of {ispn} to recover. -Monitoring the number of cache entries in both datacenters and {keycloak}'s log file can show when this would become necessary. -Future versions of {keycloak} and {ispn} plan to automate this. - -==== Manual operations - -* Manual operations that re-synchronize the {ispn} state between the datacenters will issue a full state transfer which will put a stress on the system (network, CPU, Java heap in {ispn} and {keycloak}). - -=== Questions and answers - -Why a synchronous database?:: -A synchronously replicated database ensures that data written in the primary datacenter is always available in the secondary datacenter on failover and no data is lost. - -Why a synchronous {ispn} replication?:: -A synchronously replicated {ispn} ensures that sessions created, updated and deleted in the primary datacenter are always available in the secondary datacenter on failover and no data is lost. - -Why is a low-latency network between datacenters needed?:: -Synchronous replication defers the response to the caller until the data is received at the secondary datacenter. -For a synchronous database replication and a synchronous {ispn} replication, a low latency is necessary as each request can have potentially multiple interactions between the datacenters when data is updated which would amplify the latency. - -Why active-passive?:: -Some databases support a single writer instance with a reader instance which is then promoted to be the new writer once the original writer fails. -In such a setup, it is beneficial for the latency to have the writer instance in the same datacenter as the currently active {keycloak}. -Synchronous {ispn} replication can lead to deadlocks when entries in both datacenters are modified concurrently. - -Is this setup limited to two datacenters?:: -This setup could be extended to multiple datacenters, and there are no fundamental changes necessary to have, for example, three datacenters. Once more datacenters are added, the overall latency between the datacenters increases, and the likeliness of network failures, and therefore short downtimes, increases as well. -Therefore, such a deployment is expected to have worse performance and an inferior. -For now, it has been tested and documented with blueprints only for two datacenters. - -Is a synchronous cluster less stable than an asynchronous cluster?:: -An asynchronous setup would handle network failures between the datacenter gracefully, while the synchronous setup would delay requests and will throw errors to the caller where the asynchronous setup would have deferred the writes to the secondary datacenter. -But as the secondary site would never be fully up to date with the primary site, this could lead to data loss during failovers. -This would include: -+ --- -* Lost logouts (sessions are still logged in the secondary datacenter that logged out in the primary datacenter at the point of failover when using an asynchronous {ispn} replication of sessions). -* Lost changes leading to users being able to log in with their old password (database changes not replicated to secondary datacenter at the point of failover when using an asynchronous database). -* Invalid caches leading to users being able to log in with their old password (invalidations of caches are not propagated at the point of failover to the secondary datacenter when using an asynchronous {ispn} replication). --- -+ -So there is effectively a tradeoff between availability and consistency. -For now, we've considered to rank consistency higher than availability with {keycloak}. - -[#building-blocks] -== Building blocks - -The following building blocks are needed to set up the architecture described above. -Each building block links to a blueprint with an example configuration. -They are listed in the order in which they need to be installed. - -=== Two datacenters with low-latency connection - -Ensures that synchronous replication is available for both the database and the external {ispn}. - -*Blueprint:* Two AWS Availablity Zones within the same AWS Region. - -*Not considered:* Two regions on the same or different continents, as it would increase the latency and the likelihood of network failures. -Synchronous replication of databases as a services with Aurora Regional Deployments on AWS is only available within the same region. - -=== Environment for {keycloak} and {ispn} - -Ensures that the instances are deployed and restarted as needed. - -*Blueprint:* Red Hat OpenShift Service on AWS (ROSA) deployed in each availability zone. - -*Not considered:* A stretched ROSA cluster which spans multiple availability zones, as this could be a single point of failure if misconfigured. - -=== Database - -A synchronously replicated database across two datacenters. - -*Blueprint:* xref::running/aurora-multi-az.adoc[Amazon Aurora PostgreSQL Regional Deployment spanning two availability zones, connected to ROSA] - -=== {ispn} - -An {ispn} deployment which leverages the {ispn}'s Cross-DC functionality. - -*Blueprint:* xref::running/infinispan-crossdc-deployment.adoc[Deploy {ispn} using the {ispn} Operator on ROSA, and connect the two datacenters using {ispn}'s Gossip Router]. - -*Not considered:* Direct interconnections between the OpenShift clusters on the network layer. -It might be considered in the future. - -=== Loadbalancer - -A loadbalancer which checks the `/health/live` URl of the {keycloak} deployment in each datacenter. - -*Blueprint:* xref:running/loadbalancing.adoc[]. - -*Not considered:* AWS Global Accelerator connecting to Red Hat OpenShift Service on AWS (ROSA) as it supports only weighted traffic routing and not active-passive failover. -To support active-passive failover, additional logic using, for example, AWS CloudWatch and AWS Lambda would be necessary to simulate the active-passive handling by adjusting the weights when the probes fail. - -=== {keycloak} - -A clustered deployment of {keycloak} in each datacenter, connected to an external {ispn}. - -*Blueprint:* xref::running/keycloak-deployment.adoc[Deploy {keycloak} using the {keycloak} Operator on ROSA], and xref::running/keycloak-with-external-infinispan.adoc[connect it to the external {ispn}] and the Aurora database. diff --git a/doc/kubernetes/modules/ROOT/pages/running/fail-over.adoc b/doc/kubernetes/modules/ROOT/pages/running/fail-over.adoc deleted file mode 100644 index c06e0fa35..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/fail-over.adoc +++ /dev/null @@ -1,22 +0,0 @@ -= Failover to Secondary Site -:description: This guide describes the steps to fail over from primary site to secondary site. - -include::partial$running/infinispan-attributes.adoc[] - -{description} - -== Audience - -A failover from the primary site to the secondary site will happen automatically based on the checks configured in the loadbalancer. - -When the primary site loses its state in {ispn} or a network partition occurs that prevents the synchronization, manual procedures are necessary to recover the primary site before it can handle traffic again, see xref:./switch-back.adoc[]. - -To prevent an automatic fallback to the primary site before those manual steps have performed, configure the loadbalancer as described below to prevent this from happening automatically. - -See xref:running/index.adoc[] for additional guides. - -== Procedure - -=== Route53 - -To force Route53 to mark the primary site as permanently not available and prevent an automatic fallback, edit the health check in AWS to point to a non-existent route (`health/down`). diff --git a/doc/kubernetes/modules/ROOT/pages/running/index.adoc b/doc/kubernetes/modules/ROOT/pages/running/index.adoc index e0ea9016e..630bc9797 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/index.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/index.adoc @@ -1,30 +1,15 @@ = Running in production -:description: This summarizes different topics on how to run {keycloak} in production. +:description: This summarizes different topics on how to run {project_name} in production. +:page-aliases: running/deployments/active-passive-sync.adoc, running/aurora-multi-az.adoc, running/loadbalancing.adoc, running/infinispan-crossdc-deployment.adoc, running/keycloak-with-external-infinispan.adoc, running/keycloak-deployment.adoc, running/fail-over.adoc, running/switch-over.adoc, running/network-partition.adoc, running/switch-back.adoc, running/concepts/database-connections.adoc, running/concepts/threads.adoc, running/concepts/index.adoc {description} It summarizes the logic which is condensed in the Helm charts and scripts in this project to make it accessible as independent knowledge to adapt it to other environments. -[#overview] -== Overview of different configurations - -* xref:running/deployments/active-passive-sync.adoc[] +IMPORTANT: Most of the guides are now available as the High availability guides on https://www.keycloak.org/high-availability/introduction[Keycloak's main website]. +Once they had been published as part of the Keycloak 23 release, they have been removed from this site. [#building-blocks] -== Building Blocks +== Building blocks not yet published on keycloak.org -* xref:running/keycloak-deployment.adoc[] -* xref:running/keycloak-with-external-infinispan.adoc[] * xref:running/infinispan-deployment.adoc[] -* xref:running/infinispan-crossdc-deployment.adoc[] -* xref:running/aurora-multi-az.adoc[] -* xref:running/loadbalancing.adoc[] - -[#operational] -== Operational Procedures - -* xref:running/fail-over.adoc[] -* xref:running/switch-over.adoc[] -* xref:running/network-partition.adoc[] -* xref:running/switch-back.adoc[] -See xref:./concepts/index.adoc[] to learn more about the concepts behind this. diff --git a/doc/kubernetes/modules/ROOT/pages/running/infinispan-crossdc-deployment.adoc b/doc/kubernetes/modules/ROOT/pages/running/infinispan-crossdc-deployment.adoc deleted file mode 100644 index 764a4da43..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/infinispan-crossdc-deployment.adoc +++ /dev/null @@ -1,203 +0,0 @@ -= {ispn} Deployment: Multiple Clusters -:description: This describes how to deploy {ispn} with cross-site enabled. - -include::partial$running/infinispan-attributes.adoc[] - -{description} - -== Audience - -This guide describes the procedures required to deploy {ispn} in a multiple-cluster environment (cross-site). -For simplicity, this guide uses the minimum configuration possible that allows {keycloak} to be used with an external {ispn}. - -This guide assumes two {ocp} clusters named `{site-a}` and `{site-b}`. - -See xref:running/index.adoc[] for additional guides. - -== Architecture - -This deploys two synchronously replicating {ispn} clusters in two datacenters with a low-latency network connection. -This could be, for example, two availability zones in one AWS region. - -{keycloak}, loadbalancer and database have been removed from the following diagram for simplicity. - -image::crossdc/infinispan-crossdc-az.dio.svg[] - -== Prerequisites - -include::partial$running/infinispan-prerequisites.adoc[] - -== Procedure - -include::partial$running/infinispan-install-operator.adoc[] -include::partial$running/infinispan-credentials.adoc[] -+ -These commands must be executed on both {ocp} clusters. - -. Create a service account. -+ -A service account is required to establish a connection between clusters. -The {ispn-operator} uses it to inspect the network configuration from the remote site and to configure the local {ispn} cluster accordingly. -+ -For more details check {operator-docs}#managed-cross-site-connections_cross-site[Managing Cross-Site Connections] documentation. -+ - -.. First, create the service account and generate an access token in both {ocp} clusters. -+ -.Create the service account in `{site-a}` -[source,bash,subs="+attributes"] ----- -kubectl create sa -n {ns} {sa} -kubectl policy add-role-to-user view -n {ns} -z {sa} -kubectl create token -n {ns} {sa} > {site-a}-token.txt ----- -+ -.Create the service account in `{site-b}` -[source,bash,subs="+attributes"] ----- -kubectl create sa -n {ns} {sa} -kubectl policy add-role-to-user view -n {ns} -z {sa} -kubectl create token -n {ns} {sa} > {site-b}-token.txt ----- -+ -.. The next step is to deploy the token from `{site-a}` into `{site-b}` and vice-versa -+ -.Deploy `{site-b}` token into `{site-a}` -[source,bash,subs="+attributes"] ----- -kubectl create secret generic -n {ns} {sa-secret} \ - --from-literal=token="$(cat {site-b}-token.txt)" ----- -+ -.Deploy `{site-a}` token into `{site-b}` -[source,bash,subs="+attributes"] ----- -kubectl create secret generic -n {ns} {sa-secret} \ - --from-literal=token="$(cat {site-a}-token.txt)" ----- - -. Create TLS secrets -+ -In this guide {ispn} uses an {ocp} Route for the cross-site communication. -It uses the SNI extension of TLS to direct the traffic to the correct Pods. -To achieve that, JGroups use TLS sockets, which require a Keystore and Truststore with the correct certificates. -+ -For more information check {operator-docs}#securing-cross-site-connections_cross-site[Securing Cross Site Connections] documentation or this https://developers.redhat.com/learn/openshift/cross-site-and-cross-applications-red-hat-openshift-and-red-hat-data-grid[Red Hat Developer Guide]. -+ -Upload the Keystore and the Truststore in an {ocp} Secret. -The secret contains the file content, the password to access it, and the type of the store. -How to create the certificates and the stores are out of the scope of this guide. -+ -To upload the Keystore as a Secret, use the following command: -+ -.Deploy a Keystore -[source,bash,subs="+attributes"] ----- -kubectl -n {ns} create secret generic {ks-secret} \ - --from-file=keystore.p12="./certs/keystore.p12" \ # <1> - --from-literal=password=secret \ #<2> - --from-literal=type=pkcs12 #<3> ----- -<1> The filename and the path to the Keystore. -<2> The password to access the Keystore. -<3> The Keystore type. -+ -To upload the Truststore as a Secret, use the following command: -+ -.Deploy a Truststore -[source,bash,subs="+attributes"] ----- -kubectl -n {ns} create secret generic {ts-secret} \ - --from-file=truststore.p12="./certs/truststore.p12" \ # <1> - --from-literal=password=caSecret \ # <2> - --from-literal=type=pkcs12 # <3> ----- -<1> The filename and the path to the Truststore. -<2> The password to access the Truststore. -<3> The Truststore type. -+ -NOTE: Keystore and Truststore must be uploaded in both {ocp} clusters. - -. Create an {ispn} Cluster with Cross-Site enabled -+ -The {operator-docs}#setting-up-xsite[Setting Up Cross-Site] documentation provides all the information on how to create and configure your {ispn} cluster with cross-site enabled, including the previous steps. -+ -A basic example is provided in this guide using the credentials, tokens and TLS Keystore/Truststore created by the commands from the previous steps. -+ -.The {ispn} CR for `{site-a}` -[source,yaml] ----- -include::example$helm/ispn-site-a.yaml[tag=infinispan-crossdc] ----- -<1> The cluster name -<2> Allows the cluster to be monitored by Prometheus. -<3> If using a custom credential, configure here the secret name. -<4> The name of the local site, in this case `{site-a}`. -<5> Exposing the cross-site connection using {ocp} Route. -<6> The secret name where the Keystore exists as defined in the previous step. -<7> The alias of the certificate inside the Keystore. -<8> The secret key (filename) of the Keystore as defined in the previous step. -<9> The secret name where the Truststore exists as defined in the previous step. -<10> The Truststore key (filename) of the Keystore as defined in the previous step. -<11> The remote site's name, in this case `{site-b}`. -<12> The namespace of the {ispn} cluster from the remote site. -<13> The {ocp} API URL for the remote site. -<14> The secret with the access toke to authenticate into the remote site. -+ -For `{site-b}`, the {ispn} CR looks similar to the above. -Note the differences in point 4, 11 and 13. -+ -.The {ispn} CR for `{site-b}` -[source,yaml] ----- -include::example$helm/ispn-site-b.yaml[tag=infinispan-crossdc] ----- - -. Creating the caches for {keycloak}. -+ -{keycloak} requires the following caches to be present: `sessions`, `actionTokens`, `authenticationSessions`, `offlineSessions`, `clientSessions`, `offlineClientSessions`, `loginFailures`, and `work`. -+ -The {ispn} {operator-docs}#creating-caches[Cache CR] allows to deploy the caches in the {ispn} cluster. -Cross-site needs to be enabled per cache as documented by {xsite-docs}[Cross Site Documentation]. -The documentation contains more details about the options used by this guide. -The following example shows the Cache CR for `{site-a}`. -+ -.sessions in `{site-a}` -[source,yaml] ----- -include::example$helm/ispn-site-a.yaml[tag=infinispan-cache-sessions] ----- -<1> The cross-site merge policy, invoked when there is a write-write conflict. -Set this for the caches `sessions`, `authenticationSessions`, `offlineSessions`, `clientSessions` and `offlineClientSessions`, and do not set it for all other caches. -<2> The remote site name. -<3> The cross-site communication, in this case SYNC. -+ -For `{site-b}`, the Cache CR is similar except in point 2. -+ -.session in `{site-b}` -[source,yaml] ----- -include::example$helm/ispn-site-b.yaml[tag=infinispan-cache-sessions] ----- - -[#verifying-the-deployment] -== Verifying the deployment - -Confirm that the {ispn} cluster is formed, and the cross-site connection is established between the {ocp} clusters. - - -.Wait until the {ispn} cluster is formed -[source,bash,subs="+attributes"] ----- -kubectl wait --for condition=WellFormed --timeout=300s infinispans.infinispan.org -n {ns} {cluster-name} ----- - -.Wait until the {ispn} cross-site connection is established -[source,bash,subs="+attributes"] ----- -kubectl wait --for condition=CrossSiteViewFormed --timeout=300s infinispans.infinispan.org -n {ns} {cluster-name} ----- - -== What's next? - -After infinispan is deployed and running, checkout xref:running/keycloak-with-external-infinispan.adoc[] documentation on how to connect your {keycloak} cluster with the {ispn} cluster. diff --git a/doc/kubernetes/modules/ROOT/pages/running/infinispan-deployment.adoc b/doc/kubernetes/modules/ROOT/pages/running/infinispan-deployment.adoc index 89613500c..0b4becf43 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/infinispan-deployment.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/infinispan-deployment.adoc @@ -8,7 +8,7 @@ include::partial$running/infinispan-attributes.adoc[] == Audience This guide describes the procedure required to deploy {ispn} in a single cluster environment. -For simplicity, this guide uses the minimum configuration possible that allows {keycloak} to be used with an external {ispn}. +For simplicity, this guide uses the minimum configuration possible that allows {project_name} to be used with an external {ispn}. See xref:running/index.adoc[] for additional guides. @@ -34,9 +34,9 @@ include::example$helm/ispn-single.yaml[tag=infinispan-single] <2> Allows the cluster to be monitored by Prometheus <3> If using a custom credential, configure here the secret name created in the previous step. -. Create the caches for {keycloak}. +. Create the caches for {project_name}. + -{keycloak} requires the following caches to be present: `sessions`, `actionTokens`, `authenticationSessions`, `offlineSessions`, `clientSessions`, `offlineClientSessions`, `loginFailures`, and `work`. +{project_name} requires the following caches to be present: `sessions`, `actionTokens`, `authenticationSessions`, `offlineSessions`, `clientSessions`, `offlineClientSessions`, `loginFailures`, and `work`. + Use the {operator-docs}#creating-caches[{ispn} Cache CR] to deploy the caches in the {ispn} cluster. + diff --git a/doc/kubernetes/modules/ROOT/pages/running/keycloak-deployment.adoc b/doc/kubernetes/modules/ROOT/pages/running/keycloak-deployment.adoc deleted file mode 100644 index c7babc3d2..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/keycloak-deployment.adoc +++ /dev/null @@ -1,100 +0,0 @@ -= {keycloak} Deployment -:description: This describes configuration tweaks and changes on how to run a {keycloak} Deployment under load. - -{description} - -== Audience - -This guide describes advanced {keycloak} configurations for Kubernetes which are load tested and will recover from single Pod failures. - -While the Helm charts in the {keycloak} Benchmark project mix different aspects of production style deployments with instrumentation and monitoring, this documentation focuses on a minimal deployment with optional add-ons which admins can opt in for their own deployments. - -See xref:running/index.adoc[] for additional guides. - -== Prerequisites - -* OpenShift or Kubernetes cluster running -* Understanding of a https://www.keycloak.org/operator/basic-deployment[Basic {keycloak} deployment] - -== Procedure - -// TODO: Which settings to include in the standard recipe, and which in separate optional steps below - -. Determine the sizing of the deployment using xref:benchmark-guide::report/rosa-benchmark-key-results.adoc[]. - -. https://www.keycloak.org/operator/installation[Deploy {keycloak} Operator]. - -. Deploy the {keycloak} CR with the following values with the resource requests and limits calculated in the first step: -+ -[source,yaml] ----- -include::example$helm/keycloak.yaml[tag=keycloak] ----- -<1> The database connection pool initial, max and min size should be identical to allow statement caching for the database. -Adjust this number to meet your system's needs. -As most requests won't touch the database due to {keycloak}'s embedded cache, this can server several hundreds of requests per second. -See xref:./concepts/database-connections.adoc[] for details. -<2> To be able to analyze the system under load, enable the metrics endpoint. -The downside of the setting is that the metrics will be available at the external {keycloak} endpoint, so you must add a filter so that the endpoint is not available from the outside. -Use a reverse proxy in front of {keycloak} to filter out those URLs. -<3> The internal JGroup thread pools is by default set up for 200 threads maximum. -The number of all {keycloak} threads in the StatefulSet should not exceed the number of JGroup threads to avoid a JGroup thread pool exhaustion which could stall {keycloak} request processing. -It might be beneficial to limit the number of {keycloak} threads even further, as too many concurrent threads will lead to throttling by Kubernetes once the requested CPU limit is reached. -See xref:./concepts/threads.adoc[] for details. -<4> The JVM options set additional parameters: -* `jgroups.thread_dumps_threshold` ensures that a log message "`thread pool is full`" appears once the JGroup thread pool is full for the first time. See xref:./concepts/threads.adoc[] for details. -* Adjust the memory settings for the heap. - -[NOTE] -==== -Previous versions of this documentation recommended to disable the liveness and readiness probes. -With https://github.com/keycloak/keycloak/issues/22109[keycloak#22109] being available in {keycloak} 22.0.5, this is no longer recommended. -==== - -== Verifying the deployment - -Confirm that the {keycloak} deployment is ready. - -[source,bash] ----- -kubectl wait --for=condition=Ready keycloaks.k8s.keycloak.org/keycloak -kubectl wait --for=condition=RollingUpdate=False keycloaks.k8s.keycloak.org/keycloak ----- - -== Optional: Load shedding - -{keycloak} currently lacks a mechanism for load shedding. -There are currently different methods in evaluation. - -The only way to do this in {keycloak} 22 is to specify a Quarkus thread pool queue size. Unfortunately, this can cause side effects to block, for example, the health probes. See https://github.com/keycloak/keycloak/pull/23920[keycloak#23920] for the discussion. -See below if you want to give it a try anyway. - -A more sophisticated solution is discussed in https://github.com/keycloak/keycloak/issues/23340[keycloak#23340] which migh arrive for {keycloak} 23. - -.Load shedding with Quarkus thread pool size -[source,yaml,indent=0] ----- - env: -include::example$helm/keycloak.yaml[tag=keycloak-queue-size] ----- -<1> This limits the number of queued {keycloak} requests. All exceeding requests are served with an HTTP 500 error and logged. -See xref:./concepts/threads.adoc#load-shedding[load shedding] for details. - -== Optional: Disable sticky sessions - -When running on OpenShift and the default passthrough Ingress setup as provided by the {keycloak} Operator, the load balancing done by HAProxy is done using sticky sessions based on the source's IP address. -When running load tests, or when having a reverse proxy in front of HAProxy, you might want to disable this to avoid receiving all requests on a single {keycloak} Pod. - -Add the following supplementary configuration under the `spec` in the {keycloak} Custom Resource to disable sticky sessions. - -[source,yaml] ----- -spec: - ingress: - enabled: true - annotations: - # When running load tests, disable sticky sessions on the OpenShift HAProxy router - # to avoid receiving all requests on a single {keycloak} Pod. - haproxy.router.openshift.io/balance: roundrobin - haproxy.router.openshift.io/disable_cookies: 'true' ----- diff --git a/doc/kubernetes/modules/ROOT/pages/running/keycloak-with-external-infinispan.adoc b/doc/kubernetes/modules/ROOT/pages/running/keycloak-with-external-infinispan.adoc deleted file mode 100644 index a03c02996..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/keycloak-with-external-infinispan.adoc +++ /dev/null @@ -1,79 +0,0 @@ -= {keycloak} with external {ispn} -:description: This describes configuration tweaks and changes on how to run {keycloak} with an external {ispn} Deployment under load. - -{description} - -== Audience - -This guide describes advanced {ispn} configurations for {keycloak} on Kubernetes. - -While the Helm charts in the {keycloak} Benchmark project mix different aspects of production style deployments with instrumentation and monitoring, this documentation focuses on the minimal changes which admins can implement in their own deployments. - -See xref:running/index.adoc[] for additional guides. - -== Prerequisites - -* OpenShift or Kubernetes cluster running. -* Existing xref:running/keycloak-deployment.adoc[Basic {keycloak} deployment] as it will be extended. -* Existing {ispn} deployment, for example, one of xref:running/infinispan-deployment.adoc[] or xref:running/keycloak-with-external-infinispan.adoc[]. - -== Procedure - -. Prepare an {ispn} Cache configuration XML from the file `cache-ispn.xml` which is part of the {keycloak} distribution: -.. For each `distributed-cache` entry, add the tags `` as shown below. -+ -[source,xml,indent=0] ----- -include::example$helm-keycloak-config/kcb-infinispan-cache-remote-store-config.xml[tag=keycloak-ispn-remotestore] ----- -<1> New tag `` to connect it to the remote store. -<2> For the address to the remote store, reference two environment variables for host name and port number. -<3> For authentication, reference two environment variables for username and password. -<4> To secure the remote store connection, use the Kubernetes mechanisms of the pre-configured truststore. - -.. Prepare an {ispn} Cache configuration XML from the file `cache-ispn.xml` which is part of the {keycloak} distribution: -For each `replicated-cache` entry, add the tag `` as shown below. -+ -[source,xml,indent=0] ----- -include::example$helm-keycloak-config/kcb-infinispan-cache-remote-store-config.xml[tag=keycloak-ispn-remotestore-work] ----- - -. Place the {ispn} Cache configuration XML in a ConfigMap. -+ -[source,yaml] ----- -include::example$helm/keycloak-ispn.yaml[tag=keycloak-ispn-configmap] -... ----- - -. Create a Secret with the username and password to connect to the external {ispn} deployment: -+ -[source,yaml] ----- -include::example$helm/keycloak-ispn.yaml[tag=keycloak-ispn-secret] ----- - -. Extend the {keycloak} Custom Resource with `additionalOptions` and extend the `podTemplate` as shown below. -+ -[NOTE] -==== -* The new `additionalOptions` entries starting with `remote-store` used here are not official {keycloak} configurations. -Instead, they provide their values to environment variables that are then referenced in the {ispn} XML configuration. -* All the memory, resource and database configurations are skipped from the CR below as they have been described in xref:./keycloak-deployment.adoc[] already. -Admins should leave those configurations untouched. -==== -+ -[source,yaml] ----- -include::example$helm/keycloak-ispn.yaml[tag=keycloak-ispn] ----- -<1> Custom cache configuration XML file definition, which includes configuration for remote or embedded {ispn} store. -<2> The hostname and port of the remote cache {ispn} cluster. -<3> The credentials required, username and password, to access the remote cache {ispn} cluster. -<4> `jboss.site.name` is an arbitrary {ispn} site name which {keycloak} needs for its embedded {ispn} deployment when a remote store is used. This site name is related only to the embedded {ispn} and does not need to match any value from the external {ispn} deployment. -<5> Mounting the cache configuration Volume in Kubernetes. -However, matching the `jboss.site.name` with the external {ispn} deployment site name helps debugging possible future issues. -If you are using multiple sites for {keycloak} in a cross-DC setup like xref::running/infinispan-crossdc-deployment.adoc[], the site name must be different in each site. -<6> Defining the cache configuration Volume using the already created ConfigMap in Kubernetes. - diff --git a/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc b/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc deleted file mode 100644 index 58f6c5181..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc +++ /dev/null @@ -1,279 +0,0 @@ -= AWS Route 53 active/passive loadbalancer -:description: This guide describes the procedure required to configure DNS based failover for Multi-AZ {keycloak} clusters \ -using AWS Route53. - -{description} - -== Architecture - -All {keycloak} client requests are routed via a DNS name managed by Route53 records. It's the responsibility of Route53 -to ensure that all client requests are routed to the Primary cluster when it's available and healthy, or to the backup -cluster in the event of the primary availability-zone and/or {keycloak} deployment failing. - -If the primary site fails, the DNS changes will need to propagate to the clients. Depending on the client's settings, this may take some minutes and depends on the client's configuration. -When using mobile connections, some internet providers might not respect the TTL of the DNS entries, which can lead to an extended time until the clients finally connect to the new site. - -.AWS Global Accelerator Failover -image::route53/route53-multi-az-failover.svg[] - -Two Openshift Routes are exposed on both the Primary and Backup ROSA cluster. The first Route utilises the Route53 DNS -name in order to service client requests, whereas the second Route is used by Route53 to monitor the health of the -{keycloak} cluster. - -== Prerequisites - -* ROSA based Multi-AZ {keycloak} deployment -* An owned domain for client requests to be routed through - -== Procedure - -. [[create-hosted-zone]]Create a https://docs.aws.amazon.com/Route53/latest/DeveloperGuide/CreatingHostedZone.html[Route53 Hosted Zone] using -the root domain name that you want all {keycloak} clients to connect through. -+ -Take note of the "Hosted zone ID", as this will be required in later steps. - -. Retrieve the "Hosted zone ID" and DNS name associated with each ROSA cluster. -+ -For both the Primary and Backup cluster, perform the following: -+ -.. Login to the ROSA cluster -+ -.. Obtain the cluster VPC ID -+ -.Command: -[source,bash] ----- -NODE=$(kubectl get nodes --selector=node-role.kubernetes.io/worker \ - -o jsonpath='{.items[0].metadata.name}' -) -aws ec2 describe-instances \ ---filters "Name=private-dns-name,Values=${NODE}" \ ---query 'Reservations[*].Instances[*].VpcId' \ ---region eu-west-1 \#<1> ---output text ----- -<1> The AWS region hosting your ROSA cluster -+ -.Output: -[source,bash] ----- -vpc-08572eedcb77c9f87 ----- -+ -.. [[hosted_zone_id]]Retrieve the cluster LoadBalancer Hosted Zone ID and DNS hostname -+ -.Command: -[source,bash] ----- -aws elb describe-load-balancers \ - --query "LoadBalancerDescriptions[?VPCId=='vpc-08572eedcb77c9f87'].{CanonicalHostedZoneNameID:CanonicalHostedZoneNameID,DNSName:DNSName}" \#<1> - --region eu-west-1 \ - --output json ----- -<1> Utilise the VPC ID retrieved in the previous step -+ -.Output: -[source,json] ----- -[ - { - "CanonicalHostedZoneNameID": "Z32O12XQLNTSW2", #<1> - "DNSName": "ab50395cd04304a539af5b8854325e22-773464857.eu-west-1.elb.amazonaws.com" - } -] ----- -+ -. Create Route53 health checks -+ -.Command: -[source,bash] ----- -function createHealthCheck() { - # Creating a hash of the caller reference to allow for names longer than 64 characters - REF=($(echo $1 | sha1sum )) - aws route53 create-health-check \ - --caller-reference "$REF" \ - --query "HealthCheck.Id" \ - --no-cli-pager \ - --output text \ - --health-check-config ' - { - "Type": "HTTPS", - "ResourcePath": "/health/live", - "FullyQualifiedDomainName": "'$1'", - "Port": 443, - "RequestInterval": 30, - "FailureThreshold": 1, - "EnableSNI": true - } - ' -} -CLIENT_DOMAIN="client.keycloak-benchmark.com" #<1> -PRIMARY_DOMAIN="primary.${CLIENT_DOMAIN}" #<2> -BACKUP_DOMAIN="backup.${CLIENT_DOMAIN}" #<3> -createHealthCheck ${PRIMARY_DOMAIN} -createHealthCheck ${BACKUP_DOMAIN} ----- -<1> The domain which {keycloak} clients should connect to. This should be the same, or a subdomain, of the root domain -used to create the xref:create-hosted-zone[Hosted Zone]. -<2> The subdomain that will be used for health probes on the Primary cluster -<3> The subdomain that will be used for health probes on the Backup cluster -+ -.Output: -[source,bash] ----- -233e180f-f023-45a3-954e-415303f21eab #<1> -799e2cbb-43ae-4848-9b72-0d9173f04912 #<2> ----- -<1> The ID of the Primary Health check -<2> The ID of the Backup Health check -+ -. Create the Route53 record set -+ -.Command: -[source,bash] ----- -HOSTED_ZONE_ID="Z09084361B6LKQQRCVBEY" #<1> -PRIMARY_LB_HOSTED_ZONE_ID="Z32O12XQLNTSW2" -PRIMARY_LB_DNS=ab50395cd04304a539af5b8854325e22-773464857.eu-west-1.elb.amazonaws.com -PRIMARY_HEALTH_ID=233e180f-f023-45a3-954e-415303f21eab -BACKUP_LB_HOSTED_ZONE_ID="Z32O12XQLNTSW2" -BACKUP_LB_DNS=a184a0e02a5d44a9194e517c12c2b0ec-1203036292.eu-west-1.elb.amazonaws.com -BACKUP_HEALTH_ID=799e2cbb-43ae-4848-9b72-0d9173f04912 -aws route53 change-resource-record-sets \ - --hosted-zone-id Z09084361B6LKQQRCVBEY \ - --query "ChangeInfo.Id" \ - --output text \ - --change-batch ' - { - "Comment": "Creating Record Set for '${CLIENT_DOMAIN}'", - "Changes": [{ - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "'${PRIMARY_DOMAIN}'", - "Type": "A", - "AliasTarget": { - "HostedZoneId": "'${PRIMARY_LB_HOSTED_ZONE_ID}'", - "DNSName": "'${PRIMARY_LB_DNS}'", - "EvaluateTargetHealth": true - } - } - }, { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "'${BACKUP_DOMAIN}'", - "Type": "A", - "AliasTarget": { - "HostedZoneId": "'${BACKUP_LB_HOSTED_ZONE_ID}'", - "DNSName": "'${BACKUP_LB_DNS}'", - "EvaluateTargetHealth": true - } - } - }, { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "'${CLIENT_DOMAIN}'", - "Type": "A", - "SetIdentifier": "client-failover-primary-'${SUBDOMAIN}'", - "Failover": "PRIMARY", - "HealthCheckId": "'${PRIMARY_HEALTH_ID}'", - "AliasTarget": { - "HostedZoneId": "'${HOSTED_ZONE_ID}'", - "DNSName": "'${PRIMARY_DOMAIN}'", - "EvaluateTargetHealth": true - } - } - }, { - "Action": "CREATE", - "ResourceRecordSet": { - "Name": "'${CLIENT_DOMAIN}'", - "Type": "A", - "SetIdentifier": "client-failover-backup-'${SUBDOMAIN}'", - "Failover": "SECONDARY", - "HealthCheckId": "'${BACKUP_HEALTH_ID}'", - "AliasTarget": { - "HostedZoneId": "'${HOSTED_ZONE_ID}'", - "DNSName": "'${BACKUP_DOMAIN}'", - "EvaluateTargetHealth": true - } - } - }] - } - ' ----- -<1> The ID of the xref:create-hosted-zone[Hosted Zone] created earlier -+ -.Output: -[source,json] ----- -/change/C053410633T95FR9WN3YI ----- -+ -. Wait for the Route53 records to be updated -+ -.Command: -[source,bash] ----- -aws route53 wait resource-record-sets-changed --id /change/C053410633T95FR9WN3YI ----- -+ -. Update/Create {keycloak} Deployment -+ -For both the Primary and Backup cluster, perform the following: -+ -.. Login to the ROSA cluster -+ -.. Ensure the {keycloak} CR has the following configuration -+ -[source,yaml] ----- -apiVersion: k8s.keycloak.org/v2alpha1 -kind: {keycloak} -metadata: - name: keycloak -spec: - hostname: - hostname: ${CLIENT_DOMAIN} # <1> ----- -<1> The domain clients use to connect to {keycloak} -+ -To ensure that request forwarding works, specify in the {keycloak} CR the hostname through -which clients will access the {keycloak} instances. This must be the `$CLIENT_DOMAIN` used in the Route53 configuration. -+ -.. Create health check Route -+ -.Command: -[source,bash] ----- -cat < -apiVersion: route.openshift.io/v1 -kind: Route -metadata: - name: aws-health-route -spec: - host: $DOMAIN #<2> - port: - targetPort: https - tls: - insecureEdgeTerminationPolicy: Redirect - termination: passthrough - to: - kind: Service - name: keycloak-service - weight: 100 - wildcardPolicy: None - -EOF ----- -<1> `$NAMESPACE` should be replaced with the namespace of your {keycloak} deployment -<2> `$DOMAIN` should be replaced with either the `PRIMARY_DOMAIN` or `BACKUP_DOMAIN`, if the current -cluster is the Primary of Backup cluster, respectively. - - -== Verify - -Navigate to the chosen CLIENT_DOMAIN in your local browser and login to the {keycloak} console. - -To test failover works as expected, login to the Primary cluster and scale the {keycloak} deployment to zero Pods. This will -cause the Primary's health checks to fail and Route53 should start routing traffic to the {keycloak} Pods on the Backup -cluster. diff --git a/doc/kubernetes/modules/ROOT/pages/running/network-partition.adoc b/doc/kubernetes/modules/ROOT/pages/running/network-partition.adoc deleted file mode 100644 index 70d98cf2f..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/network-partition.adoc +++ /dev/null @@ -1,63 +0,0 @@ -= Out-of-sync datacenters Recovery -:description: This guide describes the steps to bring the secondary site up to date to the primary site after a network partition. - -include::partial$running/infinispan-attributes.adoc[] - -// used by the CLI commands to avoid duplicating the code. -:stale-site: secondary -:keep-site: primary -:keep-site-name: {site-a-cr} -:stale-site-name: {site-b-cr} - -{description} - -== Audience - -This guide describes the procedures required to synchronize the secondary site after a temporary disconnection between sites. - -See xref:running/index.adoc[] for additional guides. - -=== {ispn} Cluster - -For the context of this guide, `{site-a}` is the primary site and `{site-b}` is the secondary site. - -Network partitions may happen between the site and the replication between the {ispn} cluster will stop. -Manual steps described in this guide bring both site back in sync. - -WARNING: Transferring the full state may impact the {ispn} cluster perform by increasing the response time and/or resources usage. - -First procedure is to delete the stale data from the secondary site. - -. Login into your secondary site. - -. Shutdown {keycloak}. This will clear all {keycloak} caches, and it prevents the state of {keycloak} from being out-of-sync with {ispn}. -+ -When deploying {keycloak} using the {keycloak} Operator, change the number of {keycloak} instances in the {keycloak} Custom Resource to 0. - -include::partial$running/infinispan-cli-connect.adoc[] - -include::partial$running/infinispan-cli-clear-caches.adoc[] - -Now we are ready to transfer the state from the primary site to the secondary site. - -. Login into your primary site - -include::partial$running/infinispan-cli-connect.adoc[] - -include::partial$running/infinispan-cli-state-transfer.adoc[] - -As now the state is available in the secondary datacenter, {keycloak} can be started again: - -. Login into your secondary site. - -. Startup {keycloak}. -+ -When deploying {keycloak} using the {keycloak} Operator, change the number of {keycloak} instances in the {keycloak} Custom Resource to the original value. - -=== AWS Aurora Database - -No action required. - -=== Route53 - -No action required. diff --git a/doc/kubernetes/modules/ROOT/pages/running/switch-back.adoc b/doc/kubernetes/modules/ROOT/pages/running/switch-back.adoc deleted file mode 100644 index 39696c7ae..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/switch-back.adoc +++ /dev/null @@ -1,76 +0,0 @@ -= Switch back -:description: This guide describes the procedures to switch back to the primary site back after a failover or switchover to the secondary site. - -include::partial$running/infinispan-attributes.adoc[] - -// used by the CLI commands to avoid duplicating the code. -:stale-site: primary -:keep-site: secondary -:keep-site-name: {site-b-cr} -:stale-site-name: {site-a-cr} - -{description} - -== Audience - -This guide contains the procedures required to bring the primary site back to operation when the secondary site is handling all the traffic. -At the end of the guide, the primary site is online again and handles the traffic. - -This is necessary when the primary site has lost its state in {ispn}, a network partition occurred between the primary and the secondary site while the secondary site was active, or the replication was disabled as described in xref:./switch-over.adoc[]. - -If the data in {ispn} on both sites is still in sync, the procedure for {ispn} can be skipped. - -See xref:running/index.adoc[] for additional guides. - -== Prerequisites - -This assumes a xref:running/deployments/active-passive-sync.adoc[] setup. - -== Procedures - -=== {ispn} Cluster - -For the context of this guide, `{site-a}` is the primary site, recovering back to operation, and `{site-b}` is the secondary site, running in production. - -After the {ispn} in the primary site is back online and joined the cross-site channel (see xref:running/infinispan-crossdc-deployment.adoc#verifying-the-deployment[verifying the {ispn} deployment]), the state transfer must be manually started from the secondary site. - -After clearing the state in the primary site, it transfers the full state from the secondary site to the primary site, and it must be completed before the primary site can start handling incoming requests. - -WARNING: Transferring the full state may impact the {ispn} cluster perform by increasing the response time and/or resources usage. - -The first procedure is to delete any stale data from the primary site. - -. Login to the primary site. - -. Shutdown {keycloak}. This will clear all {keycloak} caches, and it prevents the state of {keycloak} from being out-of-sync with {ispn}. -+ -When deploying {keycloak} using the {keycloak} Operator, change the number of {keycloak} instances in the {keycloak} Custom Resource to 0. - -include::partial$running/infinispan-cli-connect.adoc[] - -include::partial$running/infinispan-cli-clear-caches.adoc[] - -Now we are ready to transfer the state from the secondary site to the primary site. - -. Login into your secondary site. - -include::partial$running/infinispan-cli-connect.adoc[] - -include::partial$running/infinispan-cli-state-transfer.adoc[] - -. Login to the primary site. - -. Start {keycloak}. -+ -When deploying {keycloak} using the {keycloak} Operator, change the number of {keycloak} instances in the {keycloak} Custom Resource to the original value. - -Both {ispn} clusters are in sync and the switchover from secondary back to the primary site can be performed. - -=== AWS Aurora Database - -include::partial$aurora/aurora-failover.adoc[] - -=== Route53 - -If switching over to the secondary site has been triggered by changing the health endpoint, edit the health check in AWS to point to a correct endpoint (`health/live`). -It will take some minutes for the clients to notice the change, and traffic will gradually move over to the secondary site. diff --git a/doc/kubernetes/modules/ROOT/pages/running/switch-over.adoc b/doc/kubernetes/modules/ROOT/pages/running/switch-over.adoc deleted file mode 100644 index d61b100aa..000000000 --- a/doc/kubernetes/modules/ROOT/pages/running/switch-over.adoc +++ /dev/null @@ -1,81 +0,0 @@ -= Switchover to Secondary Site -:description: This guide describes the steps to switch from primary site to secondary site. - -include::partial$running/infinispan-attributes.adoc[] - -{description} - -== Audience - -This guide describes the procedures required to switch to the secondary site, and it assumes the primary site will be taken offline. - -See xref:running/index.adoc[] for additional guides. - -=== {ispn} Cluster - -For the context of this guide, `{site-a}` is the primary site and `{site-b}` is the secondary site. - -For taking a site offline, it is a good practice to disable the replication to it. -It prevents errors/delays when the channels are disconnected between the primary and the secondary site. - -==== Procedures to transfer state from secondary to primary site - -. Login into your secondary site - -include::partial$running/infinispan-cli-connect.adoc[] - -. Disable the replication to the primary site by running the following command: -+ -[source,bash,subs="+attributes"] ----- -site take-offline --all-caches --site={site-a-cr} ----- -+ -.Example -[source,bash,subs="+attributes"] ----- -[{cluster-name}-0-29897@ISPN//containers/default]> site take-offline --all-caches --site={site-a-cr} -{ - "offlineClientSessions" : "ok", - "authenticationSessions" : "ok", - "sessions" : "ok", - "clientSessions" : "ok", - "work" : "ok", - "offlineSessions" : "ok", - "loginFailures" : "ok", - "actionTokens" : "ok" -} ----- - -. Check the replication status is `offline`. -+ -[source,bash,subs="+attributes"] ----- -site status --all-caches --site={site-a-cr} ----- -+ -.Example -[source,bash,subs="+attributes"] ----- -[{cluster-name}-0-29897@ISPN//containers/default]> site status --all-caches --site={site-a-cr} -{ - "status" : "offline" -} ----- -+ -If the status is not `offline`, repeat the previous step. - -The {ispn} cluster in the secondary site is ready to handle requests without trying to replicate to the primary site. - -=== AWS Aurora Database - -include::partial$aurora/aurora-failover.adoc[] - -=== {keycloak} Cluster - -No action required. - -=== Route53 - -To force Route53 to mark the primary site as not available, edit the health check in AWS to point to a non-existent route (`health/down`) -It will take some minutes for the clients to notice the change, and traffic will gradually move over to the secondary site. diff --git a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-create-peering-connections.adoc b/doc/kubernetes/modules/ROOT/partials/aurora/aurora-create-peering-connections.adoc deleted file mode 100644 index af23fc8a4..000000000 --- a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-create-peering-connections.adoc +++ /dev/null @@ -1,214 +0,0 @@ -. Retrieve the Aurora VPC -+ -.Command: -[source,bash] ----- -aws ec2 describe-vpcs \ - --filters "Name=tag:AuroraCluster,Values=keycloak-aurora" \ - --query 'Vpcs[*].VpcId' \ - --region eu-west-1 \ - --output text ----- -+ -.Output: -[source,json] ----- -vpc-0b40bd7c59dbe4277 ----- -+ -. Retrieve the ROSA cluster VPC -.. Login to the ROSA cluster using `oc` -.. Retrieve the ROSA VPC -+ -.Command: -[source,bash] ----- -NODE=$(oc get nodes --selector=node-role.kubernetes.io/worker -o jsonpath='{.items[0].metadata.name}') -aws ec2 describe-instances \ - --filters "Name=private-dns-name,Values=${NODE}" \ - --query 'Reservations[0].Instances[0].VpcId' \ - --region eu-west-1 \ - --output text ----- -+ -.Output: -[source,json] ----- -vpc-0b721449398429559 ----- -+ -. Create Peering Connection -+ -.Command: -[source,bash] ----- -aws ec2 create-vpc-peering-connection \ - --vpc-id vpc-0b721449398429559 \# <1> - --peer-vpc-id vpc-0b40bd7c59dbe4277 \# <2> - --peer-region eu-west-1 \ - --region eu-west-1 ----- -<1> ROSA cluster VPC -<2> Aurora VPC -+ -.Output: -[source,json] ----- -{ - "VpcPeeringConnection": { - "AccepterVpcInfo": { - "OwnerId": "606671647913", - "VpcId": "vpc-0b40bd7c59dbe4277", - "Region": "eu-west-1" - }, - "ExpirationTime": "2023-11-08T13:26:30+00:00", - "RequesterVpcInfo": { - "CidrBlock": "10.0.17.0/24", - "CidrBlockSet": [ - { - "CidrBlock": "10.0.17.0/24" - } - ], - "OwnerId": "606671647913", - "PeeringOptions": { - "AllowDnsResolutionFromRemoteVpc": false, - "AllowEgressFromLocalClassicLinkToRemoteVpc": false, - "AllowEgressFromLocalVpcToRemoteClassicLink": false - }, - "VpcId": "vpc-0b721449398429559", - "Region": "eu-west-1" - }, - "Status": { - "Code": "initiating-request", - "Message": "Initiating Request to 606671647913" - }, - "Tags": [], - "VpcPeeringConnectionId": "pcx-0cb23d66dea3dca9f" - } -} ----- -+ -. Wait for Peering connection to exist -+ -.Command: -[source,bash] ----- -aws ec2 wait vpc-peering-connection-exists --vpc-peering-connection-ids pcx-0cb23d66dea3dca9f ----- -+ -. Accept the peering connection -+ -.Command: -[source,bash] ----- -aws ec2 accept-vpc-peering-connection \ - --vpc-peering-connection-id pcx-0cb23d66dea3dca9f \ - --region eu-west-1 ----- -+ -.Output: -[source,json] ----- -{ - "VpcPeeringConnection": { - "AccepterVpcInfo": { - "CidrBlock": "192.168.0.0/16", - "CidrBlockSet": [ - { - "CidrBlock": "192.168.0.0/16" - } - ], - "OwnerId": "606671647913", - "PeeringOptions": { - "AllowDnsResolutionFromRemoteVpc": false, - "AllowEgressFromLocalClassicLinkToRemoteVpc": false, - "AllowEgressFromLocalVpcToRemoteClassicLink": false - }, - "VpcId": "vpc-0b40bd7c59dbe4277", - "Region": "eu-west-1" - }, - "RequesterVpcInfo": { - "CidrBlock": "10.0.17.0/24", - "CidrBlockSet": [ - { - "CidrBlock": "10.0.17.0/24" - } - ], - "OwnerId": "606671647913", - "PeeringOptions": { - "AllowDnsResolutionFromRemoteVpc": false, - "AllowEgressFromLocalClassicLinkToRemoteVpc": false, - "AllowEgressFromLocalVpcToRemoteClassicLink": false - }, - "VpcId": "vpc-0b721449398429559", - "Region": "eu-west-1" - }, - "Status": { - "Code": "provisioning", - "Message": "Provisioning" - }, - "Tags": [], - "VpcPeeringConnectionId": "pcx-0cb23d66dea3dca9f" - } -} ----- -+ -. Update ROSA cluster VPC route-table -+ -.Command: -[source,bash] ----- -ROSA_PUBLIC_ROUTE_TABLE_ID=$(aws ec2 describe-route-tables \ - --filters "Name=vpc-id,Values=vpc-0b721449398429559" "Name=association.main,Values=true" \# <1> - --query "RouteTables[*].RouteTableId" \ - --output text \ - --region eu-west-1 -) -aws ec2 create-route \ - --route-table-id ${ROSA_PUBLIC_ROUTE_TABLE_ID} \ - --destination-cidr-block 192.168.0.0/16 \# <2> - --vpc-peering-connection-id pcx-0cb23d66dea3dca9f \ - --region eu-west-1 ----- -<1> ROSA cluster VPC -<2> This must be the same as the cidr-block used when creating the Aurora VPC -+ -. Update the Aurora Security Group -+ -.Command: -[source,bash] ----- -AURORA_SECURITY_GROUP_ID=$(aws ec2 describe-security-groups \ - --filters "Name=group-name,Values=keycloak-aurora-security-group" \ - --query "SecurityGroups[*].GroupId" \ - --region eu-west-1 \ - --output text -) -aws ec2 authorize-security-group-ingress \ - --group-id ${AURORA_SECURITY_GROUP_ID} \ - --protocol tcp \ - --port 5432 \ - --cidr 10.0.17.0/24 \# <1> - --region eu-west-1 ----- -<1> The "machine_cidr" of the ROSA cluster -+ -.Output: -[source,json] ----- -{ - "Return": true, - "SecurityGroupRules": [ - { - "SecurityGroupRuleId": "sgr-0785d2f04b9cec3f5", - "GroupId": "sg-0d746cc8ad8d2e63b", - "GroupOwnerId": "606671647913", - "IsEgress": false, - "IpProtocol": "tcp", - "FromPort": 5432, - "ToPort": 5432, - "CidrIpv4": "10.0.17.0/24" - } - ] -} ----- diff --git a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-failover.adoc b/doc/kubernetes/modules/ROOT/partials/aurora/aurora-failover.adoc deleted file mode 100644 index b7cfcd6fb..000000000 --- a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-failover.adoc +++ /dev/null @@ -1,14 +0,0 @@ -Assuming a Regional multi-AZ Aurora deployment, the current writer instance should be in the same region as the active Keycloak cluster to avoid latencies and communication across availability zones. - -Switching the writer instance of Aurora will lead to a small downtime, and having the writer instance in the other datacenter with a slightly longer latency might be acceptable for some deployments. -So this might be deferred to a maintenance window or skipped depending on the circumstances of the deployment. - -To change the writer instance, run a failover. -Note that this will make the database unavailable for a short time, and database connections need to be reestablished. - -To fail over the writer instance to the other AZ, issue the following command: - -[source,bash] ----- -aws rds failover-db-cluster --db-cluster-identifier ... ----- diff --git a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-multiaz-create-procedure.adoc b/doc/kubernetes/modules/ROOT/partials/aurora/aurora-multiaz-create-procedure.adoc deleted file mode 100644 index 08db2d0aa..000000000 --- a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-multiaz-create-procedure.adoc +++ /dev/null @@ -1,357 +0,0 @@ - -. Create a VPC for the Aurora cluster -+ -.Command: -[source,bash] ----- -aws ec2 create-vpc \ - --cidr-block 192.168.0.0/16 \ - --tag-specifications "ResourceType=vpc, Tags=[{Key=AuroraCluster,Value=keycloak-aurora}]" \# <1> - --region eu-west-1 ----- -<1> We add an optional tag with the name of the Aurora cluster so that we can easily retrieve the VPC. -+ -.Output: -[source,json] ----- -{ - "Vpc": { - "CidrBlock": "192.168.0.0/16", - "DhcpOptionsId": "dopt-0bae7798158bc344f", - "State": "pending", - "VpcId": "vpc-0b40bd7c59dbe4277", - "OwnerId": "606671647913", - "InstanceTenancy": "default", - "Ipv6CidrBlockAssociationSet": [], - "CidrBlockAssociationSet": [ - { - "AssociationId": "vpc-cidr-assoc-09a02a83059ba5ab6", - "CidrBlock": "192.168.0.0/16", - "CidrBlockState": { - "State": "associated" - } - } - ], - "IsDefault": false - } -} ----- -+ -. Create a subnet for each availability zone that Aurora will be deployed to, using the `VpcId` of the newly created VPC. -+ -NOTE: The cidr-block range specified for each of the availability-zones must not overlap. -+ -.. Zone A -+ -.Command: -[source,bash] ----- -aws ec2 create-subnet \ - --availability-zone "eu-west-1a" \ - --vpc-id vpc-0b40bd7c59dbe4277 \ - --cidr-block 192.168.0.0/19 \ - --region eu-west-1 ----- -+ -.Output: -[source,json] ----- -{ - "Subnet": { - "AvailabilityZone": "eu-west-1a", - "AvailabilityZoneId": "euw1-az3", - "AvailableIpAddressCount": 8187, - "CidrBlock": "192.168.0.0/19", - "DefaultForAz": false, - "MapPublicIpOnLaunch": false, - "State": "available", - "SubnetId": "subnet-0d491a1a798aa878d", - "VpcId": "vpc-0b40bd7c59dbe4277", - "OwnerId": "606671647913", - "AssignIpv6AddressOnCreation": false, - "Ipv6CidrBlockAssociationSet": [], - "SubnetArn": "arn:aws:ec2:eu-west-1:606671647913:subnet/subnet-0d491a1a798aa878d", - "EnableDns64": false, - "Ipv6Native": false, - "PrivateDnsNameOptionsOnLaunch": { - "HostnameType": "ip-name", - "EnableResourceNameDnsARecord": false, - "EnableResourceNameDnsAAAARecord": false - } - } -} - ----- -.. Zone B -+ -.Command: -[source,bash] ----- -aws ec2 create-subnet \ - --availability-zone "eu-west-1b" \ - --vpc-id vpc-0b40bd7c59dbe4277 \ - --cidr-block 192.168.32.0/19 \ - --region eu-west-1 ----- -+ -.Output: -[source,json] ----- -{ - "Subnet": { - "AvailabilityZone": "eu-west-1b", - "AvailabilityZoneId": "euw1-az1", - "AvailableIpAddressCount": 8187, - "CidrBlock": "192.168.32.0/19", - "DefaultForAz": false, - "MapPublicIpOnLaunch": false, - "State": "available", - "SubnetId": "subnet-057181b1e3728530e", - "VpcId": "vpc-0b40bd7c59dbe4277", - "OwnerId": "606671647913", - "AssignIpv6AddressOnCreation": false, - "Ipv6CidrBlockAssociationSet": [], - "SubnetArn": "arn:aws:ec2:eu-west-1:606671647913:subnet/subnet-057181b1e3728530e", - "EnableDns64": false, - "Ipv6Native": false, - "PrivateDnsNameOptionsOnLaunch": { - "HostnameType": "ip-name", - "EnableResourceNameDnsARecord": false, - "EnableResourceNameDnsAAAARecord": false - } - } -} - ----- -+ -. Obtain the ID of the Aurora VPC route-table -+ -.Command: -[source,bash] ----- -aws ec2 describe-route-tables \ - --filters Name=vpc-id,Values=vpc-0b40bd7c59dbe4277 \ - --region eu-west-1 ----- -+ -.Output: -[source,json] ----- -{ - "RouteTables": [ - { - "Associations": [ - { - "Main": true, - "RouteTableAssociationId": "rtbassoc-02dfa06f4c7b4f99a", - "RouteTableId": "rtb-04a644ad3cd7de351", - "AssociationState": { - "State": "associated" - } - } - ], - "PropagatingVgws": [], - "RouteTableId": "rtb-04a644ad3cd7de351", - "Routes": [ - { - "DestinationCidrBlock": "192.168.0.0/16", - "GatewayId": "local", - "Origin": "CreateRouteTable", - "State": "active" - } - ], - "Tags": [], - "VpcId": "vpc-0b40bd7c59dbe4277", - "OwnerId": "606671647913" - } - ] -} - ----- -+ -. Associate the Aurora VPC route-table each availability zone's subnet -.. Zone A -+ -.Command: -[source,bash] ----- -aws ec2 associate-route-table \ - --route-table-id rtb-04a644ad3cd7de351 \ - --subnet-id subnet-0d491a1a798aa878d \ - --region eu-west-1 ----- -+ -.. Zone B -+ -.Command: -[source,bash] ----- -aws ec2 associate-route-table \ - --route-table-id rtb-04a644ad3cd7de351 \ - --subnet-id subnet-057181b1e3728530e \ - --region eu-west-1 ----- -+ -. Create Aurora Subnet Group -+ -.Command: -[source,bash] ----- -aws rds create-db-subnet-group \ - --db-subnet-group-name keycloak-aurora-subnet-group \ - --db-subnet-group-description "Aurora DB Subnet Group" \ - --subnet-ids subnet-0d491a1a798aa878d subnet-057181b1e3728530e \ - --region eu-west-1 ----- -+ -. Create Aurora Security Group -+ -.Command: -[source,bash] ----- -aws ec2 create-security-group \ - --group-name keycloak-aurora-security-group \ - --description "Aurora DB Security Group" \ - --vpc-id vpc-0b40bd7c59dbe4277 \ - --region eu-west-1 ----- -+ -.Output: -[source,json] ----- -{ - "GroupId": "sg-0d746cc8ad8d2e63b" -} ----- -+ -. Create the Aurora DB Cluster -+ -.Command: -[source,bash] ----- -aws rds create-db-cluster \ - --db-cluster-identifier keycloak-aurora \ - --database-name keycloak \ - --engine aurora-postgresql \ - --engine-version 15.3 \ - --master-username keycloak \ - --master-user-password secret99 \ - --vpc-security-group-ids sg-0d746cc8ad8d2e63b \ - --db-subnet-group-name keycloak-aurora-subnet-group \ - --region eu-west-1 ----- -+ -NOTE: You should replace the `--master-username` and `--master-user-password` values. The values specified here must be used -when configuring the Keycloak DB credentials. -+ -.Output: -[source,json] ----- -{ - "DBCluster": { - "AllocatedStorage": 1, - "AvailabilityZones": [ - "eu-west-1b", - "eu-west-1c", - "eu-west-1a" - ], - "BackupRetentionPeriod": 1, - "DatabaseName": "keycloak", - "DBClusterIdentifier": "keycloak-aurora", - "DBClusterParameterGroup": "default.aurora-postgresql15", - "DBSubnetGroup": "keycloak-aurora-subnet-group", - "Status": "creating", - "Endpoint": "keycloak-aurora.cluster-clhthfqe0h8p.eu-west-1.rds.amazonaws.com", - "ReaderEndpoint": "keycloak-aurora.cluster-ro-clhthfqe0h8p.eu-west-1.rds.amazonaws.com", - "MultiAZ": false, - "Engine": "aurora-postgresql", - "EngineVersion": "15.3", - "Port": 5432, - "MasterUsername": "keycloak", - "PreferredBackupWindow": "02:21-02:51", - "PreferredMaintenanceWindow": "fri:03:34-fri:04:04", - "ReadReplicaIdentifiers": [], - "DBClusterMembers": [], - "VpcSecurityGroups": [ - { - "VpcSecurityGroupId": "sg-0d746cc8ad8d2e63b", - "Status": "active" - } - ], - "HostedZoneId": "Z29XKXDKYMONMX", - "StorageEncrypted": false, - "DbClusterResourceId": "cluster-IBWXUWQYM3MS5BH557ZJ6ZQU4I", - "DBClusterArn": "arn:aws:rds:eu-west-1:606671647913:cluster:keycloak-aurora", - "AssociatedRoles": [], - "IAMDatabaseAuthenticationEnabled": false, - "ClusterCreateTime": "2023-11-01T10:40:45.964000+00:00", - "EngineMode": "provisioned", - "DeletionProtection": false, - "HttpEndpointEnabled": false, - "CopyTagsToSnapshot": false, - "CrossAccountClone": false, - "DomainMemberships": [], - "TagList": [], - "AutoMinorVersionUpgrade": true, - "NetworkType": "IPV4" - } -} ----- -+ - -. Create Aurora DB instances -+ -.. Create Zone A Writer instance -+ -.Command: -[source,bash] ----- - aws rds create-db-instance \ - --db-cluster-identifier keycloak-aurora \ - --db-instance-identifier "keycloak-aurora-instance-1" \ - --db-instance-class db.t4g.large \ - --engine aurora-postgresql \ - --region eu-west-1 ----- -+ -.. Create Zone B Reader instance -+ -.Command: -[source,bash] ----- - aws rds create-db-instance \ - --db-cluster-identifier keycloak-aurora \ - --db-instance-identifier "keycloak-aurora-instance-2" \ - --db-instance-class db.t4g.large \ - --engine aurora-postgresql \ - --region eu-west-1 ----- -+ -. Wait for all Writer and Reader instances to be ready -+ -.Command: -[source,bash] ----- -aws rds wait db-instance-available --db-instance-identifier keycloak-aurora-instance-1 --region eu-west-1 -aws rds wait db-instance-available --db-instance-identifier keycloak-aurora-instance-2 --region eu-west-1 ----- -+ -. [[aurora-writer-url]]Obtain the Writer endpoint URL for use by Keycloak -+ -.Command: -[source,bash] ----- -aws rds describe-db-clusters \ - --db-cluster-identifier keycloak-aurora \ - --query 'DBClusters[*].Endpoint' \ - --region eu-west-1 \ - --output text ----- -+ -.Output: -[source,json] ----- -[ - "keycloak-aurora.cluster-clhthfqe0h8p.eu-west-1.rds.amazonaws.com" -] ----- diff --git a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-verify-peering-connections.adoc b/doc/kubernetes/modules/ROOT/partials/aurora/aurora-verify-peering-connections.adoc deleted file mode 100644 index ebbfe95f3..000000000 --- a/doc/kubernetes/modules/ROOT/partials/aurora/aurora-verify-peering-connections.adoc +++ /dev/null @@ -1,23 +0,0 @@ -The simplest way to verify that a connection is possible between a ROSA cluster and an Aurora DB cluster is to deploy -`psql` on the Openshift cluster and attempt to connect to the writer endpoint. - -The below command will create a pod in the default namespace and establish a `psql` connection with the Aurora cluster if possible. -Upon exiting the pod shell, the pod will be deleted. - -[source,bash] ----- -USER=keycloak# <1> -PASSWORD=secret99# <2> -DATABASE=keycloak# <3> -HOST=$(aws rds describe-db-clusters \ - --db-cluster-identifier keycloak-aurora \# <4> - --query 'DBClusters[*].Endpoint' \ - --region eu-west-1 \ - --output text -) -oc run -i --tty --rm debug --image=postgres:13 --restart=Never -- psql postgresql://${USER}:${PASSWORD}@${HOST}/${DATABASE} ----- -<1> Aurora DB user, this can be the same as `--master-username` used when creating the DB. -<2> Aurora DB user-password, this can be the same as `--master--user-password` used when creating the DB. -<3> The name of the Aurora DB, i.e. `--database-name`. -<4> The name of your Aurora DB cluster. diff --git a/doc/kubernetes/modules/ROOT/partials/running/infinispan-batchcr-intro.adoc b/doc/kubernetes/modules/ROOT/partials/running/infinispan-batchcr-intro.adoc deleted file mode 100644 index bcd035da1..000000000 --- a/doc/kubernetes/modules/ROOT/partials/running/infinispan-batchcr-intro.adoc +++ /dev/null @@ -1,11 +0,0 @@ -The steps uses {ispn} Batch CR to perform the CLI commands. -However, the system administrator may use the CLI tool, and copy/run those commands, connecting via the exposed {ispn} service. - -Detailed information about {ispn} Batch CR in the {operator-docs}#batch-cr[documentation page]. - -To install `ConfigMap` with the {ispn} CLI command, copy the YAML into a file, and install them in the {ocp} cluster using the following command: - -[source,bash,subs="+attributes"] ----- -oc -n {ns} install -f /path/to/yamlfile ----- diff --git a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-clear-caches.adoc b/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-clear-caches.adoc deleted file mode 100644 index 3fcb29d70..000000000 --- a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-clear-caches.adoc +++ /dev/null @@ -1,99 +0,0 @@ -. Disable the replication from {stale-site} site to the {keep-site} site by running the following command. -It prevents the clear request to reach the {keep-site} site and delete all the correct cached data. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site take-offline --all-caches --site={keep-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "offlineClientSessions" : "ok", - "authenticationSessions" : "ok", - "sessions" : "ok", - "clientSessions" : "ok", - "work" : "ok", - "offlineSessions" : "ok", - "loginFailures" : "ok", - "actionTokens" : "ok" -} ----- - -. Check the replication status is `offline`. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site status --all-caches --site={keep-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "status" : "offline" -} ----- -+ -If the status is not `offline`, repeat the previous step. -+ -WARNING: Make sure the replication is `offline` otherwise the clear data will clear both sites. - -. Clear all the cached data in {stale-site} site using the following commands: -+ -.Command: -[source,bash,subs="+attributes"] ----- -clearcache actionTokens -clearcache authenticationSessions -clearcache clientSessions -clearcache loginFailures -clearcache offlineClientSessions -clearcache offlineSessions -clearcache sessions -clearcache work ----- -+ -The commands don't have an output. - -. Re-enable the cross-site replication from {stale-site} site to the {keep-site} site. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site bring-online --all-caches --site={keep-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "offlineClientSessions" : "ok", - "authenticationSessions" : "ok", - "sessions" : "ok", - "clientSessions" : "ok", - "work" : "ok", - "offlineSessions" : "ok", - "loginFailures" : "ok", - "actionTokens" : "ok" -} ----- - -. Check the replication status is `online`. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site status --all-caches --site={keep-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "status" : "online" -} ----- diff --git a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-connect.adoc b/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-connect.adoc deleted file mode 100644 index 19a6bd358..000000000 --- a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-connect.adoc +++ /dev/null @@ -1,21 +0,0 @@ -. Connect into {ispn} Cluster using the {ispn} CLI tool: -+ -.Command: -[source,bash,subs="+attributes"] ----- -oc -n {ns} exec -it pods/{cluster-name}-0 -- ./bin/cli.sh --trustall --connect https://127.0.0.1:11222 ----- -+ -It asks for the username and password for the {ispn} cluster. -Those credentials are the one set in xref:running/infinispan-crossdc-deployment.adoc#infinispan-credentials[Configuring credentials] section. -+ -.Output: -[source,bash,subs="+attributes"] ----- -Username: developer -Password: -[{cluster-name}-0-29897@ISPN//containers/default]> ----- -+ -NOTE: The pod name depends on the cluster name defined in the {ispn} CR. -The connection can be done with any pod in the {ispn} cluster. diff --git a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-state-transfer.adoc b/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-state-transfer.adoc deleted file mode 100644 index 637a841e6..000000000 --- a/doc/kubernetes/modules/ROOT/partials/running/infinispan-cli-state-transfer.adoc +++ /dev/null @@ -1,120 +0,0 @@ -. Trigger the state transfer from the {keep-site} site to the {stale-site} site. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site push-site-state --all-caches --site={stale-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "offlineClientSessions" : "ok", - "authenticationSessions" : "ok", - "sessions" : "ok", - "clientSessions" : "ok", - "work" : "ok", - "offlineSessions" : "ok", - "loginFailures" : "ok", - "actionTokens" : "ok" -} ----- - -. Check the replication status is `online` for all caches. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site status --all-caches --site={stale-site-name} ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "status" : "online" -} ----- - -. Wait for the state transfer to complete by checking the output of `push-site-status` command for all caches. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site push-site-status --cache=actionTokens -site push-site-status --cache=authenticationSessions -site push-site-status --cache=clientSessions -site push-site-status --cache=loginFailures -site push-site-status --cache=offlineClientSessions -site push-site-status --cache=offlineSessions -site push-site-status --cache=sessions -site push-site-status --cache=work ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} -{ - "{stale-site-name}" : "OK" -} ----- -+ -Check the table in {xsite-docs}#rest_v2_xsite_state_push_cross-site-operations-rest[this section for the Cross-Site Documentation] for the possible status values. -+ -If an error is reported, repeat the state transfer for that specific cache. -+ -.Command: -[source,bash,subs="+attributes"] ----- -site push-site-state --cache= --site={stale-site-name} ----- - -. Clear/reset the state transfer status with the following command -+ -.Command: -[source,bash,subs="+attributes"] ----- -site clear-push-site-status --cache=actionTokens -site clear-push-site-status --cache=authenticationSessions -site clear-push-site-status --cache=clientSessions -site clear-push-site-status --cache=loginFailures -site clear-push-site-status --cache=offlineClientSessions -site clear-push-site-status --cache=offlineSessions -site clear-push-site-status --cache=sessions -site clear-push-site-status --cache=work ----- -+ -.Output: -[source,bash,subs="+attributes"] ----- -"ok" -"ok" -"ok" -"ok" -"ok" -"ok" -"ok" -"ok" -----