From d6e5f3f0ce072e74c7e4750803e4f0b5c96f2a67 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 8 Nov 2023 18:52:50 +0100 Subject: [PATCH 01/14] implement graceful shutdown --- Cargo.toml | 4 +- rust/crd/Cargo.toml | 4 +- rust/crd/src/lib.rs | 127 ++++++++++++++---- rust/operator-binary/src/druid_controller.rs | 8 ++ .../src/operations/graceful_shutdown.rs | 25 ++++ rust/operator-binary/src/operations/mod.rs | 1 + 6 files changed, 142 insertions(+), 27 deletions(-) create mode 100644 rust/operator-binary/src/operations/graceful_shutdown.rs diff --git a/Cargo.toml b/Cargo.toml index 1f126666..c5336421 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,8 +15,9 @@ clap = "4.3" fnv = "1.0" futures = { version = "0.3", features = ["compat"] } indoc = "2.0" -openssl = "0.10" lazy_static = "1.4" +openssl = "0.10" +product-config = { git = "https://github.com/stackabletech/product-config.git", tag = "0.6.0" } pin-project = "1.1" rstest = "0.18" semver = "1.0" @@ -25,7 +26,6 @@ serde_json = "1.0" serde_yaml = "0.9" snafu = "0.7" stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag = "0.56.0" } -product-config = { git = "https://github.com/stackabletech/product-config.git", tag = "0.6.0" } strum = { version = "0.25", features = ["derive"] } tokio = { version = "1.29", features = ["full"] } tracing = "0.1" diff --git a/rust/crd/Cargo.toml b/rust/crd/Cargo.toml index 24143dd2..2f0995c1 100644 --- a/rust/crd/Cargo.toml +++ b/rust/crd/Cargo.toml @@ -9,17 +9,17 @@ repository.workspace = true publish = false [dependencies] +indoc.workspace = true +product-config.workspace = true semver.workspace = true serde.workspace = true serde_json.workspace = true stackable-operator.workspace = true -product-config.workspace = true strum.workspace = true tracing.workspace = true snafu.workspace = true lazy_static.workspace = true [dev-dependencies] -indoc.workspace = true rstest.workspace = true serde_yaml.workspace = true diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index cd75fd5a..8c7dfbde 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -1,5 +1,21 @@ -use std::collections::{BTreeMap, HashMap}; +pub mod affinity; +pub mod authentication; +pub mod authorization; +pub mod memory; +pub mod resource; +pub mod security; +pub mod storage; +pub mod tls; + +use crate::{ + affinity::{get_affinity, migrate_legacy_selector}, + authentication::DruidAuthentication, + authorization::DruidAuthorization, + resource::RoleResource, + tls::{default_druid_tls, DruidTls}, +}; +use indoc::formatdoc; use product_config::types::PropertyNameKind; use serde::{Deserialize, Serialize}; use snafu::{OptionExt, ResultExt, Snafu}; @@ -25,30 +41,20 @@ use stackable_operator::{ labels::ObjectLabels, memory::{BinaryMultiple, MemoryQuantity}, product_config_utils::{ConfigError, Configuration}, - product_logging::{self, spec::Logging}, + product_logging::{ + self, + framework::{create_vector_shutdown_file_command, remove_vector_shutdown_file_command}, + spec::Logging, + }, role_utils::{CommonConfiguration, GenericRoleConfig, Role, RoleGroup}, schemars::{self, JsonSchema}, status::condition::{ClusterCondition, HasStatusCondition}, + time::Duration, + utils::COMMON_BASH_TRAP_FUNCTIONS, }; +use std::collections::{BTreeMap, HashMap}; use strum::{Display, EnumDiscriminants, EnumIter, EnumString, IntoStaticStr}; -use crate::{ - affinity::{get_affinity, migrate_legacy_selector}, - authentication::DruidAuthentication, - authorization::DruidAuthorization, - resource::RoleResource, - tls::{default_druid_tls, DruidTls}, -}; - -pub mod affinity; -pub mod authentication; -pub mod authorization; -pub mod memory; -pub mod resource; -pub mod security; -pub mod storage; -pub mod tls; - pub const APP_NAME: &str = "druid"; pub const OPERATOR_NAME: &str = "druid.stackable.tech"; @@ -133,6 +139,14 @@ pub const SC_VOLUME_NAME: &str = "segment-cache"; pub const ENV_INTERNAL_SECRET: &str = "INTERNAL_SECRET"; +// Graceful shutdown timeouts +const DEFAULT_BROKER_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(5); +const DEFAULT_COORDINATOR_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(5); +const DEFAULT_MIDDLEMANAGER_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = + Duration::from_minutes_unchecked(5); +const DEFAULT_ROUTER_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(5); +const DEFAULT_HISTORICAL_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = Duration::from_minutes_unchecked(5); + #[derive(Snafu, Debug, EnumDiscriminants)] #[strum_discriminants(derive(IntoStaticStr))] #[allow(clippy::enum_variant_names)] @@ -289,6 +303,7 @@ pub struct CommonRoleGroupConfig { pub replicas: Option, pub selector: Option, pub affinity: StackableAffinity, + pub graceful_shutdown_timeout: Option, } /// Container for the merged and validated role group configurations @@ -328,6 +343,11 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), + graceful_shutdown_timeout: rolegroup + .config + .config + .graceful_shutdown_timeout + .clone(), }) } DruidRole::Coordinator => { @@ -341,6 +361,11 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), + graceful_shutdown_timeout: rolegroup + .config + .config + .graceful_shutdown_timeout + .clone(), }) } DruidRole::Historical => { @@ -356,6 +381,11 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), + graceful_shutdown_timeout: rolegroup + .config + .config + .graceful_shutdown_timeout + .clone(), }) } DruidRole::MiddleManager => { @@ -369,6 +399,11 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), + graceful_shutdown_timeout: rolegroup + .config + .config + .graceful_shutdown_timeout + .clone(), }) } DruidRole::Router => { @@ -382,6 +417,11 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), + graceful_shutdown_timeout: rolegroup + .config + .config + .graceful_shutdown_timeout + .clone(), }) } } @@ -449,6 +489,17 @@ impl DruidRole { } } + /// Return the default graceful shutdown timeout + pub fn default_graceful_shutdown_timeout(&self) -> Duration { + match &self { + DruidRole::Coordinator => DEFAULT_COORDINATOR_GRACEFUL_SHUTDOWN_TIMEOUT, + DruidRole::Broker => DEFAULT_BROKER_GRACEFUL_SHUTDOWN_TIMEOUT, + DruidRole::Historical => DEFAULT_HISTORICAL_GRACEFUL_SHUTDOWN_TIMEOUT, + DruidRole::MiddleManager => DEFAULT_MIDDLEMANAGER_GRACEFUL_SHUTDOWN_TIMEOUT, + DruidRole::Router => DEFAULT_ROUTER_GRACEFUL_SHUTDOWN_TIMEOUT, + } + } + pub fn main_container_prepare_commands( &self, s3_connection: Option<&S3ConnectionSpec>, @@ -497,10 +548,20 @@ impl DruidRole { } pub fn main_container_start_command(&self) -> String { - format!( - "/stackable/druid/bin/run-druid {} {RW_CONFIG_DIRECTORY}", - self.get_process_name(), - ) + formatdoc! {" + {COMMON_BASH_TRAP_FUNCTIONS} + {remove_vector_shutdown_file_command} + prepare_signal_handlers + /stackable/druid/bin/run-druid {process_name} {RW_CONFIG_DIRECTORY} & + wait_for_termination $! + {create_vector_shutdown_file_command} + ", + process_name = self.get_process_name(), + remove_vector_shutdown_file_command = + remove_vector_shutdown_file_command(LOG_DIR), + create_vector_shutdown_file_command = + create_vector_shutdown_file_command(LOG_DIR), + } } } @@ -979,6 +1040,9 @@ pub struct BrokerConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, } impl BrokerConfig { @@ -991,6 +1055,7 @@ impl BrokerConfig { resources: resource::BROKER_RESOURCES.to_owned(), logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role, deep_storage), + graceful_shutdown_timeout: Some(role.default_graceful_shutdown_timeout()), } } } @@ -1016,6 +1081,9 @@ pub struct CoordinatorConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, } impl CoordinatorConfig { @@ -1028,6 +1096,7 @@ impl CoordinatorConfig { resources: resource::COORDINATOR_RESOURCES.to_owned(), logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role, deep_storage), + graceful_shutdown_timeout: Some(role.default_graceful_shutdown_timeout()), } } } @@ -1053,6 +1122,9 @@ pub struct MiddleManagerConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, } impl MiddleManagerConfig { @@ -1065,6 +1137,7 @@ impl MiddleManagerConfig { resources: resource::MIDDLE_MANAGER_RESOURCES.to_owned(), logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role, deep_storage), + graceful_shutdown_timeout: Some(role.default_graceful_shutdown_timeout()), } } } @@ -1090,6 +1163,9 @@ pub struct RouterConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, } impl RouterConfig { @@ -1102,6 +1178,7 @@ impl RouterConfig { resources: resource::ROUTER_RESOURCES.to_owned(), logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role, deep_storage), + graceful_shutdown_timeout: Some(role.default_graceful_shutdown_timeout()), } } } @@ -1127,6 +1204,9 @@ pub struct HistoricalConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + #[fragment_attrs(serde(default))] + pub graceful_shutdown_timeout: Option, } impl HistoricalConfig { @@ -1139,6 +1219,7 @@ impl HistoricalConfig { resources: resource::HISTORICAL_RESOURCES.to_owned(), logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role, deep_storage), + graceful_shutdown_timeout: Some(role.default_graceful_shutdown_timeout()), } } } diff --git a/rust/operator-binary/src/druid_controller.rs b/rust/operator-binary/src/druid_controller.rs index bd747a31..b6165552 100644 --- a/rust/operator-binary/src/druid_controller.rs +++ b/rust/operator-binary/src/druid_controller.rs @@ -69,6 +69,7 @@ use stackable_operator::{ }; use strum::{EnumDiscriminants, IntoStaticStr}; +use crate::operations::graceful_shutdown::add_graceful_shutdown_config; use crate::{ config::get_jvm_config, discovery::{self, build_discovery_configmaps}, @@ -298,6 +299,11 @@ pub enum Error { FailedToCreatePdb { source: crate::operations::pdb::Error, }, + + #[snafu(display("failed to configure graceful shutdown"))] + GracefulShutdown { + source: crate::operations::graceful_shutdown::Error, + }, } type Result = std::result::Result; @@ -836,6 +842,8 @@ fn build_rolegroup_statefulset( // init pod builder let mut pb = PodBuilder::new(); pb.affinity(&merged_rolegroup_config.affinity); + add_graceful_shutdown_config(merged_rolegroup_config.graceful_shutdown_timeout, &mut pb) + .context(GracefulShutdownSnafu)?; let mut main_container_commands = role.main_container_prepare_commands(s3_conn); let mut prepare_container_commands = vec![]; diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs new file mode 100644 index 00000000..69812fb3 --- /dev/null +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -0,0 +1,25 @@ +use snafu::{ResultExt, Snafu}; +use stackable_operator::{builder::PodBuilder, time::Duration}; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to set terminationGracePeriod"))] + SetTerminationGracePeriod { + source: stackable_operator::builder::pod::Error, + }, +} + +pub fn add_graceful_shutdown_config( + graceful_shutdown_timeout: Option, + pod_builder: &mut PodBuilder, +) -> Result<(), Error> { + // This must be always set by the merge mechanism, as we provide a default value, + // users can not disable graceful shutdown. + if let Some(graceful_shutdown_timeout) = graceful_shutdown_timeout { + pod_builder + .termination_grace_period(&graceful_shutdown_timeout) + .context(SetTerminationGracePeriodSnafu)?; + } + + Ok(()) +} diff --git a/rust/operator-binary/src/operations/mod.rs b/rust/operator-binary/src/operations/mod.rs index d3cf6e9c..92ca2ec7 100644 --- a/rust/operator-binary/src/operations/mod.rs +++ b/rust/operator-binary/src/operations/mod.rs @@ -1 +1,2 @@ +pub mod graceful_shutdown; pub mod pdb; From d5a4a1f0a74b5d6a7e567966c70a5b7b7bc99b8f Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 8 Nov 2023 18:53:04 +0100 Subject: [PATCH 02/14] regenerate charts --- deploy/helm/druid-operator/crds/crds.yaml | 40 +++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/deploy/helm/druid-operator/crds/crds.yaml b/deploy/helm/druid-operator/crds/crds.yaml index 38103311..fb3f0b29 100644 --- a/deploy/helm/druid-operator/crds/crds.yaml +++ b/deploy/helm/druid-operator/crds/crds.yaml @@ -506,6 +506,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -3945,6 +3949,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -8844,6 +8852,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -12283,6 +12295,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -15738,6 +15754,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -19203,6 +19223,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -22727,6 +22751,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -26166,6 +26194,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -29621,6 +29653,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -33060,6 +33096,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null From 222896cedfb7085d3a7518667240ff36804cc1b5 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 8 Nov 2023 18:53:09 +0100 Subject: [PATCH 03/14] add test check --- tests/templates/kuttl/smoke/50-assert.yaml | 23 +++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/templates/kuttl/smoke/50-assert.yaml b/tests/templates/kuttl/smoke/50-assert.yaml index d2f3a128..6a64f9d6 100644 --- a/tests/templates/kuttl/smoke/50-assert.yaml +++ b/tests/templates/kuttl/smoke/50-assert.yaml @@ -9,6 +9,10 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: druid-broker-default +spec: + template: + spec: + terminationGracePeriodSeconds: 300 status: readyReplicas: 1 replicas: 1 @@ -17,6 +21,10 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: druid-coordinator-default +spec: + template: + spec: + terminationGracePeriodSeconds: 300 status: readyReplicas: 1 replicas: 1 @@ -25,12 +33,10 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: druid-historical-default -status: - readyReplicas: 1 - replicas: 1 spec: template: spec: + terminationGracePeriodSeconds: 300 volumes: - name: tls-mount ephemeral: @@ -57,11 +63,18 @@ spec: - name: segment-cache emptyDir: sizeLimit: 1G +status: + readyReplicas: 1 + replicas: 1 --- apiVersion: apps/v1 kind: StatefulSet metadata: name: druid-middlemanager-default +spec: + template: + spec: + terminationGracePeriodSeconds: 300 status: readyReplicas: 1 replicas: 1 @@ -70,6 +83,10 @@ apiVersion: apps/v1 kind: StatefulSet metadata: name: druid-router-default +spec: + template: + spec: + terminationGracePeriodSeconds: 300 status: readyReplicas: 1 replicas: 1 From 743f0132ef6b996e5bb7baebfb21580f3feb8452 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 8 Nov 2023 18:53:14 +0100 Subject: [PATCH 04/14] add docs --- .../operations/graceful-shutdown.adoc | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc index 1e8e7283..0d7b4d79 100644 --- a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc @@ -1,7 +1,62 @@ = Graceful shutdown -Graceful shutdown of Druid nodes is either not supported by the product itself -or we have not implemented it yet. +You can configure the graceful shutdown as described in xref:concepts:operations/graceful_shutdown.adoc[]. -Outstanding implementation work for the graceful shutdowns of all products where this functionality is relevant is tracked in -https://github.com/stackabletech/issues/issues/357 +The Druid processes will receive a `SIGTERM` signal when Kubernetes wants to terminate the Pod. +It will log the received signal as shown in the log below and initiate a graceful shutdown. +After the graceful shutdown timeout runs out, and the process still didn't exit, Kubernetes will issue a `SIGKILL` signal. + +== Broker + +As a default, Druid brokers have `5 minutes` to shut down gracefully. + +They use the same mechanism described above. + +[source,text] +---- +aa +---- + +== Coordinator + +As a default, Druid coordinators have `5 minutes` to shut down gracefully. + +They use the same mechanism described above. + +[source,text] +---- +aa +---- + +== MiddleManager + +As a default, Druid middle managers have `5 minutes` to shut down gracefully. + +They use the same mechanism described above. + +[source,text] +---- +aa +---- + +== Router + +As a default, Druid routers have `5 minutes` to shut down gracefully. + +They use the same mechanism described above. + +[source,text] +---- +aa +---- + +== Historical + +As a default, Druid historicals have `5 minutes` to shut down gracefully. + +They use the same mechanism described above. + +[source,text] +---- +aa +---- From 99cd9fd13bfe3234b24cfac755c083aeb610cae1 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Fri, 10 Nov 2023 16:05:32 +0100 Subject: [PATCH 05/14] add lifecycle pre stop action for middle manager --- rust/crd/src/lib.rs | 30 ++----- rust/operator-binary/src/druid_controller.rs | 11 ++- .../src/operations/graceful_shutdown.rs | 78 +++++++++++++++++-- 3 files changed, 87 insertions(+), 32 deletions(-) diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 8c7dfbde..88ddc61f 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -343,11 +343,7 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), - graceful_shutdown_timeout: rolegroup - .config - .config - .graceful_shutdown_timeout - .clone(), + graceful_shutdown_timeout: rolegroup.config.config.graceful_shutdown_timeout, }) } DruidRole::Coordinator => { @@ -361,11 +357,7 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), - graceful_shutdown_timeout: rolegroup - .config - .config - .graceful_shutdown_timeout - .clone(), + graceful_shutdown_timeout: rolegroup.config.config.graceful_shutdown_timeout, }) } DruidRole::Historical => { @@ -381,11 +373,7 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), - graceful_shutdown_timeout: rolegroup - .config - .config - .graceful_shutdown_timeout - .clone(), + graceful_shutdown_timeout: rolegroup.config.config.graceful_shutdown_timeout, }) } DruidRole::MiddleManager => { @@ -399,11 +387,7 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), - graceful_shutdown_timeout: rolegroup - .config - .config - .graceful_shutdown_timeout - .clone(), + graceful_shutdown_timeout: rolegroup.config.config.graceful_shutdown_timeout, }) } DruidRole::Router => { @@ -417,11 +401,7 @@ impl MergedConfig { replicas: rolegroup.replicas, selector: rolegroup.selector.to_owned(), affinity: rolegroup.config.config.affinity.clone(), - graceful_shutdown_timeout: rolegroup - .config - .config - .graceful_shutdown_timeout - .clone(), + graceful_shutdown_timeout: rolegroup.config.config.graceful_shutdown_timeout, }) } } diff --git a/rust/operator-binary/src/druid_controller.rs b/rust/operator-binary/src/druid_controller.rs index b6165552..fd5807ed 100644 --- a/rust/operator-binary/src/druid_controller.rs +++ b/rust/operator-binary/src/druid_controller.rs @@ -842,8 +842,15 @@ fn build_rolegroup_statefulset( // init pod builder let mut pb = PodBuilder::new(); pb.affinity(&merged_rolegroup_config.affinity); - add_graceful_shutdown_config(merged_rolegroup_config.graceful_shutdown_timeout, &mut pb) - .context(GracefulShutdownSnafu)?; + // TODO: where to put this? data is added all over the place :( + add_graceful_shutdown_config( + role, + druid_tls_security, + merged_rolegroup_config.graceful_shutdown_timeout, + &mut pb, + &mut cb_druid, + ) + .context(GracefulShutdownSnafu)?; let mut main_container_commands = role.main_container_prepare_commands(s3_conn); let mut prepare_container_commands = vec![]; diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 69812fb3..1b421be9 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -1,5 +1,12 @@ +use indoc::formatdoc; use snafu::{ResultExt, Snafu}; -use stackable_operator::{builder::PodBuilder, time::Duration}; +use stackable_druid_crd::security::DruidTlsSecurity; +use stackable_druid_crd::DruidRole; +use stackable_operator::k8s_openapi::api::core::v1::{ExecAction, LifecycleHandler}; +use stackable_operator::{ + builder::{ContainerBuilder, PodBuilder}, + time::Duration, +}; #[derive(Debug, Snafu)] pub enum Error { @@ -10,15 +17,76 @@ pub enum Error { } pub fn add_graceful_shutdown_config( + role: &DruidRole, + tls_security: &DruidTlsSecurity, graceful_shutdown_timeout: Option, pod_builder: &mut PodBuilder, + druid_builder: &mut ContainerBuilder, ) -> Result<(), Error> { // This must be always set by the merge mechanism, as we provide a default value, // users can not disable graceful shutdown. - if let Some(graceful_shutdown_timeout) = graceful_shutdown_timeout { - pod_builder - .termination_grace_period(&graceful_shutdown_timeout) - .context(SetTerminationGracePeriodSnafu)?; + if let Some(termination_grace_period) = graceful_shutdown_timeout { + match role { + DruidRole::Coordinator + | DruidRole::Broker + | DruidRole::Historical + | DruidRole::Router => { + pod_builder + .termination_grace_period(&termination_grace_period) + .context(SetTerminationGracePeriodSnafu)?; + } + DruidRole::MiddleManager => { + pod_builder + .termination_grace_period(&termination_grace_period) + .context(SetTerminationGracePeriodSnafu)?; + + let (protocol, port) = if tls_security.tls_enabled() { + ("https", role.get_https_port()) + } else { + ("http", role.get_http_port()) + }; + + let middle_manager_host = format!("{protocol}://127.0.0.1:{port}"); + + druid_builder.lifecycle_pre_stop(LifecycleHandler { + exec: Some(ExecAction { + command: Some(vec![ + "/bin/bash".to_string(), + "-x".to_string(), + "-euo".to_string(), + "pipefail".to_string(), + "-c".to_string(), + // See: https://druid.apache.org/docs/latest/operations/rolling-updates/#rolling-restart-graceful-termination-based + formatdoc!(r#" + echo 'Disable middle manager to stop overlord from sending tasks' >> /proc/1/fd/1 2>&1 + curl -v --fail --insecure -X POST {middle_manager_host}/druid/worker/v1/disable + + end_time_seconds=$(date --date="+{termination_grace_period_seconds} seconds" '+%s') + while : + do + current_time_seconds=$(date '+%s') + echo "Check if termination grace period is reached..." >> /proc/1/fd/1 2>&1 + if [ $current_time_seconds -gt $end_time_seconds ] + then + echo "The termination grace period is reached!" >> /proc/1/fd/1 2>&1 + break + fi + echo "Check if all tasks are finished..." >> /proc/1/fd/1 2>&1 + if [ $(curl -v --fail --insecure -X GET {middle_manager_host}/druid/worker/v1/tasks) = "[]" ] + then + echo "All tasks finished!" >> /proc/1/fd/1 2>&1 + break + fi + sleep 2 + done"#, + termination_grace_period_seconds = termination_grace_period.as_secs() + ), + ]), + }), + ..Default::default() + }); + } + } } Ok(()) From 91d513c21c40abdaac26273e47a8573a4c82b673 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Fri, 10 Nov 2023 16:40:42 +0100 Subject: [PATCH 06/14] improve middle manager lifecycle script --- .../src/operations/graceful_shutdown.rs | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 1b421be9..4a888e08 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -47,6 +47,8 @@ pub fn add_graceful_shutdown_config( }; let middle_manager_host = format!("{protocol}://127.0.0.1:{port}"); + let debug_timestamp = "$(date --utc +%FT%T,%3N) INFO"; + let sleep_interval = 2; druid_builder.lifecycle_pre_stop(LifecycleHandler { exec: Some(ExecAction { @@ -58,27 +60,34 @@ pub fn add_graceful_shutdown_config( "-c".to_string(), // See: https://druid.apache.org/docs/latest/operations/rolling-updates/#rolling-restart-graceful-termination-based formatdoc!(r#" - echo 'Disable middle manager to stop overlord from sending tasks' >> /proc/1/fd/1 2>&1 - curl -v --fail --insecure -X POST {middle_manager_host}/druid/worker/v1/disable + response=$(curl -v --fail --insecure -X POST {middle_manager_host}/druid/worker/v1/disable) + echo "{debug_timestamp} Disable middle manager to stop overlord from sending tasks: $response" >> /proc/1/fd/1 2>&1 end_time_seconds=$(date --date="+{termination_grace_period_seconds} seconds" '+%s') while : do current_time_seconds=$(date '+%s') - echo "Check if termination grace period is reached..." >> /proc/1/fd/1 2>&1 + echo "{debug_timestamp} Check if termination grace period ({termination_grace_period_seconds} seconds) is reached..." >> /proc/1/fd/1 2>&1 if [ $current_time_seconds -gt $end_time_seconds ] then - echo "The termination grace period is reached!" >> /proc/1/fd/1 2>&1 + echo "{debug_timestamp} The termination grace period is reached!" >> /proc/1/fd/1 2>&1 break fi - echo "Check if all tasks are finished..." >> /proc/1/fd/1 2>&1 - if [ $(curl -v --fail --insecure -X GET {middle_manager_host}/druid/worker/v1/tasks) = "[]" ] + + tasks=$(curl -v --fail --insecure -X GET {middle_manager_host}/druid/worker/v1/tasks) + echo "{debug_timestamp} Check if all tasks are finished... $tasks" >> /proc/1/fd/1 2>&1 + if [ $tasks = "[]" ] then - echo "All tasks finished!" >> /proc/1/fd/1 2>&1 + echo "{debug_timestamp} All tasks finished!" >> /proc/1/fd/1 2>&1 break fi - sleep 2 - done"#, + + echo "{debug_timestamp} Sleeping {sleep_interval} seconds..." + echo "" + sleep {sleep_interval} + done + echo "{debug_timestamp} All done!" + "#, termination_grace_period_seconds = termination_grace_period.as_secs() ), ]), From c2a4ebf107fa6b4012f04ce8322d4033413286c1 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 13 Nov 2023 11:59:06 +0100 Subject: [PATCH 07/14] added docs --- .../operations/graceful-shutdown.adoc | 139 ++++++++++++++++-- 1 file changed, 128 insertions(+), 11 deletions(-) diff --git a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc index 0d7b4d79..b037adcf 100644 --- a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc @@ -14,7 +14,34 @@ They use the same mechanism described above. [source,text] ---- -aa +druid 2023-11-13T10:47:13,194 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook +druid 2023-11-13T10:47:13,196 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:47:13,198 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/broker', host='druid-broker-default-0.druid-broker-def +ault.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}] +druid 2023-11-13T10:47:13,240 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/BROKER/druid-broker-default-0.druid-broker-default.kuttl-test-striking-poll +iwog.svc.cluster.local:8282] +druid 2023-11-13T10:47:13,246 INFO [NodeRoleWatcher[BROKER]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.loc +al:8282] of role [broker] went offline. +druid 2023-11-13T10:47:13,246 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/broker","host":"druid-broker-default-0.druid-broker-d +efault.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8282,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"broker","services":{"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:11.924Z"}]. +druid 2023-11-13T10:47:13,247 WARN [CuratorDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='__default'}}', startTime=2023-11-13T10:41:11.924Z}] disappeared but was unknown for service listener [dataNodeService]. +druid 2023-11-13T10:47:13,249 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] +druid 2023-11-13T10:47:13,253 INFO [Thread-55] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@79753f20{SSL, (ssl, http/1.1)}{0.0.0.0:8282} +druid 2023-11-13T10:47:13,253 INFO [Thread-55] org.eclipse.jetty.server.session - node0 Stopped scavenging +druid 2023-11-13T10:47:13,254 INFO [Thread-55] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@3269ae62{/,null,STOPPED} +druid 2023-11-13T10:47:13,259 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] +druid 2023-11-13T10:47:13,259 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local:8282', hostAndPort='null', hostAndTlsPort='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local:8282', maxSize=0, tier='_default_tier', type=broker, priority=0}] +druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopping... +druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopped. +druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.sql.calcite.schema.MetadataSegmentView - MetadataSegmentView is stopping. +druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.sql.calcite.schema.MetadataSegmentView - MetadataSegmentView Stopped. +druid 2023-11-13T10:47:13,260 INFO [DruidSchema-Cache-0] org.apache.druid.sql.calcite.schema.SegmentMetadataCache - Metadata refresh stopped. +druid 2023-11-13T10:47:13,261 INFO [LookupExtractorFactoryContainerProvider-MainThread] org.apache.druid.query.lookup.LookupReferencesManager - Lookup Management loop exited. Lookup notices are not handled anymore. +druid 2023-11-13T10:47:13,261 INFO [Thread-55] org.apache.druid.guice.LifecycleForkJoinPoolProvider - Shutting down ForkJoinPool [org.apache.druid.guice.LifecycleForkJoinPoolProvider@73741c6e] +druid 2023-11-13T10:47:13,262 INFO [Thread-55] org.apache.druid.client.HttpServerInventoryView - Stopping executor[FilteredHttpServerInventoryView]. +druid 2023-11-13T10:47:13,263 INFO [Thread-55] org.apache.druid.client.HttpServerInventoryView - Stopped executor[FilteredHttpServerInventoryView]. +druid 2023-11-13T10:47:13,274 ERROR [HttpClient-Netty-Worker-20] com.google.common.util.concurrent.ExecutionList - RuntimeException while executing runnable com.google.common.util.concurrent.Futures$4@6a6cee20 with executor java.util.concurrent.ScheduledThreadPoolExecutor@5bcc07b6[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 6] +druid java.util.concurrent.RejectedExecutionException: Task java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask@7f994562[Not completed, task = java.util.concurrent.Executors$RunnableAdapter@2f6e75a9[Wrapped task = com.google.common.util.concurrent.Futures$4@6a6cee20]] rejected from java.util.concurrent.ScheduledThreadPoolExecutor@5bcc07b6[Terminated, pool size = 0, active threads = 0, queued tasks = 0, completed tasks = 6] ---- == Coordinator @@ -25,38 +52,128 @@ They use the same mechanism described above. [source,text] ---- -aa +druid 2023-11-13T10:12:30,506 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook +druid 2023-11-13T10:12:30,508 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:12:30,512 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/OVERLORD/druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] +druid 2023-11-13T10:12:30,522 INFO [NodeRoleWatcher[OVERLORD]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] of role [overlord] went offline. +druid 2023-11-13T10:12:30,522 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"overlord","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. +druid 2023-11-13T10:12:30,523 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/COORDINATOR/druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] +druid 2023-11-13T10:12:30,524 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"coordinator","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. +druid 2023-11-13T10:12:30,600 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] +druid 2023-11-13T10:12:30,604 INFO [Thread-54] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@2ea2f965{SSL, (ssl, http/1.1)}{0.0.0.0:8281} +druid 2023-11-13T10:12:30,605 INFO [Thread-54] org.eclipse.jetty.server.session - node0 Stopped scavenging +druid 2023-11-13T10:12:30,606 INFO [Thread-54] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@26e588b7{/,jar:file:/stackable/apache-druid-27.0.0/lib/web-console-27.0.0.jar!/org/apache/druid/console,STOPPED} +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [SERVER] +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [NORMAL] +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/overlord', host='druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8281, enableTlsPort=true}] +druid 2023-11-13T10:12:30,702 INFO [Thread-54] org.apache.druid.indexing.common.actions.SegmentAllocationQueue - Not leader anymore. Stopping queue processing. +druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.duty.OverlordDutyExecutor - Stopping OverlordDutyExecutor. +druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.duty.OverlordDutyExecutor - OverlordDutyExecutor has been stopped. +druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.supervisor.SupervisorManager - SupervisorManager stopped. +druid 2023-11-13T10:12:30,703 INFO [TaskQueue-Manager] org.apache.druid.indexing.overlord.TaskQueue - Interrupted, exiting! +druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.RemoteTaskRunner - Stopping RemoteTaskRunner... +druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [INIT] +druid 2023-11-13T10:12:30,705 INFO [Thread-54] org.apache.druid.indexing.common.actions.SegmentAllocationQueue - Tearing down segment allocation queue. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.client.HttpServerInventoryView - Stopping executor[HttpServerInventoryView]. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.client.HttpServerInventoryView - Stopped executor[HttpServerInventoryView]. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.security.basic.authorization.db.updater.CoordinatorBasicAuthorizerMetadataStorageUpdater - CoordinatorBasicAuthorizerMetadataStorageUpdater is stopping. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.security.basic.authorization.db.updater.CoordinatorBasicAuthorizerMetadataStorageUpdater - CoordinatorBasicAuthorizerMetadataStorageUpdater is stopped. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.security.basic.authentication.db.updater.CoordinatorBasicAuthenticatorMetadataStorageUpdater - CoordinatorBasicAuthenticatorMetadataStorageUpdater is stopping. +druid 2023-11-13T10:12:30,802 INFO [Thread-54] org.apache.druid.security.basic.authentication.db.updater.CoordinatorBasicAuthenticatorMetadataStorageUpdater - CoordinatorBasicAuthenticatorMetadataStorageUpdater is stopped. ---- -== MiddleManager +== Historical -As a default, Druid middle managers have `5 minutes` to shut down gracefully. +As a default, Druid historicals have `5 minutes` to shut down gracefully. They use the same mechanism described above. [source,text] ---- -aa +druid 2023-11-13T10:56:54,057 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook +druid 2023-11-13T10:56:54,059 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:56:54,062 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/HISTORICAL/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283] +druid 2023-11-13T10:56:54,072 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/historical","host":"druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8283,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"historical","services":{"dataNodeService":{"type":"dataNodeService","tier":"_default_tier","maxSize":1000000000,"type":"historical","serverType":"historical","priority":0},"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:10.024Z"}]. +druid 2023-11-13T10:56:54,081 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/segments/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283_historical__default_tier_2023-11-13T10:42:12.401Z_07a7108a06df494b8f8d7c01c841384a0] +druid 2023-11-13T10:56:54,083 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/announcements/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283] +druid 2023-11-13T10:56:54,084 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] +druid 2023-11-13T10:56:54,089 INFO [Thread-55] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@1a632663{SSL, (ssl, http/1.1)}{0.0.0.0:8283} +druid 2023-11-13T10:56:54,089 INFO [Thread-55] org.eclipse.jetty.server.session - node0 Stopped scavenging +druid 2023-11-13T10:56:54,090 INFO [Thread-55] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@3b9d85c2{/,null,STOPPED} +druid 2023-11-13T10:56:54,093 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] +druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283', hostAndPort='null', hostAndTlsPort='druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283', maxSize=1000000000, tier='_default_tier', type=historical, priority=0}] +druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopping... +druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopped. +druid 2023-11-13T10:56:54,094 INFO [LookupExtractorFactoryContainerProvider-MainThread] org.apache.druid.query.lookup.LookupReferencesManager - Lookup Management loop exited. Lookup notices are not handled anymore. +druid 2023-11-13T10:56:54,096 INFO [Thread-55] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopping. +druid 2023-11-13T10:56:54,096 INFO [Thread-55] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopped. +druid 2023-11-13T10:56:54,096 INFO [Thread-55] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopping. +druid 2023-11-13T10:56:54,096 INFO [Thread-55] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopped. +druid 2023-11-13T10:56:54,108 INFO [Curator-Framework-0] org.apache.curator.framework.imps.CuratorFrameworkImpl - backgroundOperationsLoop exiting +druid 2023-11-13T10:56:54,212 INFO [Thread-55] org.apache.zookeeper.ZooKeeper - Session: 0x10011760e0e0007 closed +druid 2023-11-13T10:56:54,212 INFO [main-EventThread] org.apache.zookeeper.ClientCnxn - EventThread shut down for session: 0x10011760e0e0007 +druid 2023-11-13T10:56:54,212 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [INIT] ---- -== Router +== MiddleManager -As a default, Druid routers have `5 minutes` to shut down gracefully. +As a default, Druid middle managers have `5 minutes` to shut down gracefully. They use the same mechanism described above. [source,text] ---- -aa +druid 2023-11-13T10:42:50,652 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook +druid 2023-11-13T10:42:50,654 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:42:50,656 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/MIDDLE_MANAGER/druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,671 INFO [Thread-52] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/middlemanager","host":"druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8291,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"middleManager","services":{"workerNodeService":{"type":"workerNodeService","ip":"druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local","capacity":1,"version":"0","category":"_default_worker_category"}},"startTime":"2023-11-13T10:41:10.341Z"}]. +druid 2023-11-13T10:42:50,675 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] +druid 2023-11-13T10:42:50,677 INFO [Thread-52] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@2f51b100{SSL, (ssl, http/1.1)}{0.0.0.0:8291} +druid 2023-11-13T10:42:50,677 INFO [Thread-52] org.eclipse.jetty.server.session - node0 Stopped scavenging +druid 2023-11-13T10:42:50,679 INFO [Thread-52] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@28705150{/,null,STOPPED} +druid 2023-11-13T10:42:50,683 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] +druid 2023-11-13T10:42:50,684 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Waiting up to 300,000ms for shutdown. +druid 2023-11-13T10:42:50,685 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Finished stopping in 2ms. +druid 2023-11-13T10:42:50,685 INFO [Thread-52] org.apache.druid.indexing.worker.WorkerCuratorCoordinator - Stopping WorkerCuratorCoordinator for worker[druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,686 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/indexer/announcements/druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Waiting up to 300,000ms for shutdown. +druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Finished stopping in 0ms. +druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopping. +druid 2023-11-13T10:42:50,689 INFO [Thread-52] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopped. +druid 2023-11-13T10:42:50,689 INFO [Thread-52] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopping. +druid 2023-11-13T10:42:50,689 INFO [Thread-52] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopped. +druid 2023-11-13T10:42:50,704 INFO [Curator-Framework-0] org.apache.curator.framework.imps.CuratorFrameworkImpl - backgroundOperationsLoop exiting +druid 2023-11-13T10:42:50,808 INFO [Thread-52] org.apache.zookeeper.ZooKeeper - Session: 0x10011760e0e0008 closed +druid 2023-11-13T10:42:50,808 INFO [main-EventThread] org.apache.zookeeper.ClientCnxn - EventThread shut down for session: 0x10011760e0e0008 +druid 2023-11-13T10:42:50,808 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [INIT] ---- -== Historical +== Router -As a default, Druid historicals have `5 minutes` to shut down gracefully. +As a default, Druid routers have `5 minutes` to shut down gracefully. They use the same mechanism described above. [source,text] ---- -aa +druid 2023-11-13T10:53:13,401 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook +druid 2023-11-13T10:53:13,403 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] +druid 2023-11-13T10:53:13,406 INFO [Thread-70] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/ROUTER/druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local:9088] +druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/router","host":"druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":9088,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"router","services":{},"startTime":"2023-11-13T10:41:23.188Z"}]. +druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/router', host='druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=9088, enableTlsPort=true}] +druid 2023-11-13T10:53:13,587 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] +druid 2023-11-13T10:53:13,591 INFO [Thread-70] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@77732366{SSL, (ssl, http/1.1)}{0.0.0.0:9088} +druid 2023-11-13T10:53:13,591 INFO [Thread-70] org.eclipse.jetty.server.session - node0 Stopped scavenging +druid 2023-11-13T10:53:13,596 INFO [Thread-70] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@487f025{/,jar:file:/stackable/apache-druid-27.0.0/lib/web-console-27.0.0.jar!/org/apache/druid/console,STOPPED} +druid 2023-11-13T10:53:13,687 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] +druid 2023-11-13T10:53:13,688 INFO [Thread-70] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopping. +druid 2023-11-13T10:53:13,688 INFO [Thread-70] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopped. +druid 2023-11-13T10:53:13,688 INFO [Thread-70] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopping. +druid 2023-11-13T10:53:13,688 INFO [Thread-70] org.apache.druid.security.basic.authentication.db.cache.CoordinatorPollingBasicAuthenticatorCacheManager - CoordinatorPollingBasicAuthenticatorCacheManager is stopped. +druid 2023-11-13T10:53:13,790 INFO [Curator-Framework-0] org.apache.curator.framework.imps.CuratorFrameworkImpl - backgroundOperationsLoop exiting +druid 2023-11-13T10:53:13,895 INFO [Thread-70] org.apache.zookeeper.ZooKeeper - Session: 0x10011760e0e000a closed +druid 2023-11-13T10:53:13,895 INFO [main-EventThread] org.apache.zookeeper.ClientCnxn - EventThread shut down for session: 0x10011760e0e000a +druid 2023-11-13T10:53:13,895 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [INIT] ---- From 6f669c236ac3e647df400ba1b186d5b490804211 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 13 Nov 2023 11:59:22 +0100 Subject: [PATCH 08/14] improve lifecycle pre stop script --- .../src/operations/graceful_shutdown.rs | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 4a888e08..b3fa430c 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -47,9 +47,14 @@ pub fn add_graceful_shutdown_config( }; let middle_manager_host = format!("{protocol}://127.0.0.1:{port}"); - let debug_timestamp = "$(date --utc +%FT%T,%3N) INFO"; + let debug_message = + "$(date --utc +%FT%T,%3N) INFO [stackable_lifecycle_pre_stop] -"; let sleep_interval = 2; + // The middle manager can be terminated gracefully by disabling it, meaning + // the overlord will not send any new tasks and it will terminate after + // all tasks are finished or the termination grace period is exceeded. + // See: https://druid.apache.org/docs/latest/operations/rolling-updates/#rolling-restart-graceful-termination-based druid_builder.lifecycle_pre_stop(LifecycleHandler { exec: Some(ExecAction { command: Some(vec![ @@ -58,35 +63,36 @@ pub fn add_graceful_shutdown_config( "-euo".to_string(), "pipefail".to_string(), "-c".to_string(), - // See: https://druid.apache.org/docs/latest/operations/rolling-updates/#rolling-restart-graceful-termination-based formatdoc!(r#" + log() {{ echo "{debug_message} $1" >> /proc/1/fd/1 2>&1 }} + response=$(curl -v --fail --insecure -X POST {middle_manager_host}/druid/worker/v1/disable) - echo "{debug_timestamp} Disable middle manager to stop overlord from sending tasks: $response" >> /proc/1/fd/1 2>&1 + log "Disable middle manager to stop overlord from sending tasks: $response" end_time_seconds=$(date --date="+{termination_grace_period_seconds} seconds" '+%s') while : do current_time_seconds=$(date '+%s') - echo "{debug_timestamp} Check if termination grace period ({termination_grace_period_seconds} seconds) is reached..." >> /proc/1/fd/1 2>&1 + log "Check if termination grace period ({termination_grace_period_seconds} seconds) is reached..." if [ $current_time_seconds -gt $end_time_seconds ] then - echo "{debug_timestamp} The termination grace period is reached!" >> /proc/1/fd/1 2>&1 + log "The termination grace period is reached!" break fi tasks=$(curl -v --fail --insecure -X GET {middle_manager_host}/druid/worker/v1/tasks) - echo "{debug_timestamp} Check if all tasks are finished... $tasks" >> /proc/1/fd/1 2>&1 + log "Check if all tasks are finished... Running: $tasks" if [ $tasks = "[]" ] then - echo "{debug_timestamp} All tasks finished!" >> /proc/1/fd/1 2>&1 + log "All tasks finished!" break fi - echo "{debug_timestamp} Sleeping {sleep_interval} seconds..." - echo "" + log "Sleeping {sleep_interval} seconds..." + log "" sleep {sleep_interval} done - echo "{debug_timestamp} All done!" + log "All done!" "#, termination_grace_period_seconds = termination_grace_period.as_secs() ), From a456dbb9c3c6757b740256be5df9c32ff205c259 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 13 Nov 2023 12:02:11 +0100 Subject: [PATCH 09/14] adapted changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4fd7796..035d055a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Support PodDisruptionBudgets ([#477]). - Add support for version 27.0.0 ([#480]). - Add integration test for OpenID Connect with Keycloak ([#481]). +- Support graceful shutdown ([#486]). ### Changed @@ -29,6 +30,7 @@ All notable changes to this project will be documented in this file. [#477]: https://github.com/stackabletech/druid-operator/pull/477 [#480]: https://github.com/stackabletech/druid-operator/pull/480 [#481]: https://github.com/stackabletech/druid-operator/pull/481 +[#486]: https://github.com/stackabletech/druid-operator/pull/486 ## [23.7.0] - 2023-07-14 From b7aeef01602746b684e12cd5411da43b81a8b5b3 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Mon, 13 Nov 2023 13:43:38 +0100 Subject: [PATCH 10/14] fix logging of pre stop script --- rust/crd/src/lib.rs | 4 +++- rust/operator-binary/src/operations/graceful_shutdown.rs | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 88ddc61f..82790290 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -528,12 +528,14 @@ impl DruidRole { } pub fn main_container_start_command(&self) -> String { + // We need to store the druid process PID for the graceful shutdown lifecycle pre stop hook. formatdoc! {" {COMMON_BASH_TRAP_FUNCTIONS} {remove_vector_shutdown_file_command} prepare_signal_handlers /stackable/druid/bin/run-druid {process_name} {RW_CONFIG_DIRECTORY} & - wait_for_termination $! + echo \"$!\" >> /tmp/DRUID_PID + wait_for_termination $(cat /tmp/DRUID_PID) {create_vector_shutdown_file_command} ", process_name = self.get_process_name(), diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index b3fa430c..03f7909d 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -64,7 +64,9 @@ pub fn add_graceful_shutdown_config( "pipefail".to_string(), "-c".to_string(), formatdoc!(r#" - log() {{ echo "{debug_message} $1" >> /proc/1/fd/1 2>&1 }} + log() {{ + echo "{debug_message} $1" >> /proc/$(cat /tmp/DRUID_PID)/fd/1 2>&1 + }} response=$(curl -v --fail --insecure -X POST {middle_manager_host}/druid/worker/v1/disable) log "Disable middle manager to stop overlord from sending tasks: $response" From 70a7fe9e1736a20b1c4a37cb8da777fc730cb847 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 15 Nov 2023 13:30:55 +0100 Subject: [PATCH 11/14] improve docs --- .../druid/pages/usage-guide/operations/graceful-shutdown.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc index b037adcf..38ae2237 100644 --- a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc @@ -121,7 +121,7 @@ druid 2023-11-13T10:56:54,212 INFO [Thread-55] org.apache.druid.java.util.common As a default, Druid middle managers have `5 minutes` to shut down gracefully. -They use the same mechanism described above. +The middle manager can be terminated gracefully by disabling it. Meaning the overlord will not send any new tasks and the middle manager will terminate after all tasks are finished or the termination grace period is exceeded. [source,text] ---- From c6ec6ba6a9f40a4c27eaeee063c55e610846961b Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 15 Nov 2023 13:31:01 +0100 Subject: [PATCH 12/14] remove todo --- rust/operator-binary/src/druid_controller.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/operator-binary/src/druid_controller.rs b/rust/operator-binary/src/druid_controller.rs index fd5807ed..aa50e072 100644 --- a/rust/operator-binary/src/druid_controller.rs +++ b/rust/operator-binary/src/druid_controller.rs @@ -842,7 +842,6 @@ fn build_rolegroup_statefulset( // init pod builder let mut pb = PodBuilder::new(); pb.affinity(&merged_rolegroup_config.affinity); - // TODO: where to put this? data is added all over the place :( add_graceful_shutdown_config( role, druid_tls_security, From 939f3d5ad88cb9955604f85bbeff24017435fe01 Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 15 Nov 2023 14:25:22 +0100 Subject: [PATCH 13/14] improved docs --- .../usage-guide/operations/graceful-shutdown.adoc | 10 ++++++++++ .../src/operations/graceful_shutdown.rs | 1 + 2 files changed, 11 insertions(+) diff --git a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc index 38ae2237..a5f6239c 100644 --- a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc @@ -125,6 +125,16 @@ The middle manager can be terminated gracefully by disabling it. Meaning the ove [source,text] ---- +druid 2023-11-13T10:42:50,569 INFO [stackable_lifecycle_pre_stop] - Disable middle manager to stop overlord from sending tasks: {"test-druid-middlemanager-automatic-log-config-0.test-druid-middlemanager-automatic-log-config.kuttl-test-on-ghost.svc.cluster.local:8291":"disabled"} +druid 2023-11-13T10:42:50,578 INFO [stackable_lifecycle_pre_stop] - Check if termination grace period (300 seconds) is reached... +druid 2023-11-13T10:42:50,586 INFO [stackable_lifecycle_pre_stop] - Check if all tasks are finished... Running: [] +druid 2023-11-13T10:42:50,591 INFO [stackable_lifecycle_pre_stop] - All tasks finished! +druid 2023-11-13T10:42:50,605 INFO [stackable_lifecycle_pre_stop] - All done! +druid ++ handle_term_signal +druid ++ '[' 11 ']' +druid ++ kill -TERM 11 +druid + trap - TERM +druid + wait 11 druid 2023-11-13T10:42:50,652 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:42:50,654 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] druid 2023-11-13T10:42:50,656 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/MIDDLE_MANAGER/druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] diff --git a/rust/operator-binary/src/operations/graceful_shutdown.rs b/rust/operator-binary/src/operations/graceful_shutdown.rs index 03f7909d..c9b2fe8c 100644 --- a/rust/operator-binary/src/operations/graceful_shutdown.rs +++ b/rust/operator-binary/src/operations/graceful_shutdown.rs @@ -55,6 +55,7 @@ pub fn add_graceful_shutdown_config( // the overlord will not send any new tasks and it will terminate after // all tasks are finished or the termination grace period is exceeded. // See: https://druid.apache.org/docs/latest/operations/rolling-updates/#rolling-restart-graceful-termination-based + // The DRUID_PID is set in the crd/src/lib.rs `main_container_start_command` method. druid_builder.lifecycle_pre_stop(LifecycleHandler { exec: Some(ExecAction { command: Some(vec![ From 4f889ae68096fb771bb54408f656197f50fa515a Mon Sep 17 00:00:00 2001 From: Malte Sander Date: Wed, 15 Nov 2023 14:27:26 +0100 Subject: [PATCH 14/14] fix docs --- .../operations/graceful-shutdown.adoc | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc index a5f6239c..fb497ab7 100644 --- a/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/druid/pages/usage-guide/operations/graceful-shutdown.adoc @@ -17,20 +17,20 @@ They use the same mechanism described above. druid 2023-11-13T10:47:13,194 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:47:13,196 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] druid 2023-11-13T10:47:13,198 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/broker', host='druid-broker-default-0.druid-broker-def -ault.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}] +ault.default.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}] druid 2023-11-13T10:47:13,240 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/BROKER/druid-broker-default-0.druid-broker-default.kuttl-test-striking-poll iwog.svc.cluster.local:8282] -druid 2023-11-13T10:47:13,246 INFO [NodeRoleWatcher[BROKER]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.loc +druid 2023-11-13T10:47:13,246 INFO [NodeRoleWatcher[BROKER]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-broker-default-0.druid-broker-default.default.svc.cluster.loc al:8282] of role [broker] went offline. druid 2023-11-13T10:47:13,246 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/broker","host":"druid-broker-default-0.druid-broker-d -efault.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8282,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"broker","services":{"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:11.924Z"}]. -druid 2023-11-13T10:47:13,247 WARN [CuratorDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='__default'}}', startTime=2023-11-13T10:41:11.924Z}] disappeared but was unknown for service listener [dataNodeService]. +efault.default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8282,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"broker","services":{"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:11.924Z"}]. +druid 2023-11-13T10:47:13,247 WARN [CuratorDruidNodeDiscoveryProvider-ListenerExecutor] org.apache.druid.discovery.DruidNodeDiscoveryProvider$ServiceDruidNodeDiscovery - Node[DiscoveryDruidNode{druidNode=DruidNode{serviceName='druid/broker', host='druid-broker-default-0.druid-broker-default.default.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8282, enableTlsPort=true}, nodeRole='BROKER', services={lookupNodeService=LookupNodeService{lookupTier='__default'}}', startTime=2023-11-13T10:41:11.924Z}] disappeared but was unknown for service listener [dataNodeService]. druid 2023-11-13T10:47:13,249 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] druid 2023-11-13T10:47:13,253 INFO [Thread-55] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@79753f20{SSL, (ssl, http/1.1)}{0.0.0.0:8282} druid 2023-11-13T10:47:13,253 INFO [Thread-55] org.eclipse.jetty.server.session - node0 Stopped scavenging druid 2023-11-13T10:47:13,254 INFO [Thread-55] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@3269ae62{/,null,STOPPED} druid 2023-11-13T10:47:13,259 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] -druid 2023-11-13T10:47:13,259 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local:8282', hostAndPort='null', hostAndTlsPort='druid-broker-default-0.druid-broker-default.kuttl-test-striking-polliwog.svc.cluster.local:8282', maxSize=0, tier='_default_tier', type=broker, priority=0}] +druid 2023-11-13T10:47:13,259 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-broker-default-0.druid-broker-default.default.svc.cluster.local:8282', hostAndPort='null', hostAndTlsPort='druid-broker-default-0.druid-broker-default.default.svc.cluster.local:8282', maxSize=0, tier='_default_tier', type=broker, priority=0}] druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopping... druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopped. druid 2023-11-13T10:47:13,260 INFO [Thread-55] org.apache.druid.sql.calcite.schema.MetadataSegmentView - MetadataSegmentView is stopping. @@ -54,11 +54,11 @@ They use the same mechanism described above. ---- druid 2023-11-13T10:12:30,506 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:12:30,508 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] -druid 2023-11-13T10:12:30,512 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/OVERLORD/druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] -druid 2023-11-13T10:12:30,522 INFO [NodeRoleWatcher[OVERLORD]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] of role [overlord] went offline. -druid 2023-11-13T10:12:30,522 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"overlord","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. -druid 2023-11-13T10:12:30,523 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/COORDINATOR/druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local:8281] -druid 2023-11-13T10:12:30,524 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"coordinator","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. +druid 2023-11-13T10:12:30,512 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/OVERLORD/druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local:8281] +druid 2023-11-13T10:12:30,522 INFO [NodeRoleWatcher[OVERLORD]] org.apache.druid.discovery.BaseNodeRoleWatcher - Node [https://druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local:8281] of role [overlord] went offline. +druid 2023-11-13T10:12:30,522 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"overlord","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. +druid 2023-11-13T10:12:30,523 INFO [Thread-54] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/COORDINATOR/druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local:8281] +druid 2023-11-13T10:12:30,524 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/coordinator","host":"druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8281,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"coordinator","services":{},"startTime":"2023-11-13T09:54:04.002Z"}]. druid 2023-11-13T10:12:30,600 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] druid 2023-11-13T10:12:30,604 INFO [Thread-54] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@2ea2f965{SSL, (ssl, http/1.1)}{0.0.0.0:8281} druid 2023-11-13T10:12:30,605 INFO [Thread-54] org.eclipse.jetty.server.session - node0 Stopped scavenging @@ -67,7 +67,7 @@ druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [ANNOUNCEMENTS] druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [SERVER] druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [task-master] stage [NORMAL] -druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/overlord', host='druid-coordinator-default-0.druid-coordinator-default. kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8281, enableTlsPort=true}] +druid 2023-11-13T10:12:30,610 INFO [Thread-54] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/overlord', host='druid-coordinator-default-0.druid-coordinator-default. default.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=8281, enableTlsPort=true}] druid 2023-11-13T10:12:30,702 INFO [Thread-54] org.apache.druid.indexing.common.actions.SegmentAllocationQueue - Not leader anymore. Stopping queue processing. druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.duty.OverlordDutyExecutor - Stopping OverlordDutyExecutor. druid 2023-11-13T10:12:30,703 INFO [Thread-54] org.apache.druid.indexing.overlord.duty.OverlordDutyExecutor - OverlordDutyExecutor has been stopped. @@ -94,16 +94,16 @@ They use the same mechanism described above. ---- druid 2023-11-13T10:56:54,057 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:56:54,059 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] -druid 2023-11-13T10:56:54,062 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/HISTORICAL/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283] -druid 2023-11-13T10:56:54,072 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/historical","host":"druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8283,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"historical","services":{"dataNodeService":{"type":"dataNodeService","tier":"_default_tier","maxSize":1000000000,"type":"historical","serverType":"historical","priority":0},"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:10.024Z"}]. -druid 2023-11-13T10:56:54,081 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/segments/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283_historical__default_tier_2023-11-13T10:42:12.401Z_07a7108a06df494b8f8d7c01c841384a0] -druid 2023-11-13T10:56:54,083 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/announcements/druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283] +druid 2023-11-13T10:56:54,062 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/HISTORICAL/druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283] +druid 2023-11-13T10:56:54,072 INFO [Thread-55] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/historical","host":"druid-historical-default-0.druid-historical-default.default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8283,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"historical","services":{"dataNodeService":{"type":"dataNodeService","tier":"_default_tier","maxSize":1000000000,"type":"historical","serverType":"historical","priority":0},"lookupNodeService":{"type":"lookupNodeService","lookupTier":"__default"}},"startTime":"2023-11-13T10:41:10.024Z"}]. +druid 2023-11-13T10:56:54,081 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/segments/druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283/druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283_historical__default_tier_2023-11-13T10:42:12.401Z_07a7108a06df494b8f8d7c01c841384a0] +druid 2023-11-13T10:56:54,083 INFO [Thread-55] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/announcements/druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283] druid 2023-11-13T10:56:54,084 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] druid 2023-11-13T10:56:54,089 INFO [Thread-55] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@1a632663{SSL, (ssl, http/1.1)}{0.0.0.0:8283} druid 2023-11-13T10:56:54,089 INFO [Thread-55] org.eclipse.jetty.server.session - node0 Stopped scavenging druid 2023-11-13T10:56:54,090 INFO [Thread-55] org.eclipse.jetty.server.handler.ContextHandler - Stopped o.e.j.s.ServletContextHandler@3b9d85c2{/,null,STOPPED} druid 2023-11-13T10:56:54,093 INFO [Thread-55] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] -druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283', hostAndPort='null', hostAndTlsPort='druid-historical-default-0.druid-historical-default.kuttl-test-striking-polliwog.svc.cluster.local:8283', maxSize=1000000000, tier='_default_tier', type=historical, priority=0}] +druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.ZkCoordinator - Stopping ZkCoordinator for [DruidServerMetadata{name='druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283', hostAndPort='null', hostAndTlsPort='druid-historical-default-0.druid-historical-default.default.svc.cluster.local:8283', maxSize=1000000000, tier='_default_tier', type=historical, priority=0}] druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopping... druid 2023-11-13T10:56:54,094 INFO [Thread-55] org.apache.druid.server.coordination.SegmentLoadDropHandler - Stopped. druid 2023-11-13T10:56:54,094 INFO [LookupExtractorFactoryContainerProvider-MainThread] org.apache.druid.query.lookup.LookupReferencesManager - Lookup Management loop exited. Lookup notices are not handled anymore. @@ -125,7 +125,7 @@ The middle manager can be terminated gracefully by disabling it. Meaning the ove [source,text] ---- -druid 2023-11-13T10:42:50,569 INFO [stackable_lifecycle_pre_stop] - Disable middle manager to stop overlord from sending tasks: {"test-druid-middlemanager-automatic-log-config-0.test-druid-middlemanager-automatic-log-config.kuttl-test-on-ghost.svc.cluster.local:8291":"disabled"} +druid 2023-11-13T10:42:50,569 INFO [stackable_lifecycle_pre_stop] - Disable middle manager to stop overlord from sending tasks: {"druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local:8291":"disabled"} druid 2023-11-13T10:42:50,578 INFO [stackable_lifecycle_pre_stop] - Check if termination grace period (300 seconds) is reached... druid 2023-11-13T10:42:50,586 INFO [stackable_lifecycle_pre_stop] - Check if all tasks are finished... Running: [] druid 2023-11-13T10:42:50,591 INFO [stackable_lifecycle_pre_stop] - All tasks finished! @@ -137,8 +137,8 @@ druid + trap - TERM druid + wait 11 druid 2023-11-13T10:42:50,652 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:42:50,654 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] -druid 2023-11-13T10:42:50,656 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/MIDDLE_MANAGER/druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] -druid 2023-11-13T10:42:50,671 INFO [Thread-52] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/middlemanager","host":"druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8291,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"middleManager","services":{"workerNodeService":{"type":"workerNodeService","ip":"druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local","capacity":1,"version":"0","category":"_default_worker_category"}},"startTime":"2023-11-13T10:41:10.341Z"}]. +druid 2023-11-13T10:42:50,656 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/MIDDLE_MANAGER/druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,671 INFO [Thread-52] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/middlemanager","host":"druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":8291,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"middleManager","services":{"workerNodeService":{"type":"workerNodeService","ip":"druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local","capacity":1,"version":"0","category":"_default_worker_category"}},"startTime":"2023-11-13T10:41:10.341Z"}]. druid 2023-11-13T10:42:50,675 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] druid 2023-11-13T10:42:50,677 INFO [Thread-52] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@2f51b100{SSL, (ssl, http/1.1)}{0.0.0.0:8291} druid 2023-11-13T10:42:50,677 INFO [Thread-52] org.eclipse.jetty.server.session - node0 Stopped scavenging @@ -146,8 +146,8 @@ druid 2023-11-13T10:42:50,679 INFO [Thread-52] org.eclipse.jetty.server.handler. druid 2023-11-13T10:42:50,683 INFO [Thread-52] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [NORMAL] druid 2023-11-13T10:42:50,684 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Waiting up to 300,000ms for shutdown. druid 2023-11-13T10:42:50,685 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Finished stopping in 2ms. -druid 2023-11-13T10:42:50,685 INFO [Thread-52] org.apache.druid.indexing.worker.WorkerCuratorCoordinator - Stopping WorkerCuratorCoordinator for worker[druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] -druid 2023-11-13T10:42:50,686 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/indexer/announcements/druid-middlemanager-default-0.druid-middlemanager-default.kuttl-test-striking-polliwog.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,685 INFO [Thread-52] org.apache.druid.indexing.worker.WorkerCuratorCoordinator - Stopping WorkerCuratorCoordinator for worker[druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local:8291] +druid 2023-11-13T10:42:50,686 INFO [Thread-52] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/indexer/announcements/druid-middlemanager-default-0.druid-middlemanager-default.default.svc.cluster.local:8291] druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Waiting up to 300,000ms for shutdown. druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.indexing.overlord.ForkingTaskRunner - Finished stopping in 0ms. druid 2023-11-13T10:42:50,688 INFO [Thread-52] org.apache.druid.security.basic.authorization.db.cache.CoordinatorPollingBasicAuthorizerCacheManager - CoordinatorPollingBasicAuthorizerCacheManager is stopping. @@ -170,9 +170,9 @@ They use the same mechanism described above. ---- druid 2023-11-13T10:53:13,401 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Lifecycle [module] running shutdown hook druid 2023-11-13T10:53:13,403 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [ANNOUNCEMENTS] -druid 2023-11-13T10:53:13,406 INFO [Thread-70] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/ROUTER/druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local:9088] -druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/router","host":"druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":9088,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"router","services":{},"startTime":"2023-11-13T10:41:23.188Z"}]. -druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/router', host='druid-router-default-0.druid-router-default.kuttl-test-striking-polliwog.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=9088, enableTlsPort=true}] +druid 2023-11-13T10:53:13,406 INFO [Thread-70] org.apache.druid.curator.announcement.Announcer - Unannouncing [/druid/internal-discovery/ROUTER/druid-router-default-0.druid-router-default.default.svc.cluster.local:9088] +druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorDruidNodeAnnouncer - Unannounced self [{"druidNode":{"service":"druid/router","host":"druid-router-default-0.druid-router-default.default.svc.cluster.local","bindOnHost":false,"plaintextPort":-1,"port":-1,"tlsPort":9088,"enablePlaintextPort":false,"enableTlsPort":true},"nodeType":"router","services":{},"startTime":"2023-11-13T10:41:23.188Z"}]. +druid 2023-11-13T10:53:13,501 INFO [Thread-70] org.apache.druid.curator.discovery.CuratorServiceAnnouncer - Unannouncing service[DruidNode{serviceName='druid/router', host='druid-router-default-0.druid-router-default.default.svc.cluster.local', bindOnHost=false, port=-1, plaintextPort=-1, enablePlaintextPort=false, tlsPort=9088, enableTlsPort=true}] druid 2023-11-13T10:53:13,587 INFO [Thread-70] org.apache.druid.java.util.common.lifecycle.Lifecycle - Stopping lifecycle [module] stage [SERVER] druid 2023-11-13T10:53:13,591 INFO [Thread-70] org.eclipse.jetty.server.AbstractConnector - Stopped ServerConnector@77732366{SSL, (ssl, http/1.1)}{0.0.0.0:9088} druid 2023-11-13T10:53:13,591 INFO [Thread-70] org.eclipse.jetty.server.session - node0 Stopped scavenging