From feb1da3fe2a8c8b651e1212434dc4ba93248c233 Mon Sep 17 00:00:00 2001 From: Sebastian Bernauer Date: Thu, 19 Oct 2023 13:37:38 +0200 Subject: [PATCH] feat: Support graceful shutdown (#407) * feat: Support graceful shutdown * update docs * docs * changelog * link code in docs * increase default of datanodes to 30 min * move into constants * use new operator-rs * docs: Format 15 minutes * Use new operator-rs * improve docs * fix link * use operator-rs 0.55.0 * fixup * improve docs * set error context * Added a high level description of graceful shutdown * Revert "Added a high level description of graceful shutdown" This reverts commit 7733ec13fb98d4a7a96946badf88fbc5ce2f9c6c. Moved to https://github.com/stackabletech/documentation/pull/473 --------- Co-authored-by: Jim Halfpenny --- CHANGELOG.md | 2 ++ deploy/helm/hdfs-operator/crds/crds.yaml | 24 +++++++++++++ .../operations/graceful-shutdown.adoc | 36 +++++++++++++++++-- rust/crd/src/constants.rs | 9 +++++ rust/crd/src/lib.rs | 26 ++++++++++++++ rust/operator/src/hdfs_controller.rs | 17 +++++---- .../src/operations/graceful_shutdown.rs | 26 ++++++++++++++ rust/operator/src/operations/mod.rs | 1 + tests/templates/kuttl/smoke/30-assert.yaml.j2 | 3 ++ 9 files changed, 134 insertions(+), 10 deletions(-) create mode 100644 rust/operator/src/operations/graceful_shutdown.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 0aa1b08d..ab22f63d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. - Default stackableVersion to operator version ([#381]). - Configuration overrides for the JVM security properties, such as DNS caching ([#384]). - Support PodDisruptionBudgets ([#394]). +- Support graceful shutdown ([#407]). - Added support for 3.2.4, 3.3.6 ([#409]). ### Changed @@ -33,6 +34,7 @@ All notable changes to this project will be documented in this file. [#402]: https://github.com/stackabletech/hdfs-operator/pull/402 [#404]: https://github.com/stackabletech/hdfs-operator/pull/404 [#405]: https://github.com/stackabletech/hdfs-operator/pull/405 +[#407]: https://github.com/stackabletech/hdfs-operator/pull/407 [#409]: https://github.com/stackabletech/hdfs-operator/pull/409 ## [23.7.0] - 2023-07-14 diff --git a/deploy/helm/hdfs-operator/crds/crds.yaml b/deploy/helm/hdfs-operator/crds/crds.yaml index 786ecd00..0c8745de 100644 --- a/deploy/helm/hdfs-operator/crds/crds.yaml +++ b/deploy/helm/hdfs-operator/crds/crds.yaml @@ -576,6 +576,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -4069,6 +4073,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -7621,6 +7629,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -11105,6 +11117,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -14606,6 +14622,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null @@ -18090,6 +18110,10 @@ spec: type: array type: object type: object + gracefulShutdownTimeout: + description: Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + nullable: true + type: string logging: default: enableVectorAgent: null diff --git a/docs/modules/hdfs/pages/usage-guide/operations/graceful-shutdown.adoc b/docs/modules/hdfs/pages/usage-guide/operations/graceful-shutdown.adoc index 9b00f4c8..7964d0c8 100644 --- a/docs/modules/hdfs/pages/usage-guide/operations/graceful-shutdown.adoc +++ b/docs/modules/hdfs/pages/usage-guide/operations/graceful-shutdown.adoc @@ -1,6 +1,36 @@ = Graceful shutdown -Graceful shutdown of HDFS nodes is either not supported by the product itself -or we have not implemented it yet. +You can configure the graceful shutdown as described in xref:concepts:operations/graceful_shutdown.adoc[]. -Outstanding implementation work for the graceful shutdowns of all products where this functionality is relevant is tracked in https://github.com/stackabletech/issues/issues/357 +== JournalNodes + +As a default, JournalNodes have `15 minutes` to terminate gracefully. + +The JournalNode process will always run as PID `1` and will get a `SIGTERM` once Kubernetes wants to terminate the Pod. +It will log the received signal as show in the log below and initiate a graceful shutdown. +After the graceful shutdown timeout is passed and the process still didn't exit, Kubernetes will issue an `SIGKILL` to force-kill the process. + +https://github.com/apache/hadoop/blob/a585a73c3e02ac62350c136643a5e7f6095a3dbb/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java#L2004[This] is the relevant code that gets executed in the JournalNodes as of HDFS version `3.3.4`. + +[source,text] +---- +2023-10-10 13:37:41,525 ERROR server.JournalNode (LogAdapter.java:error(75)) - RECEIVED SIGNAL 15: SIGTERM +2023-10-10 13:37:41,526 INFO server.JournalNode (LogAdapter.java:info(51)) - SHUTDOWN_MSG: +/************************************************************ +SHUTDOWN_MSG: Shutting down JournalNode at hdfs-journalnode-default-0/10.244.0.38 +************************************************************/ +---- + +== NameNodes + +As a default, NameNodes have `15 minutes` to terminate gracefully. +They go through the same mechanism as documented for the <<_journalnodes>> above. + +https://github.com/apache/hadoop/blob/a585a73c3e02ac62350c136643a5e7f6095a3dbb/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java#L1080[This] is the relevant code that gets executed in the NameNodes as of HDFS version `3.3.4`. + +== DataNodes + +As a default, DataNodes have `30 minutes` to terminate gracefully. +They go through the same mechanism as documented for the <<_journalnodes>> above. + +https://github.com/apache/hadoop/blob/a585a73c3e02ac62350c136643a5e7f6095a3dbb/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/server/JournalNode.java#L272[This] is the relevant code that gets executed in the DataNodes as of HDFS version `3.3.4`. diff --git a/rust/crd/src/constants.rs b/rust/crd/src/constants.rs index ae8cde43..5fe554b6 100644 --- a/rust/crd/src/constants.rs +++ b/rust/crd/src/constants.rs @@ -1,3 +1,5 @@ +use stackable_operator::time::Duration; + pub const DEFAULT_DFS_REPLICATION_FACTOR: u8 = 3; pub const CONTROLLER_NAME: &str = "hdfsclusters.hdfs.stackable.tech"; @@ -41,6 +43,13 @@ pub const DEFAULT_JOURNAL_NODE_HTTP_PORT: u16 = 8480; pub const DEFAULT_JOURNAL_NODE_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_RPC_PORT: u16 = 8485; +pub const DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = + Duration::from_minutes_unchecked(15); +pub const DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = + Duration::from_minutes_unchecked(15); +pub const DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT: Duration = + Duration::from_minutes_unchecked(30); + // hdfs-site.xml pub const DFS_NAMENODE_NAME_DIR: &str = "dfs.namenode.name.dir"; pub const DFS_NAMENODE_SHARED_EDITS_DIR: &str = "dfs.namenode.shared.edits.dir"; diff --git a/rust/crd/src/lib.rs b/rust/crd/src/lib.rs index 9dfed0fe..1d2daf91 100644 --- a/rust/crd/src/lib.rs +++ b/rust/crd/src/lib.rs @@ -36,6 +36,7 @@ use stackable_operator::{ role_utils::{GenericRoleConfig, Role, RoleGroup, RoleGroupRef}, schemars::{self, JsonSchema}, status::condition::{ClusterCondition, HasStatusCondition}, + time::Duration, }; use std::collections::{BTreeMap, HashMap}; use storage::{ @@ -156,6 +157,7 @@ pub trait MergedConfig { None } fn affinity(&self) -> &StackableAffinity; + fn graceful_shutdown_timeout(&self) -> Option<&Duration>; /// Main container shared by all roles fn hdfs_logging(&self) -> ContainerLogConfig; /// Vector container shared by all roles @@ -841,6 +843,9 @@ pub struct NameNodeConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + #[fragment_attrs(serde(default))] + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + pub graceful_shutdown_timeout: Option, } impl MergedConfig for NameNodeConfig { @@ -852,6 +857,10 @@ impl MergedConfig for NameNodeConfig { &self.affinity } + fn graceful_shutdown_timeout(&self) -> Option<&Duration> { + self.graceful_shutdown_timeout.as_ref() + } + fn hdfs_logging(&self) -> ContainerLogConfig { self.logging .containers @@ -916,6 +925,7 @@ impl NameNodeConfigFragment { }, logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role), + graceful_shutdown_timeout: Some(DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT), } } } @@ -1001,6 +1011,9 @@ pub struct DataNodeConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + #[fragment_attrs(serde(default))] + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + pub graceful_shutdown_timeout: Option, } impl MergedConfig for DataNodeConfig { @@ -1014,6 +1027,10 @@ impl MergedConfig for DataNodeConfig { &self.affinity } + fn graceful_shutdown_timeout(&self) -> Option<&Duration> { + self.graceful_shutdown_timeout.as_ref() + } + fn hdfs_logging(&self) -> ContainerLogConfig { self.logging .containers @@ -1069,6 +1086,7 @@ impl DataNodeConfigFragment { }, logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role), + graceful_shutdown_timeout: Some(DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT), } } } @@ -1152,6 +1170,9 @@ pub struct JournalNodeConfig { pub logging: Logging, #[fragment_attrs(serde(default))] pub affinity: StackableAffinity, + #[fragment_attrs(serde(default))] + /// Time period Pods have to gracefully shut down, e.g. `30m`, `1h` or `2d`. Consult the operator documentation for details. + pub graceful_shutdown_timeout: Option, } impl MergedConfig for JournalNodeConfig { @@ -1163,6 +1184,10 @@ impl MergedConfig for JournalNodeConfig { &self.affinity } + fn graceful_shutdown_timeout(&self) -> Option<&Duration> { + self.graceful_shutdown_timeout.as_ref() + } + fn hdfs_logging(&self) -> ContainerLogConfig { self.logging .containers @@ -1206,6 +1231,7 @@ impl JournalNodeConfigFragment { }, logging: product_logging::spec::default_logging(), affinity: get_affinity(cluster_name, role), + graceful_shutdown_timeout: Some(DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT), } } } diff --git a/rust/operator/src/hdfs_controller.rs b/rust/operator/src/hdfs_controller.rs index a5237a1c..2aba436e 100644 --- a/rust/operator/src/hdfs_controller.rs +++ b/rust/operator/src/hdfs_controller.rs @@ -6,7 +6,7 @@ use crate::{ discovery::build_discovery_configmap, event::{build_invalid_replica_message, publish_event}, kerberos, - operations::pdb::add_pdbs, + operations::{self, graceful_shutdown::add_graceful_shutdown_config, pdb::add_pdbs}, product_logging::{extend_role_group_config_map, resolve_vector_aggregator_address}, OPERATOR_NAME, }; @@ -166,14 +166,15 @@ pub enum Error { "kerberos not supported for HDFS versions < 3.3.x. Please use at least version 3.3.x" ))] KerberosNotSupported {}, - #[snafu(display( - "failed to serialize [{JVM_SECURITY_PROPERTIES_FILE}] for {}", - rolegroup - ))] - JvmSecurityPoperties { + #[snafu(display("failed to serialize [{JVM_SECURITY_PROPERTIES_FILE}] for {rolegroup}",))] + JvmSecurityProperties { source: stackable_operator::product_config::writer::PropertiesWriterError, rolegroup: String, }, + #[snafu(display("failed to configure graceful shutdown"), context(false))] + GracefulShutdown { + source: operations::graceful_shutdown::Error, + }, } impl ReconcilerError for Error { @@ -599,7 +600,7 @@ fn rolegroup_config_map( .add_data( JVM_SECURITY_PROPERTIES_FILE, to_java_properties_string(jvm_sec_props.iter()).with_context(|_| { - JvmSecurityPopertiesSnafu { + JvmSecurityPropertiesSnafu { rolegroup: rolegroup_ref.role_group.clone(), } })?, @@ -667,6 +668,8 @@ fn rolegroup_statefulset( ) .context(FailedToCreateContainerAndVolumeConfigurationSnafu)?; + add_graceful_shutdown_config(merged_config, &mut pb)?; + let mut pod_template = pb.build_template(); if let Some(pod_overrides) = hdfs.pod_overrides_for_role(role) { pod_template.merge_from(pod_overrides.clone()); diff --git a/rust/operator/src/operations/graceful_shutdown.rs b/rust/operator/src/operations/graceful_shutdown.rs new file mode 100644 index 00000000..ff79f384 --- /dev/null +++ b/rust/operator/src/operations/graceful_shutdown.rs @@ -0,0 +1,26 @@ +use snafu::{ResultExt, Snafu}; +use stackable_hdfs_crd::MergedConfig; +use stackable_operator::builder::PodBuilder; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to set terminationGracePeriod"))] + SetTerminationGracePeriod { + source: stackable_operator::builder::pod::Error, + }, +} + +pub fn add_graceful_shutdown_config( + merged_config: &(dyn MergedConfig + Send + 'static), + pod_builder: &mut PodBuilder, +) -> Result<(), Error> { + // This must be always set by the merge mechanism, as we provide a default value, + // users can not disable graceful shutdown. + if let Some(graceful_shutdown_timeout) = merged_config.graceful_shutdown_timeout() { + pod_builder + .termination_grace_period(graceful_shutdown_timeout) + .context(SetTerminationGracePeriodSnafu)?; + } + + Ok(()) +} diff --git a/rust/operator/src/operations/mod.rs b/rust/operator/src/operations/mod.rs index d3cf6e9c..92ca2ec7 100644 --- a/rust/operator/src/operations/mod.rs +++ b/rust/operator/src/operations/mod.rs @@ -1 +1,2 @@ +pub mod graceful_shutdown; pub mod pdb; diff --git a/tests/templates/kuttl/smoke/30-assert.yaml.j2 b/tests/templates/kuttl/smoke/30-assert.yaml.j2 index 2d82e9b5..15c2ad33 100644 --- a/tests/templates/kuttl/smoke/30-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-assert.yaml.j2 @@ -23,6 +23,7 @@ spec: - name: vector {% endif %} - name: zkfc + terminationGracePeriodSeconds: 900 status: readyReplicas: 2 replicas: 2 @@ -46,6 +47,7 @@ spec: {% if lookup('env', 'VECTOR_AGGREGATOR') %} - name: vector {% endif %} + terminationGracePeriodSeconds: 900 status: readyReplicas: 1 replicas: 1 @@ -69,6 +71,7 @@ spec: {% if lookup('env', 'VECTOR_AGGREGATOR') %} - name: vector {% endif %} + terminationGracePeriodSeconds: 1800 status: readyReplicas: {{ test_scenario['values']['number-of-datanodes'] }} replicas: {{ test_scenario['values']['number-of-datanodes'] }}