diff --git a/docs/pages/concepts/backend-lifecycle.mdx b/docs/pages/concepts/backend-lifecycle.mdx index aa1affd84..50a1eada1 100644 --- a/docs/pages/concepts/backend-lifecycle.mdx +++ b/docs/pages/concepts/backend-lifecycle.mdx @@ -10,8 +10,10 @@ The statuses are: - `starting`: The drone has loaded the image and is starting the container. - `waiting`: The container has started. The drone is waiting for it to listen on an HTTP port. - `ready`: The container is listening on an HTTP port. The drone is ready to route traffic to it. -- `terminating`: The drone has sent a request to terminate the backend. If the request was a “soft” request, - the backend may remain in this state for a grace period (by default 10 seoconds) before being hard-terminated. +- `terminating`: The drone has sent a “soft” request to terminate the backend. + The backend may remain in this state for a grace period (by default 10 seoconds) before being hard-terminated, + unless it exits on its own first. +- `hard-terminating`: The drone has sent a “hard” request to terminate the backend. - `terminated`: The drone has terminated the backend. This is considered the only terminal state. A backend may skip over some of these statuses, but will only transition to statuses lower in the list, never diff --git a/plane/src/controller/proxy.rs b/plane/src/controller/proxy.rs index f7d2f0ac8..b24b6c6f9 100644 --- a/plane/src/controller/proxy.rs +++ b/plane/src/controller/proxy.rs @@ -125,7 +125,9 @@ pub async fn handle_route_info_request( } break; } - BackendState::Terminated { .. } | BackendState::Terminating { .. } => { + BackendState::Terminated { .. } + | BackendState::Terminating { .. } + | BackendState::HardTerminating { .. } => { let response = RouteInfoResponse { token, route_info: None, diff --git a/plane/src/database/backend.rs b/plane/src/database/backend.rs index 45753fbf2..51c644e97 100644 --- a/plane/src/database/backend.rs +++ b/plane/src/database/backend.rs @@ -317,7 +317,7 @@ impl<'a> BackendDatabase<'a> { let ready = match result.last_status.as_str() { "ready" => true, - "terminated" | "terminating" => { + "terminated" | "terminating" | "hard-terminating" => { return Ok(RouteInfoResult::NotFound); } _ => false, @@ -392,7 +392,7 @@ impl<'a> BackendDatabase<'a> { let ready = match result.last_status.as_str() { "ready" => true, - "terminated" | "terminating" => { + "terminated" | "terminating" | "hard-terminating" => { return Ok(RouteInfoResult::NotFound); } _ => false, diff --git a/plane/src/drone/backend_manager.rs b/plane/src/drone/backend_manager.rs index 8f957413a..2c2ad6afe 100644 --- a/plane/src/drone/backend_manager.rs +++ b/plane/src/drone/backend_manager.rs @@ -87,6 +87,34 @@ impl Debug for BackendManager { } } +fn handle_terminating( + runtime: Arc>, + backend_id: &BackendName, + state: BackendState, + hard_terminate: bool, +) -> StepStatusResult { + let backend_id = backend_id.clone(); + + StepStatusResult::future_status(async move { + let mut backoff = ExponentialBackoff::default(); + + loop { + match runtime.terminate(&backend_id, hard_terminate).await { + Ok(false) => return state.to_terminated(None), + Ok(true) => { + // Return a future that never resolves, so that only the container + // terminating bumps us into the next state. + return pending().await; + } + Err(err) => { + tracing::error!(?err, "failed to terminate backend"); + backoff.wait().await; + } + } + } + }) +} + impl BackendManager { #[allow(clippy::too_many_arguments)] pub fn new( @@ -173,41 +201,18 @@ impl BackendManager { runtime.wait_for_backend(&backend_id, address.0).await { tracing::error!("Backend startup timeout"); - state.to_terminating( - TerminationKind::Hard, - TerminationReason::StartupTimeout, - ) + state.to_hard_terminating(TerminationReason::StartupTimeout) } else { state.to_ready(address) } }) } BackendState::Ready { .. } => StepStatusResult::DoNothing, - BackendState::Terminating { termination, .. } => { - let docker = self.runtime.clone(); - let backend_id = self.backend_id.clone(); - - StepStatusResult::future_status(async move { - let mut backoff = ExponentialBackoff::default(); - - loop { - match docker - .terminate(&backend_id, termination == TerminationKind::Hard) - .await - { - Ok(false) => return state.to_terminated(None), - Ok(true) => { - // Return a future that never resolves, so that only the container - // terminating bumps us into the next state. - return pending().await; - } - Err(err) => { - tracing::error!(?err, "failed to terminate backend"); - backoff.wait().await; - } - } - } - }) + BackendState::Terminating { .. } => { + handle_terminating(self.runtime.clone(), &self.backend_id, state, false) + } + BackendState::HardTerminating { .. } => { + handle_terminating(self.runtime.clone(), &self.backend_id, state, true) } BackendState::Terminated { .. } => StepStatusResult::DoNothing, } @@ -262,7 +267,12 @@ impl BackendManager { .expect("State lock is poisoned") .state .clone(); - self.set_state(state.to_terminating(kind, reason)); + + let new_state = match kind { + TerminationKind::Soft => state.to_terminating(reason), + TerminationKind::Hard => state.to_hard_terminating(reason), + }; + self.set_state(new_state); Ok(()) } diff --git a/plane/src/drone/executor.rs b/plane/src/drone/executor.rs index 76ffba941..0826a3c5a 100644 --- a/plane/src/drone/executor.rs +++ b/plane/src/drone/executor.rs @@ -3,7 +3,7 @@ use crate::{ drone::runtime::Runtime, names::BackendName, protocol::{BackendAction, BackendEventId, BackendStateMessage}, - types::{BackendState, BackendStatus, TerminationKind, TerminationReason}, + types::{BackendState, BackendStatus, TerminationReason}, util::{ExponentialBackoff, GuardHandle}, }; use anyhow::Result; @@ -93,7 +93,7 @@ impl Executor { .expect("State store lock poisoned.") .register_event( &backend_id, - &state.to_terminating(TerminationKind::Hard, TerminationReason::KeyExpired), + &state.to_hard_terminating(TerminationReason::KeyExpired), Utc::now(), ) .unwrap_or_else(|_| { diff --git a/plane/src/drone/state_store.rs b/plane/src/drone/state_store.rs index fd8686b0b..3ea7b8583 100644 --- a/plane/src/drone/state_store.rs +++ b/plane/src/drone/state_store.rs @@ -234,7 +234,7 @@ mod test { use crate::{ log_types::BackendAddr, names::Name, - types::{BackendStatus, TerminationKind, TerminationReason}, + types::{BackendStatus, TerminationReason}, }; use std::{ net::{SocketAddr, SocketAddrV4}, @@ -300,7 +300,7 @@ mod test { state_store .register_event( &backend_id, - &ready_state.to_terminating(TerminationKind::Hard, TerminationReason::External), + &ready_state.to_hard_terminating(TerminationReason::External), Utc::now(), ) .unwrap(); @@ -308,9 +308,8 @@ mod test { let result = state_store.backend_state(&backend_id).unwrap(); assert_eq!( result, - BackendState::Terminating { + BackendState::HardTerminating { last_status: BackendStatus::Ready, - termination: TerminationKind::Hard, reason: TerminationReason::External, } ); @@ -357,7 +356,7 @@ mod test { state_store .register_event( &backend_id, - &ready_state.to_terminating(TerminationKind::Hard, TerminationReason::Swept), + &ready_state.to_hard_terminating(TerminationReason::Swept), Utc::now(), ) .unwrap(); @@ -365,16 +364,15 @@ mod test { let result = state_store.backend_state(&backend_id).unwrap(); assert_eq!( result, - ready_state.to_terminating(TerminationKind::Hard, TerminationReason::Swept) + ready_state.to_hard_terminating(TerminationReason::Swept) ); let event = recv.try_recv().unwrap(); assert_eq!(event.backend_id, backend_id); assert_eq!( event.state, - BackendState::Terminating { + BackendState::HardTerminating { last_status: BackendStatus::Ready, - termination: TerminationKind::Hard, reason: TerminationReason::Swept, } ); @@ -400,7 +398,7 @@ mod test { state_store .register_event( &backend_id, - &ready_state.to_terminating(TerminationKind::Hard, TerminationReason::Swept), + &ready_state.to_hard_terminating(TerminationReason::Swept), Utc::now(), ) .unwrap(); @@ -429,9 +427,8 @@ mod test { assert_eq!(event.event_id, BackendEventId::from(2)); assert_eq!( event.state, - BackendState::Terminating { + BackendState::HardTerminating { last_status: BackendStatus::Ready, - termination: TerminationKind::Hard, reason: TerminationReason::Swept, } ); @@ -463,9 +460,8 @@ mod test { assert_eq!(event.backend_id, backend_id); assert_eq!( event.state, - BackendState::Terminating { + BackendState::HardTerminating { last_status: BackendStatus::Ready, - termination: TerminationKind::Hard, reason: TerminationReason::Swept, } ); @@ -489,9 +485,8 @@ mod test { assert_eq!(event.backend_id, backend_id); assert_eq!( event.state, - BackendState::Terminating { + BackendState::HardTerminating { last_status: BackendStatus::Ready, - termination: TerminationKind::Hard, reason: TerminationReason::Swept, } ); diff --git a/plane/src/log_types.rs b/plane/src/log_types.rs index a467095a6..6f8251cb9 100644 --- a/plane/src/log_types.rs +++ b/plane/src/log_types.rs @@ -35,7 +35,7 @@ impl From for LoggableTime { } } -#[derive(Clone, Copy, Serialize, Deserialize, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Serialize, Deserialize, Debug, PartialEq, Eq, PartialOrd)] pub struct BackendAddr(pub SocketAddr); impl valuable::Valuable for BackendAddr { diff --git a/plane/src/types/backend_state.rs b/plane/src/types/backend_state.rs index 0a22a22c2..af4764fae 100644 --- a/plane/src/types/backend_state.rs +++ b/plane/src/types/backend_state.rs @@ -6,7 +6,6 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::{fmt::Display, net::SocketAddr}; -use valuable::Valuable; #[derive(Clone, Copy, Serialize, Deserialize, Debug, PartialEq, PartialOrd)] #[serde(rename_all = "lowercase")] @@ -32,6 +31,11 @@ pub enum BackendStatus { /// Proxies should stop sending traffic to it, but we should not yet release the key. Terminating, + /// The backend has been sent a SIGKILL, either because the user sent a hard termination + /// request or the lock was past the hard-termination deadline. + #[serde(rename = "hard-terminating")] + HardTerminating, + /// The backend has exited or been swept. Terminated, } @@ -50,6 +54,7 @@ impl BackendStatus { BackendStatus::Waiting => 40, BackendStatus::Ready => 50, BackendStatus::Terminating => 60, + BackendStatus::HardTerminating => 65, BackendStatus::Terminated => 70, } } @@ -64,6 +69,7 @@ impl valuable::Valuable for BackendStatus { BackendStatus::Waiting => valuable::Value::String("waiting"), BackendStatus::Ready => valuable::Value::String("ready"), BackendStatus::Terminating => valuable::Value::String("terminating"), + BackendStatus::HardTerminating => valuable::Value::String("hard-terminating"), BackendStatus::Terminated => valuable::Value::String("terminated"), } } @@ -93,10 +99,18 @@ pub enum BackendState { address: BackendAddr, }, Terminating { + /// Last status before either soft or hard termination. last_status: BackendStatus, + #[deprecated(note = "Use HardTerminating instead")] termination: TerminationKind, reason: TerminationReason, }, + #[serde(rename = "hard-terminating")] + HardTerminating { + /// Last status before either soft or hard termination. + last_status: BackendStatus, + reason: TerminationReason, + }, Terminated { last_status: BackendStatus, termination: Option, @@ -138,6 +152,7 @@ impl valuable::Valuable for BackendState { ); visit.visit_entry(valuable::Value::String("address"), address.as_value()); } + #[allow(deprecated)] BackendState::Terminating { last_status, termination, @@ -157,6 +172,20 @@ impl valuable::Valuable for BackendState { ); visit.visit_entry(valuable::Value::String("reason"), reason.as_value()); } + BackendState::HardTerminating { + last_status, + reason, + } => { + visit.visit_entry( + valuable::Value::String("status"), + valuable::Value::String("hard-terminating"), + ); + visit.visit_entry( + valuable::Value::String("last_status"), + last_status.as_value(), + ); + visit.visit_entry(valuable::Value::String("reason"), reason.as_value()); + } BackendState::Terminated { last_status, termination, @@ -184,6 +213,8 @@ impl valuable::Valuable for BackendState { impl valuable::Mappable for BackendState { fn size_hint(&self) -> (usize, Option) { + // These numbers should match the number of calls to visit_entry in visit. + // (This is use as a hint; differences are not a correctness issue.) match self { BackendState::Scheduled => (1, Some(1)), BackendState::Loading => (1, Some(1)), @@ -191,6 +222,7 @@ impl valuable::Mappable for BackendState { BackendState::Waiting { .. } => (2, Some(2)), BackendState::Ready { .. } => (1, Some(2)), BackendState::Terminating { .. } => (1, Some(4)), + BackendState::HardTerminating { .. } => (1, Some(3)), BackendState::Terminated { .. } => (2, Some(5)), } } @@ -247,6 +279,7 @@ impl BackendState { BackendState::Waiting { .. } => BackendStatus::Waiting, BackendState::Ready { .. } => BackendStatus::Ready, BackendState::Terminating { .. } => BackendStatus::Terminating, + BackendState::HardTerminating { .. } => BackendStatus::HardTerminating, BackendState::Terminated { .. } => BackendStatus::Terminated, } } @@ -273,39 +306,32 @@ impl BackendState { BackendState::Ready { address } } - pub fn to_terminating( - &self, - termination: TerminationKind, - reason: TerminationReason, - ) -> BackendState { + pub fn to_terminating(&self, reason: TerminationReason) -> BackendState { + if self.status() >= BackendStatus::Terminating { + tracing::warn!(?reason, state=?self, "to_terminating called on backend in later state."); + return self.clone(); + } + + BackendState::Terminating { + last_status: self.status(), + termination: TerminationKind::Soft, + reason, + } + } + + pub fn to_hard_terminating(&self, reason: TerminationReason) -> BackendState { + if self.status() >= BackendStatus::HardTerminating { + tracing::warn!(?reason, state=?self, "to_hard_terminating called on backend in later state."); + return self.clone(); + } + match self { - BackendState::Terminated { .. } => { - tracing::warn!(?reason, termination=termination.as_value(), state=?self, "to_terminating called on terminated backend"); - self.clone() - } - // a soft terminating backend can be transitioned to a hard terminating backend - BackendState::Terminating { - last_status, - termination: termination_kind, - .. - } => { - if termination_kind == &termination { - tracing::warn!(?reason, termination=termination.as_value(), state=?self, "to_terminating called on terminating backend with the same termination_kind"); - return self.clone(); - } - if termination_kind != &TerminationKind::Hard { - tracing::warn!(?reason, termination=termination.as_value(), state=?self, "to_terminating called on terminating backend with soft termination kind"); - return self.clone(); - } - BackendState::Terminating { - last_status: *last_status, - termination, - reason, - } - } - _ => BackendState::Terminating { + BackendState::Terminating { last_status, .. } => BackendState::HardTerminating { + last_status: *last_status, + reason, + }, + _ => BackendState::HardTerminating { last_status: self.status(), - termination, reason, }, } @@ -317,6 +343,16 @@ impl BackendState { tracing::warn!(?exit_code, state=?self, "to_terminated called on terminated backend"); self.clone() } + BackendState::HardTerminating { + last_status, + reason, + } => BackendState::Terminated { + last_status: *last_status, + termination: Some(TerminationKind::Hard), + reason: Some(*reason), + exit_code, + }, + #[allow(deprecated)] BackendState::Terminating { last_status, termination, @@ -388,12 +424,14 @@ impl BackendStatusStreamEntry { let termination_reason = match state { BackendState::Terminated { reason, .. } => reason, BackendState::Terminating { reason, .. } => Some(reason), + BackendState::HardTerminating { reason, .. } => Some(reason), _ => None, }; let termination_kind = match state { BackendState::Terminated { termination, .. } => termination, - BackendState::Terminating { termination, .. } => Some(termination), + BackendState::Terminating { .. } => Some(TerminationKind::Soft), + BackendState::HardTerminating { .. } => Some(TerminationKind::Hard), _ => None, };