diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesConfig.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesConfig.cs index b29218fa1..21d1671e3 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesConfig.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesConfig.cs @@ -23,6 +23,9 @@ public static class KubernetesConfig public static bool DisableAutomaticPodCleanup => bool.TryParse(Environment.GetEnvironmentVariable($"{EnvVarPrefix}__DISABLEAUTOPODCLEANUP"), out var disableAutoCleanup) && disableAutoCleanup; public static bool DisablePodEventsInTaskLog => bool.TryParse(Environment.GetEnvironmentVariable($"{EnvVarPrefix}__DISABLEPODEVENTSINTASKLOG"), out var disable) && disable; + public static string PendingPodsConsideredStuckAfterTimeSpanVariableName = $"{EnvVarPrefix}__PENDINGPODSSTUCKAFTERMINUTES"; + public static TimeSpan? PendingPodsConsideredStuckAfterTimeSpan => int.TryParse(Environment.GetEnvironmentVariable(PendingPodsConsideredStuckAfterTimeSpanVariableName), out var stuckPodsMinutes) ? TimeSpan.FromMinutes(stuckPodsMinutes) : null; + public static string HelmReleaseNameVariableName => $"{EnvVarPrefix}__HELMRELEASENAME"; public static string HelmReleaseName => GetRequiredEnvVar(HelmReleaseNameVariableName, "Unable to determine Helm release name."); diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesModule.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesModule.cs index 06ef3ebe0..ac40df739 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesModule.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesModule.cs @@ -52,6 +52,9 @@ protected override void Load(ContainerBuilder builder) builder.RegisterType().As().As().SingleInstance(); builder.RegisterType().As().As().SingleInstance(); + + builder.RegisterType().As().As().SingleInstance(); + builder.RegisterType().As().SingleInstance(); builder.RegisterType().As().As().SingleInstance(); builder.RegisterType().As().SingleInstance(); diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchDog.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchDog.cs new file mode 100644 index 000000000..663e19cb3 --- /dev/null +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchDog.cs @@ -0,0 +1,105 @@ +using System; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Octopus.Tentacle.Contracts; +using Octopus.Tentacle.Core.Diagnostics; +using Octopus.Tentacle.Time; +using Octopus.Time; +using Polly; + +namespace Octopus.Tentacle.Kubernetes +{ + public interface IKubernetesPendingPodWatchdog + { + Task StartAsync(CancellationToken cancellationToken); + } + + public class KubernetesPendingPodWatchdog : IKubernetesPendingPodWatchdog + { + readonly IKubernetesPodStatusProvider podStatusProvider; + readonly ISystemLog log; + readonly IClock clock; + readonly ITentacleScriptLogProvider scriptLogProvider; + + static readonly TimeSpan RecheckTime = TimeSpan.FromMinutes(1); + internal readonly TimeSpan? pendingPodsConsideredStuckAfterTimeSpan = KubernetesConfig.PendingPodsConsideredStuckAfterTimeSpan; + + public KubernetesPendingPodWatchdog( + IKubernetesPodStatusProvider podStatusProvider, + ISystemLog log, + IClock clock, + ITentacleScriptLogProvider scriptLogProvider) + { + this.podStatusProvider = podStatusProvider; + this.log = log; + this.clock = clock; + this.scriptLogProvider = scriptLogProvider; + } + + public async Task StartAsync(CancellationToken cancellationToken) + { + if (pendingPodsConsideredStuckAfterTimeSpan is null) + { + log.Verbose($"PendingPodWatchdog: Environment variable {KubernetesConfig.PendingPodsConsideredStuckAfterTimeSpanVariableName} is not set or is an invalid number, so watchdog will not start."); + return; + } + + const int maxDurationSeconds = 70; + + //We don't want the monitoring to ever stop + var policy = Policy.Handle().WaitAndRetryForeverAsync( + retry => TimeSpan.FromSeconds(ExponentialBackoff.GetDuration(retry, maxDurationSeconds)), + (ex, duration) => + { + log.Error(ex, "PendingPodWatchdog: An unexpected error occured while running watchdog loop, re-running in: " + duration); + }); + + await policy.ExecuteAsync(async ct => await WatchPendingPodsLoop(pendingPodsConsideredStuckAfterTimeSpan.Value ,ct), cancellationToken); + } + + async Task WatchPendingPodsLoop(TimeSpan consideredStuckTimeSpan, CancellationToken cancellationToken) + { + while (!cancellationToken.IsCancellationRequested) + { + log.Verbose("PendingPodWatchdog: Checking for stuck pending pods"); + CheckForStuckPendingPods(consideredStuckTimeSpan); + + var nextCheckTime = clock.GetUtcTime() + RecheckTime; + log.Verbose($"PendingPodWatchdog: Next check will happen at {nextCheckTime:O}"); + + await Task.Delay(RecheckTime, cancellationToken); + } + } + + void CheckForStuckPendingPods(TimeSpan consideredStuckTimeSpan) + { + var allPods = podStatusProvider.GetAllTrackedScriptPods(); + var pendingPods = allPods + .Where(p => p.State.Phase == TrackedScriptPodPhase.Pending) + .Where(p => p.CreationTimestamp is not null) + .ToList(); + + log.Verbose($"PendingPodWatchdog: {pendingPods.Count} pending pods found"); + + var now = clock.GetUtcTime(); + + //calculate the time, where if the pending pods creation date is before this, then the pods is considered stuck + var podsAreStuckBeforeThisTime = now - pendingPodsConsideredStuckAfterTimeSpan; + foreach (var pendingPod in pendingPods) + { + //if the creation timestamp is before the check time, then mark the pod as completed (which will then trigger the end state of the + if (pendingPod.CreationTimestamp!.Value < podsAreStuckBeforeThisTime) + { + log.Verbose($"PendingPodWatchdog: Pod {pendingPod.ScriptTicket} has been pending for more than {consideredStuckTimeSpan.Minutes} minutes"); + + var podScriptLog = scriptLogProvider.GetOrCreate(pendingPod.ScriptTicket); + podScriptLog.Error($"The Kubernetes Pod '{pendingPod.ScriptTicket.ToKubernetesScriptPodName()}' has been in the '{pendingPod.State.Phase}' for more than {consideredStuckTimeSpan.Minutes} minute."); + + //marks the pod as completed with a canceled exit code, which then lets the script orchestration handle it :) + pendingPod.MarkAsCompleted(ScriptExitCodes.CanceledExitCode, now); + } + } + } + } +} \ No newline at end of file diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchdogTask.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchdogTask.cs new file mode 100644 index 000000000..e929959f7 --- /dev/null +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchdogTask.cs @@ -0,0 +1,28 @@ +using System; +using System.Threading; +using System.Threading.Tasks; +using Octopus.Tentacle.Background; +using Octopus.Tentacle.Core.Diagnostics; + +namespace Octopus.Tentacle.Kubernetes +{ + public interface IKubernetesStuckPodWatchdogTask : IBackgroundTask + { + } + + public class KubernetesStuckPodWatchdogTask : BackgroundTask, IKubernetesStuckPodWatchdogTask + { + readonly IKubernetesPendingPodWatchdog watchdog; + + public KubernetesStuckPodWatchdogTask(IKubernetesPendingPodWatchdog watchdog, ISystemLog log) + : base(log, TimeSpan.FromSeconds(30)) + { + this.watchdog = watchdog; + } + + protected override async Task RunTask(CancellationToken cancellationToken) + { + await watchdog.StartAsync(cancellationToken); + } + } +} \ No newline at end of file diff --git a/source/Octopus.Tentacle/Kubernetes/KubernetesPodMonitor.cs b/source/Octopus.Tentacle/Kubernetes/KubernetesPodMonitor.cs index fe03845d3..367174667 100644 --- a/source/Octopus.Tentacle/Kubernetes/KubernetesPodMonitor.cs +++ b/source/Octopus.Tentacle/Kubernetes/KubernetesPodMonitor.cs @@ -222,6 +222,7 @@ public interface ITrackedScriptPod TrackedScriptPodState State { get; } ScriptTicket ScriptTicket { get; } void MarkAsCompleted(int exitCode, DateTimeOffset finishedAt); + public DateTimeOffset? CreationTimestamp { get; } } public class TrackedScriptPod : ITrackedScriptPod @@ -234,6 +235,8 @@ public class TrackedScriptPod : ITrackedScriptPod //We create a tracked Pod entry when creating the script Pod so we don't need to wait for the K8s watch event to come through public bool MightNotExistInClusterYet { get; set; } + public DateTimeOffset? CreationTimestamp { get; private set; } + public TrackedScriptPod(ScriptTicket ticket, IClock clock) { this.clock = clock; @@ -243,6 +246,11 @@ public TrackedScriptPod(ScriptTicket ticket, IClock clock) public void Update(V1Pod pod) { + //we assume the creation timestamp is in UTC + CreationTimestamp = pod.Metadata.CreationTimestamp is not null + ? new DateTimeOffset(pod.Metadata.CreationTimestamp.Value) + : null; + var scriptContainerState = GetScriptContainerState(); //If we can't find the container state, but the Pod exists, assume it's still creating/pending