Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions source/Octopus.Tentacle/Kubernetes/KubernetesConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public static class KubernetesConfig
public static bool DisableAutomaticPodCleanup => bool.TryParse(Environment.GetEnvironmentVariable($"{EnvVarPrefix}__DISABLEAUTOPODCLEANUP"), out var disableAutoCleanup) && disableAutoCleanup;
public static bool DisablePodEventsInTaskLog => bool.TryParse(Environment.GetEnvironmentVariable($"{EnvVarPrefix}__DISABLEPODEVENTSINTASKLOG"), out var disable) && disable;

public static string PendingPodsConsideredStuckAfterTimeSpanVariableName = $"{EnvVarPrefix}__PENDINGPODSSTUCKAFTERMINUTES";
public static TimeSpan? PendingPodsConsideredStuckAfterTimeSpan => int.TryParse(Environment.GetEnvironmentVariable(PendingPodsConsideredStuckAfterTimeSpanVariableName), out var stuckPodsMinutes) ? TimeSpan.FromMinutes(stuckPodsMinutes) : null;

public static string HelmReleaseNameVariableName => $"{EnvVarPrefix}__HELMRELEASENAME";
public static string HelmReleaseName => GetRequiredEnvVar(HelmReleaseNameVariableName, "Unable to determine Helm release name.");

Expand Down
3 changes: 3 additions & 0 deletions source/Octopus.Tentacle/Kubernetes/KubernetesModule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ protected override void Load(ContainerBuilder builder)

builder.RegisterType<KubernetesPodMonitorTask>().As<IKubernetesPodMonitorTask>().As<IBackgroundTask>().SingleInstance();
builder.RegisterType<KubernetesPodMonitor>().As<IKubernetesPodMonitor>().As<IKubernetesPodStatusProvider>().SingleInstance();

builder.RegisterType<KubernetesStuckPodWatchdogTask>().As<IKubernetesStuckPodWatchdogTask>().As<IBackgroundTask>().SingleInstance();
builder.RegisterType<KubernetesPendingPodWatchdog>().As<IKubernetesPendingPodWatchdog>().SingleInstance();

builder.RegisterType<KubernetesOrphanedPodCleanerTask>().As<IKubernetesOrphanedPodCleanerTask>().As<IBackgroundTask>().SingleInstance();
builder.RegisterType<KubernetesOrphanedPodCleaner>().As<IKubernetesOrphanedPodCleaner>().SingleInstance();
Expand Down
105 changes: 105 additions & 0 deletions source/Octopus.Tentacle/Kubernetes/KubernetesPendingPodWatchDog.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
using System;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using Octopus.Tentacle.Contracts;
using Octopus.Tentacle.Core.Diagnostics;
using Octopus.Tentacle.Time;
using Octopus.Time;
using Polly;

namespace Octopus.Tentacle.Kubernetes
{
public interface IKubernetesPendingPodWatchdog
{
Task StartAsync(CancellationToken cancellationToken);
}

public class KubernetesPendingPodWatchdog : IKubernetesPendingPodWatchdog
{
readonly IKubernetesPodStatusProvider podStatusProvider;
readonly ISystemLog log;
readonly IClock clock;
readonly ITentacleScriptLogProvider scriptLogProvider;

static readonly TimeSpan RecheckTime = TimeSpan.FromMinutes(1);
internal readonly TimeSpan? pendingPodsConsideredStuckAfterTimeSpan = KubernetesConfig.PendingPodsConsideredStuckAfterTimeSpan;

public KubernetesPendingPodWatchdog(
IKubernetesPodStatusProvider podStatusProvider,
ISystemLog log,
IClock clock,
ITentacleScriptLogProvider scriptLogProvider)
{
this.podStatusProvider = podStatusProvider;
this.log = log;
this.clock = clock;
this.scriptLogProvider = scriptLogProvider;
}

public async Task StartAsync(CancellationToken cancellationToken)
{
if (pendingPodsConsideredStuckAfterTimeSpan is null)
{
log.Verbose($"PendingPodWatchdog: Environment variable {KubernetesConfig.PendingPodsConsideredStuckAfterTimeSpanVariableName} is not set or is an invalid number, so watchdog will not start.");
return;
}

const int maxDurationSeconds = 70;

//We don't want the monitoring to ever stop
var policy = Policy.Handle<Exception>().WaitAndRetryForeverAsync(
retry => TimeSpan.FromSeconds(ExponentialBackoff.GetDuration(retry, maxDurationSeconds)),
(ex, duration) =>
{
log.Error(ex, "PendingPodWatchdog: An unexpected error occured while running watchdog loop, re-running in: " + duration);
});

await policy.ExecuteAsync(async ct => await WatchPendingPodsLoop(pendingPodsConsideredStuckAfterTimeSpan.Value ,ct), cancellationToken);
}

async Task WatchPendingPodsLoop(TimeSpan consideredStuckTimeSpan, CancellationToken cancellationToken)
{
while (!cancellationToken.IsCancellationRequested)
{
log.Verbose("PendingPodWatchdog: Checking for stuck pending pods");
CheckForStuckPendingPods(consideredStuckTimeSpan);

var nextCheckTime = clock.GetUtcTime() + RecheckTime;
log.Verbose($"PendingPodWatchdog: Next check will happen at {nextCheckTime:O}");

await Task.Delay(RecheckTime, cancellationToken);
}
}

void CheckForStuckPendingPods(TimeSpan consideredStuckTimeSpan)
{
var allPods = podStatusProvider.GetAllTrackedScriptPods();
var pendingPods = allPods
.Where(p => p.State.Phase == TrackedScriptPodPhase.Pending)
.Where(p => p.CreationTimestamp is not null)
.ToList();

log.Verbose($"PendingPodWatchdog: {pendingPods.Count} pending pods found");

var now = clock.GetUtcTime();

//calculate the time, where if the pending pods creation date is before this, then the pods is considered stuck
var podsAreStuckBeforeThisTime = now - pendingPodsConsideredStuckAfterTimeSpan;
foreach (var pendingPod in pendingPods)
{
//if the creation timestamp is before the check time, then mark the pod as completed (which will then trigger the end state of the
if (pendingPod.CreationTimestamp!.Value < podsAreStuckBeforeThisTime)
{
log.Verbose($"PendingPodWatchdog: Pod {pendingPod.ScriptTicket} has been pending for more than {consideredStuckTimeSpan.Minutes} minutes");

var podScriptLog = scriptLogProvider.GetOrCreate(pendingPod.ScriptTicket);
podScriptLog.Error($"The Kubernetes Pod '{pendingPod.ScriptTicket.ToKubernetesScriptPodName()}' has been in the '{pendingPod.State.Phase}' for more than {consideredStuckTimeSpan.Minutes} minute.");

//marks the pod as completed with a canceled exit code, which then lets the script orchestration handle it :)
pendingPod.MarkAsCompleted(ScriptExitCodes.CanceledExitCode, now);
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Octopus.Tentacle.Background;
using Octopus.Tentacle.Core.Diagnostics;

namespace Octopus.Tentacle.Kubernetes
{
public interface IKubernetesStuckPodWatchdogTask : IBackgroundTask
{
}

public class KubernetesStuckPodWatchdogTask : BackgroundTask, IKubernetesStuckPodWatchdogTask
{
readonly IKubernetesPendingPodWatchdog watchdog;

public KubernetesStuckPodWatchdogTask(IKubernetesPendingPodWatchdog watchdog, ISystemLog log)
: base(log, TimeSpan.FromSeconds(30))
{
this.watchdog = watchdog;
}

protected override async Task RunTask(CancellationToken cancellationToken)
{
await watchdog.StartAsync(cancellationToken);
}
}
}
8 changes: 8 additions & 0 deletions source/Octopus.Tentacle/Kubernetes/KubernetesPodMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ public interface ITrackedScriptPod
TrackedScriptPodState State { get; }
ScriptTicket ScriptTicket { get; }
void MarkAsCompleted(int exitCode, DateTimeOffset finishedAt);
public DateTimeOffset? CreationTimestamp { get; }
}

public class TrackedScriptPod : ITrackedScriptPod
Expand All @@ -234,6 +235,8 @@ public class TrackedScriptPod : ITrackedScriptPod
//We create a tracked Pod entry when creating the script Pod so we don't need to wait for the K8s watch event to come through
public bool MightNotExistInClusterYet { get; set; }

public DateTimeOffset? CreationTimestamp { get; private set; }

public TrackedScriptPod(ScriptTicket ticket, IClock clock)
{
this.clock = clock;
Expand All @@ -243,6 +246,11 @@ public TrackedScriptPod(ScriptTicket ticket, IClock clock)

public void Update(V1Pod pod)
{
//we assume the creation timestamp is in UTC
CreationTimestamp = pod.Metadata.CreationTimestamp is not null
? new DateTimeOffset(pod.Metadata.CreationTimestamp.Value)
: null;

var scriptContainerState = GetScriptContainerState();

//If we can't find the container state, but the Pod exists, assume it's still creating/pending
Expand Down