From cc553ee619e3cf29610f8458b35c9e5ebe525e5b Mon Sep 17 00:00:00 2001
From: matth- <matth-@users.noreply.github.com>
Date: Tue, 11 Apr 2023 14:18:00 +0200
Subject: [PATCH] feat(lb): add lb annotation for redispatch, retries and
 transient_check_delay (#127)

Co-authored-by: Matthieu Morel <mmorel@scaleway.com>
---
 docs/loadbalancer-annotations.md | 14 ++++-
 go.mod                           |  2 +-
 go.sum                           |  4 ++
 scaleway/loadbalancers.go        | 89 ++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 2 deletions(-)
diff --git a/docs/loadbalancer-annotations.md b/docs/loadbalancer-annotations.md
index b991000..397cc6c 100644
--- a/docs/loadbalancer-annotations.md
+++ b/docs/loadbalancer-annotations.md
@@ -22,7 +22,7 @@ You can get a list of working annotation on in the Scaleway loadBalancer [docume
 
 Note:
 - If an invalid mode is passed in the annotation, the service will throw an error.
-- If an annotation is not specified, the cloud controller manager will apply default configuration.  
+ If an annotation is not specified, the cloud controller manager will apply default configuration.
 
 ### `service.beta.kubernetes.io/scw-loadbalancer-id`
 This annotation is the ID of the loadbalancer to use. It is populated by the CCM with the new LB ID if the annotation does not exist.
@@ -51,6 +51,10 @@ NB: depending on the type, some other annotations are required, see below.
 This is the annotation to set the time between two consecutive health checks.
 The default value is `5s`. The duration are go's time.Duration (ex: `1s`, `2m`, `4h`, ...).
 
+### `service.beta.kubernetes.io/scw-loadbalancer-health-transient-check-delay`
+This is the annotation to set the time between two consecutive health checks in a transient state (going UP or DOWN).
+The default value is `0.5s`. The duration are go's time.Duration (ex: `1s`, `2m`, `4h`, ...).
+
 ### `service.beta.kubernetes.io/scw-loadbalancer-health-check-timeout`
 This is the annotaton to set the additional check timeout, after the connection has been already established.
 The default value is `5s`. The duration are go's time.Duration (ex: `1s`, `2m`, `4h`, ...).
@@ -134,3 +138,11 @@ The possible format are:
  - `<certificate-id>`: will use this certificate for all frontends
  - `<certificate-id>,<certificate-id>` will use these certificates for all frontends
  - `<port1>:<certificate1-id>,<certificate2-id>;<port2>,<port3>:<certificate3-id>` will use certificate 1 and 2 for frontend with port port1 and certificate3 for frotend with port port2 and port3
+
+### `service.beta.kubernetes.io/scw-loadbalancer-redispatch-attempt-count`
+This is the annotation to activate redispatch on another backend server in case of failure
+The default value is 0, which disable the redispatch.
+
+### `service.beta.kubernetes.io/scw-loadbalancer-max-retries`
+This is the annotation to configure the number of retry on connection failure
+The default value is 2.
diff --git a/go.mod b/go.mod
index 72d078b..72a91a6 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/scaleway/scaleway-cloud-controller-manager
 go 1.17
 
 require (
-	github.com/scaleway/scaleway-sdk-go v1.0.0-beta.14.0.20230318120603-8df14b12fd02
+	github.com/scaleway/scaleway-sdk-go v1.0.0-beta.15.0.20230327160534-01e8b89ed721
 	github.com/spf13/pflag v1.0.5
 	k8s.io/api v0.21.0
 	k8s.io/apimachinery v0.21.0
diff --git a/go.sum b/go.sum
index de857fb..30dcbec 100644
--- a/go.sum
+++ b/go.sum
@@ -341,6 +341,10 @@ github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQD
 github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
 github.com/scaleway/scaleway-sdk-go v1.0.0-beta.14.0.20230318120603-8df14b12fd02 h1:Ipdjm5oXQtzoHxsc1Pj7LqOlNJh5HrVb4R9TRpFiwKM=
 github.com/scaleway/scaleway-sdk-go v1.0.0-beta.14.0.20230318120603-8df14b12fd02/go.mod h1:fCa7OJZ/9DRTnOKmxvT6pn+LPWUptQAmHF/SBJUGEcg=
+github.com/scaleway/scaleway-sdk-go v1.0.0-beta.15 h1:Y7xOFbD+3jaPw+VN7lkakNJ/pa+ZSQVFp1ONtJaBxns=
+github.com/scaleway/scaleway-sdk-go v1.0.0-beta.15/go.mod h1:fCa7OJZ/9DRTnOKmxvT6pn+LPWUptQAmHF/SBJUGEcg=
+github.com/scaleway/scaleway-sdk-go v1.0.0-beta.15.0.20230327160534-01e8b89ed721 h1:sH3/HWBjaFYFuWnmA2biULFcq6KTc1OP3cdz/OPMhAI=
+github.com/scaleway/scaleway-sdk-go v1.0.0-beta.15.0.20230327160534-01e8b89ed721/go.mod h1:fCa7OJZ/9DRTnOKmxvT6pn+LPWUptQAmHF/SBJUGEcg=
 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
 github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
diff --git a/scaleway/loadbalancers.go b/scaleway/loadbalancers.go
index b15d886..0b92153 100644
--- a/scaleway/loadbalancers.go
+++ b/scaleway/loadbalancers.go
@@ -28,6 +28,7 @@ import (
 	scwlb "github.com/scaleway/scaleway-sdk-go/api/lb/v1"
 	"github.com/scaleway/scaleway-sdk-go/scw"
 	"github.com/scaleway/scaleway-sdk-go/validation"
+	"google.golang.org/protobuf/types/known/durationpb"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/klog/v2"
 )
@@ -60,6 +61,10 @@ const (
 	// The default value is "5s". The duration are go's time.Duration (ex: "1s", "2m", "4h", ...)
 	serviceAnnotationLoadBalancerHealthCheckDelay = "service.beta.kubernetes.io/scw-loadbalancer-health-check-delay"
 
+	// serviceAnnotationLoadBalancerHealthTransientCheckDelay is the time between two consecutive health checks on transient state (going UP or DOWN)
+	// The default value is "0.5s". The duration are go's time.Duration (ex: "1s", "2m", "4h", ...)
+	serviceAnnotationLoadBalancerHealthTransientCheckDelay = "service.beta.kubernetes.io/scw-loadbalancer-health-transient-check-delay"
+
 	// serviceAnnotationLoadBalancerHealthCheckTimeout is the additional check timeout, after the connection has been already established
 	// The default value is "5s". The duration are go's time.Duration (ex: "1s", "2m", "4h", ...)
 	serviceAnnotationLoadBalancerHealthCheckTimeout = "service.beta.kubernetes.io/scw-loadbalancer-health-check-timeout"
@@ -160,6 +165,14 @@ const (
 	// serviceAnnotationLoadBalancerTargetNodeLabels is the annotation to target nodes with specific label(s)
 	// Expected format: "Key1=Val1,Key2=Val2"
 	serviceAnnotationLoadBalancerTargetNodeLabels = "service.beta.kubernetes.io/scw-loadbalancer-target-node-labels"
+
+	// serviceAnnotationLoadBalancerRedispatchAttemptCount is the annotation to activate redispatch on another backend server in case of failure
+	// The default value is "0", which disable the redispatch
+	serviceAnnotationLoadBalancerRedispatchAttemptCount = "service.beta.kubernetes.io/scw-loadbalancer-redispatch-attempt-count"
+
+	// serviceAnnotationLoadBalancerMaxRetries is the annotation to configure the number of retry on connection failure
+	// The default value is 3.
+	serviceAnnotationLoadBalancerMaxRetries = "service.beta.kubernetes.io/scw-loadbalancer-max-retries"
 )
 
 const MaxEntriesPerACL = 60
@@ -969,6 +982,20 @@ func (l *loadbalancers) makeUpdateBackendRequest(backend *scwlb.Backend, service
 
 	request.OnMarkedDownAction = onMarkedDownAction
 
+	redispatchAttemptCount, err := getRedisatchAttemptCount(service)
+	if err != nil {
+		return nil, err
+	}
+
+	request.RedispatchAttemptCount = redispatchAttemptCount
+
+	maxRetries, err := getMaxRetries(service)
+	if err != nil {
+		return nil, err
+	}
+
+	request.MaxRetries = maxRetries
+
 	return request, nil
 }
 
@@ -999,6 +1026,13 @@ func (l *loadbalancers) makeUpdateHealthCheckRequest(backend *scwlb.Backend, nod
 
 	request.CheckMaxRetries = healthCheckMaxRetries
 
+	transientCheckDelay, err := getHealthCheckTransientCheckDelay(service)
+	if err != nil {
+		return nil, err
+	}
+
+	request.TransientCheckDelay = transientCheckDelay
+
 	healthCheckType, err := getHealthCheckType(service, nodePort)
 	if err != nil {
 		return nil, err
@@ -1157,6 +1191,12 @@ func (l *loadbalancers) makeCreateBackendRequest(loadbalancer *scwlb.LB, nodePor
 		return nil, err
 	}
 
+	healthCheckTransientCheckDelay, err := getHealthCheckTransientCheckDelay(service)
+	if err != nil {
+		return nil, err
+	}
+	healthCheck.TransientCheckDelay = healthCheckTransientCheckDelay
+
 	healthCheck.CheckMaxRetries = healthCheckMaxRetries
 
 	healthCheckType, err := getHealthCheckType(service, nodePort)
@@ -1466,6 +1506,36 @@ func getOnMarkedDownAction(service *v1.Service) (scwlb.OnMarkedDownAction, error
 	return onMarkedDownActionValue, nil
 }
 
+func getRedisatchAttemptCount(service *v1.Service) (*int32, error) {
+	redispatchAttemptCount, ok := service.Annotations[serviceAnnotationLoadBalancerRedispatchAttemptCount]
+	if !ok {
+		return nil, nil
+	}
+	redispatchAttemptCountInt, err := strconv.Atoi(redispatchAttemptCount)
+	if err != nil {
+		klog.Errorf("invalid value for annotation %s", serviceAnnotationLoadBalancerRedispatchAttemptCount)
+		return nil, errLoadBalancerInvalidAnnotation
+
+	}
+	redispatchAttemptCountInt32 := int32(redispatchAttemptCountInt)
+	return &redispatchAttemptCountInt32, nil
+}
+
+func getMaxRetries(service *v1.Service) (*int32, error) {
+	maxRetriesCount, ok := service.Annotations[serviceAnnotationLoadBalancerMaxRetries]
+	if !ok {
+		return nil, nil
+	}
+	maxRetriesCountInt, err := strconv.Atoi(maxRetriesCount)
+	if err != nil {
+		klog.Errorf("invalid value for annotation %s", serviceAnnotationLoadBalancerMaxRetries)
+		return nil, errLoadBalancerInvalidAnnotation
+
+	}
+	maxRetriesCountInt32 := int32(maxRetriesCountInt)
+	return &maxRetriesCountInt32, nil
+}
+
 func getHealthCheckDelay(service *v1.Service) (time.Duration, error) {
 	healthCheckDelay, ok := service.Annotations[serviceAnnotationLoadBalancerHealthCheckDelay]
 	if !ok {
@@ -1511,6 +1581,25 @@ func getHealthCheckMaxRetries(service *v1.Service) (int32, error) {
 	return int32(healthCheckMaxRetriesInt), nil
 }
 
+func getHealthCheckTransientCheckDelay(service *v1.Service) (*scw.Duration, error) {
+	transientCheckDelay, ok := service.Annotations[serviceAnnotationLoadBalancerHealthTransientCheckDelay]
+	if !ok {
+		return nil, nil
+	}
+	transientCheckDelayDuration, err := time.ParseDuration(transientCheckDelay)
+	if err != nil {
+		klog.Errorf("invalid value for annotation %s", serviceAnnotationLoadBalancerHealthTransientCheckDelay)
+		return nil, errLoadBalancerInvalidAnnotation
+	}
+
+	durationpb := durationpb.New(transientCheckDelayDuration)
+
+	return &scw.Duration{
+		Seconds: durationpb.Seconds,
+		Nanos:   durationpb.Nanos,
+	}, nil
+}
+
 func getForceInternalIP(service *v1.Service) bool {
 	forceInternalIP, ok := service.Annotations[serviceAnnotationLoadBalancerForceInternalIP]
 	if !ok {