NLB based loadbalancer service support in eks deploy (#132)

* Add support for eks deploy with NLB based LoadBalancer Services * Fix error handling in drain and cordon commands
gruntwork-io · Aug 12, 2021 · dc17187 · dc17187
1 parent 7e12102
commit dc17187
Show file tree

Hide file tree

Showing 10 changed files with 319 additions and 120 deletions.
diff --git a/commonerrors/commonerrors.go b/commonerrors/commonerrors.go
@@ -0,0 +1,15 @@
+// Package commonerrors contains error types that are common across the project.
+package commonerrors
+
+import "fmt"
+
+// ImpossibleErr is returned for impossible conditions that should never happen in the code. This error should only be
+// returned if there is no user remedy and represents a bug in the code.
+type ImpossibleErr string
+
+func (err ImpossibleErr) Error() string {
+	return fmt.Sprintf(
+		"You reached a point in kubergrunt that should not happen and is almost certainly a bug. Please open a GitHub issue on https://github.com/gruntwork-io/kubergrunt/issues with the contents of this error message. Code: %s",
+		string(err),
+	)
+}
diff --git a/eks/asg.go b/eks/asg.go
@@ -7,9 +7,12 @@ import (
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/aws/aws-sdk-go/service/elb"
+	"github.com/aws/aws-sdk-go/service/elbv2"
 	"github.com/gruntwork-io/go-commons/collections"
 	"github.com/gruntwork-io/go-commons/errors"
+	"github.com/hashicorp/go-multierror"
 
+	"github.com/gruntwork-io/kubergrunt/commonerrors"
 	"github.com/gruntwork-io/kubergrunt/kubectl"
 	"github.com/gruntwork-io/kubergrunt/logging"
 )
@@ -38,6 +41,7 @@ func scaleUp(
 	asgSvc *autoscaling.AutoScaling,
 	ec2Svc *ec2.EC2,
 	elbSvc *elb.ELB,
+	elbv2Svc *elbv2.ELBV2,
 	kubectlOptions *kubectl.KubectlOptions,
 	asgName string,
 	desiredCapacity int64,
@@ -86,14 +90,14 @@ func scaleUp(
 		logger.Errorf("Undo by terminating all the new instances and trying again")
 		return err
 	}
-	elbNames, err := kubectl.GetLoadBalancerNames(kubectlOptions)
+	elbs, err := kubectl.GetAWSLoadBalancers(kubectlOptions)
 	if err != nil {
 		logger.Errorf("Error retrieving associated ELB names of the Kubernetes services.")
 		// TODO: can we use stages to pick up from here?
 		logger.Errorf("Undo by terminating all the new instances and trying again")
 		return err
 	}
-	err = waitForAnyInstancesRegisteredToELB(elbSvc, elbNames, newInstanceIds)
+	err = waitForAnyInstancesRegisteredToELB(elbSvc, elbv2Svc, elbs, newInstanceIds)
 	if err != nil {
 		logger.Errorf("Timed out waiting for the instances to register to the Service ELBs.")
 		// TODO: can we use stages to pick up from here?
@@ -242,10 +246,9 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
 	return nil
 }
 
-// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the classic ELBs
-// provided. Classic ELB is what is used by the LoadBalancer Service resource in Kubernetes.
-// Here we wait for any instance to be registered, because we only need one instance to be registered to preserve
-// service uptime, due to the way Kubernetes works.
+// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the ELBs
+// provided. Here we wait for any instance to be registered, because we only need one instance to be registered to
+// preserve service uptime, due to the way Kubernetes works.
 // Pros:
 // - Shorter wait time.
 // - Can continue on to drain nodes succinctly, which is also time consuming.
@@ -255,36 +258,36 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
 // - Not all instances are registered, so there is no "load balancing" initially. This may bring down the new server
 //   that is launched.
 // Ultimately, it was decided that the cons are not worth the extended wait time it will introduce to the command.
-// TODO: Update this when:
-// - we support ALB ingress controllers
-// - NLB for LoadBalancer Service resource comes out of alpha
-func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbNames []string, instanceIds []string) error {
+func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbv2Svc *elbv2.ELBV2, elbs []kubectl.AWSLoadBalancer, instanceIds []string) error {
 	logger := logging.GetProjectLogger()
 	logger.Infof("Verifying new nodes are registered to external load balancers.")
 
-	instances := []*elb.Instance{}
-	for _, instanceID := range instanceIds {
-		instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
-	}
+	var multipleErrs *multierror.Error
+	for _, elb := range elbs {
+		if elb.TargetType == kubectl.IPTarget {
+			// We ignore ELBs of the IP type as those directly link to Pods and not instances.
+			continue
+		} else if elb.TargetType == kubectl.UnknownELBTarget {
+			// This should never happen, so we return a generic error that indicates this is an impossible condition and
+			// almost 100% a bug with kubergrunt.
+			multipleErrs = multierror.Append(commonerrors.ImpossibleErr("UNKNOWN_ELB_TARGET_TYPE_IN_WAIT"))
+			continue
+		}
 
-	multipleErrs := NewMultipleLookupErrors()
-	for _, elbName := range elbNames {
-		logger.Infof("Waiting for at least one instance to be in service for elb %s", elbName)
-		params := &elb.DescribeInstanceHealthInput{
-			LoadBalancerName: aws.String(elbName),
-			Instances:        instances,
+		var err error
+		switch elb.Type {
+		case kubectl.CLB:
+			err = waitForAnyInstancesRegisteredToCLB(logger, elbSvc, elb.Name, instanceIds)
+		case kubectl.NLB, kubectl.ALB:
+			err = waitForAnyInstancesRegisteredToALBOrNLB(logger, elbv2Svc, elb.Name, instanceIds)
+		default:
+			// This should never happen, so we return a generic error that indicates this is an impossible condition and
+			// almost 100% a bug with kubergrunt.
+			err = commonerrors.ImpossibleErr("UNKNOWN_ELB_TYPE_IN_WAIT")
 		}
-		err := elbSvc.WaitUntilAnyInstanceInService(params)
 		if err != nil {
-			logger.Infof("ERROR: error waiting for any instance to be in service for elb %s", elbName)
-			multipleErrs.AddError(err)
-		} else {
-			logger.Infof("At least one instance in service for elb %s", elbName)
+			multipleErrs = multierror.Append(multipleErrs, err)
 		}
 	}
-	if !multipleErrs.IsEmpty() {
-		return multipleErrs
-	}
-	logger.Infof("All ELBs have at least one instance in service")
-	return nil
+	return multipleErrs.ErrorOrNil()
 }
diff --git a/eks/deploy.go b/eks/deploy.go
@@ -8,6 +8,7 @@ import (
 	"github.com/aws/aws-sdk-go/service/autoscaling"
 	"github.com/aws/aws-sdk-go/service/ec2"
 	"github.com/aws/aws-sdk-go/service/elb"
+	"github.com/aws/aws-sdk-go/service/elbv2"
 	"github.com/gruntwork-io/go-commons/errors"
 
 	"github.com/gruntwork-io/kubergrunt/eksawshelper"
@@ -47,6 +48,7 @@ func RollOutDeployment(
 	asgSvc := autoscaling.New(sess)
 	ec2Svc := ec2.New(sess)
 	elbSvc := elb.New(sess)
+	elbv2Svc := elbv2.New(sess)
 	logger.Infof("Successfully authenticated with AWS")
 
 	// Retrieve the ASG object and gather required info we will need later
@@ -89,6 +91,7 @@ func RollOutDeployment(
 		asgSvc,
 		ec2Svc,
 		elbSvc,
+		elbv2Svc,
 		kubectlOptions,
 		eksAsgName,
 		originalCapacity*2,

diff --git a/eks/elb.go b/eks/elb.go
@@ -0,0 +1,114 @@
+package eks
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/aws/aws-sdk-go/aws"
+	"github.com/aws/aws-sdk-go/service/elb"
+	"github.com/aws/aws-sdk-go/service/elbv2"
+	"github.com/gruntwork-io/go-commons/collections"
+	"github.com/gruntwork-io/go-commons/errors"
+	"github.com/gruntwork-io/go-commons/retry"
+	"github.com/sirupsen/logrus"
+
+	"github.com/gruntwork-io/kubergrunt/commonerrors"
+)
+
+// waitForAnyInstancesRegisteredToALBOrNLB implements the logic to wait for instance registration to Application and
+// Network Load Balancers. Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
+// NOTE: this assumes the ELB is using the instance target type.
+func waitForAnyInstancesRegisteredToALBOrNLB(logger *logrus.Entry, elbv2Svc *elbv2.ELBV2, lbName string, instanceIDsToWaitFor []string) error {
+	targetGroup, err := getELBTargetGroup(elbv2Svc, lbName)
+	if err != nil {
+		return err
+	}
+
+	// Retry up to 10 minutes with 15 second retry sleep
+	waitErr := retry.DoWithRetry(
+		logger.Logger,
+		fmt.Sprintf(
+			"wait for expected targets to be registered to target group %s of load balancer %s",
+			aws.StringValue(targetGroup.TargetGroupName),
+			lbName,
+		),
+		40, 15*time.Second,
+		func() error {
+			targetsResp, err := elbv2Svc.DescribeTargetHealth(&elbv2.DescribeTargetHealthInput{TargetGroupArn: targetGroup.TargetGroupArn})
+			if err != nil {
+				return retry.FatalError{Underlying: err}
+			}
+
+			// Check each target to see if it is one of the instances we are waiting for, and return without error to
+			// stop the retry loop if that is the case since condition is met.
+			for _, targetHealth := range targetsResp.TargetHealthDescriptions {
+				if targetHealth.Target == nil || targetHealth.Target.Id == nil {
+					continue
+				}
+				instanceID := *targetHealth.Target.Id
+				if collections.ListContainsElement(instanceIDsToWaitFor, instanceID) {
+					return nil
+				}
+			}
+			return fmt.Errorf("No expected instances registered yet")
+		},
+	)
+	if fatalWaitErr, isFatalErr := waitErr.(retry.FatalError); isFatalErr {
+		return errors.WithStackTrace(fatalWaitErr.Underlying)
+	}
+	return errors.WithStackTrace(waitErr)
+}
+
+// waitForAnyInstancesRegisteredToCLB implements the logic to wait for instance registration to Classic Load Balancers.
+// Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
+func waitForAnyInstancesRegisteredToCLB(logger *logrus.Entry, elbSvc *elb.ELB, lbName string, instanceIds []string) error {
+	instances := []*elb.Instance{}
+	for _, instanceID := range instanceIds {
+		instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
+	}
+
+	logger.Infof("Waiting for at least one instance to be in service for elb %s", lbName)
+	params := &elb.DescribeInstanceHealthInput{
+		LoadBalancerName: aws.String(lbName),
+		Instances:        instances,
+	}
+	err := elbSvc.WaitUntilAnyInstanceInService(params)
+	if err != nil {
+		logger.Errorf("error waiting for any instance to be in service for elb %s", lbName)
+		return err
+	}
+	logger.Infof("At least one instance in service for elb %s", lbName)
+	return nil
+}
+
+// getELBTargetGroup looks up the associated TargetGroup of the given ELB. Note that this assumes:
+// - lbName refers to a v2 ELB (ALB or NLB)
+// - There is exactly one TargetGroup associated with the ELB (this is enforced by the Kubernetes controllers)
+func getELBTargetGroup(elbv2Svc *elbv2.ELBV2, lbName string) (*elbv2.TargetGroup, error) {
+	resp, err := elbv2Svc.DescribeLoadBalancers(&elbv2.DescribeLoadBalancersInput{Names: aws.StringSlice([]string{lbName})})
+	if err != nil {
+		return nil, errors.WithStackTrace(err)
+	}
+
+	if len(resp.LoadBalancers) == 0 {
+		return nil, errors.WithStackTrace(CouldNotFindLoadBalancerErr{name: lbName})
+	} else if len(resp.LoadBalancers) > 1 {
+		// This condition is impossible because we are querying a single LB name and names are unique within regions.
+		return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("MORE_THAN_ONE_ELB_IN_LOOKUP"))
+	} else if resp.LoadBalancers[0] == nil {
+		return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_IS_NULL_FROM_API"))
+	}
+	elb := resp.LoadBalancers[0]
+
+	targetGroupsResp, err := elbv2Svc.DescribeTargetGroups(&elbv2.DescribeTargetGroupsInput{LoadBalancerArn: elb.LoadBalancerArn})
+	if err != nil {
+		return nil, errors.WithStackTrace(err)
+	}
+
+	if len(targetGroupsResp.TargetGroups) != 1 {
+		// This is an impossible condition because the load balancer controllers always only creates a single target
+		// group for the ELBs it provisions.
+		return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_HAS_UNEXPECTED_NUMBER_OF_TARGET_GROUPS"))
+	}
+	return targetGroupsResp.TargetGroups[0], nil
+}
diff --git a/eks/errors.go b/eks/errors.go
@@ -168,3 +168,15 @@ func (err NetworkInterfaceDeletedTimeoutError) Error() string {
 		err.networkInterfaceId,
 	)
 }
+
+// CouldNotFindLoadBalancerErr is returned when the given ELB can not be found.
+type CouldNotFindLoadBalancerErr struct {
+	name string
+}
+
+func (err CouldNotFindLoadBalancerErr) Error() string {
+	return fmt.Sprintf(
+		"Could not find ELB with name %s.",
+		err.name,
+	)
+}
diff --git a/go.mod b/go.mod
@@ -7,6 +7,7 @@ require (
 	github.com/blang/semver/v4 v4.0.0
 	github.com/gruntwork-io/go-commons v0.8.2
 	github.com/gruntwork-io/terratest v0.32.9
+	github.com/hashicorp/go-multierror v1.1.0
 	github.com/mitchellh/go-homedir v1.1.0
 	github.com/sirupsen/logrus v1.6.0
 	github.com/stretchr/testify v1.6.1

diff --git a/kubectl/errors.go b/kubectl/errors.go
@@ -55,66 +55,12 @@ type NodeDrainError struct {
 	NodeID string
 }
 
-// NodeDrainErrors is returned when there are errors draining nodes concurrently. Each node that has an error is added
-// to the list.
-type NodeDrainErrors struct {
-	errors []NodeDrainError
-}
-
-func (err NodeDrainErrors) Error() string {
-	base := "Multiple errors caught while draining a node:\n"
-	for _, subErr := range err.errors {
-		subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
-		base = base + subErrMessage + "\n"
-	}
-	return base
-}
-
-func (err NodeDrainErrors) AddError(newErr NodeDrainError) {
-	err.errors = append(err.errors, newErr)
-}
-
-func (err NodeDrainErrors) IsEmpty() bool {
-	return len(err.errors) == 0
-}
-
-func NewNodeDrainErrors() NodeDrainErrors {
-	return NodeDrainErrors{[]NodeDrainError{}}
-}
-
 // NodeCordonError is returned when there is an error cordoning a node.
 type NodeCordonError struct {
 	Error  error
 	NodeID string
 }
 
-// NodeCordonErrors is returned when there are errors cordoning nodes concurrently. Each node that has an error is added
-// to the list.
-type NodeCordonErrors struct {
-	errors []NodeCordonError
-}
-
-func (err NodeCordonErrors) Error() string {
-	base := "Multiple errors caught while cordoning nodes:\n"
-	for _, subErr := range err.errors {
-		subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
-		base = base + subErrMessage + "\n"
-	}
-	return base
-}
-
-func (err NodeCordonErrors) AddError(newErr NodeCordonError) {
-	err.errors = append(err.errors, newErr)
-}
-
-func (err NodeCordonErrors) IsEmpty() bool {
-	return len(err.errors) == 0
-}
-
-func NewNodeCordonErrors() NodeCordonErrors {
-	return NodeCordonErrors{[]NodeCordonError{}}
-}
-
 // LoadBalancerNotReadyError is returned when the LoadBalancer Service is unexpectedly not ready.
 type LoadBalancerNotReadyError struct {
 	serviceName string
@@ -154,3 +100,17 @@ func (err ProvisionIngressEndpointTimeoutError) Error() string {
 		err.namespace,
 	)
 }
+
+// UnknownAWSLoadBalancerTypeErr is returned when we encounter a load balancer type that we don't expect/support.
+type UnknownAWSLoadBalancerTypeErr struct {
+	typeKey string
+	typeStr string
+}
+
+func (err UnknownAWSLoadBalancerTypeErr) Error() string {
+	return fmt.Sprintf(
+		"Unknown value for annotation %s (value: %s)",
+		err.typeKey,
+		err.typeStr,
+	)
+}