Skip to content

Commit

Permalink
NLB based loadbalancer service support in eks deploy (#132)
Browse files Browse the repository at this point in the history
* Add support for eks deploy with NLB based LoadBalancer Services

* Fix error handling in drain and cordon commands
  • Loading branch information
yorinasub17 authored Aug 12, 2021
1 parent 7e12102 commit dc17187
Show file tree
Hide file tree
Showing 10 changed files with 319 additions and 120 deletions.
15 changes: 15 additions & 0 deletions commonerrors/commonerrors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Package commonerrors contains error types that are common across the project.
package commonerrors

import "fmt"

// ImpossibleErr is returned for impossible conditions that should never happen in the code. This error should only be
// returned if there is no user remedy and represents a bug in the code.
type ImpossibleErr string

func (err ImpossibleErr) Error() string {
return fmt.Sprintf(
"You reached a point in kubergrunt that should not happen and is almost certainly a bug. Please open a GitHub issue on https://github.com/gruntwork-io/kubergrunt/issues with the contents of this error message. Code: %s",
string(err),
)
}
63 changes: 33 additions & 30 deletions eks/asg.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ import (
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/elb"
"github.com/aws/aws-sdk-go/service/elbv2"
"github.com/gruntwork-io/go-commons/collections"
"github.com/gruntwork-io/go-commons/errors"
"github.com/hashicorp/go-multierror"

"github.com/gruntwork-io/kubergrunt/commonerrors"
"github.com/gruntwork-io/kubergrunt/kubectl"
"github.com/gruntwork-io/kubergrunt/logging"
)
Expand Down Expand Up @@ -38,6 +41,7 @@ func scaleUp(
asgSvc *autoscaling.AutoScaling,
ec2Svc *ec2.EC2,
elbSvc *elb.ELB,
elbv2Svc *elbv2.ELBV2,
kubectlOptions *kubectl.KubectlOptions,
asgName string,
desiredCapacity int64,
Expand Down Expand Up @@ -86,14 +90,14 @@ func scaleUp(
logger.Errorf("Undo by terminating all the new instances and trying again")
return err
}
elbNames, err := kubectl.GetLoadBalancerNames(kubectlOptions)
elbs, err := kubectl.GetAWSLoadBalancers(kubectlOptions)
if err != nil {
logger.Errorf("Error retrieving associated ELB names of the Kubernetes services.")
// TODO: can we use stages to pick up from here?
logger.Errorf("Undo by terminating all the new instances and trying again")
return err
}
err = waitForAnyInstancesRegisteredToELB(elbSvc, elbNames, newInstanceIds)
err = waitForAnyInstancesRegisteredToELB(elbSvc, elbv2Svc, elbs, newInstanceIds)
if err != nil {
logger.Errorf("Timed out waiting for the instances to register to the Service ELBs.")
// TODO: can we use stages to pick up from here?
Expand Down Expand Up @@ -242,10 +246,9 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
return nil
}

// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the classic ELBs
// provided. Classic ELB is what is used by the LoadBalancer Service resource in Kubernetes.
// Here we wait for any instance to be registered, because we only need one instance to be registered to preserve
// service uptime, due to the way Kubernetes works.
// waitForAnyInstancesRegisteredToELB waits until any of the instances provided are registered to all the ELBs
// provided. Here we wait for any instance to be registered, because we only need one instance to be registered to
// preserve service uptime, due to the way Kubernetes works.
// Pros:
// - Shorter wait time.
// - Can continue on to drain nodes succinctly, which is also time consuming.
Expand All @@ -255,36 +258,36 @@ func detachInstances(asgSvc *autoscaling.AutoScaling, asgName string, idList []s
// - Not all instances are registered, so there is no "load balancing" initially. This may bring down the new server
// that is launched.
// Ultimately, it was decided that the cons are not worth the extended wait time it will introduce to the command.
// TODO: Update this when:
// - we support ALB ingress controllers
// - NLB for LoadBalancer Service resource comes out of alpha
func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbNames []string, instanceIds []string) error {
func waitForAnyInstancesRegisteredToELB(elbSvc *elb.ELB, elbv2Svc *elbv2.ELBV2, elbs []kubectl.AWSLoadBalancer, instanceIds []string) error {
logger := logging.GetProjectLogger()
logger.Infof("Verifying new nodes are registered to external load balancers.")

instances := []*elb.Instance{}
for _, instanceID := range instanceIds {
instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
}
var multipleErrs *multierror.Error
for _, elb := range elbs {
if elb.TargetType == kubectl.IPTarget {
// We ignore ELBs of the IP type as those directly link to Pods and not instances.
continue
} else if elb.TargetType == kubectl.UnknownELBTarget {
// This should never happen, so we return a generic error that indicates this is an impossible condition and
// almost 100% a bug with kubergrunt.
multipleErrs = multierror.Append(commonerrors.ImpossibleErr("UNKNOWN_ELB_TARGET_TYPE_IN_WAIT"))
continue
}

multipleErrs := NewMultipleLookupErrors()
for _, elbName := range elbNames {
logger.Infof("Waiting for at least one instance to be in service for elb %s", elbName)
params := &elb.DescribeInstanceHealthInput{
LoadBalancerName: aws.String(elbName),
Instances: instances,
var err error
switch elb.Type {
case kubectl.CLB:
err = waitForAnyInstancesRegisteredToCLB(logger, elbSvc, elb.Name, instanceIds)
case kubectl.NLB, kubectl.ALB:
err = waitForAnyInstancesRegisteredToALBOrNLB(logger, elbv2Svc, elb.Name, instanceIds)
default:
// This should never happen, so we return a generic error that indicates this is an impossible condition and
// almost 100% a bug with kubergrunt.
err = commonerrors.ImpossibleErr("UNKNOWN_ELB_TYPE_IN_WAIT")
}
err := elbSvc.WaitUntilAnyInstanceInService(params)
if err != nil {
logger.Infof("ERROR: error waiting for any instance to be in service for elb %s", elbName)
multipleErrs.AddError(err)
} else {
logger.Infof("At least one instance in service for elb %s", elbName)
multipleErrs = multierror.Append(multipleErrs, err)
}
}
if !multipleErrs.IsEmpty() {
return multipleErrs
}
logger.Infof("All ELBs have at least one instance in service")
return nil
return multipleErrs.ErrorOrNil()
}
3 changes: 3 additions & 0 deletions eks/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/elb"
"github.com/aws/aws-sdk-go/service/elbv2"
"github.com/gruntwork-io/go-commons/errors"

"github.com/gruntwork-io/kubergrunt/eksawshelper"
Expand Down Expand Up @@ -47,6 +48,7 @@ func RollOutDeployment(
asgSvc := autoscaling.New(sess)
ec2Svc := ec2.New(sess)
elbSvc := elb.New(sess)
elbv2Svc := elbv2.New(sess)
logger.Infof("Successfully authenticated with AWS")

// Retrieve the ASG object and gather required info we will need later
Expand Down Expand Up @@ -89,6 +91,7 @@ func RollOutDeployment(
asgSvc,
ec2Svc,
elbSvc,
elbv2Svc,
kubectlOptions,
eksAsgName,
originalCapacity*2,
Expand Down
114 changes: 114 additions & 0 deletions eks/elb.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package eks

import (
"fmt"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/elb"
"github.com/aws/aws-sdk-go/service/elbv2"
"github.com/gruntwork-io/go-commons/collections"
"github.com/gruntwork-io/go-commons/errors"
"github.com/gruntwork-io/go-commons/retry"
"github.com/sirupsen/logrus"

"github.com/gruntwork-io/kubergrunt/commonerrors"
)

// waitForAnyInstancesRegisteredToALBOrNLB implements the logic to wait for instance registration to Application and
// Network Load Balancers. Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
// NOTE: this assumes the ELB is using the instance target type.
func waitForAnyInstancesRegisteredToALBOrNLB(logger *logrus.Entry, elbv2Svc *elbv2.ELBV2, lbName string, instanceIDsToWaitFor []string) error {
targetGroup, err := getELBTargetGroup(elbv2Svc, lbName)
if err != nil {
return err
}

// Retry up to 10 minutes with 15 second retry sleep
waitErr := retry.DoWithRetry(
logger.Logger,
fmt.Sprintf(
"wait for expected targets to be registered to target group %s of load balancer %s",
aws.StringValue(targetGroup.TargetGroupName),
lbName,
),
40, 15*time.Second,
func() error {
targetsResp, err := elbv2Svc.DescribeTargetHealth(&elbv2.DescribeTargetHealthInput{TargetGroupArn: targetGroup.TargetGroupArn})
if err != nil {
return retry.FatalError{Underlying: err}
}

// Check each target to see if it is one of the instances we are waiting for, and return without error to
// stop the retry loop if that is the case since condition is met.
for _, targetHealth := range targetsResp.TargetHealthDescriptions {
if targetHealth.Target == nil || targetHealth.Target.Id == nil {
continue
}
instanceID := *targetHealth.Target.Id
if collections.ListContainsElement(instanceIDsToWaitFor, instanceID) {
return nil
}
}
return fmt.Errorf("No expected instances registered yet")
},
)
if fatalWaitErr, isFatalErr := waitErr.(retry.FatalError); isFatalErr {
return errors.WithStackTrace(fatalWaitErr.Underlying)
}
return errors.WithStackTrace(waitErr)
}

// waitForAnyInstancesRegisteredToCLB implements the logic to wait for instance registration to Classic Load Balancers.
// Refer to function docs for waitForAnyInstancesRegisteredToELB for more info.
func waitForAnyInstancesRegisteredToCLB(logger *logrus.Entry, elbSvc *elb.ELB, lbName string, instanceIds []string) error {
instances := []*elb.Instance{}
for _, instanceID := range instanceIds {
instances = append(instances, &elb.Instance{InstanceId: aws.String(instanceID)})
}

logger.Infof("Waiting for at least one instance to be in service for elb %s", lbName)
params := &elb.DescribeInstanceHealthInput{
LoadBalancerName: aws.String(lbName),
Instances: instances,
}
err := elbSvc.WaitUntilAnyInstanceInService(params)
if err != nil {
logger.Errorf("error waiting for any instance to be in service for elb %s", lbName)
return err
}
logger.Infof("At least one instance in service for elb %s", lbName)
return nil
}

// getELBTargetGroup looks up the associated TargetGroup of the given ELB. Note that this assumes:
// - lbName refers to a v2 ELB (ALB or NLB)
// - There is exactly one TargetGroup associated with the ELB (this is enforced by the Kubernetes controllers)
func getELBTargetGroup(elbv2Svc *elbv2.ELBV2, lbName string) (*elbv2.TargetGroup, error) {
resp, err := elbv2Svc.DescribeLoadBalancers(&elbv2.DescribeLoadBalancersInput{Names: aws.StringSlice([]string{lbName})})
if err != nil {
return nil, errors.WithStackTrace(err)
}

if len(resp.LoadBalancers) == 0 {
return nil, errors.WithStackTrace(CouldNotFindLoadBalancerErr{name: lbName})
} else if len(resp.LoadBalancers) > 1 {
// This condition is impossible because we are querying a single LB name and names are unique within regions.
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("MORE_THAN_ONE_ELB_IN_LOOKUP"))
} else if resp.LoadBalancers[0] == nil {
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_IS_NULL_FROM_API"))
}
elb := resp.LoadBalancers[0]

targetGroupsResp, err := elbv2Svc.DescribeTargetGroups(&elbv2.DescribeTargetGroupsInput{LoadBalancerArn: elb.LoadBalancerArn})
if err != nil {
return nil, errors.WithStackTrace(err)
}

if len(targetGroupsResp.TargetGroups) != 1 {
// This is an impossible condition because the load balancer controllers always only creates a single target
// group for the ELBs it provisions.
return nil, errors.WithStackTrace(commonerrors.ImpossibleErr("ELB_HAS_UNEXPECTED_NUMBER_OF_TARGET_GROUPS"))
}
return targetGroupsResp.TargetGroups[0], nil
}
12 changes: 12 additions & 0 deletions eks/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,15 @@ func (err NetworkInterfaceDeletedTimeoutError) Error() string {
err.networkInterfaceId,
)
}

// CouldNotFindLoadBalancerErr is returned when the given ELB can not be found.
type CouldNotFindLoadBalancerErr struct {
name string
}

func (err CouldNotFindLoadBalancerErr) Error() string {
return fmt.Sprintf(
"Could not find ELB with name %s.",
err.name,
)
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/blang/semver/v4 v4.0.0
github.com/gruntwork-io/go-commons v0.8.2
github.com/gruntwork-io/terratest v0.32.9
github.com/hashicorp/go-multierror v1.1.0
github.com/mitchellh/go-homedir v1.1.0
github.com/sirupsen/logrus v1.6.0
github.com/stretchr/testify v1.6.1
Expand Down
68 changes: 14 additions & 54 deletions kubectl/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,66 +55,12 @@ type NodeDrainError struct {
NodeID string
}

// NodeDrainErrors is returned when there are errors draining nodes concurrently. Each node that has an error is added
// to the list.
type NodeDrainErrors struct {
errors []NodeDrainError
}

func (err NodeDrainErrors) Error() string {
base := "Multiple errors caught while draining a node:\n"
for _, subErr := range err.errors {
subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
base = base + subErrMessage + "\n"
}
return base
}

func (err NodeDrainErrors) AddError(newErr NodeDrainError) {
err.errors = append(err.errors, newErr)
}

func (err NodeDrainErrors) IsEmpty() bool {
return len(err.errors) == 0
}

func NewNodeDrainErrors() NodeDrainErrors {
return NodeDrainErrors{[]NodeDrainError{}}
}

// NodeCordonError is returned when there is an error cordoning a node.
type NodeCordonError struct {
Error error
NodeID string
}

// NodeCordonErrors is returned when there are errors cordoning nodes concurrently. Each node that has an error is added
// to the list.
type NodeCordonErrors struct {
errors []NodeCordonError
}

func (err NodeCordonErrors) Error() string {
base := "Multiple errors caught while cordoning nodes:\n"
for _, subErr := range err.errors {
subErrMessage := fmt.Sprintf("Node %s: %s", subErr.NodeID, subErr.Error)
base = base + subErrMessage + "\n"
}
return base
}

func (err NodeCordonErrors) AddError(newErr NodeCordonError) {
err.errors = append(err.errors, newErr)
}

func (err NodeCordonErrors) IsEmpty() bool {
return len(err.errors) == 0
}

func NewNodeCordonErrors() NodeCordonErrors {
return NodeCordonErrors{[]NodeCordonError{}}
}

// LoadBalancerNotReadyError is returned when the LoadBalancer Service is unexpectedly not ready.
type LoadBalancerNotReadyError struct {
serviceName string
Expand Down Expand Up @@ -154,3 +100,17 @@ func (err ProvisionIngressEndpointTimeoutError) Error() string {
err.namespace,
)
}

// UnknownAWSLoadBalancerTypeErr is returned when we encounter a load balancer type that we don't expect/support.
type UnknownAWSLoadBalancerTypeErr struct {
typeKey string
typeStr string
}

func (err UnknownAWSLoadBalancerTypeErr) Error() string {
return fmt.Sprintf(
"Unknown value for annotation %s (value: %s)",
err.typeKey,
err.typeStr,
)
}
Loading

0 comments on commit dc17187

Please sign in to comment.