Skip to content

Commit

Permalink
Ignore ActiveInstanceRefreshNotFound on cancel (for idempotence), onl…
Browse files Browse the repository at this point in the history
…y cancel if unfinished *and* not in Cancelling status
  • Loading branch information
AndiDog committed Jul 1, 2024
1 parent 56599b4 commit 05bb37c
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 41 deletions.
6 changes: 3 additions & 3 deletions exp/controllers/awsmachinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
return ctrl.Result{}, err
}

canUpdateLaunchTemplate := func() (bool, error) {
canStartInstanceRefresh := func() (bool, *string, error) {
// If there is a change: before changing the template, check if there exist an ongoing instance refresh,
// because only 1 instance refresh can be "InProgress". If template is updated when refresh cannot be started,
// that change will not trigger a refresh. Do not start an instance refresh if only userdata changed.
if asg == nil {
// If the ASG hasn't been created yet, there is no need to check if we can start the instance refresh.
// But we want to update the LaunchTemplate because an error in the LaunchTemplate may be blocking the ASG creation.
return true, nil
return true, nil, nil
}
return asgsvc.CanStartASGInstanceRefresh(machinePoolScope)
}
Expand Down Expand Up @@ -299,7 +299,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP
machinePoolScope.Info("starting instance refresh", "number of instances", machinePoolScope.MachinePool.Spec.Replicas)
return asgsvc.StartASGInstanceRefresh(machinePoolScope)
}
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canUpdateLaunchTemplate, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation)
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation)
if err != nil {
r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err)
machinePoolScope.Error(err, "failed to reconcile launch template")
Expand Down
6 changes: 3 additions & 3 deletions exp/controllers/awsmachinepool_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) {
nil)
ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-different"), nil)
ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil)
ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil)
ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-different")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any()).Return(nil)
ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil)
Expand Down Expand Up @@ -603,7 +603,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) {
nil)
ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-existing"), nil)
ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil)
ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil)
ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any()).Return(nil)
ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil)
Expand Down Expand Up @@ -678,7 +678,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) {
nil)
ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-existing"), nil)
ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil)
asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil)
ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil)
ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data-new"}), gomock.Any()).Return(nil)
ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil)
Expand Down
6 changes: 3 additions & 3 deletions exp/controllers/awsmanagedmachinepool_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal(
reconSvc := r.getReconcileService(ec2Scope)

if machinePoolScope.ManagedMachinePool.Spec.AWSLaunchTemplate != nil {
canUpdateLaunchTemplate := func() (bool, error) {
return true, nil
canStartInstanceRefresh := func() (bool, *string, error) {
return true, nil, nil
}
cancelInstanceRefresh := func() error {
return nil
Expand All @@ -222,7 +222,7 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal(
return nil
}
var objectStoreSvc services.ObjectStoreInterface = nil // no S3 bucket support for `AWSManagedControlPlane` yet
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canUpdateLaunchTemplate, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation)
res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation)
if err != nil {
r.Recorder.Eventf(machinePoolScope.ManagedMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err)
machinePoolScope.Error(err, "failed to reconcile launch template")
Expand Down
23 changes: 18 additions & 5 deletions pkg/cloud/services/autoscaling/autoscalinggroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"strings"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/pkg/errors"
Expand Down Expand Up @@ -321,27 +322,29 @@ func (s *Service) UpdateASG(machinePoolScope *scope.MachinePoolScope) error {
return nil
}

// CanStartASGInstanceRefresh will start an ASG instance with refresh.
func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error) {
// CanStartASGInstanceRefresh checks if a new ASG instance refresh can currently be started, and returns the status if there is an existing, unfinished refresh.
func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *string, error) {
describeInput := &autoscaling.DescribeInstanceRefreshesInput{AutoScalingGroupName: aws.String(scope.Name())}
refreshes, err := s.ASGClient.DescribeInstanceRefreshesWithContext(context.TODO(), describeInput)
if err != nil {
return false, err
return false, nil, err
}
hasUnfinishedRefresh := false
var unfinishedRefreshStatus *string
if err == nil && len(refreshes.InstanceRefreshes) != 0 {
for i := range refreshes.InstanceRefreshes {
if *refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusInProgress ||
*refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusPending ||
*refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusCancelling {
hasUnfinishedRefresh = true
unfinishedRefreshStatus = refreshes.InstanceRefreshes[i].Status
}
}
}
if hasUnfinishedRefresh {
return false, nil
return false, unfinishedRefreshStatus, nil
}
return true, nil
return true, nil, nil
}

// CancelASGInstanceRefresh cancels an ASG instance refresh.
Expand All @@ -351,6 +354,16 @@ func (s *Service) CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error
}

if _, err := s.ASGClient.CancelInstanceRefreshWithContext(context.TODO(), input); err != nil {
var aerr awserr.Error
if errors.As(err, &aerr) {
if aerr.Code() == autoscaling.ErrCodeActiveInstanceRefreshNotFoundFault {
// Refresh isn't "in progress". It may have turned to cancelled status
// by now. So this is not an error for us because we may have called
// CancelInstanceRefresh multiple times and should be idempotent here.
return nil
}
}

return errors.Wrapf(err, "failed to cancel ASG instance refresh %q", scope.Name())
}

Expand Down
20 changes: 12 additions & 8 deletions pkg/cloud/services/autoscaling/autoscalinggroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1111,10 +1111,11 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) {
defer mockCtrl.Finish()

tests := []struct {
name string
wantErr bool
canStart bool
expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder)
name string
wantErr bool
wantUnfinishedRefreshStatus *string
canStart bool
expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder)
}{
{
name: "should return error if describe instance refresh failed",
Expand All @@ -1139,9 +1140,10 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) {
},
},
{
name: "should return false if some instances have unfinished refresh",
wantErr: false,
canStart: false,
name: "should return false if some instances have unfinished refresh",
wantErr: false,
wantUnfinishedRefreshStatus: aws.String(autoscaling.InstanceRefreshStatusInProgress),
canStart: false,
expect: func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) {
m.DescribeInstanceRefreshesWithContext(context.TODO(), gomock.Eq(&autoscaling.DescribeInstanceRefreshesInput{
AutoScalingGroupName: aws.String("machinePoolName"),
Expand Down Expand Up @@ -1173,11 +1175,13 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) {
g.Expect(err).ToNot(HaveOccurred())
mps.AWSMachinePool.Name = "machinePoolName"

out, err := s.CanStartASGInstanceRefresh(mps)
out, unfinishedRefreshStatus, err := s.CanStartASGInstanceRefresh(mps)
checkErr(tt.wantErr, err, g)
if tt.canStart {
g.Expect(out).To(BeTrue())
return
} else {
g.Expect(unfinishedRefreshStatus).To(Equal(tt.wantUnfinishedRefreshStatus))
}
g.Expect(out).To(BeFalse())
})
Expand Down
31 changes: 18 additions & 13 deletions pkg/cloud/services/ec2/launchtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/blang/semver"
ignTypes "github.com/coreos/ignition/config/v2_3/types"
Expand Down Expand Up @@ -69,7 +70,7 @@ func (s *Service) ReconcileLaunchTemplate(
s3Scope scope.S3Scope,
ec2svc services.EC2Interface,
objectStoreSvc services.ObjectStoreInterface,
canUpdateLaunchTemplate func() (bool, error),
canStartInstanceRefresh func() (bool, *string, error),
cancelInstanceRefresh func() error,
runPostLaunchTemplateUpdateOperation func() error,
) (*ctrl.Result, error) {
Expand Down Expand Up @@ -230,22 +231,26 @@ func (s *Service) ReconcileLaunchTemplate(
if needsUpdate || tagsChanged || amiChanged || userDataSecretKeyChanged {
// More than just the bootstrap token changed

canUpdate, err := canUpdateLaunchTemplate()
canStartRefresh, unfinishedRefreshStatus, err := canStartInstanceRefresh()
if err != nil {
return nil, err
}
if !canUpdate {
err := cancelInstanceRefresh()
if err != nil {
return nil, err
if !canStartRefresh {
if unfinishedRefreshStatus != nil && *unfinishedRefreshStatus != autoscaling.InstanceRefreshStatusCancelling {
// Until the previous instance refresh goes into `Cancelled` state
// asynchronously, allowing another refresh to be started,
// defer the reconciliation. Otherwise, we get an
// `ErrCodeInstanceRefreshInProgressFault` error if we tried to
// start an instance refresh immediately.
scope.Info("Cancelling previous instance refresh and delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus)

err := cancelInstanceRefresh()
if err != nil {
return nil, err
}
} else {
scope.Info("Existing instance refresh is not finished, delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus)
}

// Until the previous instance refresh goes into `Cancelled` state
// asynchronously, allowing another refresh to be started,
// defer the reconciliation. Otherwise, we get an
// `ErrCodeInstanceRefreshInProgressFault` error if we tried to
// start an instance refresh immediately.
scope.Info("Cancelled previous instance refresh, delaying reconciliation until the next one can be started")
return &ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/cloud/services/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ type ASGInterface interface {
UpdateASG(scope *scope.MachinePoolScope) error
CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error
StartASGInstanceRefresh(scope *scope.MachinePoolScope) error
CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error)
CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *string, error)
UpdateResourceTags(resourceID *string, create, remove map[string]string) error
DeleteASGAndWait(id string) error
SuspendProcesses(name string, processes []string) error
Expand Down Expand Up @@ -86,7 +86,7 @@ type EC2Interface interface {
// separate from EC2Interface so that we can mock AWS requests separately. For example, by not mocking the
// ReconcileLaunchTemplate function, but mocking EC2Interface, we can test which EC2 API operations would have been called.
type MachinePoolReconcileInterface interface {
ReconcileLaunchTemplate(ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error) (*ctrl.Result, error)
ReconcileLaunchTemplate(ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, *string, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error) (*ctrl.Result, error)
ReconcileTags(scope scope.LaunchTemplateScope, resourceServicesToUpdate []scope.ResourceServiceToUpdate) error
}

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 05bb37c

Please sign in to comment.