diff --git a/exp/controllers/awsmachinepool_controller.go b/exp/controllers/awsmachinepool_controller.go index f85d6a37c3..e00226efe8 100644 --- a/exp/controllers/awsmachinepool_controller.go +++ b/exp/controllers/awsmachinepool_controller.go @@ -261,14 +261,14 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP return ctrl.Result{}, err } - canUpdateLaunchTemplate := func() (bool, error) { + canStartInstanceRefresh := func() (bool, *string, error) { // If there is a change: before changing the template, check if there exist an ongoing instance refresh, // because only 1 instance refresh can be "InProgress". If template is updated when refresh cannot be started, // that change will not trigger a refresh. Do not start an instance refresh if only userdata changed. if asg == nil { // If the ASG hasn't been created yet, there is no need to check if we can start the instance refresh. // But we want to update the LaunchTemplate because an error in the LaunchTemplate may be blocking the ASG creation. - return true, nil + return true, nil, nil } return asgsvc.CanStartASGInstanceRefresh(machinePoolScope) } @@ -299,7 +299,7 @@ func (r *AWSMachinePoolReconciler) reconcileNormal(ctx context.Context, machineP machinePoolScope.Info("starting instance refresh", "number of instances", machinePoolScope.MachinePool.Spec.Replicas) return asgsvc.StartASGInstanceRefresh(machinePoolScope) } - res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canUpdateLaunchTemplate, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) + res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2Svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) if err != nil { r.Recorder.Eventf(machinePoolScope.AWSMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) machinePoolScope.Error(err, "failed to reconcile launch template") diff --git a/exp/controllers/awsmachinepool_controller_test.go b/exp/controllers/awsmachinepool_controller_test.go index abdd0a5584..55b15dd9ab 100644 --- a/exp/controllers/awsmachinepool_controller_test.go +++ b/exp/controllers/awsmachinepool_controller_test.go @@ -555,7 +555,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-different"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-different")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) @@ -603,7 +603,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-existing"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data"}), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) @@ -678,7 +678,7 @@ func TestAWSMachinePoolReconciler(t *testing.T) { nil) ec2Svc.EXPECT().DiscoverLaunchTemplateAMI(gomock.Any()).Return(pointer.String("ami-existing"), nil) ec2Svc.EXPECT().LaunchTemplateNeedsUpdate(gomock.Any(), gomock.Any(), gomock.Any()).Return(false, nil) - asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil) + asgSvc.EXPECT().CanStartASGInstanceRefresh(gomock.Any()).Return(true, nil, nil) ec2Svc.EXPECT().PruneLaunchTemplateVersions(gomock.Any()).Return(nil) ec2Svc.EXPECT().CreateLaunchTemplateVersion(gomock.Any(), gomock.Any(), gomock.Eq(pointer.String("ami-existing")), gomock.Eq(apimachinerytypes.NamespacedName{Namespace: "default", Name: "bootstrap-data-new"}), gomock.Any()).Return(nil) ec2Svc.EXPECT().GetLaunchTemplateLatestVersion(gomock.Any()).Return("2", nil) diff --git a/exp/controllers/awsmanagedmachinepool_controller.go b/exp/controllers/awsmanagedmachinepool_controller.go index e3264d22cf..8e6deb131a 100644 --- a/exp/controllers/awsmanagedmachinepool_controller.go +++ b/exp/controllers/awsmanagedmachinepool_controller.go @@ -212,8 +212,8 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( reconSvc := r.getReconcileService(ec2Scope) if machinePoolScope.ManagedMachinePool.Spec.AWSLaunchTemplate != nil { - canUpdateLaunchTemplate := func() (bool, error) { - return true, nil + canStartInstanceRefresh := func() (bool, *string, error) { + return true, nil, nil } cancelInstanceRefresh := func() error { return nil @@ -222,7 +222,7 @@ func (r *AWSManagedMachinePoolReconciler) reconcileNormal( return nil } var objectStoreSvc services.ObjectStoreInterface = nil // no S3 bucket support for `AWSManagedControlPlane` yet - res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canUpdateLaunchTemplate, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) + res, err := reconSvc.ReconcileLaunchTemplate(machinePoolScope, machinePoolScope, s3Scope, ec2svc, objectStoreSvc, canStartInstanceRefresh, cancelInstanceRefresh, runPostLaunchTemplateUpdateOperation) if err != nil { r.Recorder.Eventf(machinePoolScope.ManagedMachinePool, corev1.EventTypeWarning, "FailedLaunchTemplateReconcile", "Failed to reconcile launch template: %v", err) machinePoolScope.Error(err, "failed to reconcile launch template") diff --git a/pkg/cloud/services/autoscaling/autoscalinggroup.go b/pkg/cloud/services/autoscaling/autoscalinggroup.go index 33be8d972f..a0f30816a8 100644 --- a/pkg/cloud/services/autoscaling/autoscalinggroup.go +++ b/pkg/cloud/services/autoscaling/autoscalinggroup.go @@ -22,6 +22,7 @@ import ( "strings" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/ec2" "github.com/pkg/errors" @@ -321,27 +322,29 @@ func (s *Service) UpdateASG(machinePoolScope *scope.MachinePoolScope) error { return nil } -// CanStartASGInstanceRefresh will start an ASG instance with refresh. -func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error) { +// CanStartASGInstanceRefresh checks if a new ASG instance refresh can currently be started, and returns the status if there is an existing, unfinished refresh. +func (s *Service) CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *string, error) { describeInput := &autoscaling.DescribeInstanceRefreshesInput{AutoScalingGroupName: aws.String(scope.Name())} refreshes, err := s.ASGClient.DescribeInstanceRefreshesWithContext(context.TODO(), describeInput) if err != nil { - return false, err + return false, nil, err } hasUnfinishedRefresh := false + var unfinishedRefreshStatus *string if err == nil && len(refreshes.InstanceRefreshes) != 0 { for i := range refreshes.InstanceRefreshes { if *refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusInProgress || *refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusPending || *refreshes.InstanceRefreshes[i].Status == autoscaling.InstanceRefreshStatusCancelling { hasUnfinishedRefresh = true + unfinishedRefreshStatus = refreshes.InstanceRefreshes[i].Status } } } if hasUnfinishedRefresh { - return false, nil + return false, unfinishedRefreshStatus, nil } - return true, nil + return true, nil, nil } // CancelASGInstanceRefresh cancels an ASG instance refresh. @@ -351,6 +354,16 @@ func (s *Service) CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error } if _, err := s.ASGClient.CancelInstanceRefreshWithContext(context.TODO(), input); err != nil { + var aerr awserr.Error + if errors.As(err, &aerr) { + if aerr.Code() == autoscaling.ErrCodeActiveInstanceRefreshNotFoundFault { + // Refresh isn't "in progress". It may have turned to cancelled status + // by now. So this is not an error for us because we may have called + // CancelInstanceRefresh multiple times and should be idempotent here. + return nil + } + } + return errors.Wrapf(err, "failed to cancel ASG instance refresh %q", scope.Name()) } diff --git a/pkg/cloud/services/autoscaling/autoscalinggroup_test.go b/pkg/cloud/services/autoscaling/autoscalinggroup_test.go index 09bd2bc042..be3a6dce14 100644 --- a/pkg/cloud/services/autoscaling/autoscalinggroup_test.go +++ b/pkg/cloud/services/autoscaling/autoscalinggroup_test.go @@ -1111,10 +1111,11 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { defer mockCtrl.Finish() tests := []struct { - name string - wantErr bool - canStart bool - expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) + name string + wantErr bool + wantUnfinishedRefreshStatus *string + canStart bool + expect func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) }{ { name: "should return error if describe instance refresh failed", @@ -1139,9 +1140,10 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { }, }, { - name: "should return false if some instances have unfinished refresh", - wantErr: false, - canStart: false, + name: "should return false if some instances have unfinished refresh", + wantErr: false, + wantUnfinishedRefreshStatus: aws.String(autoscaling.InstanceRefreshStatusInProgress), + canStart: false, expect: func(m *mock_autoscalingiface.MockAutoScalingAPIMockRecorder) { m.DescribeInstanceRefreshesWithContext(context.TODO(), gomock.Eq(&autoscaling.DescribeInstanceRefreshesInput{ AutoScalingGroupName: aws.String("machinePoolName"), @@ -1173,11 +1175,13 @@ func TestServiceCanStartASGInstanceRefresh(t *testing.T) { g.Expect(err).ToNot(HaveOccurred()) mps.AWSMachinePool.Name = "machinePoolName" - out, err := s.CanStartASGInstanceRefresh(mps) + out, unfinishedRefreshStatus, err := s.CanStartASGInstanceRefresh(mps) checkErr(tt.wantErr, err, g) if tt.canStart { g.Expect(out).To(BeTrue()) return + } else { + g.Expect(unfinishedRefreshStatus).To(Equal(tt.wantUnfinishedRefreshStatus)) } g.Expect(out).To(BeFalse()) }) diff --git a/pkg/cloud/services/ec2/launchtemplate.go b/pkg/cloud/services/ec2/launchtemplate.go index 5528a48fcb..be382e055f 100644 --- a/pkg/cloud/services/ec2/launchtemplate.go +++ b/pkg/cloud/services/ec2/launchtemplate.go @@ -27,6 +27,7 @@ import ( "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/aws/aws-sdk-go/service/ec2" "github.com/blang/semver" ignTypes "github.com/coreos/ignition/config/v2_3/types" @@ -69,7 +70,7 @@ func (s *Service) ReconcileLaunchTemplate( s3Scope scope.S3Scope, ec2svc services.EC2Interface, objectStoreSvc services.ObjectStoreInterface, - canUpdateLaunchTemplate func() (bool, error), + canStartInstanceRefresh func() (bool, *string, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error, ) (*ctrl.Result, error) { @@ -230,22 +231,26 @@ func (s *Service) ReconcileLaunchTemplate( if needsUpdate || tagsChanged || amiChanged || userDataSecretKeyChanged { // More than just the bootstrap token changed - canUpdate, err := canUpdateLaunchTemplate() + canStartRefresh, unfinishedRefreshStatus, err := canStartInstanceRefresh() if err != nil { return nil, err } - if !canUpdate { - err := cancelInstanceRefresh() - if err != nil { - return nil, err + if !canStartRefresh { + if unfinishedRefreshStatus != nil && *unfinishedRefreshStatus != autoscaling.InstanceRefreshStatusCancelling { + // Until the previous instance refresh goes into `Cancelled` state + // asynchronously, allowing another refresh to be started, + // defer the reconciliation. Otherwise, we get an + // `ErrCodeInstanceRefreshInProgressFault` error if we tried to + // start an instance refresh immediately. + scope.Info("Cancelling previous instance refresh and delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus) + + err := cancelInstanceRefresh() + if err != nil { + return nil, err + } + } else { + scope.Info("Existing instance refresh is not finished, delaying reconciliation until the next one can be started", "unfinishedRefreshStatus", unfinishedRefreshStatus) } - - // Until the previous instance refresh goes into `Cancelled` state - // asynchronously, allowing another refresh to be started, - // defer the reconciliation. Otherwise, we get an - // `ErrCodeInstanceRefreshInProgressFault` error if we tried to - // start an instance refresh immediately. - scope.Info("Cancelled previous instance refresh, delaying reconciliation until the next one can be started") return &ctrl.Result{RequeueAfter: 30 * time.Second}, nil } } diff --git a/pkg/cloud/services/interfaces.go b/pkg/cloud/services/interfaces.go index dbcf90beaa..54a576b9d4 100644 --- a/pkg/cloud/services/interfaces.go +++ b/pkg/cloud/services/interfaces.go @@ -43,7 +43,7 @@ type ASGInterface interface { UpdateASG(scope *scope.MachinePoolScope) error CancelASGInstanceRefresh(scope *scope.MachinePoolScope) error StartASGInstanceRefresh(scope *scope.MachinePoolScope) error - CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, error) + CanStartASGInstanceRefresh(scope *scope.MachinePoolScope) (bool, *string, error) UpdateResourceTags(resourceID *string, create, remove map[string]string) error DeleteASGAndWait(id string) error SuspendProcesses(name string, processes []string) error @@ -86,7 +86,7 @@ type EC2Interface interface { // separate from EC2Interface so that we can mock AWS requests separately. For example, by not mocking the // ReconcileLaunchTemplate function, but mocking EC2Interface, we can test which EC2 API operations would have been called. type MachinePoolReconcileInterface interface { - ReconcileLaunchTemplate(ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error) (*ctrl.Result, error) + ReconcileLaunchTemplate(ignitionScope scope.IgnitionScope, scope scope.LaunchTemplateScope, s3Scope scope.S3Scope, ec2svc EC2Interface, objectStoreSvc ObjectStoreInterface, canUpdateLaunchTemplate func() (bool, *string, error), cancelInstanceRefresh func() error, runPostLaunchTemplateUpdateOperation func() error) (*ctrl.Result, error) ReconcileTags(scope scope.LaunchTemplateScope, resourceServicesToUpdate []scope.ResourceServiceToUpdate) error } diff --git a/pkg/cloud/services/mock_services/autoscaling_interface_mock.go b/pkg/cloud/services/mock_services/autoscaling_interface_mock.go index aea51ea912..2448d3dcb7 100644 --- a/pkg/cloud/services/mock_services/autoscaling_interface_mock.go +++ b/pkg/cloud/services/mock_services/autoscaling_interface_mock.go @@ -67,12 +67,13 @@ func (mr *MockASGInterfaceMockRecorder) ASGIfExists(arg0 interface{}) *gomock.Ca } // CanStartASGInstanceRefresh mocks base method. -func (m *MockASGInterface) CanStartASGInstanceRefresh(arg0 *scope.MachinePoolScope) (bool, error) { +func (m *MockASGInterface) CanStartASGInstanceRefresh(arg0 *scope.MachinePoolScope) (bool, *string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "CanStartASGInstanceRefresh", arg0) ret0, _ := ret[0].(bool) - ret1, _ := ret[1].(error) - return ret0, ret1 + ret1, _ := ret[1].(*string) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // CanStartASGInstanceRefresh indicates an expected call of CanStartASGInstanceRefresh. diff --git a/pkg/cloud/services/mock_services/reconcile_interface_mock.go b/pkg/cloud/services/mock_services/reconcile_interface_mock.go index 87b96e0cb8..cc2c41e082 100644 --- a/pkg/cloud/services/mock_services/reconcile_interface_mock.go +++ b/pkg/cloud/services/mock_services/reconcile_interface_mock.go @@ -53,7 +53,7 @@ func (m *MockMachinePoolReconcileInterface) EXPECT() *MockMachinePoolReconcileIn } // ReconcileLaunchTemplate mocks base method. -func (m *MockMachinePoolReconcileInterface) ReconcileLaunchTemplate(arg0 scope.IgnitionScope, arg1 scope.LaunchTemplateScope, arg2 scope.S3Scope, arg3 services.EC2Interface, arg4 services.ObjectStoreInterface, arg5 func() (bool, error), arg6, arg7 func() error) (*reconcile.Result, error) { +func (m *MockMachinePoolReconcileInterface) ReconcileLaunchTemplate(arg0 scope.IgnitionScope, arg1 scope.LaunchTemplateScope, arg2 scope.S3Scope, arg3 services.EC2Interface, arg4 services.ObjectStoreInterface, arg5 func() (bool, *string, error), arg6, arg7 func() error) (*reconcile.Result, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "ReconcileLaunchTemplate", arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7) ret0, _ := ret[0].(*reconcile.Result)