Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/v1beta1/hcloudmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,18 @@ type HCloudMachineStatus struct {
// FailureReason will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a succinct value suitable
// for machine interpretation.
//
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
//
// +optional
FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"`

// FailureMessage will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a more verbose string suitable
// for logging and human consumption.
//
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
//
// +optional
FailureMessage *string `json:"failureMessage,omitempty"`

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,16 @@ spec:
FailureMessage will be set in the event that there is a terminal problem
reconciling the Machine and will contain a more verbose string suitable
for logging and human consumption.

Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
type: string
failureReason:
description: |-
FailureReason will be set in the event that there is a terminal problem
reconciling the Machine and will contain a succinct value suitable
for machine interpretation.

Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
type: string
instanceState:
description: InstanceState is the state of the server for this machine.
Expand Down
7 changes: 5 additions & 2 deletions controllers/hcloudmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,13 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
return r.reconcileDelete(ctx, machineScope)
}

if hcloudMachine.Status.FailureReason != nil {
// This machine will be removed.
_, exists := machine.Annotations[clusterv1.RemediateMachineAnnotation]
if exists {
// This hcloud machine will be removed.
log.Info("CAPI Machine has RemediateMachineAnnotation. Not reconciling this machine.")
return reconcile.Result{}, nil
}

return r.reconcileNormal(ctx, machineScope)
}

Expand Down
118 changes: 109 additions & 9 deletions controllers/hcloudremediation_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
hcloudutil "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/util"
"github.com/syself/cluster-api-provider-hetzner/pkg/utils"
)
Expand Down Expand Up @@ -147,9 +151,8 @@ var _ = Describe("HCloudRemediationReconciler", func() {
},
},
Spec: infrav1.HCloudMachineSpec{
ImageName: "my-control-plane",
Type: "cpx31",
PlacementGroupName: &defaultPlacementGroupName,
ImageName: "my-control-plane",
Type: "cpx31",
},
}
Expect(testEnv.Create(ctx, hcloudMachine)).To(Succeed())
Expand Down Expand Up @@ -227,14 +230,18 @@ var _ = Describe("HCloudRemediationReconciler", func() {
Expect(testEnv.Create(ctx, hcloudRemediation)).To(Succeed())

By("checking if hcloudRemediation is in deleting phase and capiMachine has the MachineOwnerRemediatedCondition")
Eventually(func() bool {
Eventually(func() error {
if err := testEnv.Get(ctx, hcloudRemediationkey, hcloudRemediation); err != nil {
return false
return err
}

return hcloudRemediation.Status.Phase == infrav1.PhaseDeleting &&
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
}, timeout).Should(BeTrue())
if hcloudRemediation.Status.Phase != infrav1.PhaseDeleting {
return fmt.Errorf("hcloudRemediation.Status.Phase is not infrav1.PhaseDeleting")
}
if !isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) {
return fmt.Errorf("MachineOwnerRemediatedCondition not set")
}
return nil
}, timeout).Should(Succeed())
})

It("checks that, under normal conditions, a reboot is carried out and retryCount and lastRemediated are set", func() {
Expand Down Expand Up @@ -318,5 +325,98 @@ var _ = Describe("HCloudRemediationReconciler", func() {
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
}, timeout).Should(BeTrue())
})
It("should delete machine if SetErrorAndRemediate() was called", func() {
By("Checking Environment")
capiMachineAgain, err := util.GetOwnerMachine(ctx, testEnv, hcloudMachine.ObjectMeta)
Expect(err).ShouldNot(HaveOccurred())
Expect(capiMachineAgain).ToNot(BeNil())
Expect(capiMachine.UID).To(Equal(capiMachineAgain.UID))
hcloudClient := testEnv.HCloudClientFactory.NewClient("dummy-token")

server, err := hcloudClient.CreateServer(ctx, hcloud.ServerCreateOpts{
Name: "myserver",
})
Expect(err).ShouldNot(HaveOccurred())
providerID := hcloudutil.ProviderIDFromServerID(int(server.ID))
hcloudMachine.Spec.ProviderID = &providerID
err = testEnv.Update(ctx, hcloudMachine)
Expect(err).ShouldNot(HaveOccurred())

By("Call SetErrorAndRemediateMachine")
Eventually(func() error {
err = testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
if err != nil {
return err
}
err = scope.SetErrorAndRemediateMachine(ctx, testEnv, capiMachine, hcloudMachine, "test-of-set-error-and-remediate")
if err != nil {
return err
}
err = testEnv.Status().Update(ctx, hcloudMachine)
if err != nil {
return err
}
return nil
}).Should(Succeed())

By("Wait until hcloud has condition set.")
Eventually(func() error {
err := testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
if err != nil {
return err
}
c := conditions.Get(hcloudMachine, infrav1.NoRemediateMachineAnnotationCondition)
if c == nil {
return fmt.Errorf("not set: NoRemediateMachineAnnotationCondition")
}
if c.Status != corev1.ConditionFalse {
return fmt.Errorf("status not set yet")
}
return nil
}).Should(Succeed())

By("Do the job of CAPI: Create a HCloudRemediation")
rem := &infrav1.HCloudRemediation{
ObjectMeta: metav1.ObjectMeta{
Name: hcloudMachine.Name,
Namespace: hcloudMachine.Namespace,
},
Spec: infrav1.HCloudRemediationSpec{
Strategy: &infrav1.RemediationStrategy{
Type: infrav1.RemediationTypeReboot,
RetryLimit: 5,
Timeout: &metav1.Duration{
Duration: time.Minute,
},
},
},
}

err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.GetScheme())
Expect(err).Should(Succeed())

err = testEnv.Create(ctx, rem)
Expect(err).ShouldNot(HaveOccurred())

By("Wait until remediation is done")
Eventually(func() error {
err := testEnv.Get(ctx, client.ObjectKeyFromObject(capiMachine), capiMachine)
if err != nil {
return err
}

c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition)
if c == nil {
return fmt.Errorf("not set: MachineOwnerRemediatedCondition")
}
if c.Status != corev1.ConditionFalse {
return fmt.Errorf("status not set yet")
}
if c.Message != "Remediation finished (machine will be deleted): exit remediation because infra machine has condition set: RemediateMachineAnnotationIsSet: test-of-set-error-and-remediate" {
return fmt.Errorf("Message is not as expected: %q", c.Message)
}
return nil
}).Should(Succeed())
})
})
})
38 changes: 13 additions & 25 deletions pkg/scope/hcloudremediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,33 +72,26 @@ func NewHCloudRemediationScope(params HCloudRemediationScopeParams) (*HCloudReme
return nil, fmt.Errorf("failed to init patch helper: %w", err)
}

machinePatchHelper, err := patch.NewHelper(params.Machine, params.Client)
if err != nil {
return nil, fmt.Errorf("failed to init machine patch helper: %w", err)
}

return &HCloudRemediationScope{
Logger: params.Logger,
Client: params.Client,
HCloudClient: params.HCloudClient,
patchHelper: patchHelper,
machinePatchHelper: machinePatchHelper,
Machine: params.Machine,
HCloudMachine: params.HCloudMachine,
HCloudRemediation: params.HCloudRemediation,
Logger: params.Logger,
Client: params.Client,
HCloudClient: params.HCloudClient,
patchHelper: patchHelper,
Machine: params.Machine,
HCloudMachine: params.HCloudMachine,
HCloudRemediation: params.HCloudRemediation,
}, nil
}

// HCloudRemediationScope defines the basic context for an actuator to operate upon.
type HCloudRemediationScope struct {
logr.Logger
Client client.Client
patchHelper *patch.Helper
machinePatchHelper *patch.Helper
HCloudClient hcloudclient.Client
Machine *clusterv1.Machine
HCloudMachine *infrav1.HCloudMachine
HCloudRemediation *infrav1.HCloudRemediation
Client client.Client
patchHelper *patch.Helper
HCloudClient hcloudclient.Client
Machine *clusterv1.Machine
HCloudMachine *infrav1.HCloudMachine
HCloudRemediation *infrav1.HCloudRemediation
}

// Close closes the current scope persisting the cluster configuration and status.
Expand Down Expand Up @@ -126,8 +119,3 @@ func (m *HCloudRemediationScope) ServerIDFromProviderID() (int64, error) {
func (m *HCloudRemediationScope) PatchObject(ctx context.Context, opts ...patch.Option) error {
return m.patchHelper.Patch(ctx, m.HCloudRemediation, opts...)
}

// PatchMachine persists the machine spec and status.
func (m *HCloudRemediationScope) PatchMachine(ctx context.Context, opts ...patch.Option) error {
return m.machinePatchHelper.Patch(ctx, m.Machine, opts...)
}
45 changes: 37 additions & 8 deletions pkg/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ import (

"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/record"
"sigs.k8s.io/controller-runtime/pkg/client"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
Expand Down Expand Up @@ -126,13 +127,41 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
return m.patchHelper.Patch(ctx, m.HCloudMachine)
}

// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
// the message. It assumes the reason is invalid configuration, since that is
// currently the only relevant MachineStatusError choice.
// CAPI will delete the machine and create a new one.
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
m.HCloudMachine.Status.FailureMessage = &message
m.HCloudMachine.Status.FailureReason = &reason
// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding
// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be
// created, and the NoRemediateMachineAnnotationCondition will be set on the hcloud-machine. It gets
// used, when a not-recoverable error happens. Example: hcloud server was deleted by hand in the
// hcloud UI.
func (m *MachineScope) SetErrorAndRemediate(ctx context.Context, message string) error {
return SetErrorAndRemediateMachine(ctx, m.Client, m.Machine, m.HCloudMachine, message)
}

// SetErrorAndRemediateMachine implements SetErrorAndRemediate. It is exported, so that other code
// (for example in tests) can call without creating a MachinenScope.
func SetErrorAndRemediateMachine(ctx context.Context, crClient client.Client, capiMachine *clusterv1.Machine, hcloudMachine *infrav1.HCloudMachine, message string) error {
// Create a patch base
patch := client.MergeFrom(capiMachine.DeepCopy())

// Modify only annotations on the in-memory copy
if capiMachine.Annotations == nil {
capiMachine.Annotations = map[string]string{}
}
capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] = ""

// Apply patch – only the diff (annotations) is sent to the API server
if err := crClient.Patch(ctx, capiMachine, patch); err != nil {
return fmt.Errorf("patch failed in SetErrorAndRemediate: %w", err)
}

record.Warnf(hcloudMachine,
"HCloudMachineWillBeRemediated",
"HCloudMachine will be remediated: %s", message)

conditions.MarkFalse(hcloudMachine, infrav1.NoRemediateMachineAnnotationCondition,
infrav1.RemediateMachineAnnotationIsSetReason, clusterv1.ConditionSeverityInfo, "%s",
message)

return nil
}

// SetRegion sets the region field on the machine.
Expand Down
Loading