Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
3d01d2d
:seedling: Remove usage of FailureReason and FailureMessage (baremetal)
guettli Nov 17, 2025
1fa1406
make linter happy.
guettli Nov 17, 2025
8c7c7a9
set condition.
guettli Nov 17, 2025
8c81c6c
feedback of AI.
guettli Nov 17, 2025
5b3a35d
set condition on host, too. Do not loose the message...
guettli Nov 17, 2025
98bcfff
cleanup, do not set condition from hbmm reconcile on hbmh.
guettli Nov 17, 2025
fb67a97
copy condition from hbmm to host. No need to read capi machine.
guettli Nov 17, 2025
a5521cd
delete via RemediateAnnotation does not work well. Code in Remediatio…
guettli Nov 17, 2025
7d834ca
revert last commit. Delete via remediation is ok.
guettli Nov 18, 2025
cda32ca
WIP: give msg to method, so that a more precise error gets created.
guettli Nov 18, 2025
700d1ec
show exit message of remediation on capi machine condition.
guettli Nov 20, 2025
f71d8a2
no need to try a reboot.
guettli Nov 20, 2025
7c57394
Merge branch 'main' into tg/remove-failure-reason--baremetal
guettli Nov 20, 2025
401a474
more logging, WIP
guettli Nov 20, 2025
25890c4
fixed test cases.
guettli Nov 20, 2025
fc375ce
:seedling: Fix hbmm delete unit tests.
guettli Nov 20, 2025
393ecd6
simplified code, so that AI understands it.
guettli Nov 20, 2025
8c1986a
fix failing test.
guettli Nov 20, 2025
7133120
deduplicated code, cleaned up PR.
guettli Nov 20, 2025
527de14
...
guettli Nov 20, 2025
826c0d4
adapt comments to no longer use failure reason.
guettli Nov 20, 2025
523dd7b
remove todo.
guettli Nov 20, 2025
49a275c
linting.
guettli Nov 20, 2025
0726e9a
make diff to main smaller.
guettli Nov 20, 2025
ef94736
tiny changes.
guettli Nov 20, 2025
0385349
Merge branch 'main' into tg/remove-failure-reason--baremetal
guettli Nov 20, 2025
12cf08b
:seedling: set GITHUB_TOKEN, so that Lychee does not get rate-limited.
guettli Nov 20, 2025
69642d9
Merge branch 'tg/set-github-token-to-avoid-rate-limiting' into tg/rem…
guettli Nov 20, 2025
2b851a9
Merge branch 'main' into tg/fix-hbmm-delete-test
guettli Nov 21, 2025
18d4d23
Add testEnv.Resetter to avoid flaky tests
guettli Nov 21, 2025
5d76f65
fixed hcloud test.
guettli Nov 21, 2025
404560a
make linter happy.
guettli Nov 21, 2025
ef05ede
fix typo.
guettli Nov 21, 2025
3ea2349
all unit-tests are fine again. Avoid Reset() on hcloud client.
guettli Nov 24, 2025
56dda26
do not run code in Describe(). Use BeforeEach().
guettli Nov 24, 2025
62f7a28
better docs.
guettli Nov 24, 2025
1101547
NewControllerResetter
guettli Nov 24, 2025
a0e24a6
use NewClient(t), not m.Test(t)
guettli Nov 24, 2025
b33f086
avoid m.Test(t), use NewClient(t)
guettli Nov 24, 2025
658bff1
revert to old Test(t), otherwise Cleanup will fail, because not all c…
guettli Nov 24, 2025
43c1e75
Merge branch 'tg/fix-hbmm-delete-test' into tg/remove-failure-reason-…
guettli Nov 24, 2025
0139e5f
remove FIt().
guettli Nov 24, 2025
598e51d
Merge branch 'tg/fix-hbmm-delete-test' into tg/remove-failure-reason-…
guettli Nov 24, 2025
0b1bb05
avoid global variable with bmMachineName.
guettli Nov 24, 2025
a38311f
do not use namespace of InfraRef.
guettli Nov 24, 2025
288665e
rename to ResetAndInitNamespace()
guettli Nov 25, 2025
b74bdd0
provide link to docs of interface.
guettli Nov 25, 2025
ba0f15e
comment.
guettli Nov 25, 2025
0ad280d
add comment to Test(t)
guettli Nov 26, 2025
29cec0b
feedback from janis.
guettli Nov 26, 2025
de8f4a9
WIP: extend test to wait until host is deprovisioned.
guettli Nov 26, 2025
fdf5e6b
:seedling: Less noise for unit-test
guettli Nov 26, 2025
29df3e5
use script for `make test-unit`. And do more nice output.
guettli Nov 26, 2025
3e05e2c
handle json errors.
guettli Nov 26, 2025
a6b27d4
Merge branch 'tg/less-noise-for-unit-tests' into tg/remove-failure-re…
guettli Nov 26, 2025
ec75c1b
Merge branch 'tg/less-noise-for-unit-tests' into tg/fix-hbmm-delete-test
guettli Nov 26, 2025
f02171b
Merge branch 'tg/fix-hbmm-delete-test' into tg/remove-failure-reason-…
guettli Nov 26, 2025
c617211
better json line detection: } at end needed.
guettli Nov 26, 2025
05e8b9b
Merge branch 'tg/less-noise-for-unit-tests' into tg/remove-failure-re…
guettli Nov 26, 2025
36a98e4
test passing now.
guettli Nov 26, 2025
7567f35
skip reconcile of hbmm if deletion happens.
guettli Nov 26, 2025
63c9e19
remove comment.
guettli Nov 26, 2025
9b8ff82
move baremetalutils code to pkg/services/baremetal/host/utils.go
guettli Nov 26, 2025
156f723
fix linting.
guettli Nov 26, 2025
5469266
make generate.
guettli Nov 26, 2025
8a5442e
Merge branch 'main' into tg/fix-hbmm-delete-test
guettli Nov 26, 2025
c6d202f
Merge branch 'tg/fix-hbmm-delete-test' into tg/remove-failure-reason-…
guettli Nov 26, 2025
6882d40
Merge branch 'main' into tg/remove-failure-reason--baremetal
guettli Nov 27, 2025
566dd86
renamed to RemediationSucceededCondition.
guettli Nov 27, 2025
2086a34
move check for rate-limit up.
guettli Nov 27, 2025
3b45854
Merge branch 'tg/pre--remove-failure-reason--baremetal--on-top-main' …
guettli Nov 27, 2025
de7c875
fix typo.
guettli Nov 27, 2025
6eba6e0
Merge branch 'tg/pre--remove-failure-reason--baremetal--on-top-main' …
guettli Nov 27, 2025
c4235a0
fix typo.
guettli Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions api/v1beta1/conditions_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,15 @@ const (
// RebootSucceededCondition indicates that the machine got rebooted successfully.
RebootSucceededCondition clusterv1.ConditionType = "RebootSucceeded"
)

const (
// RemediationSucceededCondition is:
// - False when the corresponding CAPI Machine has the "cluster.x-k8s.io/remediate-machine" annotation set and will be remediated by CAPI soon.
// - True otherwise.
RemediationSucceededCondition clusterv1.ConditionType = "RemediationSucceeded"

// RemediationInProgressReason indicates that the CAPI machine has the
// "cluster.x-k8s.io/remediate-machine" annotation set. The CAPI machine and the corresponding
// infra-machine will be deleted by CAPI soon.
RemediationInProgressReason = "RemediationInProgress"
)
6 changes: 0 additions & 6 deletions api/v1beta1/hetznerbaremetalmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,12 +359,6 @@ func (hbmm *HetznerBareMetalMachine) SetConditions(conditions clusterv1.Conditio
hbmm.Status.Conditions = conditions
}

// SetFailure sets a failure reason and message.
func (hbmm *HetznerBareMetalMachine) SetFailure(reason string, message string) {
hbmm.Status.FailureReason = &reason
hbmm.Status.FailureMessage = &message
}

// GetImageSuffix tests whether the suffix is known and outputs it if yes. Otherwise it returns an error.
func GetImageSuffix(url string) (string, error) {
if strings.HasPrefix(url, "oci://") {
Expand Down
29 changes: 0 additions & 29 deletions api/v1beta1/hetznerbaremetalmachine_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,35 +176,6 @@ var _ = Describe("Test GetImageSuffix", func() {
)
})

var _ = Describe("Test SetFailure", func() {
bmMachine := HetznerBareMetalMachine{}
newFailureMessage := "bad failure message"
newFailureReason := "bad failure reason"

It("sets new failure on the machine with existing failure", func() {
failureMessage := "first message"
failureReason := "first error"
bmMachine.Status.FailureMessage = &failureMessage
bmMachine.Status.FailureReason = &failureReason

bmMachine.SetFailure(newFailureReason, newFailureMessage)

Expect(bmMachine.Status.FailureMessage).ToNot(BeNil())
Expect(bmMachine.Status.FailureReason).ToNot(BeNil())
Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage))
Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason))
})

It("sets new failure on the machine without existing failure", func() {
bmMachine.SetFailure(newFailureReason, newFailureMessage)

Expect(bmMachine.Status.FailureMessage).ToNot(BeNil())
Expect(bmMachine.Status.FailureReason).ToNot(BeNil())
Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage))
Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason))
})
})

var _ = Describe("Test HasHostAnnotation", func() {
type testCaseHasHostAnnotation struct {
annotations map[string]string
Expand Down
13 changes: 13 additions & 0 deletions controllers/hetznerbaremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,19 @@ func (r *HetznerBareMetalHostReconciler) Reconcile(ctx context.Context, req ctrl
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
}

remediateConditionOfHbmm := conditions.Get(hetznerBareMetalMachine, infrav1.RemediationSucceededCondition)
if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we not reconcile the host if the Machine is remediated? Then we also cannot deprovision anymore, can we?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you are right. Thank you!

This is the wrong place for skipping Reconcile. This was moved to hbmm.

Mirroring the Condition to the host is still done.

Additionally, I added a test, so that this error will be noticed in the future.

// The hbmm of this host is in remediation. Do not reconcile it.
// Take the Condition of the hbmm and make it available on the hbmh.
msg := "hbmm has RemediationSucceededCondition=False."
log.Info(msg)
conditions.MarkFalse(bmHost, infrav1.RemediationSucceededCondition,
remediateConditionOfHbmm.Reason, remediateConditionOfHbmm.Severity,
"%s", remediateConditionOfHbmm.Message)
} else {
conditions.MarkTrue(bmHost, infrav1.RemediationSucceededCondition)
}

// Get Hetzner robot api credentials
secretManager := secretutil.NewSecretManager(log, r.Client, r.APIReader)
robotCreds, err := getAndValidateRobotCredentials(ctx, req.Namespace, hetznerCluster, secretManager)
Expand Down
8 changes: 8 additions & 0 deletions controllers/hetznerbaremetalmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"time"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -167,6 +168,13 @@ func (r *HetznerBareMetalMachineReconciler) Reconcile(ctx context.Context, req r
return r.reconcileDelete(ctx, machineScope)
}

remediateConditionOfHbmm := conditions.Get(hbmMachine, infrav1.RemediationSucceededCondition)
if remediateConditionOfHbmm != nil && remediateConditionOfHbmm.Status == corev1.ConditionFalse {
// The hbmm will be deleted. Do not reconcile it.
log.Info("hbmm has RemediationSucceededCondition=False. Waiting for deletion")
return reconcile.Result{}, nil
}

return r.reconcileNormal(ctx, machineScope)
}

Expand Down
101 changes: 94 additions & 7 deletions controllers/hetznerbaremetalmachine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ import (
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/utils/ptr"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/client"
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
"github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/baremetal"
robotmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/robot"
sshmock "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/mocks/ssh"
sshclient "github.com/syself/cluster-api-provider-hetzner/pkg/services/baremetal/client/ssh"
Expand Down Expand Up @@ -480,7 +482,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() {
}, timeout, time.Second).Should(BeTrue())
})

It("sets a failure reason when maintenance mode is set on the host", func() {
It("sets RemediateMachineAnnotation when maintenance mode is set on the host", func() {
By("making sure that machine is ready")

Eventually(func() bool {
Expand All @@ -500,14 +502,99 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() {

Expect(ph.Patch(ctx, host, patch.WithStatusObservedGeneration{})).To(Succeed())

By("checking that failure message is set on machine")
By("checking that RemediateMachineAnnotation is set on machine")

Eventually(func() bool {
Eventually(func() error {
if err := testEnv.Get(ctx, key, bmMachine); err != nil {
return false
return err
}
return bmMachine.Status.FailureMessage != nil && *bmMachine.Status.FailureMessage == baremetal.FailureMessageMaintenanceMode
}, timeout).Should(BeTrue())

capiMachine, err := util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta)
if err != nil {
return err
}

_, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation]
if !exists {
return fmt.Errorf("RemediateMachineAnnotation not set on capi machine")
}

c := conditions.Get(bmMachine, infrav1.RemediationSucceededCondition)
if c == nil {
return fmt.Errorf("condition RemediationSucceededCondition does not exist")
}

if c.Status != corev1.ConditionFalse {
return fmt.Errorf("condition RemediationSucceededCondition should be False")
}

return nil
}, timeout).Should(Succeed())

By("Do the job of CAPI: Create a HetznerBareMetalRemediation")
rem := &infrav1.HetznerBareMetalRemediation{
ObjectMeta: metav1.ObjectMeta{
Name: bmMachine.Name,
Namespace: bmMachine.Namespace,
},
}
err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.Scheme())
Expect(err).To(BeNil())
err = testEnv.Create(ctx, rem)
Expect(err).To(BeNil())

By("Wait for MachineOwnerRemediatedCondition to be set on capiMachine")
Eventually(func() error {
capiMachine, err = util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta)
if err != nil {
return err
}
if capiMachine == nil {
return fmt.Errorf("capiMachine is nil")
}

err = testEnv.Get(ctx, client.ObjectKeyFromObject(bmMachine), bmMachine)
Expect(err).ShouldNot(HaveOccurred())

Expect(capiMachine.Name).To(Equal(bmMachine.Name))
Expect(capiMachine.Spec.InfrastructureRef.Name).To(Equal(bmMachine.Name))

c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition)
if c == nil {
return fmt.Errorf("condition MachineOwnerRemediatedCondition does not exist %+v", capiMachine.Status)
}

if c.Status != corev1.ConditionFalse {
return fmt.Errorf("condition MachineOwnerRemediatedCondition should be False")
}

if c.Message != "Remediation finished (machine will be deleted): exit remediation because infra machine has condition set: RemediationInProgress: host machine in maintenance mode" {
return fmt.Errorf("Unexpected message: %q", c.Message)
}

return nil
}, timeout).Should(Succeed())

By("Play role of CAPI: Delete capi and bm machine")
err = testEnv.Delete(ctx, capiMachine)
Expect(err).Should(Succeed())
err = testEnv.Delete(ctx, bmMachine)
Expect(err).Should(Succeed())

By("Wait for host to get deprovisioned")
Eventually(func() error {
err := testEnv.Get(ctx, client.ObjectKeyFromObject(host), host)
if err != nil {
return err
}
if host.Spec.Status.ProvisioningState != infrav1.StateDeprovisioning {
return fmt.Errorf("host.Spec.Status.ProvisioningState != infrav1.StateDeprovisioning. Is: %q", host.Spec.Status.ProvisioningState)
}
return nil
}).Should(Succeed())

// check that ResetKubeadm was called
osSSHClient.AssertCalled(GinkgoT(), "ResetKubeadm")
})

It("checks the hetznerBareMetalMachine status running phase", func() {
Expand Down
18 changes: 18 additions & 0 deletions pkg/baremetalutils/baremetalutils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
Copyright 2025 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package baremetalutils implements helper functions for working with baremetal.
package baremetalutils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure right now - but if this change is also unrelated to the PR, then I would ask you to create another one

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the code. Creating these helpers was needed, so that the implementation could re-use existing code.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

31 changes: 31 additions & 0 deletions pkg/scope/baremetalmachine.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/record"
"sigs.k8s.io/controller-runtime/pkg/client"

infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
Expand Down Expand Up @@ -123,3 +124,33 @@ func (m *BareMetalMachineScope) IsControlPlane() bool {
func (m *BareMetalMachineScope) IsBootstrapReady() bool {
return m.Machine.Spec.Bootstrap.DataSecretName != nil
}

// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding
// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be
// created, and the condition will be set on both the BareMetalMachine and the corresponding
// HetznerBareMetalHost (if found). The Condition RemediationSucceededCondition will be set
// on the hbmm.
func (m *BareMetalMachineScope) SetErrorAndRemediate(ctx context.Context, message string) error {
obj := m.Machine

// Create a patch base
patch := client.MergeFrom(obj.DeepCopy())

// Modify only annotations on the in-memory copy
if obj.Annotations == nil {
obj.Annotations = map[string]string{}
}
obj.Annotations[clusterv1.RemediateMachineAnnotation] = ""

// Apply patch – only the diff (annotations) is sent to the API server
if err := m.Client.Patch(ctx, obj, patch); err != nil {
return err
}

record.Warnf(m.BareMetalMachine, "HetznerBareMetalMachineWillBeRemediated",
"HetznerBareMetalMachine will be remediated: %s", message)

conditions.MarkFalse(m.BareMetalMachine, infrav1.RemediationSucceededCondition,
infrav1.RemediationInProgressReason, clusterv1.ConditionSeverityInfo, "%s", message)
return nil
}
38 changes: 17 additions & 21 deletions pkg/services/baremetal/baremetal/baremetal.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (s *Service) update(ctx context.Context) error {
return fmt.Errorf("failed to get host: %w", err)
}
if host == nil {
s.scope.BareMetalMachine.SetFailure("UpdateError", "host not found")
err = errors.Join(s.scope.SetErrorAndRemediate(ctx, "Reconcile of hbmm: host not found"))
return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err)
}

Expand All @@ -232,30 +232,23 @@ func (s *Service) update(ctx context.Context) error {
}

// maintenance mode on the host is a fatal error for the machine object
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode && s.scope.BareMetalMachine.Status.FailureReason == nil {
s.scope.BareMetalMachine.SetFailure("UpdateError", FailureMessageMaintenanceMode)
record.Eventf(
s.scope.BareMetalMachine,
"BareMetalMachineSetFailure",
"set failure reason due to maintenance mode of underlying host",
)
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode {
err := s.scope.SetErrorAndRemediate(ctx, FailureMessageMaintenanceMode)
if err != nil {
return err
}
return nil
}

// if host has a fatal error, then it should be set on the machine object as well
if (host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError) &&
s.scope.BareMetalMachine.Status.FailureReason == nil {
s.scope.BareMetalMachine.SetFailure("UpdateError", host.Spec.Status.ErrorMessage)
record.Eventf(s.scope.BareMetalMachine, "BareMetalMachineSetFailure", host.Spec.Status.ErrorMessage)
// if host has a fatal error, then it should be set on the hbmm object as well
if host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError {
err := s.scope.SetErrorAndRemediate(ctx, host.Spec.Status.ErrorMessage)
if err != nil {
return err
}
return nil
}

// if host is healthy, the machine is healthy as well
if host.Spec.Status.ErrorType == infrav1.ErrorType("") {
s.scope.BareMetalMachine.Status.FailureMessage = nil
s.scope.BareMetalMachine.Status.FailureReason = nil
}

// ensure that the references are correctly set on host
s.setReferencesOnHost(host)

Expand Down Expand Up @@ -661,8 +654,11 @@ func (s *Service) setProviderID(ctx context.Context) error {
}

if host == nil {
s.scope.BareMetalMachine.SetFailure("UpdateError", "host not found")
return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err)
err := s.scope.SetErrorAndRemediate(ctx, "setProviderID failed: host not found")
if err != nil {
return err
}
return fmt.Errorf("host not found for machine %q", s.scope.Machine.Name)
}

if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned {
Expand Down
9 changes: 9 additions & 0 deletions pkg/services/baremetal/remediation/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"time"

corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
Expand Down Expand Up @@ -68,6 +69,14 @@ func (s *Service) Reconcile(ctx context.Context) (reconcile.Result, error) {
return s.setOwnerRemediatedConditionToFailed(ctx,
"exit remediation because hbmm has no host annotation")
}
// if SetErrorAndRemediate() was used to stop provisioning, do not try to reboot server
infraMachineCondition := conditions.Get(s.scope.BareMetalMachine, infrav1.RemediationSucceededCondition)
if infraMachineCondition != nil && infraMachineCondition.Status == corev1.ConditionFalse {
return s.setOwnerRemediatedConditionToFailed(ctx,
fmt.Sprintf("exit remediation because infra machine has condition set: %s: %s",
infraMachineCondition.Reason,
infraMachineCondition.Message))
}

// if host is not provisioned, do not try to reboot server
if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned {
Expand Down