diff --git a/pkg/services/errors/errors.go b/pkg/services/errors/errors.go index 15de94d4..3f9d93e3 100644 --- a/pkg/services/errors/errors.go +++ b/pkg/services/errors/errors.go @@ -5,16 +5,16 @@ import ( "strings" ) -type FailedTaskErr struct { +type HostsFailedErr struct { Hosts []string } -func (e *FailedTaskErr) Error() string { - return "task failed for hosts " + strings.Join(e.Hosts, ", ") +func (e *HostsFailedErr) Error() string { + return "hosts failed " + strings.Join(e.Hosts, ", ") } -func NewFailedTaskErr(hosts []string) error { - return &FailedTaskErr{Hosts: hosts} +func NewHostsFailedErr(hosts []string) error { + return &HostsFailedErr{Hosts: hosts} } var ( diff --git a/pkg/services/system_state_poll/common.go b/pkg/services/system_state_poll/common.go index 5faf0a59..b1df5678 100644 --- a/pkg/services/system_state_poll/common.go +++ b/pkg/services/system_state_poll/common.go @@ -2,7 +2,7 @@ package system_state_poll import ( "go-deploy/models/model" - errors2 "go-deploy/pkg/services/errors" + wErrors "go-deploy/pkg/services/errors" "k8s.io/client-go/kubernetes" "log" "sync" @@ -35,7 +35,7 @@ func ForEachHost(taskName string, hosts []model.Host, job func(worker int, host wg.Wait() if len(failedHosts) > 0 { - return errors2.NewFailedTaskErr(failedHosts) + return wErrors.NewHostsFailedErr(failedHosts) } return nil diff --git a/pkg/services/workers.go b/pkg/services/workers.go index 80ffe611..e4c189b8 100644 --- a/pkg/services/workers.go +++ b/pkg/services/workers.go @@ -4,8 +4,11 @@ import ( "context" "errors" "fmt" + "go-deploy/pkg/db/resources/host_repo" + "go-deploy/pkg/log" wErrors "go-deploy/pkg/services/errors" "go-deploy/utils" + "strings" "time" ) @@ -60,6 +63,17 @@ func PeriodicWorker(ctx context.Context, name string, work func() error, interva ReportUp(name) case <-tick: if err := work(); err != nil { + // If errors is HostsFailedErr, disable the hosts + var hostsFailedErr *wErrors.HostsFailedErr + if errors.As(err, &hostsFailedErr) { + deactivateDuration := 30 * time.Minute + log.Printf("Hosts [%s] failed. Deactivating them for %s", strings.Join(hostsFailedErr.Hosts, ", "), deactivateDuration.String()) + deactivationErr := DeactivateHosts(hostsFailedErr.Hosts, time.Now().Add(deactivateDuration)) + if err != nil { + utils.PrettyPrintError(fmt.Errorf("failed to disable hosts: %w", deactivationErr)) + } + } + // It's too verbose to print when no hosts or clusters are available, so we skip that if !errors.Is(err, wErrors.NoClustersErr) && !errors.Is(err, wErrors.NoHostsErr) { utils.PrettyPrintError(fmt.Errorf("%s failed (sleeping for extra %s): %w", name, errorSleep.String(), err)) @@ -74,3 +88,14 @@ func PeriodicWorker(ctx context.Context, name string, work func() error, interva } } } + +func DeactivateHosts(hosts []string, until time.Time) error { + for _, host := range hosts { + err := host_repo.New().DeactivateHost(host, until) + if err != nil { + return fmt.Errorf("failed to deactivate host %s: %w", host, err) + } + } + + return nil +}