Skip to content

Commit

Permalink
Merge pull request #663 from kthcloud/dev
Browse files Browse the repository at this point in the history
Temporarily deactivate hosts that failed when polling
  • Loading branch information
saffronjam authored Jun 9, 2024
2 parents 5dc3dbe + 7eeade9 commit c8caffa
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 7 deletions.
10 changes: 5 additions & 5 deletions pkg/services/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ import (
"strings"
)

type FailedTaskErr struct {
type HostsFailedErr struct {
Hosts []string
}

func (e *FailedTaskErr) Error() string {
return "task failed for hosts " + strings.Join(e.Hosts, ", ")
func (e *HostsFailedErr) Error() string {
return "hosts failed " + strings.Join(e.Hosts, ", ")
}

func NewFailedTaskErr(hosts []string) error {
return &FailedTaskErr{Hosts: hosts}
func NewHostsFailedErr(hosts []string) error {
return &HostsFailedErr{Hosts: hosts}
}

var (
Expand Down
4 changes: 2 additions & 2 deletions pkg/services/system_state_poll/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package system_state_poll

import (
"go-deploy/models/model"
errors2 "go-deploy/pkg/services/errors"
wErrors "go-deploy/pkg/services/errors"
"k8s.io/client-go/kubernetes"
"log"
"sync"
Expand Down Expand Up @@ -35,7 +35,7 @@ func ForEachHost(taskName string, hosts []model.Host, job func(worker int, host
wg.Wait()

if len(failedHosts) > 0 {
return errors2.NewFailedTaskErr(failedHosts)
return wErrors.NewHostsFailedErr(failedHosts)
}

return nil
Expand Down
25 changes: 25 additions & 0 deletions pkg/services/workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ import (
"context"
"errors"
"fmt"
"go-deploy/pkg/db/resources/host_repo"
"go-deploy/pkg/log"
wErrors "go-deploy/pkg/services/errors"
"go-deploy/utils"
"strings"
"time"
)

Expand Down Expand Up @@ -60,6 +63,17 @@ func PeriodicWorker(ctx context.Context, name string, work func() error, interva
ReportUp(name)
case <-tick:
if err := work(); err != nil {
// If errors is HostsFailedErr, disable the hosts
var hostsFailedErr *wErrors.HostsFailedErr
if errors.As(err, &hostsFailedErr) {
deactivateDuration := 30 * time.Minute
log.Printf("Hosts [%s] failed. Deactivating them for %s", strings.Join(hostsFailedErr.Hosts, ", "), deactivateDuration.String())
deactivationErr := DeactivateHosts(hostsFailedErr.Hosts, time.Now().Add(deactivateDuration))
if err != nil {
utils.PrettyPrintError(fmt.Errorf("failed to disable hosts: %w", deactivationErr))
}
}

// It's too verbose to print when no hosts or clusters are available, so we skip that
if !errors.Is(err, wErrors.NoClustersErr) && !errors.Is(err, wErrors.NoHostsErr) {
utils.PrettyPrintError(fmt.Errorf("%s failed (sleeping for extra %s): %w", name, errorSleep.String(), err))
Expand All @@ -74,3 +88,14 @@ func PeriodicWorker(ctx context.Context, name string, work func() error, interva
}
}
}

func DeactivateHosts(hosts []string, until time.Time) error {
for _, host := range hosts {
err := host_repo.New().DeactivateHost(host, until)
if err != nil {
return fmt.Errorf("failed to deactivate host %s: %w", host, err)
}
}

return nil
}

0 comments on commit c8caffa

Please sign in to comment.