Skip to content

Commit

Permalink
chore(sysadvisor): disable some healthcheck rule (#539)
Browse files Browse the repository at this point in the history
  • Loading branch information
zzzzhhb committed Apr 10, 2024
1 parent 72afaf3 commit 76d9f46
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 8 deletions.
6 changes: 0 additions & 6 deletions pkg/agent/sysadvisor/plugin/inference/inference.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ func NewInferencePlugin(pluginName string, conf *config.Configuration, extraConf
}

func (infp *InferencePlugin) Run(ctx context.Context) {
if len(infp.modelsResultFetchers) > 0 {
general.RegisterHeartbeatCheck(borweinfetcher.BorweinModelResultFetcherName, 3*infp.period, general.HealthzCheckStateNotReady, 3*infp.period)
}
wait.UntilWithContext(ctx, infp.fetchModelResult, infp.period)
}

Expand All @@ -128,9 +125,6 @@ func (infp *InferencePlugin) fetchModelResult(ctx context.Context) {
go func(modelName string, fetcher modelresultfetcher.ModelResultFetcher) {
defer wg.Done()
err := fetcher.FetchModelResult(ctx, infp.metaReader, infp.metaWriter, infp.metaServer)
defer func() {
_ = general.UpdateHealthzStateByError(borweinfetcher.BorweinModelResultFetcherName, err)
}()
if err != nil {
general.Errorf("FetchModelResult for model: %s failed with error: %v", modelName, err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"time"
)

const healthCheckTolerationDuration = 15 * time.Second
const healthCheckTolerationDuration = 5 * time.Minute

// Reporter is used to report resource
type Reporter interface {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,9 @@ func (m *memoryBalancer) getBalanceInfo() (balanceInfo *BalanceInfo, err error)
}

if balanceInfo.MaxLatencyNuma.ReadLatency == 0 || balanceInfo.MaxBandwidthNuma.ReadWriteBandwidth == 0 {
err = fmt.Errorf("all numas read latency or bandwidth are 0")
// TODO: this exception always happens in VM node since malachite can't provide related metric for VM node,
// maybe we should disable this whole feature when running in VM node.
general.Errorf("all numas read latency or bandwidth are 0")
return
}

Expand Down

0 comments on commit 76d9f46

Please sign in to comment.