Skip to content

Commit

Permalink
[CLD-7290] Add retry mechanism for Thanos queries to mitigate random …
Browse files Browse the repository at this point in the history
…network connectivity issues (#31)

Co-authored-by: Stylianos Rigas <stylianosrigas@Stylianoss-MacBook-Pro-Work.local>
  • Loading branch information
stylianosrigas and Stylianos Rigas authored Mar 14, 2024
1 parent 9dd5f85 commit 0f3be4c
Showing 1 changed file with 24 additions and 9 deletions.
33 changes: 24 additions & 9 deletions internal/elrond/elrond_soak.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,36 @@ func querySLOMetrics(ring *model.Ring, url string, queryTime time.Time, logger *
}

v1api := v1.NewAPI(client)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
var results []pmodel.Vector

for _, installationGroup := range ring.InstallationGroups {
query := fmt.Sprintf("((slo:sli_error:ratio_rate5m{slo_service='%[1]s-ring-%[2]s'} > (14.4 * 0.005)) and ignoring(slo_window)(slo:sli_error:ratio_rate1h{slo_service='%[1]s-ring-%[2]s'} > (14.4 * 0.005))) or ignoring(slo_window)((slo:sli_error:ratio_rate30m{slo_service='%[1]s-ring-%[2]s'} > (6 * 0.005)) and ignoring(slo_window)(slo:sli_error:ratio_rate6h{slo_service='%[1]s-ring-%[2]s'} > (3.3 * 0.005))) or vector(0)", installationGroup.Name, installationGroup.ProvisionerGroupID)
logger.Infof("Running Thanos query %s", query)
result, warnings, err := v1api.Query(ctx, query, queryTime)
if err != nil {
return nil, errors.Wrap(err, "failed to query")
var lastErr error
// Retry mechanism for Thanos network connectivity issues.
for attempt := 0; attempt < 10; attempt++ {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
logger.Infof("Running Thanos query %s, attempt %d", query, attempt+1)
result, warnings, err := v1api.Query(ctx, query, queryTime)
cancel()

if err == nil {
if len(warnings) > 0 {
logger.Warnf("Encountered warnings obtaining metrics: %s", strings.Join(warnings, ", "))
}
results = append(results, result.(pmodel.Vector))
break
}

lastErr = err
logger.Warnf("Query failed: %v", err)
if attempt+1 < 10 {
time.Sleep(time.Second * time.Duration(2<<attempt)) // Exponential backoff
}
}

if len(warnings) > 0 {
return nil, errors.Errorf("encounted warnings obtaining metrics: %s", strings.Join(warnings, ", "))
if lastErr != nil {
return nil, errors.Wrap(lastErr, "failed to query after retries")
}
results = append(results, result.(pmodel.Vector))
}

return results, nil
Expand Down

0 comments on commit 0f3be4c

Please sign in to comment.