Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

monitoring PR handling failures. #42

Merged
merged 16 commits into from
Dec 13, 2024
2 changes: 2 additions & 0 deletions cmd/telefonistka/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ func serve() {
mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)
prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)

go githubapi.MainGhMetricsLoop(mainGhClientCache)

mux := http.NewServeMux()
mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache))
mux.Handle("/metrics", promhttp.Handler())
Expand Down
23 changes: 23 additions & 0 deletions docs/observability.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@
|telefonistka_github_github_rest_api_client_rate_remaining|gauge|The number of remaining requests the client can make this hour||
|telefonistka_github_github_rest_api_client_rate_limit|gauge|The number of requests per hour the client is currently limited to||
|telefonistka_webhook_server_webhook_hits_total|counter|The total number of validated webhook hits|`parsing`|
|telefonistka_github_open_prs|gauge|The number of open PRs|`repo_slug`|
|telefonistka_github_open_promotion_prs|gauge|The number of open promotion PRs|`repo_slug`|
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
|telefonistka_github_open_prs_with_pending_telefonistka_checks|gauge|The number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)|`repo_slug`|
|telefonistka_github_commit_status_updates_total|counter|The total number of commit status updates, and their status (success/pending/failure)|`repo_slug`, `status`|

> [!NOTE]
> telefonistka_github_*_prs metrics are only supported on installtions that uses GitHub App authentication as it provides an easy way to query the relevant GH repos.
Example metrics snippet:

Oded-B marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -26,4 +33,20 @@ telefonistka_github_github_rest_api_client_rate_remaining 99668
# HELP telefonistka_webhook_server_webhook_hits_total The total number of validated webhook hits
# TYPE telefonistka_webhook_server_webhook_hits_total counter
telefonistka_webhook_server_webhook_hits_total{parsing="successful"} 8
# HELP telefonistka_github_commit_status_updates_total The total number of commit status updates, and their status (success/pending/failure)
# TYPE telefonistka_github_commit_status_updates_total counter
telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="error"} 1
telefonistka_github_commit_status_updates_total{repo_slug="foo/bar2",status="pending"} 1
# HELP telefonistka_github_open_promotion_prs The total number of open PRs with promotion label
# TYPE telefonistka_github_open_promotion_prs gauge
telefonistka_github_open_promotion_prs{repo_slug="foo/bar1"} 0
telefonistka_github_open_promotion_prs{repo_slug="foo/bar2"} 10
# HELP telefonistka_github_open_prs The total number of open PRs
# TYPE telefonistka_github_open_prs gauge
telefonistka_github_open_prs{repo_slug="foo/bar1"} 0
telefonistka_github_open_prs{repo_slug="foo/bar2"} 21
# HELP telefonistka_github_open_prs_with_pending_telefonistka_checks The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)
# TYPE telefonistka_github_open_prs_with_pending_telefonistka_checks gauge
telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar1"} 0
telefonistka_github_open_prs_with_pending_telefonistka_checks{repo_slug="foo/bar2"} 0
```
2 changes: 2 additions & 0 deletions internal/pkg/githubapi/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,8 @@ func SetCommitStatus(ghPrClientDetails GhPrClientDetails, state string) {

_, resp, err := ghPrClientDetails.GhClientPair.v3Client.Repositories.CreateStatus(ctx, ghPrClientDetails.Owner, ghPrClientDetails.Repo, ghPrClientDetails.PrSHA, commitStatus)
prom.InstrumentGhCall(resp)
repoSlug := ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo
prom.IncCommitStatusUpdateCounter(repoSlug, state)
if err != nil {
ghPrClientDetails.PrLogger.Errorf("Failed to set commit status: err=%s\n%v", err, resp)
}
Expand Down
108 changes: 108 additions & 0 deletions internal/pkg/githubapi/pr_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package githubapi

import (
"context"
"time"

"github.com/google/go-github/v62/github"
lru "github.com/hashicorp/golang-lru/v2"
log "github.com/sirupsen/logrus"
prom "github.com/wayfair-incubator/telefonistka/internal/pkg/prometheus"
)

const (
timeToDefineStale = 20 * time.Minute
metricRefreshTime = 60 * time.Second
)

func MainGhMetricsLoop(mainGhClientCache *lru.Cache[string, GhClientPair]) {
for t := range time.Tick(metricRefreshTime) {
log.Debugf("Updating pr metrics at %v", t)
getPrMetrics(mainGhClientCache)
}
}

func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (pc prom.PrCounters, err error) {
log.Debugf("Checking repo %s", repo.GetName())
ghOwner := repo.GetOwner().GetLogin()
prListOpts := &github.PullRequestListOptions{
State: "open",
}
prs := []*github.PullRequest{}

// paginate through PRs, there might be lots of them.
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
for {
perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting PRs for %s/%s: %v", ghOwner, repo.GetName(), err)
}
Comment on lines +37 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not checking the error first, and then calling the prom.InstrumentGhCall(resp)?
Another question if the output for prom.InstrumentGhCall(resp) is not important why do bother to assign it something?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The InstrumentGhCall function returns labels just for testing.
During runtime these labels are just used internally in the function.
I chose to instrument before error handling to ensure we still instrument even if we change the error handling to break out of that code path in the future, be I can change the order if you feel it makes it less readable

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose to instrument before error handling to ensure we still instrument even if we change the error handling to break out of that code path in the future, be I can change the order if you feel it makes it less readable

I would add that as a comment

prs = append(prs, perPagePrs...)
if resp.NextPage == 0 {
break
}
prListOpts.Page = resp.NextPage
}

for _, pr := range prs {
if DoesPrHasLabel(pr.Labels, "promotion") {
pc.OpenPromotionPrs++
}

log.Debugf("Checking PR %d", pr.GetNumber())
commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err)
continue
}
if isPrStalePending(commitStatuses, timeToDefineStale) {
pc.PrWithStaleChecks++
}
}
pc.OpenPrs = len(prs)

return
}

// isPrStalePending checks if the a combinedStatus has a "telefonistka" context pending status that is older than timeToDefineStale and is in pending state
func isPrStalePending(commitStatuses *github.CombinedStatus, timeToDefineStale time.Duration) bool {
for _, status := range commitStatuses.Statuses {
if *status.Context == "telefonistka" &&
*status.State == "pending" &&
status.UpdatedAt.GetTime().Before(time.Now().Add(timeToDefineStale*-1)) {
log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State)
return true
} else {
log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State)
}
}

return false
}

// getPrMetrics iterates through all clients , gets all repos and then all PRs and calculates metrics
// getPrMetrics assumes Telefonsitka uses a GitHub App style of authentication as it uses the Apps.ListRepos call
// When using personal access token authentication, Telefonistka is unaware of the "relevant" repos (at least it get a webhook from them).
func getPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()
for _, ghOwner := range mainGhClientCache.Keys() {
log.Debugf("Checking gh Owner %s", ghOwner)
ghClient, _ := mainGhClientCache.Get(ghOwner)
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil)
_ = prom.InstrumentGhCall(resp)
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
log.Errorf("error getting repos for %s: %v", ghOwner, err)
continue
}
for _, repo := range repos.Repositories {
pc, err := getRepoPrMetrics(ctx, ghClient, repo)
if err != nil {
log.Errorf("error getting repos for %s: %v", ghOwner, err)
continue
}
prom.PublishPrMetrics(pc, repo.GetFullName())
}
}
}
89 changes: 89 additions & 0 deletions internal/pkg/githubapi/pr_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package githubapi

import (
"testing"
"time"

"github.com/google/go-github/v62/github"
)

func TestIsPrStalePending(t *testing.T) {
t.Parallel()
timeToDefineStale := 15 * time.Minute

currentTime := time.Now()
tests := map[string]struct {
input github.CombinedStatus
result bool
}{
"All Success": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("success"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
{
State: github.String("success"),
Context: github.String("circleci"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
{
State: github.String("success"),
Context: github.String("foobar"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-10 * time.Minute),
},
},
},
},
result: false,
},
"Pending but not stale": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("pending"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-1 * time.Minute),
},
},
},
},
result: false,
},

"Pending and stale": {
input: github.CombinedStatus{
Statuses: []*github.RepoStatus{
{
State: github.String("pending"),
Context: github.String("telefonistka"),
UpdatedAt: &github.Timestamp{
Time: currentTime.Add(-20 * time.Minute),
},
},
},
},
result: true,
},
}

for name, tc := range tests {
name := name
tc := tc
t.Run(name, func(t *testing.T) {
t.Parallel()
result := isPrStalePending(&tc.input, timeToDefineStale)
if result != tc.result {
t.Errorf("(%s)Expected %v, got %v", name, tc.result, result)
}
})
}
}
50 changes: 50 additions & 0 deletions internal/pkg/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ import (
"github.com/prometheus/client_golang/prometheus/promauto"
)

type PrCounters struct {
OpenPrs int
OpenPromotionPrs int
PrWithStaleChecks int
}

var (
webhookHitsVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "webhook_hits_total",
Expand Down Expand Up @@ -39,6 +45,34 @@ var (
Subsystem: "github",
}, []string{"api_group", "api_path", "repo_slug", "status", "method"})

ghOpenPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs",
Help: "The total number of open PRs",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPromotionPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_promotion_prs",
Help: "The total number of open PRs with promotion label",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPrsWithPendingCheckGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs_with_pending_telefonistka_checks",
Help: "The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

commitStatusUpdates = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "commit_status_updates_total",
Help: "The total number of commit status updates, and their status (success/pending/failure)",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug", "status"})

whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "upstream_requests_total",
Help: "The total number of requests forwarded upstream servers",
Expand All @@ -47,6 +81,22 @@ var (
}, []string{"status", "method", "url"})
)

func IncCommitStatusUpdateCounter(repoSlug string, status string) {
commitStatusUpdates.With(prometheus.Labels{
"repo_slug": repoSlug,
"status": status,
}).Inc()
}

func PublishPrMetrics(pc PrCounters, repoSlug string) {
metricLables := prometheus.Labels{
"repo_slug": repoSlug,
}
ghOpenPrsGauge.With(metricLables).Set(float64(pc.OpenPrs))
ghOpenPromotionPrsGauge.With(metricLables).Set(float64(pc.OpenPromotionPrs))
ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(pc.PrWithStaleChecks))
}

// This function instrument Webhook hits and parsing of their content
func InstrumentWebhookHit(parsing_status string) {
webhookHitsVec.With(prometheus.Labels{"parsing": parsing_status}).Inc()
Expand Down
Loading