forked from GLCharge/distributed-scheduler
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature: Runner metrics support (#17)
* Added observability stack * Moved docker composes to deployments/docker * Added runner metrics * Simplified logger setup * Added short description of observability for the scheduler. * Moved metrics definitions to constants instead of hardcoding in the string
- Loading branch information
Showing
13 changed files
with
263 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
services: | ||
|
||
# Configure the Grafana observability stack | ||
grafana-lgtm-stack: | ||
image: grafana/otel-lgtm | ||
container_name: lgtm-stack | ||
hostname: lgtm-stack | ||
profiles: | ||
- observability | ||
ports: | ||
- "3000:3000" | ||
- "4317:4317" | ||
- "4318:4318" | ||
healthcheck: | ||
test: [ "CMD", "curl", "-f", "http://localhost:3000/api/health" ] | ||
start_period: 30s | ||
interval: 30s | ||
timeout: 10s | ||
retries: 5 | ||
volumes: | ||
- prometheus:/prometheus | ||
- loki:/data/loki | ||
- grafana:/var/lib/grafana | ||
|
||
promtail: | ||
image: grafana/promtail:latest | ||
container_name: promtail | ||
command: | ||
- "-config.file=/etc/promtail/promtail.yaml" | ||
profiles: | ||
- observability | ||
hostname: promtail | ||
restart: always | ||
depends_on: | ||
grafana-lgtm-stack: | ||
condition: service_healthy | ||
volumes: | ||
- ./observability/promtail/config.yaml:/etc/promtail/promtail.yaml | ||
- /var/run/docker.sock:/var/run/docker.sock:ro | ||
|
||
volumes: | ||
prometheus: | ||
grafana: | ||
loki: | ||
minio_loki: | ||
tempo_data: |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
server: | ||
http_listen_address: 0.0.0.0 | ||
http_listen_port: 9080 | ||
|
||
positions: | ||
filename: /tmp/positions.yaml | ||
|
||
client: | ||
url: http://lgtm-stack:3100/loki/api/v1/push | ||
|
||
scrape_configs: | ||
- job_name: docker | ||
docker_sd_configs: | ||
- host: unix:///var/run/docker.sock | ||
refresh_interval: 5s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Observability | ||
|
||
Scheduler currently supports logging and metrics. Both are exported via the OpenTelemetry protocol (GRPC) and can be | ||
collected by any OpenTelemetry-compatible collector. | ||
|
||
## Logging | ||
|
||
Logging can be configured via the `LOG_LEVEL` environment variable. The following levels are supported: | ||
|
||
- `debug` | ||
- `info` | ||
- `warn` | ||
- `error` | ||
|
||
## Metrics | ||
|
||
Metrics can be enabled by setting the `METRICS_ENABLED` environment variable to `true`. Metrics are exported via the | ||
OpenTelemetry protocol (GRPC). | ||
|
||
The following manager metrics are currently exported: | ||
|
||
- `http_requests_total`: The total number of HTTP requests received by the server. | ||
- `http_request_duration_seconds`: The duration of HTTP requests in seconds. | ||
- `http_errors_total`: The total number of failed HTTP requests. | ||
|
||
The following runner metrics are currently exported: | ||
|
||
- `scheduler_jobs_total`: The total number of jobs that have been scheduled. | ||
- `scheduler_jobs_failed_total`: The total number of jobs that have failed. | ||
- `scheduler_jobs_duration_seconds`: The duration of jobs in seconds. | ||
- `scheduler_jobs_in_execution`: The total number of jobs currently in execution. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package metrics | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/xBlaz3kx/DevX/observability" | ||
"go.opentelemetry.io/otel" | ||
"go.opentelemetry.io/otel/attribute" | ||
"go.opentelemetry.io/otel/metric" | ||
) | ||
|
||
const ( | ||
jobsTotal = "scheduler_runner_jobs_total" | ||
jobsExecuted = "scheduler_runner_jobs_executed" | ||
jobsFailed = "scheduler_runner_jobs_failed" | ||
jobRetries = "scheduler_runner_job_retries" | ||
jobDuration = "scheduler_runner_job_duration" | ||
jobsInExecution = "scheduler_runner_jobs_in_execution" | ||
) | ||
|
||
// Add attributes: Job Type/Executor, Instance ID, status, numberOfTries | ||
|
||
type RunnerMetrics struct { | ||
enabled bool | ||
|
||
jobsTotal metric.Int64Counter | ||
|
||
jobsExecuted metric.Int64Counter | ||
|
||
jobsFailed metric.Int64Counter | ||
|
||
jobRetries metric.Int64Counter | ||
|
||
jobDuration metric.Float64Histogram | ||
|
||
jobsInExecution metric.Int64Gauge | ||
} | ||
|
||
func NewRunnerMetrics(config observability.MetricsConfig) *RunnerMetrics { | ||
if !config.Enabled { | ||
return &RunnerMetrics{enabled: false} | ||
} | ||
|
||
meter := otel.GetMeterProvider().Meter("runner") | ||
|
||
jobsTotal, err := meter.Int64Counter(jobsTotal) | ||
must(err) | ||
|
||
jobsExecuted, err := meter.Int64Counter(jobsExecuted) | ||
must(err) | ||
|
||
jobsFailed, err := meter.Int64Counter(jobsFailed) | ||
must(err) | ||
|
||
jobRetries, err := meter.Int64Counter(jobRetries) | ||
must(err) | ||
|
||
jobDuration, err := meter.Float64Histogram(jobDuration) | ||
must(err) | ||
|
||
jobsInExecution, err := meter.Int64Gauge(jobsInExecution) | ||
must(err) | ||
|
||
return &RunnerMetrics{ | ||
enabled: true, | ||
jobsTotal: jobsTotal, | ||
jobsExecuted: jobsExecuted, | ||
jobsFailed: jobsFailed, | ||
jobRetries: jobRetries, | ||
jobDuration: jobDuration, | ||
jobsInExecution: jobsInExecution, | ||
} | ||
} | ||
|
||
func (r *RunnerMetrics) IncreaseJobsInExecution(ctx context.Context, numJobs int, attributes ...attribute.KeyValue) { | ||
if r.enabled { | ||
// Increase gauge metric for number of running jobs | ||
attrs := metric.WithAttributes(attributes...) | ||
r.jobsInExecution.Record(ctx, int64(numJobs), attrs) | ||
} | ||
} | ||
|
||
func (r *RunnerMetrics) DecreaseJobsInExecution(ctx context.Context, numJobs int, attributes ...attribute.KeyValue) { | ||
if r.enabled { | ||
jobs := int64(numJobs) | ||
// Increase gauge metric for number of running jobs | ||
attrs := metric.WithAttributes(attributes...) | ||
r.jobsInExecution.Record(ctx, -jobs, attrs) | ||
r.jobsTotal.Add(ctx, jobs, attrs) | ||
r.jobsExecuted.Add(ctx, jobs, attrs) | ||
} | ||
} | ||
|
||
func (r *RunnerMetrics) RecordJobDuration(ctx context.Context, duration float64, attributes ...attribute.KeyValue) { | ||
if r.enabled { | ||
attrs := metric.WithAttributes(attributes...) | ||
r.jobDuration.Record(ctx, duration, attrs) | ||
} | ||
} | ||
|
||
func (r *RunnerMetrics) IncrementJobRetries(ctx context.Context, attributes ...attribute.KeyValue) { | ||
if r.enabled { | ||
attrs := metric.WithAttributes(attributes...) | ||
r.jobRetries.Add(ctx, 1, attrs) | ||
} | ||
} | ||
|
||
func (r *RunnerMetrics) IncreaseFailedJobCount(ctx context.Context, attributes ...attribute.KeyValue) { | ||
if r.enabled { | ||
attrs := metric.WithAttributes(attributes...) | ||
r.jobsFailed.Add(ctx, 1, attrs) | ||
} | ||
} | ||
|
||
func must(err error) { | ||
if err != nil { | ||
panic(err) | ||
} | ||
} |
Oops, something went wrong.