Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bin/celery_flower.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@

set -e

# Set defaults for OTEL
export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-flower}"

exec celery --app objects --workdir src flower
5 changes: 5 additions & 0 deletions bin/celery_worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ if [[ "$ENABLE_COVERAGE" ]]; then
_binary="coverage run $_binary"
fi

# Set defaults for OTEL
export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-worker-"${QUEUE}"}"

echo "Starting celery worker $WORKER_NAME with queue $QUEUE"
# unset this if NOT using a process pool
export _OTEL_DEFER_SETUP="true"
exec $_binary --workdir src --app "objects.celery" worker \
-Q $QUEUE \
-n $WORKER_NAME \
Expand Down
9 changes: 8 additions & 1 deletion bin/docker_start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ ${SCRIPTPATH}/wait_for_db.sh

>&2 echo "Database is up."

# Set defaults for OTEL
export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects}"

# Apply database migrations
>&2 echo "Apply database migrations"
python src/manage.py migrate
OTEL_SDK_DISABLED=True python src/manage.py migrate

# Load any JSON fixtures present
if [ -d $fixtures_dir ]; then
Expand Down Expand Up @@ -50,6 +53,7 @@ fi
# Start server
>&2 echo "Starting server"
uwsgi \
--strict \
--ini "${SCRIPTPATH}/uwsgi.ini" \
--http :$uwsgi_port \
--http-keepalive \
Expand All @@ -59,6 +63,9 @@ uwsgi \
--static-map /media=/app/media \
--chdir src \
--enable-threads \
--single-interpreter \
--die-on-term \
--need-app \
--processes $uwsgi_processes \
--threads $uwsgi_threads \
--buffer-size=65535
7 changes: 5 additions & 2 deletions bin/setup_configuration.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@

set -e

# Set defaults for OTEL
export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-setup-configuration}"

if [[ "${RUN_SETUP_CONFIG,,}" =~ ^(true|1|yes)$ ]]; then
# wait for required services
/wait_for_db.sh

src/manage.py migrate
src/manage.py setup_configuration --yaml-file setup_configuration/data.yaml
OTEL_SDK_DISABLED=True src/manage.py migrate
OTEL_SDK_DISABLED=True src/manage.py setup_configuration --yaml-file setup_configuration/data.yaml
fi
11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ services:
SUBPATH: ${SUBPATH}
DB_CONN_MAX_AGE: "0"
DB_POOL_ENABLED: True

# Enabling Open Telemetry requires the services in docker/docker-compose.observability.yaml
# to be up and running.
OTEL_SDK_DISABLED: ${OTEL_SDK_DISABLED:-true}
OTEL_RESOURCE_ATTRIBUTES: maykin.saas.client=maykin,maykin.saas.target=dev
OTEL_METRIC_EXPORT_INTERVAL: ${OTEL_METRIC_EXPORT_INTERVAL:-60000}
OTEL_EXPORTER_OTLP_ENDPOINT: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://otel-collector:4317} # gRPC
# otel:supersecret, escape spaces and = with percent encoding
OTEL_EXPORTER_OTLP_HEADERS: Authorization=Basic b3RlbDpzdXBlcnNlY3JldA==
OTEL_EXPORTER_OTLP_METRICS_INSECURE: ${OTEL_EXPORTER_OTLP_METRICS_INSECURE:-true}
_OTEL_ENABLE_CONTAINER_RESOURCE_DETECTOR: true
healthcheck:
test: ["CMD", "python", "-c", "import requests; exit(requests.head('http://localhost:8000/admin/').status_code not in [200, 302])"]
interval: 30s
Expand Down
47 changes: 27 additions & 20 deletions docker/docker-compose.observability.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ services:
networks:
- objects-dev

prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-tsdb:/prometheus
networks:
- objects-dev

promtail:
image: grafana/promtail:latest
volumes:
Expand All @@ -24,39 +34,36 @@ services:
networks:
- objects-dev

# open telemetry collector, receives metrics from the application instance(s)
otel-collector:
image: otel/opentelemetry-collector-contrib:0.131.0
command: --config=/etc/otel-collector-config.yaml
volumes:
- ./observability/otel/otel-collector-config.yml:/etc/otel-collector-config.yaml
ports:
- 4317:4317
- 4318:4318
- 8889:8889
networks:
- objects-dev

grafana:
image: grafana/grafana:latest
volumes:
- ./observability/grafana/datasources:/etc/grafana/provisioning/datasources
environment:
- GF_PATHS_PROVISIONING=/etc/grafana/provisioning
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_FEATURE_TOGGLES_ENABLE=alertingSimplifiedRouting,alertingQueryAndExpressionsStepMode
entrypoint:
- sh
- -euc
- |
mkdir -p /etc/grafana/provisioning/datasources
cat <<EOF > /etc/grafana/provisioning/datasources/ds.yaml
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
orgId: 1
url: http://loki:3100
basicAuth: false
isDefault: true
version: 1
editable: false
EOF
/run.sh
image: grafana/grafana:latest
ports:
- "3000:3000"
networks:
- objects-dev

volumes:
promtail-logs:
prometheus-tsdb:

networks:
objects-dev:
Expand Down
20 changes: 20 additions & 0 deletions docker/observability/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ You can now navigate to:
- http://localhost:3000 for Grafana
- http://localhost:3100/ready for Loki readiness
- http://localhost:3100/metrics for Loki metrics
- http://localhost:9090 for the Prometheus web interface

## Logging

Expand All @@ -48,3 +49,22 @@ You can filter application logs based on a request ID:
```logql
{job="docker", app="objects-api"} | json | __error__ = "" | request_id=`1e9e1b9d-4d34-4657-99e4-88673d824724`
```

## Metrics

Metrics can be sent using OTLP to the collector at http://localhost:4317 (gRPC).

The `maykin_common.otel` module takes care of setting everything up, just make sure to set the
environment variable `OTEL_SDK_DISABLED=false` in development (it's disabled by default).

The collector ingests the metrics, and they are then scraped by Prometheus. They're also printed to
stdout.

## Traces

Traces can be sent using OTLP to the collector at http://localhost:4317 (gRPC).

The `maykin_common.otel` module takes care of setting everything up, just make sure to set the
environment variable `OTEL_SDK_DISABLED=false` in development (it's disabled by default).

The collector ingests the traces and prints them out to stdout.
28 changes: 28 additions & 0 deletions docker/observability/grafana/datasources/ds.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---

apiVersion: 1

datasources:
- name: Loki
type: loki
access: proxy
orgId: 1
url: http://loki:3100
basicAuth: false
isDefault: true
version: 1
editable: false
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
jsonData:
httpMethod: POST
manageAlerts: false
allowAsRecordingRulesTarget: false
prometheusType: Prometheus
prometheusVersion: 3.5.0
cacheLevel: 'Low'
disableRecordingRules: false
timeInterval: 15s # Prometheus scrape interval
incrementalQueryOverlapWindow: 10m
62 changes: 62 additions & 0 deletions docker/observability/otel/otel-collector-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
---

extensions:
basicauth/server:
htpasswd:
inline: |
otel:supersecret

receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
auth:
authenticator: basicauth/server
http:
endpoint: 0.0.0.0:4318
auth:
authenticator: basicauth/server

processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 1024
transform:
metric_statements:
- context: datapoint
statements:
- set(attributes["client"], resource.attributes["of.client"])
- set(attributes["target"], resource.attributes["of.target"])

exporters:
# NOTE: Prior to v0.86.0 use `logging` instead of `debug`.
debug:
verbosity: detailed
# Exposes a metrics endpoint that prometheus can scrape
prometheus:
endpoint: 0.0.0.0:8889
namespace: otel # this becomes a prefix in the prometheus metrics: `otel_*`
const_labels:
environment: local_otel_collector
resource_to_telemetry_conversion:
enabled: true

service:
extensions: [basicauth/server]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [debug]
metrics:
receivers: [otlp]
processors: [memory_limiter, transform, batch]
exporters: [debug, prometheus]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [debug]
74 changes: 74 additions & 0 deletions docker/observability/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---

# From https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-otlp.yml
# and https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-docker.yml

global:
scrape_interval: 15s
evaluation_interval: 60s

# otlp:
# # Recommended attributes to be promoted to labels.
# promote_resource_attributes:
# - service.instance.id
# - service.name
# - service.namespace
# - cloud.availability_zone
# - cloud.region
# - container.name
# - deployment.environment.name
# - k8s.cluster.name
# - k8s.container.name
# - k8s.cronjob.name
# - k8s.daemonset.name
# - k8s.deployment.name
# - k8s.job.name
# - k8s.namespace.name
# - k8s.pod.name
# - k8s.replicaset.name
# - k8s.statefulset.name
# # Ingest OTLP data keeping all characters in metric/label names.
# translation_strategy: NoUTF8EscapingWithSuffixes

# storage:
# # OTLP is a push-based protocol, Out of order samples is a common scenario.
# tsdb:
# out_of_order_time_window: 30m

scrape_configs:
# Make Prometheus scrape itself for metrics.
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]

- job_name: 'otel-collector'
static_configs:
- targets: ["otel-collector:8889"]
metrics_path: /metrics
metric_relabel_configs:
# put the exported job (service name) in the job label
- source_labels: ["exported_job"]
regex: (.+)
action: replace
target_label: job
replacement: $1
# put the exported instance in the instance label
- source_labels: ["exported_instance"]
regex: (.*)
action: replace
target_label: instance
replacement: $1
# remove cardinality on instance when the metric is global scope, this
# effectively de-duplicates them.
- source_labels: ["scope"]
regex: global
target_label: instance
replacement: global
action: replace
# drop irrelevant labels:
# - service_instance_id is equal to instance
# - exported_instance duplicates instance
# - exported_job is already assigned to job
# - telemetry_sdk_version increases cardinality, but has little use
- action: labeldrop
regex: "^(service_instance_id|exported_instance|exported_job|telemetry_sdk_version)$"
1 change: 1 addition & 0 deletions docs/installation/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ this.
config
config_cli
deployment/index
observability/index
oidc
hardware
8 changes: 8 additions & 0 deletions docs/installation/observability/error_monitoring.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.. _installation_observability_error_monitoring:

Error monitoring
================

Uncaught exceptions are automatically sent to `Sentry <https://sentry.io/>`_, if
configured. It's highly recommended to configure Sentry for proper insight into bugs.

Loading
Loading