From a0c4fa68372e389c7438d121c2a79a97608471db Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:32:12 +0100 Subject: [PATCH 1/9] =?UTF-8?q?=E2=9E=95=20[maykinmedia/open-api-framework?= =?UTF-8?q?#152]=20Add=20OpenTelemetry=20packages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements/base.in | 2 +- requirements/base.txt | 76 +++++++++++++++++++++++++++- requirements/ci.txt | 113 ++++++++++++++++++++++++++++++++++++++++++ requirements/dev.txt | 113 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 302 insertions(+), 2 deletions(-) diff --git a/requirements/base.in b/requirements/base.in index 3871a21d..e980947d 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -14,7 +14,7 @@ notifications-api-common[setup-configuration] zgw-consumers[setup-configuration] mozilla-django-oidc-db[setup-configuration] commonground-api-common[oas] -maykin-common[axes,mfa] +maykin-common[axes,mfa,otel] # celery dependencies # TODO this should be moved to open-api-framework once it is verified that this fixes diff --git a/requirements/base.txt b/requirements/base.txt index 7f5b5853..e348a858 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -225,10 +225,18 @@ glom==23.5.0 # via # -r requirements/base.in # mozilla-django-oidc-db +googleapis-common-protos==1.72.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +grpcio==1.74.0 + # via opentelemetry-exporter-otlp-proto-grpc humanize==4.9.0 # via flower idna==3.7 # via requests +importlib-metadata==8.7.0 + # via opentelemetry-api inflection==0.5.1 # via drf-spectacular iso639-lang==2.6.0 @@ -261,6 +269,57 @@ notifications-api-common==0.7.3 # commonground-api-common open-api-framework==0.13.1 # via -r requirements/base.in +opentelemetry-api==1.38.0 + # via + # maykin-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp==1.38.0 + # via maykin-common +opentelemetry-exporter-otlp-proto-common==1.38.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.38.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.38.0 + # via opentelemetry-exporter-otlp +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-django==0.59b0 + # via maykin-common +opentelemetry-instrumentation-wsgi==0.59b0 + # via opentelemetry-instrumentation-django +opentelemetry-proto==1.38.0 + # via + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-resource-detector-containerid==0.59b0 + # via maykin-common +opentelemetry-sdk==1.38.0 + # via + # maykin-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-resource-detector-containerid +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi orderedmultidict==1.0.1 # via furl oyaml==1.0 @@ -269,12 +328,17 @@ packaging==25.0 # via # django-csp # kombu + # opentelemetry-instrumentation phonenumberslite==8.13.30 # via django-two-factor-auth prometheus-client==0.20.0 # via flower prompt-toolkit==3.0.43 # via click-repl +protobuf==6.33.1 + # via + # googleapis-common-protos + # opentelemetry-proto psycopg==3.2.9 # via # -r requirements/base.in @@ -334,6 +398,7 @@ requests==2.32.4 # django-log-outgoing-requests # mozilla-django-oidc # open-api-framework + # opentelemetry-exporter-otlp-proto-http # requests-mock # zgw-consumers requests-mock==1.12.1 @@ -366,6 +431,11 @@ tqdm==4.67.1 typing-extensions==4.9.0 # via # mozilla-django-oidc-db + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk + # opentelemetry-semantic-conventions # psycopg # psycopg-pool # pydantic @@ -394,9 +464,13 @@ webauthn==2.0.0 webencodings==0.5.1 # via bleach wrapt==1.14.1 - # via elastic-apm + # via + # elastic-apm + # opentelemetry-instrumentation zgw-consumers==1.0.0 # via # -r requirements/base.in # commonground-api-common # notifications-api-common +zipp==3.23.0 + # via importlib-metadata diff --git a/requirements/ci.txt b/requirements/ci.txt index 4866e13e..757cf185 100644 --- a/requirements/ci.txt +++ b/requirements/ci.txt @@ -397,6 +397,17 @@ glom==23.5.0 # -c requirements/base.txt # -r requirements/base.txt # mozilla-django-oidc-db +googleapis-common-protos==1.72.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +grpcio==1.74.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-grpc humanize==4.9.0 # via # -c requirements/base.txt @@ -410,6 +421,11 @@ idna==3.7 # yarl imagesize==1.4.1 # via sphinx +importlib-metadata==8.7.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-api inflection==0.5.1 # via # -c requirements/base.txt @@ -475,6 +491,84 @@ open-api-framework==0.13.1 # via # -c requirements/base.txt # -r requirements/base.txt +opentelemetry-api==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt +opentelemetry-exporter-otlp-proto-common==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp +opentelemetry-instrumentation==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-django==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-instrumentation-django +opentelemetry-proto==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-resource-detector-containerid==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt +opentelemetry-sdk==1.38.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-resource-detector-containerid +opentelemetry-semantic-conventions==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-util-http==0.59b0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi orderedmultidict==1.0.1 # via # -c requirements/base.txt @@ -491,6 +585,7 @@ packaging==25.0 # -r requirements/base.txt # django-csp # kombu + # opentelemetry-instrumentation # pytest # sphinx phonenumberslite==8.13.30 @@ -510,6 +605,12 @@ prompt-toolkit==3.0.43 # -c requirements/base.txt # -r requirements/base.txt # click-repl +protobuf==6.33.1 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # googleapis-common-protos + # opentelemetry-proto psycopg==3.2.9 # via # -c requirements/base.txt @@ -640,6 +741,7 @@ requests==2.32.4 # django-log-outgoing-requests # mozilla-django-oidc # open-api-framework + # opentelemetry-exporter-otlp-proto-http # requests-mock # sphinx # zgw-consumers @@ -737,6 +839,11 @@ typing-extensions==4.9.0 # -c requirements/base.txt # -r requirements/base.txt # mozilla-django-oidc-db + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk + # opentelemetry-semantic-conventions # psycopg # psycopg-pool # pydantic @@ -800,6 +907,7 @@ wrapt==1.14.1 # -c requirements/base.txt # -r requirements/base.txt # elastic-apm + # opentelemetry-instrumentation # vcrpy yarl==1.9.4 # via vcrpy @@ -809,3 +917,8 @@ zgw-consumers==1.0.0 # -r requirements/base.txt # commonground-api-common # notifications-api-common +zipp==3.23.0 + # via + # -c requirements/base.txt + # -r requirements/base.txt + # importlib-metadata diff --git a/requirements/dev.txt b/requirements/dev.txt index ab8d3eb8..c8c28ff6 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -453,8 +453,19 @@ glom==23.5.0 # -c requirements/ci.txt # -r requirements/ci.txt # mozilla-django-oidc-db +googleapis-common-protos==1.72.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http gprof2dot==2024.6.6 # via django-silk +grpcio==1.74.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-grpc h11==0.16.0 # via httpcore httpcore==1.0.9 @@ -481,6 +492,11 @@ imagesize==1.4.1 # -c requirements/ci.txt # -r requirements/ci.txt # sphinx +importlib-metadata==8.7.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-api inflection==0.5.1 # via # -c requirements/ci.txt @@ -570,6 +586,84 @@ open-api-framework==0.13.1 # via # -c requirements/ci.txt # -r requirements/ci.txt +opentelemetry-api==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt +opentelemetry-exporter-otlp-proto-common==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp +opentelemetry-instrumentation==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-django==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-instrumentation-django +opentelemetry-proto==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-resource-detector-containerid==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt +opentelemetry-sdk==1.38.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-resource-detector-containerid +opentelemetry-semantic-conventions==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-instrumentation + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-util-http==0.59b0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-wsgi orderedmultidict==1.0.1 # via # -c requirements/ci.txt @@ -587,6 +681,7 @@ packaging==25.0 # build # django-csp # kombu + # opentelemetry-instrumentation # pytest # sphinx phonenumberslite==8.13.30 @@ -618,6 +713,12 @@ prompt-toolkit==3.0.43 # -r requirements/ci.txt # click-repl # questionary +protobuf==6.33.1 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # googleapis-common-protos + # opentelemetry-proto psycopg==3.2.9 # via # -c requirements/ci.txt @@ -774,6 +875,7 @@ requests==2.32.4 # django-log-outgoing-requests # mozilla-django-oidc # open-api-framework + # opentelemetry-exporter-otlp-proto-http # requests-mock # sphinx # zgw-consumers @@ -926,6 +1028,11 @@ typing-extensions==4.9.0 # -r requirements/ci.txt # anyio # mozilla-django-oidc-db + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk + # opentelemetry-semantic-conventions # psycopg # psycopg-pool # pydantic @@ -1009,6 +1116,7 @@ wrapt==1.14.1 # -c requirements/ci.txt # -r requirements/ci.txt # elastic-apm + # opentelemetry-instrumentation # vcrpy yarl==1.9.4 # via @@ -1021,3 +1129,8 @@ zgw-consumers==1.0.0 # -r requirements/ci.txt # commonground-api-common # notifications-api-common +zipp==3.23.0 + # via + # -c requirements/ci.txt + # -r requirements/ci.txt + # importlib-metadata From 4bfd6db94edc53cebbd64e25c600364e5bd9b54b Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:40:38 +0100 Subject: [PATCH 2/9] =?UTF-8?q?=F0=9F=94=A7=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Enable=20OpenTelemetry=20integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/celery_flower.sh | 3 +++ bin/celery_worker.sh | 5 +++++ bin/docker_start.sh | 5 ++++- bin/setup_configuration.sh | 7 +++++-- dotenv.example | 2 ++ src/objects/conf/ci.py | 2 ++ src/objects/conf/dev.py | 3 +++ src/objects/setup.py | 12 ++++++++++++ 8 files changed, 36 insertions(+), 3 deletions(-) diff --git a/bin/celery_flower.sh b/bin/celery_flower.sh index b8b3327b..f402ae9a 100755 --- a/bin/celery_flower.sh +++ b/bin/celery_flower.sh @@ -2,4 +2,7 @@ set -e +# Set defaults for OTEL +export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-flower}" + exec celery --app objects --workdir src flower diff --git a/bin/celery_worker.sh b/bin/celery_worker.sh index ed1179c5..826fcfd1 100755 --- a/bin/celery_worker.sh +++ b/bin/celery_worker.sh @@ -14,7 +14,12 @@ if [[ "$ENABLE_COVERAGE" ]]; then _binary="coverage run $_binary" fi +# Set defaults for OTEL +export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-worker-"${QUEUE}"}" + echo "Starting celery worker $WORKER_NAME with queue $QUEUE" +# unset this if NOT using a process pool +export _OTEL_DEFER_SETUP="true" exec $_binary --workdir src --app "objects.celery" worker \ -Q $QUEUE \ -n $WORKER_NAME \ diff --git a/bin/docker_start.sh b/bin/docker_start.sh index 2ad9cff3..00ca8669 100755 --- a/bin/docker_start.sh +++ b/bin/docker_start.sh @@ -20,9 +20,12 @@ ${SCRIPTPATH}/wait_for_db.sh >&2 echo "Database is up." +# Set defaults for OTEL +export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects}" + # Apply database migrations >&2 echo "Apply database migrations" -python src/manage.py migrate +OTEL_SDK_DISABLED=True python src/manage.py migrate # Load any JSON fixtures present if [ -d $fixtures_dir ]; then diff --git a/bin/setup_configuration.sh b/bin/setup_configuration.sh index a133e964..d2f87dce 100755 --- a/bin/setup_configuration.sh +++ b/bin/setup_configuration.sh @@ -5,10 +5,13 @@ set -e +# Set defaults for OTEL +export OTEL_SERVICE_NAME="${OTEL_SERVICE_NAME:-objects-setup-configuration}" + if [[ "${RUN_SETUP_CONFIG,,}" =~ ^(true|1|yes)$ ]]; then # wait for required services /wait_for_db.sh - src/manage.py migrate - src/manage.py setup_configuration --yaml-file setup_configuration/data.yaml + OTEL_SDK_DISABLED=True src/manage.py migrate + OTEL_SDK_DISABLED=True src/manage.py setup_configuration --yaml-file setup_configuration/data.yaml fi diff --git a/dotenv.example b/dotenv.example index f9959903..50053069 100644 --- a/dotenv.example +++ b/dotenv.example @@ -8,3 +8,5 @@ DB_USER="objects" DB_PASSWORD="" DB_HOST="" DB_PORT="" + +OTEL_SERVICE_NAME=objects \ No newline at end of file diff --git a/src/objects/conf/ci.py b/src/objects/conf/ci.py index 8e000a0b..4ccbc9ad 100644 --- a/src/objects/conf/ci.py +++ b/src/objects/conf/ci.py @@ -9,6 +9,8 @@ os.environ.setdefault("SECRET_KEY", "dummy") os.environ.setdefault("IS_HTTPS", "no") os.environ.setdefault("ENVIRONMENT", "ci") +os.environ.setdefault("OTEL_SDK_DISABLED", "true") +os.environ.setdefault("OTEL_SERVICE_NAME", "objects-ci") from .base import * # noqa isort:skip diff --git a/src/objects/conf/dev.py b/src/objects/conf/dev.py index d4ec0b98..f7fe4855 100644 --- a/src/objects/conf/dev.py +++ b/src/objects/conf/dev.py @@ -17,6 +17,9 @@ os.environ.setdefault("DB_USER", "objects") os.environ.setdefault("DB_PASSWORD", "objects") +os.environ.setdefault("OTEL_SDK_DISABLED", "true") +os.environ.setdefault("OTEL_EXPORTER_OTLP_METRICS_INSECURE", "true") + from .base import * # noqa isort:skip # diff --git a/src/objects/setup.py b/src/objects/setup.py index b96a0054..6b2c3e07 100644 --- a/src/objects/setup.py +++ b/src/objects/setup.py @@ -11,11 +11,13 @@ """ import os +import warnings from django.conf import settings import structlog from dotenv import load_dotenv +from maykin_common.otel import setup_otel logger = structlog.stdlib.get_logger(__name__) @@ -26,6 +28,16 @@ def setup_env(): load_dotenv(dotenv_path) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "objects.conf.dev") + if "OTEL_SERVICE_NAME" not in os.environ: + warnings.warn( + "No OTEL_SERVICE_NAME environment variable set, using a default. " + "You should set a (distinct) value for each component (web, worker...)", + RuntimeWarning, + stacklevel=2, + ) + os.environ.setdefault("OTEL_SERVICE_NAME", "objects") + + setup_otel() structlog.contextvars.bind_contextvars(source="app") From 83abc2fac01b25a30a5925844d6ae7318148eecf Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:47:42 +0100 Subject: [PATCH 3/9] =?UTF-8?q?=F0=9F=90=B3=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Add=20OTel=20collector=20and=20prometheus=20to=20obs?= =?UTF-8?q?ervability=20stack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker/docker-compose.observability.yaml | 47 +++++++----- docker/observability/README.md | 20 +++++ .../observability/grafana/datasources/ds.yml | 28 +++++++ .../otel/otel-collector-config.yml | 62 ++++++++++++++++ .../observability/prometheus/prometheus.yml | 74 +++++++++++++++++++ 5 files changed, 211 insertions(+), 20 deletions(-) create mode 100644 docker/observability/grafana/datasources/ds.yml create mode 100644 docker/observability/otel/otel-collector-config.yml create mode 100644 docker/observability/prometheus/prometheus.yml diff --git a/docker/docker-compose.observability.yaml b/docker/docker-compose.observability.yaml index 02f1d6d4..50dfea08 100644 --- a/docker/docker-compose.observability.yaml +++ b/docker/docker-compose.observability.yaml @@ -11,6 +11,16 @@ services: networks: - objects-dev + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-tsdb:/prometheus + networks: + - objects-dev + promtail: image: grafana/promtail:latest volumes: @@ -24,32 +34,28 @@ services: networks: - objects-dev + # open telemetry collector, receives metrics from the application instance(s) + otel-collector: + image: otel/opentelemetry-collector-contrib:0.131.0 + command: --config=/etc/otel-collector-config.yaml + volumes: + - ./observability/otel/otel-collector-config.yml:/etc/otel-collector-config.yaml + ports: + - 4317:4317 + - 4318:4318 + - 8889:8889 + networks: + - objects-dev + grafana: + image: grafana/grafana:latest + volumes: + - ./observability/grafana/datasources:/etc/grafana/provisioning/datasources environment: - GF_PATHS_PROVISIONING=/etc/grafana/provisioning - GF_AUTH_ANONYMOUS_ENABLED=true - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin - GF_FEATURE_TOGGLES_ENABLE=alertingSimplifiedRouting,alertingQueryAndExpressionsStepMode - entrypoint: - - sh - - -euc - - | - mkdir -p /etc/grafana/provisioning/datasources - cat < /etc/grafana/provisioning/datasources/ds.yaml - apiVersion: 1 - datasources: - - name: Loki - type: loki - access: proxy - orgId: 1 - url: http://loki:3100 - basicAuth: false - isDefault: true - version: 1 - editable: false - EOF - /run.sh - image: grafana/grafana:latest ports: - "3000:3000" networks: @@ -57,6 +63,7 @@ services: volumes: promtail-logs: + prometheus-tsdb: networks: objects-dev: diff --git a/docker/observability/README.md b/docker/observability/README.md index 9d82bcc9..1e80489d 100644 --- a/docker/observability/README.md +++ b/docker/observability/README.md @@ -23,6 +23,7 @@ You can now navigate to: - http://localhost:3000 for Grafana - http://localhost:3100/ready for Loki readiness - http://localhost:3100/metrics for Loki metrics +- http://localhost:9090 for the Prometheus web interface ## Logging @@ -48,3 +49,22 @@ You can filter application logs based on a request ID: ```logql {job="docker", app="objects-api"} | json | __error__ = "" | request_id=`1e9e1b9d-4d34-4657-99e4-88673d824724` ``` + +## Metrics + +Metrics can be sent using OTLP to the collector at http://localhost:4317 (gRPC). + +The `maykin_common.otel` module takes care of setting everything up, just make sure to set the +environment variable `OTEL_SDK_DISABLED=false` in development (it's disabled by default). + +The collector ingests the metrics, and they are then scraped by Prometheus. They're also printed to +stdout. + +## Traces + +Traces can be sent using OTLP to the collector at http://localhost:4317 (gRPC). + +The `maykin_common.otel` module takes care of setting everything up, just make sure to set the +environment variable `OTEL_SDK_DISABLED=false` in development (it's disabled by default). + +The collector ingests the traces and prints them out to stdout. \ No newline at end of file diff --git a/docker/observability/grafana/datasources/ds.yml b/docker/observability/grafana/datasources/ds.yml new file mode 100644 index 00000000..1c33771b --- /dev/null +++ b/docker/observability/grafana/datasources/ds.yml @@ -0,0 +1,28 @@ +--- + +apiVersion: 1 + +datasources: +- name: Loki + type: loki + access: proxy + orgId: 1 + url: http://loki:3100 + basicAuth: false + isDefault: true + version: 1 + editable: false +- name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + jsonData: + httpMethod: POST + manageAlerts: false + allowAsRecordingRulesTarget: false + prometheusType: Prometheus + prometheusVersion: 3.5.0 + cacheLevel: 'Low' + disableRecordingRules: false + timeInterval: 15s # Prometheus scrape interval + incrementalQueryOverlapWindow: 10m diff --git a/docker/observability/otel/otel-collector-config.yml b/docker/observability/otel/otel-collector-config.yml new file mode 100644 index 00000000..010f0fc5 --- /dev/null +++ b/docker/observability/otel/otel-collector-config.yml @@ -0,0 +1,62 @@ +--- + +extensions: + basicauth/server: + htpasswd: + inline: | + otel:supersecret + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + auth: + authenticator: basicauth/server + http: + endpoint: 0.0.0.0:4318 + auth: + authenticator: basicauth/server + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + memory_limiter: + check_interval: 1s + limit_mib: 1024 + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["client"], resource.attributes["of.client"]) + - set(attributes["target"], resource.attributes["of.target"]) + +exporters: + # NOTE: Prior to v0.86.0 use `logging` instead of `debug`. + debug: + verbosity: detailed + # Exposes a metrics endpoint that prometheus can scrape + prometheus: + endpoint: 0.0.0.0:8889 + namespace: otel # this becomes a prefix in the prometheus metrics: `otel_*` + const_labels: + environment: local_otel_collector + resource_to_telemetry_conversion: + enabled: true + +service: + extensions: [basicauth/server] + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [debug] + metrics: + receivers: [otlp] + processors: [memory_limiter, transform, batch] + exporters: [debug, prometheus] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [debug] diff --git a/docker/observability/prometheus/prometheus.yml b/docker/observability/prometheus/prometheus.yml new file mode 100644 index 00000000..a66979bd --- /dev/null +++ b/docker/observability/prometheus/prometheus.yml @@ -0,0 +1,74 @@ +--- + +# From https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-otlp.yml +# and https://github.com/prometheus/prometheus/blob/main/documentation/examples/prometheus-docker.yml + +global: + scrape_interval: 15s + evaluation_interval: 60s + +# otlp: +# # Recommended attributes to be promoted to labels. +# promote_resource_attributes: +# - service.instance.id +# - service.name +# - service.namespace +# - cloud.availability_zone +# - cloud.region +# - container.name +# - deployment.environment.name +# - k8s.cluster.name +# - k8s.container.name +# - k8s.cronjob.name +# - k8s.daemonset.name +# - k8s.deployment.name +# - k8s.job.name +# - k8s.namespace.name +# - k8s.pod.name +# - k8s.replicaset.name +# - k8s.statefulset.name +# # Ingest OTLP data keeping all characters in metric/label names. +# translation_strategy: NoUTF8EscapingWithSuffixes + +# storage: +# # OTLP is a push-based protocol, Out of order samples is a common scenario. +# tsdb: +# out_of_order_time_window: 30m + +scrape_configs: + # Make Prometheus scrape itself for metrics. + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + - job_name: 'otel-collector' + static_configs: + - targets: ["otel-collector:8889"] + metrics_path: /metrics + metric_relabel_configs: + # put the exported job (service name) in the job label + - source_labels: ["exported_job"] + regex: (.+) + action: replace + target_label: job + replacement: $1 + # put the exported instance in the instance label + - source_labels: ["exported_instance"] + regex: (.*) + action: replace + target_label: instance + replacement: $1 + # remove cardinality on instance when the metric is global scope, this + # effectively de-duplicates them. + - source_labels: ["scope"] + regex: global + target_label: instance + replacement: global + action: replace + # drop irrelevant labels: + # - service_instance_id is equal to instance + # - exported_instance duplicates instance + # - exported_job is already assigned to job + # - telemetry_sdk_version increases cardinality, but has little use + - action: labeldrop + regex: "^(service_instance_id|exported_instance|exported_job|telemetry_sdk_version)$" From 06a1222a973d0e0a06730b7f88723fa5cfecb519 Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:51:02 +0100 Subject: [PATCH 4/9] =?UTF-8?q?=F0=9F=94=A8=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Apply=20some=20best=20practices=20to=20uwsgi?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/docker_start.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/docker_start.sh b/bin/docker_start.sh index 00ca8669..08843289 100755 --- a/bin/docker_start.sh +++ b/bin/docker_start.sh @@ -53,6 +53,7 @@ fi # Start server >&2 echo "Starting server" uwsgi \ + --strict \ --ini "${SCRIPTPATH}/uwsgi.ini" \ --http :$uwsgi_port \ --http-keepalive \ @@ -62,6 +63,9 @@ uwsgi \ --static-map /media=/app/media \ --chdir src \ --enable-threads \ + --single-interpreter \ + --die-on-term \ + --need-app \ --processes $uwsgi_processes \ --threads $uwsgi_threads \ --buffer-size=65535 From 2a7912282d73d1becfa0816d4374892d7d0f4b2e Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:52:07 +0100 Subject: [PATCH 5/9] =?UTF-8?q?=F0=9F=90=B3=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Add=20OTel=20env=20vars=20to=20docker-compose?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docker-compose.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docker-compose.yml b/docker-compose.yml index 7c92a589..e427ba0a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -37,6 +37,17 @@ services: SUBPATH: ${SUBPATH} DB_CONN_MAX_AGE: "0" DB_POOL_ENABLED: True + + # Enabling Open Telemetry requires the services in docker/docker-compose.observability.yaml + # to be up and running. + OTEL_SDK_DISABLED: ${OTEL_SDK_DISABLED:-true} + OTEL_RESOURCE_ATTRIBUTES: maykin.saas.client=maykin,maykin.saas.target=dev + OTEL_METRIC_EXPORT_INTERVAL: ${OTEL_METRIC_EXPORT_INTERVAL:-60000} + OTEL_EXPORTER_OTLP_ENDPOINT: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://otel-collector:4317} # gRPC + # otel:supersecret, escape spaces and = with percent encoding + OTEL_EXPORTER_OTLP_HEADERS: Authorization=Basic b3RlbDpzdXBlcnNlY3JldA== + OTEL_EXPORTER_OTLP_METRICS_INSECURE: ${OTEL_EXPORTER_OTLP_METRICS_INSECURE:-true} + _OTEL_ENABLE_CONTAINER_RESOURCE_DETECTOR: true healthcheck: test: ["CMD", "python", "-c", "import requests; exit(requests.head('http://localhost:8000/admin/').status_code not in [200, 302])"] interval: 30s From 4e728722f72c3b219de43622893a70bf7af5760b Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 11:57:07 +0100 Subject: [PATCH 6/9] =?UTF-8?q?=F0=9F=93=88=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Add=20generic=20user=20metrics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/objects/accounts/apps.py | 3 + src/objects/accounts/metrics.py | 63 +++++++++++++++++++++ src/objects/accounts/signals.py | 63 +++++++++++++++++++++ src/objects/accounts/tests/test_metrics.py | 64 ++++++++++++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 src/objects/accounts/metrics.py create mode 100644 src/objects/accounts/signals.py create mode 100644 src/objects/accounts/tests/test_metrics.py diff --git a/src/objects/accounts/apps.py b/src/objects/accounts/apps.py index f9c12645..913b5706 100644 --- a/src/objects/accounts/apps.py +++ b/src/objects/accounts/apps.py @@ -23,4 +23,7 @@ class AccountsConfig(AppConfig): name = "objects.accounts" def ready(self): + from . import metrics # noqa + from . import signals # noqa + post_migrate.connect(update_admin_index, sender=self) diff --git a/src/objects/accounts/metrics.py b/src/objects/accounts/metrics.py new file mode 100644 index 00000000..d42a8c46 --- /dev/null +++ b/src/objects/accounts/metrics.py @@ -0,0 +1,63 @@ +from collections.abc import Collection + +from django.db.models import Count, Q + +from opentelemetry import metrics + +from .models import User + +meter = metrics.get_meter("objects.accounts") + + +def count_users(options: metrics.CallbackOptions) -> Collection[metrics.Observation]: + counts: dict[str, int] = User.objects.aggregate( + total=Count("id"), + staff=Count("id", filter=Q(is_staff=True)), + superuser=Count("id", filter=Q(is_superuser=True)), + ) + return ( + metrics.Observation( + counts["total"], + {"scope": "global", "type": "all"}, + ), + metrics.Observation( + counts["staff"], + {"scope": "global", "type": "staff"}, + ), + metrics.Observation( + counts["superuser"], + {"scope": "global", "type": "superuser"}, + ), + ) + + +meter.create_observable_gauge( + name="objects.auth.user_count", + description="The number of application users in the database.", + unit=r"{user}", # no unit so that the _ratio suffix is not added + callbacks=[count_users], +) + +logins = meter.create_counter( + "objects.auth.logins", + unit="1", # unitless count + description="The number of successful user logins.", +) + +logouts = meter.create_counter( + "objects.auth.logouts", + unit="1", # unitless count + description="The number of user logouts.", +) + +login_failures = meter.create_counter( + "objects.auth.login_failures", + unit="1", # unitless count + description="The number of failed logins by users, including the admin.", +) + +user_lockouts = meter.create_counter( + "objects.auth.user_lockouts", + unit="1", # unitless count + description="The number of user lockouts because of failed logins.", +) diff --git a/src/objects/accounts/signals.py b/src/objects/accounts/signals.py new file mode 100644 index 00000000..80c43dbf --- /dev/null +++ b/src/objects/accounts/signals.py @@ -0,0 +1,63 @@ +from typing import Literal + +from django.contrib.auth.signals import ( + user_logged_in, + user_logged_out, + user_login_failed, +) +from django.dispatch import receiver +from django.http import HttpRequest + +from axes.signals import user_locked_out + +from .metrics import login_failures, logins, logouts, user_lockouts +from .models import User + + +@receiver(user_logged_in, dispatch_uid="user_logged_in.increment_counter") +def increment_logins_counter( + sender: type[User], request: HttpRequest | None, user: User, **kwargs +) -> None: + logins.add( + 1, + attributes={ + "username": user.username, + "http_target": request.path if request else "", + }, + ) + + +@receiver(user_logged_out, dispatch_uid="user_logged_out.increment_counter") +def increment_logouts_counter( + sender: type[User], request: HttpRequest | None, user: User | None, **kwargs +) -> None: + if user is None: + return + logouts.add(1, attributes={"username": user.username}) + + +@receiver(user_login_failed, dispatch_uid="user_login_failed.increment_counter") +def increment_login_failure_counter( + sender, request: HttpRequest | None = None, **kwargs +): + login_failures.add( + 1, + attributes={"http_target": request.path if request else ""}, + ) + + +@receiver(user_locked_out, dispatch_uid="user_locked_out.increment_counter") +def increment_user_locked_out_counter( + sender: Literal["axes"], + request: HttpRequest, + username: str, + ip_address: str, + **kwargs, +) -> None: + user_lockouts.add( + 1, + attributes={ + "http_target": request.path, + "username": username, + }, + ) diff --git a/src/objects/accounts/tests/test_metrics.py b/src/objects/accounts/tests/test_metrics.py new file mode 100644 index 00000000..f2ed34ee --- /dev/null +++ b/src/objects/accounts/tests/test_metrics.py @@ -0,0 +1,64 @@ +from unittest.mock import MagicMock, patch + +from django.contrib.auth import authenticate +from django.test import RequestFactory, TestCase, override_settings + +from maykin_common.tests.otel import MetricsAssertMixin +from opentelemetry.metrics import CallbackOptions + +from ..metrics import count_users, login_failures, user_lockouts +from .factories import UserFactory + + +class UserCountMetricTests(MetricsAssertMixin, TestCase): + def test_count_users_by_type(self): + UserFactory.create_batch(3) + UserFactory.create_batch(2, is_staff=True) + UserFactory.create_batch(4, is_staff=True, is_superuser=True) + + result = count_users(CallbackOptions()) + + counts_by_type = { + observation.attributes["type"]: observation.value + for observation in result + if observation.attributes + } + self.assertEqual( + counts_by_type, + { + "all": 3 + 2 + 4, + "staff": 2 + 4, + "superuser": 4, + }, + ) + self.assertMarkedGlobal(result) + + +class LoginFailuresMetricTests(TestCase): + @patch.object(login_failures, "add", wraps=login_failures.add) + def test_login_failures_tracked(self, mock_add: MagicMock): + request = RequestFactory().post("/admin/login/") + + # invalid credentials, no such user exists + authenticate(request=request, username="foo", password="bar") + + mock_add.assert_called_once_with(1, attributes={"http_target": "/admin/login/"}) + + +@override_settings(AXES_FAILURE_LIMIT=2) +class LockoutsMetricTests(TestCase): + @patch.object(user_lockouts, "add", wraps=user_lockouts.add) + def test_no_counter_increment_if_not_yet_locked_out(self, mock_add: MagicMock): + request = RequestFactory().post("/admin/login/") + + with self.subTest(attempt=1, lockout=False): + # invalid credentials, no such user exists + authenticate(request=request, username="foo", password="bar") + + self.assertFalse(mock_add.called) + + with self.subTest(attempt=2, lockout=True): + # invalid credentials, no such user exists + authenticate(request=request, username="foo", password="still wrong") + + self.assertTrue(mock_add.called) From 6abb3565373f4838ab7f68b6470b2adaab343936 Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 14:07:33 +0100 Subject: [PATCH 7/9] =?UTF-8?q?=F0=9F=93=88=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20Add=20metrics=20for=20objects?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/objects/api/metrics.py | 19 +++++ src/objects/api/v2/views.py | 16 +++- src/objects/tests/v2/test_metrics.py | 115 +++++++++++++++++++++++++++ src/objects/utils/apps.py | 1 + 4 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/objects/api/metrics.py create mode 100644 src/objects/tests/v2/test_metrics.py diff --git a/src/objects/api/metrics.py b/src/objects/api/metrics.py new file mode 100644 index 00000000..712836e6 --- /dev/null +++ b/src/objects/api/metrics.py @@ -0,0 +1,19 @@ +from opentelemetry import metrics + +meter = metrics.get_meter("objects.api") + +objects_create_counter = meter.create_counter( + "objects.objects.creates", + description="Amount of objects created (via the API).", + unit="1", +) +objects_update_counter = meter.create_counter( + "objects.objects.updates", + description="Amount of objects updated (via the API).", + unit="1", +) +objects_delete_counter = meter.create_counter( + "objects.objects.deletes", + description="Amount of objects deleted (via the API).", + unit="1", +) diff --git a/src/objects/api/v2/views.py b/src/objects/api/v2/views.py index 732b67f8..0ebec27f 100644 --- a/src/objects/api/v2/views.py +++ b/src/objects/api/v2/views.py @@ -23,6 +23,11 @@ from ..filter_backends import OrderingBackend from ..kanalen import KANAAL_OBJECTEN +from ..metrics import ( + objects_create_counter, + objects_delete_counter, + objects_update_counter, +) from ..mixins import GeoMixin, ObjectNotificationMixin from ..pagination import DynamicPageSizePagination from ..serializers import ( @@ -144,8 +149,17 @@ def filter_queryset(self, queryset): # filter on the rest of query params return super().filter_queryset(queryset) + def perform_create(self, serializer): + super().perform_create(serializer) + objects_create_counter.add(1) + + def perform_update(self, serializer): + super().perform_update(serializer) + objects_update_counter.add(1) + def perform_destroy(self, instance): - instance.object.delete() + super().perform_destroy(instance) + objects_delete_counter.add(1) @extend_schema( description="Retrieve all RECORDs of an OBJECT.", diff --git a/src/objects/tests/v2/test_metrics.py b/src/objects/tests/v2/test_metrics.py new file mode 100644 index 00000000..17787540 --- /dev/null +++ b/src/objects/tests/v2/test_metrics.py @@ -0,0 +1,115 @@ +from typing import cast +from unittest.mock import MagicMock, patch + +import requests_mock +from freezegun import freeze_time +from rest_framework.test import APITestCase + +from objects.api.metrics import ( + objects_create_counter, + objects_delete_counter, + objects_update_counter, +) +from objects.core.models import ObjectType +from objects.core.tests.factories import ( + ObjectFactory, + ObjectRecordFactory, + ObjectTypeFactory, +) +from objects.token.constants import PermissionModes +from objects.token.tests.factories import PermissionFactory +from objects.utils.test import TokenAuthMixin + +from ..constants import GEO_WRITE_KWARGS +from ..utils import mock_objecttype, mock_objecttype_version, mock_service_oas_get +from .utils import reverse + +OBJECT_TYPES_API = "https://example.com/objecttypes/v1/" + + +@freeze_time("2024-08-31") +class ObjectMetricsTests(TokenAuthMixin, APITestCase): + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.object_type = cast( + ObjectType, ObjectTypeFactory(service__api_root=OBJECT_TYPES_API) + ) + PermissionFactory.create( + object_type=cls.object_type, + mode=PermissionModes.read_and_write, + token_auth=cls.token_auth, + ) + + def create_object_with_record(self, diameter: int = 10): + obj = ObjectFactory.create(object_type=self.object_type) + ObjectRecordFactory.create( + object=obj, + version=1, + data={"diameter": diameter}, + start_at="2024-08-31", + ) + return obj + + @requests_mock.Mocker() + @patch.object(objects_create_counter, "add", wraps=objects_create_counter.add) + def test_objects_create_counter(self, m, mock_add: MagicMock): + mock_service_oas_get(m, OBJECT_TYPES_API, "objecttypes") + m.get( + f"{self.object_type.url}/versions/1", + json=mock_objecttype_version(self.object_type.url), + ) + m.get(self.object_type.url, json=mock_objecttype(self.object_type.url)) + + url = reverse("object-list") + data = { + "type": self.object_type.url, + "record": { + "typeVersion": 1, + "data": {"diameter": 10}, + "startAt": "2024-08-31", + }, + } + response = self.client.post(url, data, **GEO_WRITE_KWARGS) + self.assertEqual(response.status_code, 201) + mock_add.assert_called_once_with(1) + + @requests_mock.Mocker() + @patch.object(objects_update_counter, "add", wraps=objects_update_counter.add) + def test_objects_update_counter(self, m, mock_add: MagicMock): + mock_service_oas_get(m, OBJECT_TYPES_API, "objecttypes") + m.get( + f"{self.object_type.url}/versions/1", + json=mock_objecttype_version(self.object_type.url), + ) + m.get(self.object_type.url, json=mock_objecttype(self.object_type.url)) + + obj = self.create_object_with_record() + url = reverse("object-detail", args=[obj.uuid]) + data = { + "record": { + "typeVersion": 1, + "data": {"diameter": 20}, + "startAt": "2024-08-31", + } + } + response = self.client.patch(url, data, **GEO_WRITE_KWARGS) + self.assertEqual(response.status_code, 200) + mock_add.assert_called_once_with(1) + + @requests_mock.Mocker() + @patch.object(objects_delete_counter, "add", wraps=objects_delete_counter.add) + def test_objects_delete_counter(self, m, mock_add: MagicMock): + mock_service_oas_get(m, OBJECT_TYPES_API, "objecttypes") + m.get( + f"{self.object_type.url}/versions/1", + json=mock_objecttype_version(self.object_type.url), + ) + m.get(self.object_type.url, json=mock_objecttype(self.object_type.url)) + + obj = self.create_object_with_record() + url = reverse("object-detail", args=[obj.uuid]) + response = self.client.delete(url, **GEO_WRITE_KWARGS) + self.assertEqual(response.status_code, 204) + + mock_add.assert_called_once_with(1) diff --git a/src/objects/utils/apps.py b/src/objects/utils/apps.py index db0c2933..0f914f11 100644 --- a/src/objects/utils/apps.py +++ b/src/objects/utils/apps.py @@ -24,6 +24,7 @@ class UtilsConfig(AppConfig): def ready(self): from . import oas_extensions # noqa + from ..api import metrics # noqa unregister_camelize_filter_extension() From f40eeae40760408a6717ee27fa2dc772704bd427 Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Thu, 20 Nov 2025 14:16:28 +0100 Subject: [PATCH 8/9] =?UTF-8?q?=F0=9F=93=9D=20[maykinmedia/open-api-framew?= =?UTF-8?q?ork#152]=20add=20observability=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/installation/index.rst | 1 + .../observability/error_monitoring.rst | 8 ++ docs/installation/observability/index.rst | 32 ++++++ .../observability}/logging.rst | 53 +++++++++ docs/installation/observability/metrics.rst | 108 ++++++++++++++++++ .../observability/otel_config.rst | 52 +++++++++ docs/installation/observability/tracing.rst | 15 +++ docs/manual/index.rst | 1 - src/objects/api/v2/views.py | 2 +- 9 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 docs/installation/observability/error_monitoring.rst create mode 100644 docs/installation/observability/index.rst rename docs/{manual => installation/observability}/logging.rst (77%) create mode 100644 docs/installation/observability/metrics.rst create mode 100644 docs/installation/observability/otel_config.rst create mode 100644 docs/installation/observability/tracing.rst diff --git a/docs/installation/index.rst b/docs/installation/index.rst index 95ca5f28..a26f7668 100644 --- a/docs/installation/index.rst +++ b/docs/installation/index.rst @@ -19,5 +19,6 @@ this. config config_cli deployment/index + observability/index oidc hardware diff --git a/docs/installation/observability/error_monitoring.rst b/docs/installation/observability/error_monitoring.rst new file mode 100644 index 00000000..0679f0c0 --- /dev/null +++ b/docs/installation/observability/error_monitoring.rst @@ -0,0 +1,8 @@ +.. _installation_observability_error_monitoring: + +Error monitoring +================ + +Uncaught exceptions are automatically sent to `Sentry `_, if +configured. It's highly recommended to configure Sentry for proper insight into bugs. + diff --git a/docs/installation/observability/index.rst b/docs/installation/observability/index.rst new file mode 100644 index 00000000..798015b3 --- /dev/null +++ b/docs/installation/observability/index.rst @@ -0,0 +1,32 @@ +.. _installation_observability_index: + +============= +Observability +============= + +Observability is an umbrella term for a number of principles and technologies to get +insight in running (distributed) systems. It typically focuses on Metrics, Logging and +Tracing, which provide insight in: + +* what the application is doing, in particular as part of a larger system, such as + microservice environments +* performance of the system +* how the system is used + +Objects API operates in distributed environments, and being able to fully trace a +customer request from start to end, observability tools are crucial. Below we provide +some additional context for infastructure teams that wish to integrate Objects API in +their observability stack. + +.. toctree:: + :maxdepth: 1 + :caption: Contents + + logging + metrics + tracing + error_monitoring + otel_config + +.. seealso:: The base integration layer is provided through our shared library, which + includes some `architecture documentation `_. diff --git a/docs/manual/logging.rst b/docs/installation/observability/logging.rst similarity index 77% rename from docs/manual/logging.rst rename to docs/installation/observability/logging.rst index f36e6e82..ca394ead 100644 --- a/docs/manual/logging.rst +++ b/docs/installation/observability/logging.rst @@ -1,3 +1,56 @@ +.. _installation_observability_logging: + +======= +Logging +======= + +Logging is the practice of emitting log messages that describe what is happening in the +system, or "events" in short. Log events can have varying degrees of severity, such as +``debug``, ``info``, ``warning``, ``error`` or even ``critical``. By default, Objects API +emits logs with level ``info`` and higher. + +A collection of log events with a correlation ID (like a request or trace ID) allow one +to reconstruct the chain of events that took place which lead to a particular outcome. + +Objects API emits structured logs in JSON format (unless explicitly configured otherwise), +which should make log aggregation and analysis easier. + +We try to keep a consistent log message structure, where the following keys +are (usually) present: + +``source`` + The component in the application stack that produced the log entry. Typical + values are ``uwsgi`` and ``app``. + +``level`` + The severity level of the log message. One of ``debug``, ``info``, ``warning``, + ``error`` or ``critical``. + +``timestamp`` + The moment when the log entry was produced, a string in ISO-8601 format. Most of + the logs have microsecond precision, but some of them are limited to second + precision. + +``event`` + The event that occurred, e.g. ``request_started`` or ``spawned worker (PID 123)``. + This gives the semantic meaning to the log entry. + +Other keys that frequently occur are: + +``request_id`` + Present for application logs emitted during an HTTP request, makes it possible to + correlate multiple log entries for a single request. Not available in logs emitted + by background tasks or logs emitted before/after the Objects API app. + +.. tip:: Certain log aggregation solutions require you to configure "labels" to extract + for efficient querying. You can use the above summary of log context keys to configure + this according to your needs. + +.. note:: We can not 100% guarantee that every log message will always be JSON due to + limitations in third party software/packages that we use. Most (if not all) log + aggregation technologies support handling both structured and unstructured logs. + + .. _manual_logging: Logging diff --git a/docs/installation/observability/metrics.rst b/docs/installation/observability/metrics.rst new file mode 100644 index 00000000..90ddafb4 --- /dev/null +++ b/docs/installation/observability/metrics.rst @@ -0,0 +1,108 @@ +.. _installation_observability_metrics: + +======= +Metrics +======= + +Objects API produces application metrics (using Open Telemetry). + +.. note:: The exact metric names that show up may be transformed, e.g. Prometheus replaces + periods with underscores, and processing pipelines may add prefixes or suffixes. + +.. important:: Some metrics are defined as "global scope". + + These metrics are typically derived from application state introspection, e.g. by + performing database (read) queries to aggregate some information. Usually those + correspond to an `Asynchronous Gauge `_. + + Multiple replicas and/or instances of the same service will produce the same values + of the metrics. You need to apply some kind of aggregation to de-duplicate these + values. The attribute ``scope="global"`` acts as a marker for these type of metrics. + + With PromQL for example, you can use ``avg`` on the assumption that all values will + be equal, so the average will also be identical: + + .. code-block:: promql + + avg by (type) (otel_objects_auth_user_count{scope="global"}) + +Generic +======= + +``http.server.duration`` + Captures how long each HTTP request took, in ms. The metric produces histogram data. + +``http.server.request.duration`` (not active) + The future replacement of ``http.server.duration``, in seconds. Currently not + enabled, but the code is in the Open Telemetry SDK instrumentation already and could + possibly be opted-in to. + +Application specific +==================== + +Accounts +-------- + +``objects.auth.user_count`` + Reports the number of users in the database. This is a global metric, you must take + care in de-duplicating results. Additional attributes are: + + - ``scope`` - fixed, set to ``global`` to enable de-duplication. + - ``type`` - the user type. ``all``, ``staff`` or ``superuser``. + + Sample PromQL query: + + .. code-block:: promql + + max by (type) (last_over_time( + otel_objects_auth_user_count{scope="global"} + [1m] + )) + +``objects.auth.login_failures`` + A counter incremented every time a user login fails (typically because of invalid + credentials). Does not include the second factor, if enabled. Additional attributes: + + - ``http_target`` - the request path where the login failure occurred, if this + happened in a request context. + +``objects.auth.user_lockouts`` + A counter incremented every time a user is locked out because they reached the + maximum number of failed attempts. Additional attributes: + + - ``http_target`` - the request path where the login failure occurred, if this + happened in a request context. + - ``username`` - username of the user trying to log in. + +``objects.auth.logins`` + Counter incrementing on every successful login by a user. Additional attributes: + + - ``http_target`` - the request path where the login failure occurred, if this + happened in a request context. + - ``username`` - username of the user trying to log in. + +``objects.auth.logouts`` + Counter incrementing every time a user logs out. Additional attributes: + + - ``username`` - username of the user who logged out. + +Objects +------- + +``objects.objects.creates`` + Reports the number of objects created via the API. + +``objects.objects.updates`` + Reports the number of objects updated via the API. + +``objects.objects.deletes`` + Reports the number of objects deleted via the API. + +The objects metrics show how many entities are created, updated, or deleted via the API, +helping to monitor load and the most frequent operations, and allow for various aggregations on the data. + + Sample PromQL query: + + .. code-block:: promql + + sum by (otel_scope_name) (otel_objects_zaken_updates_total) diff --git a/docs/installation/observability/otel_config.rst b/docs/installation/observability/otel_config.rst new file mode 100644 index 00000000..23878b1b --- /dev/null +++ b/docs/installation/observability/otel_config.rst @@ -0,0 +1,52 @@ +.. _installation_observability_otel_config: + +============================ +Open Telemetry Configuration +============================ + +You should be able to use the standard Open Telemetry +`environment variables `_, +but we highlight some that you'd commonly want to specify for typical use cases. + +Disabling Open Telemetry +======================== + +Set ``OTEL_SDK_DISABLED=true`` to disable telemetry entirely. This does not affect the +(structured) logging to the container stdout/stderr. + +Configuring the Open Telemetry sink +=================================== + +Enabling Open Telemetry (enabled by default) requires you to have a "sink" to push the +telemetry data to. Objects API only supports the Open Telemetry Protocol (OTLP). You can +use any vendor that supports this protocol (over gRPC or HTTP/protobuf). + +.. tip:: We recommend the usage of the Open Telemetry + `Collector `_ as sink - you are then in + full control of how telemetry is processed and exported. + +**Environment variables you likely want to set** + +* ``OTEL_EXPORTER_OTLP_ENDPOINT``: network address where to send the metrics to. Examples + are: ``https://otel.example.com:4318`` or ``http://otel-collector.namespace.cluster.svc:4317``. + It defaults to ``localhost:4317``, which will **not** work in a container context. + +* ``OTEL_EXPORTER_OTLP_METRICS_INSECURE``: set to ``true`` if the endoint is not protected + with TLS. + +* ``OTEL_EXPORTER_OTLP_HEADERS``: Any additional HTTP headers, e.g. when your collector + is username/password protected with Basic auth, you want something like: + ``Authorization=Basic ``. + +* ``OTEL_EXPORTER_OTLP_PROTOCOL``: controls the wire protocol for the OTLP data. Defaults to + ``grpc``. Available options: ``grpc`` and ``http/protobuf``. + +* ``OTEL_METRIC_EXPORT_INTERVAL``: controls how often (in milliseconds) the metrics are + exported. The exports run in a background thread and should not affect the performance + of the application. The default is every minute (``60000``). + +* ``_OTEL_ENABLE_CONTAINER_RESOURCE_DETECTOR=true``: enable this when not deploying on + Kubernetes, but in another container runtime like Docker or Podman. + + .. tip:: On Kubernetes, use the Collector + `attributes processor `_. diff --git a/docs/installation/observability/tracing.rst b/docs/installation/observability/tracing.rst new file mode 100644 index 00000000..0a4070aa --- /dev/null +++ b/docs/installation/observability/tracing.rst @@ -0,0 +1,15 @@ +.. _installation_observability_tracing: + +======= +Tracing +======= + +.. note:: A vendor-agnostic implementation is under development. Currently you can + already use Elastic APM. + +Tracing makes it possible to follow the flow of requests across system boundaries, +e.g. from one application to another. This makes it possible to pinpoint where errors +or performance degrations are situated exactly. Trace IDs also make it possible to +correlate the relevant log entries. + +.. note:: Better support for (distributed) traces is underway. diff --git a/docs/manual/index.rst b/docs/manual/index.rst index a04875b3..d00424a4 100644 --- a/docs/manual/index.rst +++ b/docs/manual/index.rst @@ -7,5 +7,4 @@ Manual :maxdepth: 1 :caption: Further reading - logging scripts diff --git a/src/objects/api/v2/views.py b/src/objects/api/v2/views.py index 0ebec27f..d89c0762 100644 --- a/src/objects/api/v2/views.py +++ b/src/objects/api/v2/views.py @@ -158,7 +158,7 @@ def perform_update(self, serializer): objects_update_counter.add(1) def perform_destroy(self, instance): - super().perform_destroy(instance) + instance.object.delete() objects_delete_counter.add(1) @extend_schema( From 3ee993954e0c2b1dace6c03df0cd3c24792eb39d Mon Sep 17 00:00:00 2001 From: Tim de Beer Date: Mon, 24 Nov 2025 15:08:38 +0100 Subject: [PATCH 9/9] :recycle: [maykinmedia/open-api-framework#152] refactor naming --- docs/installation/observability/metrics.rst | 8 ++++---- src/objects/api/metrics.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/installation/observability/metrics.rst b/docs/installation/observability/metrics.rst index 90ddafb4..c94df222 100644 --- a/docs/installation/observability/metrics.rst +++ b/docs/installation/observability/metrics.rst @@ -89,13 +89,13 @@ Accounts Objects ------- -``objects.objects.creates`` +``objects.object.creates`` Reports the number of objects created via the API. -``objects.objects.updates`` +``objects.object.updates`` Reports the number of objects updated via the API. -``objects.objects.deletes`` +``objects.object.deletes`` Reports the number of objects deleted via the API. The objects metrics show how many entities are created, updated, or deleted via the API, @@ -105,4 +105,4 @@ helping to monitor load and the most frequent operations, and allow for various .. code-block:: promql - sum by (otel_scope_name) (otel_objects_zaken_updates_total) + sum by (otel_scope_name) (otel_objects_object_updates_total) diff --git a/src/objects/api/metrics.py b/src/objects/api/metrics.py index 712836e6..ad9de2f8 100644 --- a/src/objects/api/metrics.py +++ b/src/objects/api/metrics.py @@ -3,17 +3,17 @@ meter = metrics.get_meter("objects.api") objects_create_counter = meter.create_counter( - "objects.objects.creates", + "objects.object.creates", description="Amount of objects created (via the API).", unit="1", ) objects_update_counter = meter.create_counter( - "objects.objects.updates", + "objects.object.updates", description="Amount of objects updated (via the API).", unit="1", ) objects_delete_counter = meter.create_counter( - "objects.objects.deletes", + "objects.object.deletes", description="Amount of objects deleted (via the API).", unit="1", )