From 878a3dd8fee7b0978179560fc64adb330418c2f8 Mon Sep 17 00:00:00 2001 From: Ryan Crichton Date: Tue, 8 Nov 2022 12:36:20 +0200 Subject: [PATCH 1/2] Initial logging config --- .../roles/docker/files/docker-daemon.json | 3 +- monitoring/docker-compose.dev.yml | 12 + monitoring/docker-compose.yml | 70 ++ .../logging-universal-dashboard_rev1.json | 926 ++++++++++++++++++ monitoring/loki/loki-config-minio.yml | 20 + monitoring/loki/loki-config.yml | 65 ++ monitoring/package-metadata.json | 2 +- monitoring/promtail/promtail-config.yml | 47 + monitoring/swarm.sh | 5 +- 9 files changed, 1145 insertions(+), 5 deletions(-) create mode 100644 monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json create mode 100644 monitoring/loki/loki-config-minio.yml create mode 100644 monitoring/loki/loki-config.yml create mode 100644 monitoring/promtail/promtail-config.yml diff --git a/infrastructure/ansible/roles/docker/files/docker-daemon.json b/infrastructure/ansible/roles/docker/files/docker-daemon.json index b8ab8c9d..54d65077 100644 --- a/infrastructure/ansible/roles/docker/files/docker-daemon.json +++ b/infrastructure/ansible/roles/docker/files/docker-daemon.json @@ -2,6 +2,7 @@ "log-driver": "json-file", "log-opts": { "max-size": "10m", - "max-file": "3" + "max-file": "3", + "labels-regex": "^.+" } } diff --git a/monitoring/docker-compose.dev.yml b/monitoring/docker-compose.dev.yml index aaaf782a..751b8338 100644 --- a/monitoring/docker-compose.dev.yml +++ b/monitoring/docker-compose.dev.yml @@ -12,3 +12,15 @@ services: - target: 9090 published: 9090 mode: host + + loki: + ports: + - target: 3100 + published: 3100 + mode: host + + minio1: + ports: + - target: 9001 + published: 9001 + mode: host diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 5cb5ded9..e77d94a6 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -26,6 +26,8 @@ services: source: kminion-groups_rev1.json - target: /etc/grafana/provisioning/dashboards/applications/kminion-topic_rev1.json source: kminion-topic_rev1.json + - target: /etc/grafana/provisioning/dashboards/containers/logging-universal-dashboard_rev1.json + source: logging-universal-dashboard_rev1.json prometheus: image: prom/prometheus:v2.38.0 @@ -71,6 +73,56 @@ services: - KAFKA_BROKER_LIST=kafka:9092 - KAFKA_COMPRESSION=gzip + loki: + image: grafana/loki:2.6.1 + volumes: + - loki_data:/tmp/loki + configs: + - target: /etc/loki/loki-config.yml + source: loki-config.yml + command: -config.file=/etc/loki/loki-config.yml + deploy: + labels: + - prometheus-job-service=loki + - prometheus-address=loki:3100 + + promtail: + image: grafana/promtail:2.6.1 + volumes: + - /var/lib/docker/containers:/host/containers + - /var/log:/var/log:ro + configs: + - target: /etc/promtail/promtail-config.yml + source: promtail-config.yml + command: -config.file=/etc/promtail/promtail-config.yml + deploy: + mode: global + + minio1: + image: quay.io/minio/minio:RELEASE.2022-10-24T18-35-07Z + entrypoint: sh + command: -c 'mkdir -p /data1/loki /data2/loki && minio server --console-address ":9001" http://minio{1...1}/data{1...2}' + environment: + MINIO_ROOT_USER: minioadmin + MINIO_ROOT_PASSWORD: minioadmin + healthcheck: + test: + [ + "CMD", + "curl", + "-f", + "http://localhost:9000/minio/health/live" + ] + interval: 30s + timeout: 20s + retries: 3 + hostname: minio1 + volumes: + - minio_data1:/data1 + - minio_data2:/data2 + deploy: + replicas: 1 + configs: grafana.ini: file: ./grafana/grafana.ini @@ -112,12 +164,30 @@ configs: name: kminion-topic_rev1.json-${kminion_topic_rev1_json_DIGEST:?err} labels: name: grafana + logging-universal-dashboard_rev1.json: + file: ./grafana/dashboards/containers/logging-universal-dashboard_rev1.json + name: logging-universal-dashboard_rev1.json-${logging_universal_dashboard_rev1_json_DIGEST:?err} + labels: + name: grafana prometheus.yml: file: ./prometheus/prometheus.yml name: prometheus.yml-${prometheus_yml_DIGEST:?err} labels: name: prometheus + loki-config.yml: + file: ./loki/loki-config.yml + name: loki-config.yml-${loki_config_yml_DIGEST:?err} + labels: + name: loki + promtail-config.yml: + file: ./promtail/promtail-config.yml + name: promtail-config.yml-${promtail_config_yml_DIGEST:?err} + labels: + name: promtail volumes: prometheus_data: grafana_data: + loki_data: + minio_data1: + minio_data2: diff --git a/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json b/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json new file mode 100644 index 00000000..8d3bf322 --- /dev/null +++ b/monitoring/grafana/dashboards/containers/logging-universal-dashboard_rev1.json @@ -0,0 +1,926 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Universal and flexible dashboard for logging", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": 12611, + "graphTooltip": 0, + "id": 11, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Total Count of log lines in the specified time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgb(31, 255, 7)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(31, 255, 7)", + "value": null + }, + { + "color": "rgb(31, 255, 7)", + "value": 10 + }, + { + "color": "rgb(31, 255, 7)", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 11, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total Count of logs", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Total Count: of $searchable_pattern in the specified time range", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgb(222, 15, 43)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(222, 15, 43)", + "value": null + }, + { + "color": "rgb(222, 15, 43)", + "value": 10 + }, + { + "color": "rgb(222, 15, 43)", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 6, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "sum" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total Count: of \"$searchable_pattern\"", + "type": "stat" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "description": "Live logs is a like 'tail -f | grep' in a real time", + "gridPos": { + "h": 22, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 2, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": true, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "{swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\"", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Live logs (filtered by \"$searchable_pattern\")", + "type": "logs" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 25 + }, + "id": 19, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "7.0.4", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval])) by (stream)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Total count of stderr / stdout pie", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 7, + "y": 25 + }, + "id": 20, + "interval": "1m", + "links": [], + "maxDataPoints": "", + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "donut", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "6.4.3", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) by (swarm_task_name)", + "queryType": "range", + "refId": "A" + } + ], + "title": "Matched word: \"$searchable_pattern\" donut", + "type": "piechart" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "#299c46", + "text": "0" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "#C4162A", + "value": 50 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 19, + "y": 25 + }, + "id": 9, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) * 100 / sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval]))", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "\"$searchable_pattern\" Percentage for specified time", + "type": "gauge" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Count", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 18, + "interval": "1m", + "links": [], + "maxDataPoints": "", + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[$__interval])) by (swarm_task_name)", + "queryType": "range", + "refId": "A" + } + ], + "title": "Matched word: \"$searchable_pattern\" historical", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "log": 10, + "type": "log" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 10, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(rate(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"} |~ \"(?i)$searchable_pattern\")[30s])) by (swarm_task_name)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "\"$searchable_pattern\" Rate per Task", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 6, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "{stream=\"stderr\"} stderr" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C4162A", + "mode": "fixed" + } + }, + { + "id": "custom.lineWidth", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "{stream=\"stdout\"} stdout" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#56A64B", + "mode": "fixed" + } + }, + { + "id": "custom.lineWidth", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 7, + "links": [], + "maxDataPoints": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "editorMode": "code", + "expr": "sum(count_over_time(({swarm_service_name=\"$service_name\", stream=~\"$stream\", swarm_task_name=~\"$task_name\"})[$__interval])) by (stream)", + "hide": false, + "queryType": "range", + "refId": "A" + } + ], + "title": "Count of stderr / stdout historical", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "Loki", + "logging" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "instant_prometheus-kafka-adapter", + "value": "instant_prometheus-kafka-adapter" + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=~\".+\"}, swarm_service_name)", + "hide": 0, + "includeAll": false, + "label": "Service", + "multi": false, + "name": "service_name", + "options": [], + "query": "label_values({swarm_service_name=~\".+\"}, swarm_service_name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=\"$service_name\"}, swarm_task_name)", + "hide": 0, + "includeAll": true, + "label": "Task", + "multi": true, + "name": "task_name", + "options": [], + "query": "label_values({swarm_service_name=\"$service_name\"}, swarm_task_name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "loki", + "uid": "P00201832B18B88C3" + }, + "definition": "label_values({swarm_service_name=\"$service_name\"}, stream)", + "hide": 0, + "includeAll": true, + "label": "Stream", + "multi": false, + "name": "stream", + "options": [], + "query": "label_values({swarm_service_name=\"$service_name\"}, stream)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "error", + "value": "error" + }, + "hide": 0, + "label": "Search (case insensitive)", + "name": "searchable_pattern", + "options": [ + { + "selected": true, + "text": "error", + "value": "error" + } + ], + "query": "error", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Container logs", + "uid": "fRIvzUZMf", + "version": 4, + "weekStart": "" +} diff --git a/monitoring/loki/loki-config-minio.yml b/monitoring/loki/loki-config-minio.yml new file mode 100644 index 00000000..46020dac --- /dev/null +++ b/monitoring/loki/loki-config-minio.yml @@ -0,0 +1,20 @@ +storage_config: + aws: + # Note: use a fully qualified domain name, like localhost. + # full example: http://loki:supersecret@localhost.:9000 + s3: http://:@: + s3forcepathstyle: true + boltdb_shipper: + active_index_directory: /loki/boltdb-shipper-active + cache_location: /loki/boltdb-shipper-cache + shared_store: s3 + +schema_config: + configs: + - from: 2020-07-01 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml new file mode 100644 index 00000000..83abdf9a --- /dev/null +++ b/monitoring/loki/loki-config.yml @@ -0,0 +1,65 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + grpc_server_max_concurrent_streams: 0 + +ingester: + wal: + enabled: true + dir: /tmp/wal + lifecycler: + address: 127.0.0.1 + ring: + kvstore: + store: inmemory + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 1h + max_chunk_age: 1h + chunk_target_size: 1048576 + chunk_retain_period: 30s + max_transfer_retries: 0 + +schema_config: + configs: + - from: 2020-07-01 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/boltdb-shipper-active + cache_location: /loki/boltdb-shipper-cache + resync_interval: 30s + shared_store: s3 + aws: + # Note: use a fully qualified domain name, like localhost. + # full example: http://loki:supersecret@localhost.:9000 + s3: http://minioadmin:minioadmin@minio1.:9000/loki + s3forcepathstyle: true + +compactor: + working_directory: /loki/boltdb-shipper-compactor + shared_store: s3 + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 100 + ingestion_burst_size_mb: 150 + max_concurrent_tail_requests: 200 + max_cache_freshness_per_query: 10m + max_streams_per_user: 500 + +chunk_store_config: + max_look_back_period: 0s + +table_manager: + retention_deletes_enabled: false + retention_period: 0s diff --git a/monitoring/package-metadata.json b/monitoring/package-metadata.json index 6532c538..7c155e7f 100644 --- a/monitoring/package-metadata.json +++ b/monitoring/package-metadata.json @@ -4,7 +4,7 @@ "description": "A package for monitoring the platform services", "type": "infrastructure", "version": "0.0.1", - "dependencies": ["message-bus-kafka"], + "dependencies": [], "environmentVariables": { "STATEFUL_NODES": "single", "GF_SECURITY_ADMIN_USER": "admin", diff --git a/monitoring/promtail/promtail-config.yml b/monitoring/promtail/promtail-config.yml new file mode 100644 index 00000000..1cce8fe2 --- /dev/null +++ b/monitoring/promtail/promtail-config.yml @@ -0,0 +1,47 @@ +server: + http_listen_address: 0.0.0.0 + http_listen_port: 9080 + +positions: + filename: /tmp/positions.yaml + +clients: +- url: http://loki:3100/loki/api/v1/push + +scrape_configs: + +- job_name: containers + static_configs: + - targets: + - localhost + labels: + job: containerlogs + __path__: /host/containers/*/*log + + pipeline_stages: + - json: + expressions: + log: log + stream: stream + time: time + tag: attrs.tag + stack_name: attrs."com.docker.stack.namespace" + swarm_service_name: attrs."com.docker.swarm.service.name" + swarm_task_name: attrs."com.docker.swarm.task.name" + swarm_node_id: attrs."com.docker.swarm.node.id" + - regex: + expression: "^/host/containers/(?P.{12}).+/.+-json.log$" + source: filename + - timestamp: + format: RFC3339Nano + source: time + - labels: + stream: + container_id: + tag: + stack_name: + swarm_service_name: + swarm_task_name: + swarm_node_id: + - output: + source: log diff --git a/monitoring/swarm.sh b/monitoring/swarm.sh index 800707e7..c4174ff2 100644 --- a/monitoring/swarm.sh +++ b/monitoring/swarm.sh @@ -14,12 +14,11 @@ ROOT_PATH="${COMPOSE_FILE_PATH}/.." . "${ROOT_PATH}/utils/docker-utils.sh" . "${ROOT_PATH}/utils/log.sh" - if [[ "${MODE}" == "dev" ]]; then - log info "Running Message Bus Kafka package in DEV mode" + log info "Running Monitoring package in DEV mode" monitoring_dev_compose_param="-c ${COMPOSE_FILE_PATH}/docker-compose.dev.yml" else - log info "Running Message Bus Kafka package in PROD mode" + log info "Running Monitoring package in PROD mode" monitoring_dev_compose_param="" fi From 70fdc387d38338e88e6dc59f975ce82b917a5960 Mon Sep 17 00:00:00 2001 From: Ryan Crichton Date: Fri, 13 Jan 2023 15:28:14 +0200 Subject: [PATCH 2/2] WIP loki clustering --- monitoring/docker-compose.dev.yml | 12 +++++++++ monitoring/docker-compose.yml | 41 ++++++++++++++++++++++++---- monitoring/loki/loki-config.yml | 44 ++++++++++++------------------- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/monitoring/docker-compose.dev.yml b/monitoring/docker-compose.dev.yml index 3da46417..24e952bf 100644 --- a/monitoring/docker-compose.dev.yml +++ b/monitoring/docker-compose.dev.yml @@ -19,6 +19,18 @@ services: published: 3100 mode: host + loki2: + ports: + - target: 3100 + published: 3101 + mode: host + + loki3: + ports: + - target: 3100 + published: 3102 + mode: host + minio1: ports: - target: 9001 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 153162d1..dd2f3da2 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -97,18 +97,50 @@ services: - KAFKA_COMPRESSION=gzip loki: - image: grafana/loki:2.6.1 - volumes: - - loki_data:/tmp/loki + image: grafana/loki:2.7.1 configs: - target: /etc/loki/loki-config.yml source: loki-config.yml - command: -config.file=/etc/loki/loki-config.yml + command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true -print-config-stderr + environment: + - NODE_NAME=loki + - MEMBER1=loki2 + - MEMBER2=loki3 deploy: labels: - prometheus-job-service=loki - prometheus-address=loki:3100 + loki2: + image: grafana/loki:2.7.1 + configs: + - target: /etc/loki/loki-config.yml + source: loki-config.yml + command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true -print-config-stderr + environment: + - NODE_NAME=loki2 + - MEMBER1=loki1 + - MEMBER2=loki3 + deploy: + labels: + - prometheus-job-service=loki2 + - prometheus-address=loki2:3100 + + loki3: + image: grafana/loki:2.7.1 + configs: + - target: /etc/loki/loki-config.yml + source: loki-config.yml + command: -config.file=/etc/loki/loki-config.yml -config.expand-env=true -print-config-stderr + environment: + - NODE_NAME=loki3 + - MEMBER1=loki2 + - MEMBER2=loki1 + deploy: + labels: + - prometheus-job-service=loki3 + - prometheus-address=loki3:3100 + promtail: image: grafana/promtail:2.6.1 volumes: @@ -211,6 +243,5 @@ configs: volumes: prometheus_data: grafana_data: - loki_data: minio_data1: minio_data2: diff --git a/monitoring/loki/loki-config.yml b/monitoring/loki/loki-config.yml index 83abdf9a..922f8fba 100644 --- a/monitoring/loki/loki-config.yml +++ b/monitoring/loki/loki-config.yml @@ -2,25 +2,28 @@ auth_enabled: false server: http_listen_port: 3100 - grpc_listen_port: 9096 - grpc_server_max_concurrent_streams: 0 + +# I think the issue cold be due to the network interface that is being used +# common: +# ring: +# instance_interface_names: +# - eth0 +# - eth1 +# - lo +# instance_interface_names: +# - eth0 +# - eth1 + +memberlist: + node_name: ${NODE_NAME} + join_members: + - ${MEMBER1} + - ${MEMBER2} ingester: wal: enabled: true dir: /tmp/wal - lifecycler: - address: 127.0.0.1 - ring: - kvstore: - store: inmemory - replication_factor: 1 - final_sleep: 0s - chunk_idle_period: 1h - max_chunk_age: 1h - chunk_target_size: 1048576 - chunk_retain_period: 30s - max_transfer_retries: 0 schema_config: configs: @@ -39,8 +42,6 @@ storage_config: resync_interval: 30s shared_store: s3 aws: - # Note: use a fully qualified domain name, like localhost. - # full example: http://loki:supersecret@localhost.:9000 s3: http://minioadmin:minioadmin@minio1.:9000/loki s3forcepathstyle: true @@ -49,17 +50,6 @@ compactor: shared_store: s3 limits_config: - reject_old_samples: true - reject_old_samples_max_age: 168h ingestion_rate_mb: 100 ingestion_burst_size_mb: 150 max_concurrent_tail_requests: 200 - max_cache_freshness_per_query: 10m - max_streams_per_user: 500 - -chunk_store_config: - max_look_back_period: 0s - -table_manager: - retention_deletes_enabled: false - retention_period: 0s