diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..8910db8 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,141 @@ +--- +name: Build and Publish + +"on": + workflow_dispatch: + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + branches: + - "main" + - 'master' + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + flake8: + runs-on: ubuntu-latest + steps: + - name: Check out source repository + uses: actions/checkout@v4 + - name: Set up Python environment + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Install dependencies + run: pip install . + - name: flake8 Lint + uses: py-actions/flake8@v2 + + hadolint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: hadolint/hadolint-action@v3.1.0 + with: + dockerfile: Dockerfile + + yamllint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run yamllint + uses: bewuethr/yamllint-action@v1 + + oci_image: + name: Build OCI Image + if: github.repository == 'lsst-it/gnocpush' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + helm-lint: + runs-on: ubuntu-latest + name: helm lint + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install Helm + uses: azure/setup-helm@v4 + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + + - name: Run helm lint + run: cd charts/gnocpush && helm lint . + + + chart-release: + # only run when merged to main + if: github.ref == 'refs/heads/main' + name: Helm Chart Release + needs: + - oci_image + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Install Helm + uses: azure/setup-helm@v4 + env: + GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + + - name: Run chart-releaser + uses: helm/chart-releaser-action@v1.6.0 + env: + CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 0000000..8ae08cf --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,18 @@ +--- +extends: default + +ignore: | + charts/gnocpush/templates/* + +rules: + # 80 chars should be enough, but don't fail if a line is longer + line-length: false + # do not obsess over comment formatting + comments-indentation: false + comments: + require-starting-space: false + indentation: + spaces: consistent + indent-sequences: consistent + # ignore {{ .foo }} go templates + braces: false diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..70e4247 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.12 + +WORKDIR /app +COPY . . +RUN pip install --no-cache-dir . +ENTRYPOINT ["gnocpush"] diff --git a/README.md b/README.md index f52a07a..e252dd7 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,181 @@ # gnocpush +A simple service to accept webhook payloads from [Prometheus Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) and to push those alerts on to [GlobalNOC's Alertmon](https://alertmon-stage.grnoc.iu.edu/alertmon2/). + +## Alert format + +`gnocpush` expects that alerts have labels taht match GlocalNOC's required parameter names. + +Note that group annotations/labels from the alert group are ignored. + +### Required annotations + +* `description` - A description of the alert. + +### Required labels + +* `node_name` - The name of the node that the alert is associated with. +* `service_name` - The name of the service that the alert is associated with. +* `severity` - The severity of the alert. One of: `Critical`, `Major`, `Minor`, `Unknown`, `OK` + +### Optional labels + +* `device` - The subcomponent of the node that is alarming. +* `start_time` - The time that the alert started. + +### Example PrometheusRule + +```yaml +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + lsst.io/rule: "true" + name: net +spec: + groups: + - name: net.rules + rules: + - alert: lhn_interface_up + annotations: + description: '{{ $labels.instance }} - {{ $labels.ifName }}|{{ $labels.ifAlias }} is down' + expr: ifOperStatus{ifAlias=~".*LHN.*"} != 1 + for: 30s + labels: + severity: critical + node_name: '{{ $labels.instance }}' + device: '{{ $labels.ifName }}' + service_name: ifInErrors-{{ $labels.ifName}} + gnoc: "true" +``` + + +### Example Payload + +```json +{ + "receiver": "gnocpush", + "status": "firing", + "alerts": [ + { + "status": "firing", + "labels": { + "alertname": "ifInErrors", + "device": "Ethernet17/1", + "gnoc": "true", + "ifAlias": "rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2", + "ifDescr": "Ethernet17/1", + "ifIndex": "17001", + "ifName": "Ethernet17/1", + "instance": "new-rubinobs-br01", + "job": "snmp-network", + "node_name": "new-rubinobs-br01", + "prom": "dev/ruka", + "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", + "service_name": "ifInErrors-Ethernet17/1", + "severity": "major", + "site": "dev" + }, + "annotations": { + "description": "new-rubinobs-br01 - Ethernet17/1|rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2 has 12.2k input errors" + }, + "startsAt": "2024-04-26T20:46:34.933Z", + "endsAt": "0001-01-01T00:00:00Z", + "generatorURL": "https://prometheus.example.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", + "fingerprint": "46df8c14dbab758c" + } + ], + "groupLabels": { + "gnoc": "true" + }, + "commonLabels": { + "gnoc": "true", + "job": "snmp-network", + "prom": "dev/ruka", + "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", + "site": "dev" + }, + "commonAnnotations": {}, + "externalURL": "https://alertmanager.example.org", + "version": "4", + "groupKey": "{}/{gnoc=\"true\"}:{gnoc=\"true\"}", + "truncatedAlerts": 0 +} +``` + +## Alertmanager Configuration + +Note that `gnocpush` does not impose any alert grouping constraints. + +```yaml + config: + routes: + - receiver: gnocpush + continue: true + repeat_interval: 30s + group_interval: 30s + group_wait: 30s + group_by: + - gnoc + matchers: + - gnoc = "true" + receivers: + - name: gnocpush + webhook_configs: + - url: http://gnocpush.gnocpush:8080/alerts +``` + +## Deployment on Kubernetes + +```bash +helm upgrade --install \ + gnocpush ./charts/gnocpush \ + --create-namespace --namespace gnocpush \ + -f ./values.yaml +``` + +### Debugging a Kubernetes Deployment + +```bash +k logs alertmanager-kube-prometheus-stack-alertmanager-0 --tail=100 -f + +k logs -l app.kubernetes.io/instance=gnocpush -f +``` + +```bash +k -n gnocpush port-forward gnocpush-dc4d94d8-mqvqq 8080 +$ curl localhost:8080/metrics +``` + ## Development +### Local Development + ```bash virtualenv venv . venv/bin/activate pip install --editable . ``` -## URLs +### Testing with the OCI image + +```bash +docker run \ + -e GNOC_USERNAME=$GNOC_USERNAME \ + -e GNOC_PASSWORD=$GNOC_PASSWORD \ + -e GNOC_SERVER=$GNOC_SERVER \ + -e GNOC_REALM=$GNOC_REALM \ + --network=host ghcr.io/lsst-it/gnocpush +``` + +### Testing gnocpush with curl + +```bash +curl http://localhost:8080/alerts -v --json @- < alerts.json +``` + +## Useful GlobalNOC URLs ### Stage diff --git a/charts/gnocpush/.helmignore b/charts/gnocpush/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/gnocpush/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/gnocpush/Chart.yaml b/charts/gnocpush/Chart.yaml new file mode 100644 index 0000000..2b8d401 --- /dev/null +++ b/charts/gnocpush/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: gnocpush +description: Push Prometheus Alertmanager webhook payloads to Global NOC +type: application +version: 0.1.0 +appVersion: 0.1.0 diff --git a/charts/gnocpush/README.md b/charts/gnocpush/README.md new file mode 100644 index 0000000..8b4e3b6 --- /dev/null +++ b/charts/gnocpush/README.md @@ -0,0 +1,45 @@ +# gnocpush + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.1.0](https://img.shields.io/badge/AppVersion-0.1.0-informational?style=flat-square) + +Push Prometheus Alertmanager webhook payloads to Global NOC + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| affinity | object | `{}` | | +| autoscaling.enabled | bool | `false` | | +| autoscaling.maxReplicas | int | `100` | | +| autoscaling.minReplicas | int | `1` | | +| autoscaling.targetCPUUtilizationPercentage | int | `80` | | +| extraArgs | list | `[]` | | +| fullnameOverride | string | `""` | | +| image.pullPolicy | string | `"IfNotPresent"` | | +| image.repository | string | `"ghcr.io/lsst-it/gnocpush"` | | +| image.tag | string | `""` | | +| imagePullSecrets | list | `[]` | | +| ingress.annotations | object | `{}` | | +| ingress.className | string | `""` | | +| ingress.enabled | bool | `false` | | +| ingress.hosts[0].host | string | `"chart-example.local"` | | +| ingress.hosts[0].paths[0].path | string | `"/"` | | +| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | | +| ingress.tls | list | `[]` | | +| nameOverride | string | `""` | | +| nodeSelector | object | `{}` | | +| podAnnotations | object | `{}` | | +| podSecurityContext | object | `{}` | | +| replicaCount | int | `1` | | +| resources | object | `{}` | | +| securityContext | object | `{}` | | +| service.port | int | `8080` | | +| service.type | string | `"ClusterIP"` | | +| serviceAccount.annotations | object | `{}` | | +| serviceAccount.create | bool | `false` | | +| serviceAccount.name | string | `""` | | +| serviceMonitor.enabled | bool | `true` | | +| serviceMonitor.labels | object | `{}` | | +| serviceMonitor.namespace | string | `""` | | +| tolerations | list | `[]` | | + diff --git a/charts/gnocpush/templates/NOTES.txt b/charts/gnocpush/templates/NOTES.txt new file mode 100644 index 0000000..598bc00 --- /dev/null +++ b/charts/gnocpush/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "gnocpush.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "gnocpush.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "gnocpush.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "gnocpush.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/charts/gnocpush/templates/_helpers.tpl b/charts/gnocpush/templates/_helpers.tpl new file mode 100644 index 0000000..04c0f0b --- /dev/null +++ b/charts/gnocpush/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "gnocpush.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "gnocpush.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "gnocpush.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "gnocpush.labels" -}} +helm.sh/chart: {{ include "gnocpush.chart" . }} +{{ include "gnocpush.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "gnocpush.selectorLabels" -}} +app.kubernetes.io/name: {{ include "gnocpush.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "gnocpush.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "gnocpush.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/gnocpush/templates/deployment.yaml b/charts/gnocpush/templates/deployment.yaml new file mode 100644 index 0000000..c0e38b7 --- /dev/null +++ b/charts/gnocpush/templates/deployment.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gnocpush.fullname" . }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "gnocpush.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "gnocpush.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "gnocpush.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - --listen=0.0.0.0:{{ .Values.service.port }} + {{- if .Values.extraArgs }} + {{- toYaml .Values.extraArgs | nindent 12 }} + {{- end }} + env: + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + envFrom: + {{- with .Values.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.service.port }} + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/gnocpush/templates/hpa.yaml b/charts/gnocpush/templates/hpa.yaml new file mode 100644 index 0000000..0276a26 --- /dev/null +++ b/charts/gnocpush/templates/hpa.yaml @@ -0,0 +1,28 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "gnocpush.fullname" . }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "gnocpush.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/charts/gnocpush/templates/ingress.yaml b/charts/gnocpush/templates/ingress.yaml new file mode 100644 index 0000000..54e347f --- /dev/null +++ b/charts/gnocpush/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "gnocpush.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/gnocpush/templates/service.yaml b/charts/gnocpush/templates/service.yaml new file mode 100644 index 0000000..dfc3029 --- /dev/null +++ b/charts/gnocpush/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gnocpush.fullname" . }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "gnocpush.selectorLabels" . | nindent 4 }} diff --git a/charts/gnocpush/templates/serviceaccount.yaml b/charts/gnocpush/templates/serviceaccount.yaml new file mode 100644 index 0000000..09a5924 --- /dev/null +++ b/charts/gnocpush/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "gnocpush.serviceAccountName" . }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/gnocpush/templates/servicemonitor.yaml b/charts/gnocpush/templates/servicemonitor.yaml new file mode 100644 index 0000000..1353725 --- /dev/null +++ b/charts/gnocpush/templates/servicemonitor.yaml @@ -0,0 +1,23 @@ +{{- if .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "gnocpush.fullname" . }} + namespace: {{ .Values.serviceMonitor.namespace | default .Release.Namespace }} + labels: + {{- include "gnocpush.labels" . | nindent 4 }} + {{- if .Values.serviceMonitor.additionalLabels }} + {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 -}} + {{- end }} +spec: + endpoints: + - interval: 15s + path: /metrics + port: http + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "gnocpush.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/gnocpush/templates/tests/test-connection.yaml b/charts/gnocpush/templates/tests/test-connection.yaml new file mode 100644 index 0000000..0f9aee2 --- /dev/null +++ b/charts/gnocpush/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "gnocpush.fullname" . }}-test-connection" + labels: + {{- include "gnocpush.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "gnocpush.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/charts/gnocpush/values.yaml b/charts/gnocpush/values.yaml new file mode 100644 index 0000000..d9dd82a --- /dev/null +++ b/charts/gnocpush/values.yaml @@ -0,0 +1,89 @@ +# Default values for gnocpush. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: ghcr.io/lsst-it/gnocpush + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +extraArgs: [] + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + # Specifies whether a service account should be created + create: false + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 8080 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +serviceMonitor: + enabled: true + namespace: "" + labels: {} diff --git a/gnocpush/__init__.py b/gnocpush/__init__.py index e69de29..2feda00 100644 --- a/gnocpush/__init__.py +++ b/gnocpush/__init__.py @@ -0,0 +1 @@ +from .pusher import Pusher # NOQA diff --git a/gnocpush/cli.py b/gnocpush/cli.py deleted file mode 100644 index df55a47..0000000 --- a/gnocpush/cli.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/env python3 - -import os -import requests -import sys -import yaml -import logging - -from dateutil import parser -from globalnoc_alertmon_agent import AlertMonAgent, Alert - -def sanitize_severity(severity): - s = 'Unknown' - - # match statement doesn't seem to have an easy way to do case insensitive matching, so we force everything to lower case. - match severity.lower(): - case 'critical': s = 'Critical' - case 'major' | 'warning': s ='Major' - case 'minor' | 'info': s = 'Minor' - case 'unknown': s = 'Unknown' - case 'ok': s ='Ok' - - log.debug(f'severity: {severity}, s: {s}') - return s - -def push_to_gnoc(alerts, agent): - # initialize the alertmon agent - for alert in alerts: - data = { - 'node_name': alert['labels'].get('pod', 'Unknown'), - 'service_name': alert['labels'].get('alertname', 'Unknown'), - 'severity': sanitize_severity(alert['labels'].get('severity', 'Unknown')), - 'description': alert['annotations'].get('description', 'Unknown'), - 'start_time': parser.isoparse(alert['startsAt']).timestamp() - } - - agent.add_alert(Alert( - start_time = data.get('start_time', None), - node_name = data.get('node_name'), - service_name = data.get('service_name'), - description = data.get('description'), - severity = data.get('severity') - )) - - agent.send_alerts() - - -def get_alertmanager_alerts(url): - r = requests.get(url) - return r.json()#['data']['alerts'] - -def main(): - config = {} - - logging.basicConfig(level=logging.DEBUG) - global log - log = logging.getLogger() - - try: - config['username'] = os.environ['GNOC_USERNAME'] - config['password'] = os.environ['GNOC_PASSWORD'] - config['server'] = os.environ['GNOC_SERVER'] - config['realm'] = os.environ['GNOC_REALM'] - config['alertmanager_url'] = os.environ['ALERTMANAGER_URL'] - except KeyError as e: - print(f"The {e} environment variable is not set.") - sys.exit(1) - - alerts = get_alertmanager_alerts(config['alertmanager_url']) - - agent = AlertMonAgent( - username = config['username'], - password = config['password'], - server = config['server'], - realm = config['realm'] - ) - - push_to_gnoc(alerts, agent) - -if __name__ == '__main__': - main() diff --git a/gnocpush/cli/gnocpush.py b/gnocpush/cli/gnocpush.py new file mode 100644 index 0000000..c5279b8 --- /dev/null +++ b/gnocpush/cli/gnocpush.py @@ -0,0 +1,102 @@ +#!/bin/env python3 + +import argparse +import json +import logging + +from flask import Flask, request, jsonify +from gnocpush.envdefault import EnvDefault +from gnocpush import Pusher +from prometheus_flask_exporter import PrometheusMetrics +from waitress import serve + +app = Flask(__name__) +app.config['MAX_CONTENT_LENGTH'] = 4 * 1024 * 1024 # 4 MB +metrics = PrometheusMetrics(app) + + +@app.route('/alerts', methods=['POST']) +def push_endpoint(): + # Get the data from the request + data = request.get_json() + + log.debug(f"Received data: {json.dumps(data)}") + + try: + yeeter.push(data['alerts']) + except Exception as e: + log.error(f"Failed to push alerts to GNOC: {str(e)}") + return jsonify(error=str(e)), 502 + + # Return a response + return {'status': 'success'} + + +@app.route('/healthz', methods=['GET']) +@metrics.do_not_track() +def healthz(): + return {'status': 'ok'} + + +def parse_args(): + """Parse command-line arguments""" + + parser = argparse.ArgumentParser( + prog='gnocpush', + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '-u', '--user', action=EnvDefault, envvar='GNOC_USER', + help='Specify the GNOC username' + ) + parser.add_argument( + '-p', '--pass', action=EnvDefault, envvar='GNOC_PASS', + dest='password', + help='Specify the GNOC password' + ) + parser.add_argument( + '-s', '--server', action=EnvDefault, envvar='GNOC_SERVER', + help='Specify the GNOC server' + ) + parser.add_argument( + '-r', '--realm', action=EnvDefault, envvar='GNOC_REALM', + help='Specify the GNOC realm' + ) + parser.add_argument( + '-l', '--listen', + default='localhost:8080', + help='Specify the address:port to listen on' + ) + parser.add_argument( + '-v', '--verbose', + dest='debug', + action=argparse.BooleanOptionalAction, + help='Enable verbose logging' + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + log_level = logging.DEBUG if args.debug else logging.INFO + logging.basicConfig(level=log_level) + logging.getLogger('waitress').setLevel(log_level) + + global log + log = logging.getLogger(__name__) + + global yeeter + yeeter = Pusher({ + 'username': args.user, + 'password': args.password, + 'server': args.server, + 'realm': args.realm + }) + + serve(app, listen=args.listen) + + +if __name__ == '__main__': + main() diff --git a/gnocpush/cli/gnocscrape.py b/gnocpush/cli/gnocscrape.py new file mode 100644 index 0000000..a0d5894 --- /dev/null +++ b/gnocpush/cli/gnocscrape.py @@ -0,0 +1,69 @@ +#!/bin/env python3 + +import argparse +import logging +import requests + +from gnocpush.envdefault import EnvDefault +from gnocpush import Pusher + + +def get_alertmanager_alerts(url): + r = requests.get(url) + return r.json() + + +def parse_args(): + """Parse command-line arguments""" + + parser = argparse.ArgumentParser( + prog='gnocscrape', + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '-u', '--user', action=EnvDefault, envvar='GNOC_USER', + help='Specify the GNOC username' + ) + parser.add_argument( + '-p', '--pass', action=EnvDefault, envvar='GNOC_PASS', + dest='password', + help='Specify the GNOC password' + ) + parser.add_argument( + '-s', '--server', action=EnvDefault, envvar='GNOC_SERVER', + help='Specify the GNOC server' + ) + parser.add_argument( + '-a', '--url', action=EnvDefault, envvar='ALERTMANAGER_URL', + help='Specify the Alertmanager URL to scrape alerts from' + ) + parser.add_argument( + '-v', '--verbose', + dest='debug', + action=argparse.BooleanOptionalAction, + help='Enable verbose logging' + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + + log_level = logging.DEBUG if args.debug else logging.INFO + logging.basicConfig(level=log_level) + global log + log = logging.getLogger(__name__) + + yeeter = Pusher({ + 'username': args.user, + 'password': args.password, + 'server': args.server, + 'realm': args.realm + }) + alerts = get_alertmanager_alerts(args.url) + yeeter.push(alerts) + + +if __name__ == '__main__': + main() diff --git a/gnocpush/envdefault.py b/gnocpush/envdefault.py new file mode 100644 index 0000000..211e704 --- /dev/null +++ b/gnocpush/envdefault.py @@ -0,0 +1,18 @@ +# copied from https://stackoverflow.com/a/10551190/21807529 + +import argparse +import os + + +class EnvDefault(argparse.Action): + def __init__(self, envvar, required=True, default=None, **kwargs): + if not default and envvar: + if envvar in os.environ: + default = os.environ[envvar] + if required and default: + required = False + super(EnvDefault, self).__init__(default=default, required=required, + **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values) diff --git a/gnocpush/pusher.py b/gnocpush/pusher.py new file mode 100644 index 0000000..2e95ea9 --- /dev/null +++ b/gnocpush/pusher.py @@ -0,0 +1,46 @@ +import logging + +from gnocpush.utils import sanitize_severity +from dateutil import parser +from globalnoc_alertmon_agent import AlertMonAgent, Alert + +log = logging.getLogger(__name__) + + +class Pusher: + def __init__(self, config): + self.config = config + self.agent = AlertMonAgent( + username = config['username'], + password = config['password'], + server = config['server'], + realm = config['realm'] + ) + + def push(self, alerts): + + for alert in alerts: + sev = sanitize_severity(alert['labels'].get('severity', 'Unknown')) + desc = alert['annotations'].get('description', 'Unknown') + + data = { + 'node_name': alert['labels'].get('node_name', 'Unknown'), + 'device': alert['labels'].get('device'), + 'service_name': alert['labels'].get('service_name', 'Unknown'), + 'severity': sev, + 'description': desc, + 'start_time': parser.isoparse(alert['startsAt']).timestamp() + } + + log.debug(f"Pushing alert: {data}") + + self.agent.add_alert(Alert( + start_time = data.get('start_time'), + node_name = data.get('node_name'), + device = data.get('device'), + service_name = data.get('service_name'), + description = data.get('description'), + severity = data.get('severity') + )) + + self.agent.send_alerts() diff --git a/gnocpush/utils.py b/gnocpush/utils.py new file mode 100644 index 0000000..01ed59c --- /dev/null +++ b/gnocpush/utils.py @@ -0,0 +1,22 @@ +"""Assorted utility functions.""" + +import logging + +log = logging.getLogger(__name__) + + +def sanitize_severity(severity): + s = 'Unknown' + + # match statement doesn't seem to have an easy way to do case insensitive + # matching, so we force everything to lower case. + match severity.lower(): + case 'critical' | 'alert': s = 'Critical' + case 'major' | 'warning': s = 'Major' + case 'minor' | 'info': s = 'Minor' + case 'unknown': s = 'Unknown' + case 'ok': s = 'Ok' + + log.debug(f'severity: {severity} -> {s}') + + return s diff --git a/pyproject.toml b/pyproject.toml index 65c760c..801b764 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,13 @@ name = "gnocpush" version = "0.1.0" dependencies = [ + "flake8-pyproject", + "flask", "globalnoc_alertmon_agent", + "prometheus-flask-exporter==0.23.0", "python-dateutil", "requests", + "waitress", ] [build-system] @@ -12,4 +16,11 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [project.scripts] -gnocpush = "gnocpush.cli:main" +gnocpush = "gnocpush.cli.gnocpush:main" +gnocscrape = "gnocpush.cli.gnocscrape:main" + +[tool.flake8] +ignore = ['E221', 'E251'] + +[tool.setuptools.packages.find] +exclude = ["charts"] diff --git a/tests/alerts.json b/tests/alerts.json new file mode 100644 index 0000000..bae4e52 --- /dev/null +++ b/tests/alerts.json @@ -0,0 +1,2 @@ + +{"receiver": "webhook-test", "status": "firing", "alerts": [{"status": "firing", "labels": {"alertname": "et26_interface_up", "device": "Ethernet26", "gnoc": "true", "ifDescr": "Ethernet26", "ifIndex": "26", "ifName": "Ethernet26", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOperStatus-Ethernet26", "severity": "critical", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet26| is down"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOperStatus%7BifName%3D%22Ethernet26%22%2Cinstance%3D%22bdc-b05-lf1%22%7D+%21%3D+1&g0.tab=1", "fingerprint": "de0a6051f99c6608"}, {"status": "firing", "labels": {"alertname": "ifInErrors", "device": "Ethernet55/1", "gnoc": "true", "ifAlias": "-> bdc-sp01.ls Ethernet9/1 - Ansible", "ifDescr": "Ethernet55/1", "ifIndex": "55001", "ifName": "Ethernet55/1", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifInErrors-Ethernet55/1", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet55/1|-> bdc-sp01.ls Ethernet9/1 - Ansible has 1.862M input errors"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", "fingerprint": "9b6d5bba6a9726a9"}, {"status": "firing", "labels": {"alertname": "ifInErrors", "device": "Ethernet56/1", "gnoc": "true", "ifAlias": "-> bdc-sp02.ls Ethernet9/1 - Ansible", "ifDescr": "Ethernet56/1", "ifIndex": "56001", "ifName": "Ethernet56/1", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifInErrors-Ethernet56/1", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet56/1|-> bdc-sp02.ls Ethernet9/1 - Ansible has 1.093M input errors"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", "fingerprint": "ae9e2a96c8e91d2d"}, {"status": "firing", "labels": {"alertname": "ifInErrors", "device": "Port-Channel1", "gnoc": "true", "ifAlias": "bdc-lf09 Et55/56 Po1 <-> bdc-sp01/2 Et1/9 Po9", "ifDescr": "Port-Channel1", "ifIndex": "1000001", "ifName": "Port-Channel1", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifInErrors-Port-Channel1", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Port-Channel1|bdc-lf09 Et55/56 Po1 <-> bdc-sp01/2 Et1/9 Po9 has 2.955M input errors"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", "fingerprint": "91254c1c455a81ff"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet1", "gnoc": "true", "ifAlias": "gaw01.ls.lsst.org - Data Interface", "ifDescr": "Ethernet1", "ifIndex": "1", "ifName": "Ethernet1", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet1", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet1|gaw01.ls.lsst.org - Data Interface has 201.8k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "779f50e20b7c174b"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet10", "gnoc": "true", "ifAlias": "luan05.ls.lsst.org - DATA Interface", "ifDescr": "Ethernet10", "ifIndex": "10", "ifName": "Ethernet10", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet10", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet10|luan05.ls.lsst.org - DATA Interface has 17.27k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "a7f0ba719fe6684e"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet12", "gnoc": "true", "ifAlias": "vsphere05.ls.lsst.org - Trunk #1", "ifDescr": "Ethernet12", "ifIndex": "12", "ifName": "Ethernet12", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet12", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet12|vsphere05.ls.lsst.org - Trunk #1 has 14.37k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "a33e42ed83339c2e"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet13", "gnoc": "true", "ifAlias": "vsphere06.ls.lsst.org - Trunk #1", "ifDescr": "Ethernet13", "ifIndex": "13", "ifName": "Ethernet13", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet13", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet13|vsphere06.ls.lsst.org - Trunk #1 has 5.512k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "060fd02657a3b53a"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet2", "gnoc": "true", "ifAlias": "gaw02.ls.lsst.org - Data Interface", "ifDescr": "Ethernet2", "ifIndex": "2", "ifName": "Ethernet2", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet2", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet2|gaw02.ls.lsst.org - Data Interface has 9.516k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "5eed63008f20c699"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet3", "gnoc": "true", "ifAlias": "gaw03.ls.lsst.org - Data Interface", "ifDescr": "Ethernet3", "ifIndex": "3", "ifName": "Ethernet3", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet3", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet3|gaw03.ls.lsst.org - Data Interface has 138.6k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "f8b6c043f6d229ab"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet5", "gnoc": "true", "ifAlias": "gaw05.ls.lsst.org - Data Interface", "ifDescr": "Ethernet5", "ifIndex": "5", "ifName": "Ethernet5", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet5", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet5|gaw05.ls.lsst.org - Data Interface has 3.305M discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "17ad9928d833e5c3"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet6", "gnoc": "true", "ifAlias": "luan01.ls.lsst.org - DATA Interface", "ifDescr": "Ethernet6", "ifIndex": "6", "ifName": "Ethernet6", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet6", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet6|luan01.ls.lsst.org - DATA Interface has 40.83k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "6dc23a89a9a5877d"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet7", "gnoc": "true", "ifAlias": "luan02.ls.lsst.org - DATA Interface", "ifDescr": "Ethernet7", "ifIndex": "7", "ifName": "Ethernet7", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet7", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet7|luan02.ls.lsst.org - DATA Interface has 157.7k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "892de32d70ac8093"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet8", "gnoc": "true", "ifAlias": "luan03.ls.lsst.org - DATA Interface", "ifDescr": "Ethernet8", "ifIndex": "8", "ifName": "Ethernet8", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet8", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet8|luan03.ls.lsst.org - DATA Interface has 18.18k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "1f53b0e3bbda30a1"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet9", "gnoc": "true", "ifAlias": "luan04.ls.lsst.org - DATA Interface", "ifDescr": "Ethernet9", "ifIndex": "9", "ifName": "Ethernet9", "instance": "bdc-b05-lf1", "job": "snmp-network", "node_name": "bdc-b05-lf1", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet9", "severity": "major", "site": "dev"}, "annotations": {"description": "bdc-b05-lf1 - Ethernet9|luan04.ls.lsst.org - DATA Interface has 16.87k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "83a4305355089c9b"}, {"status": "firing", "labels": {"alertname": "ifInErrors", "device": "Ethernet12/1", "gnoc": "true", "ifAlias": "rubinobs-br01 Et12/1 <-> bdc-a04-lf1 Et1/54", "ifDescr": "Ethernet12/1", "ifIndex": "12001", "ifName": "Ethernet12/1", "instance": "new-rubinobs-br01", "job": "snmp-network", "node_name": "new-rubinobs-br01", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifInErrors-Ethernet12/1", "severity": "major", "site": "dev"}, "annotations": {"description": "new-rubinobs-br01 - Ethernet12/1|rubinobs-br01 Et12/1 <-> bdc-a04-lf1 Et1/54 has 95.75k input errors"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", "fingerprint": "25e6ecdaa5411da5"}, {"status": "firing", "labels": {"alertname": "ifInErrors", "device": "Ethernet17/1", "gnoc": "true", "ifAlias": "rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2", "ifDescr": "Ethernet17/1", "ifIndex": "17001", "ifName": "Ethernet17/1", "instance": "new-rubinobs-br01", "job": "snmp-network", "node_name": "new-rubinobs-br01", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifInErrors-Ethernet17/1", "severity": "major", "site": "dev"}, "annotations": {"description": "new-rubinobs-br01 - Ethernet17/1|rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2 has 12.2k input errors"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifInErrors+%3E+1000&g0.tab=1", "fingerprint": "46df8c14dbab758c"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet1/1", "gnoc": "true", "ifAlias": "to LHN-PRIMARY-LINK", "ifDescr": "Ethernet1/1", "ifIndex": "1001", "ifName": "Ethernet1/1", "instance": "new-rubinobs-br01", "job": "snmp-network", "node_name": "new-rubinobs-br01", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet1/1", "severity": "major", "site": "dev"}, "annotations": {"description": "new-rubinobs-br01 - Ethernet1/1|to LHN-PRIMARY-LINK has 4.891k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "cd8482239be143fc"}, {"status": "firing", "labels": {"alertname": "ifOutDiscards", "device": "Ethernet13/1", "gnoc": "true", "ifAlias": "rubinobs-br01 Et13/1 <-SUMMIT-LINK--VRF_RUBIN-> sdc-a05-sp01 Et60/1", "ifDescr": "Ethernet13/1", "ifIndex": "13001", "ifName": "Ethernet13/1", "instance": "new-rubinobs-br01", "job": "snmp-network", "node_name": "new-rubinobs-br01", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscards-Ethernet13/1", "severity": "major", "site": "dev"}, "annotations": {"description": "new-rubinobs-br01 - Ethernet13/1|rubinobs-br01 Et13/1 <-SUMMIT-LINK--VRF_RUBIN-> sdc-a05-sp01 Et60/1 has 1.512k discarded output packets"}, "startsAt": "2024-04-26T20:46:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=ifOutDiscards+%3E+1000&g0.tab=1", "fingerprint": "a5112a7000ea9bb1"}, {"status": "firing", "labels": {"alertname": "ifOutDiscardsRate", "device": "Ethernet17/1", "gnoc": "true", "ifAlias": "rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2", "ifDescr": "Ethernet17/1", "ifIndex": "17001", "ifName": "Ethernet17/1", "instance": "new-rubinobs-br01", "job": "snmp-network", "node_name": "new-rubinobs-br01", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "service_name": "ifOutDiscardsRate-Ethernet17/1", "severity": "major", "site": "dev"}, "annotations": {"description": "new-rubinobs-br01 - Ethernet17/1|rubinobs-br01 Et17/1 <--SCIENCE #1--> LS-DWDM Linecard001-Port2 has 560.2u discarded output packets in the last hour"}, "startsAt": "2024-04-29T17:25:34.933Z", "endsAt": "0001-01-01T00:00:00Z", "generatorURL": "https://prometheus.ruka.dev.lsst.org/graph?g0.expr=rate%28ifOutDiscards%5B1h%5D%29+%3E+0&g0.tab=1", "fingerprint": "8ee66bea73ebe91e"}], "groupLabels": {"gnoc": "true"}, "commonLabels": {"gnoc": "true", "job": "snmp-network", "prom": "dev/ruka", "prometheus": "kube-prometheus-stack/kube-prometheus-stack-prometheus", "site": "dev"}, "commonAnnotations": {}, "externalURL": "https://alertmanager.ruka.dev.lsst.org", "version": "4", "groupKey": "{}/{gnoc=\"true\"}:{gnoc=\"true\"}", "truncatedAlerts": 0}