Merge branch 'add-service-and-service-monitor-for-forwarder' of githu…

…b.com:robusta-dev/robusta into add-service-and-service-monitor-for-forwarder
robusta-dev · Sep 20, 2024 · 236d293 · 236d293
2 parents 2a3833c + 8d713f6
commit 236d293
Show file tree

Hide file tree

Showing 34 changed files with 697 additions and 440 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -49,10 +49,6 @@ ENV PATH="/venv/bin:$PATH"
 ENV PYTHONPATH=$PYTHONPATH:.:/app/src
 
 WORKDIR /app
-COPY --from=builder /app/venv /venv
-COPY --from=builder /etc/robusta/playbooks/defaults /etc/robusta/playbooks/defaults
-# Copy virtual environment and application files from the build stage
-COPY --from=builder /app /app
 
 # Install necessary packages for the runtime environment
 RUN apt-get update \
@@ -67,6 +63,11 @@ RUN git config --global core.symlinks false
 # Remove setuptools-65.5.1 installed from python:3.11-slim base image as fix for CVE-2024-6345 until image will be updated
 RUN rm -rf /usr/local/lib/python3.11/site-packages/setuptools-65.5.1.dist-info
 
+COPY --from=builder /app/venv /venv
+COPY --from=builder /etc/robusta/playbooks/defaults /etc/robusta/playbooks/defaults
+# Copy virtual environment and application files from the build stage
+COPY --from=builder /app /app
+
 # Run the application
 # -u disables stdout buffering https://stackoverflow.com/questions/107705/disable-output-buffering
 CMD [ "python3", "-u", "-m", "robusta.runner.main"]
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 <div id="top"></div>
 
 <div align="center">
-  <h1 align="center">Robusta - Better Prometheus Alerts (and more) for Kubernetes</h1>
-  <h2 align="center">Enrich, Group, and Remediate your Alerts</h2>
+  <h1 align="center">Robusta - Better Prometheus Notifications for Kubernetes</h1>
+  <h2 align="center">Better grouping, enrichment, and remediation of your existing alerts</h2>
   <p align="center">
     <a href="#%EF%B8%8F-how-it-works"><strong>How it Works</strong></a> |
     <a href="#-installing-robusta"><strong>Installation</strong></a> | 
@@ -14,7 +14,9 @@
 
 ## What Can Robusta Do?
 
-Robusta integrates with Prometheus (e.g. `kube-prometheus-stack` or Coralogix) by webhook and adds features like: 
+Compatible with kube-prometheus-stack, Prometheus Operator, and more.
+
+Robusta integrates with Prometheus by webhook and adds features like: 
 
 * [**Smart Grouping**](https://docs.robusta.dev/master/configuration/notification-grouping.html) - reduce notification spam with Slack threads 🧵
 * [**AI Investigation**](https://docs.robusta.dev/master/configuration/ai-analysis.html#ai-analysis) -  Kickstart alert investigation with AI (optional)
@@ -30,23 +32,13 @@ Don't have Prometheus? You can use Robusta without Prometheus, or install our al
 
 ## 🛠️ How it works
 
-Robusta is powered by a rule engine that takes incoming events (e.g. Prometheus alerts) and runs actions on them to gather more information or remediate problems. 
-
-Here is an example rule that adds Pod logs to the `KubePodCrashLooping` alert from Prometheus:
-
-```yaml
-triggers:
-  - on_prometheus_alert:
-      alert_name: KubePodCrashLooping
-actions:
-  - logs_enricher: {}
-```
+Robusta uses rules and AI to take Prometheus alerts and add extra information to them, such as pod logs, relevant graphs, possible remediations, and more.
 
-The resulting alert looks like this in Slack:
+Here is an example alert in Slack:
 
 ![](./docs/images/crash-report.png)
 
-When performing auto-remediation, you can configure 100% automation, or semi-automatic mode that requires user confirmation:
+Here is an example remediation action:
 
 ![](./docs/images/alert_on_hpa_reached_limit1.png)
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -97,9 +97,17 @@
     "configuration/additional-settings.html": "/master/setup-robusta/additional-settings.html",
     "developer-guide/writing-playbooks.html": "/master/playbook-reference/defining-playbooks/index.html",
     "user-guide/slack.html": "/master/configuration/sinks/slack.html",
-    "user-guide/elasticsearch.html": "/master/playbook-reference/triggers/elasticsearch.html"
+    "user-guide/elasticsearch.html": "/master/playbook-reference/triggers/elasticsearch.html",
+    "catalog/actions/python-troubleshooting.html": "/master/playbook-reference/actions/python-troubleshooting.html",
+    "configuration/notification-grouping.html": "/master/notification-routing/notification-grouping.html",
+    "tutorials/routing-by-namespace.html": "/master/notification-routing/routing-by-namespace.html",
+    "tutorials/routing-by-type.html": "/master/notification-routing/routing-by-type.html",
+    "tutorials/routing-exclusion.html": "/master/notification-routing/routing-exclusion.html",
+    "tutorials/routing-silencing.html": "/master/notification-routing/routing-silencing.html",
+    "configuration/configuring-sinks.html": "/master/notification-routing/configuring-sinks.html"
 }
 
+
 # for sphinx_jinja - see https://github.com/tardyp/sphinx-jinja
 jinja_contexts = {}
 

diff --git a/docs/configuration/ai-analysis.rst b/docs/configuration/ai-analysis.rst
@@ -24,103 +24,191 @@ Configuration
 
   Only GPT-4o is officially supported. We highly recommend using GPT-4o to get the most accurate results!
 
-In order to include ``Holmes GPT`` with you Robusta installation, add the following to your ``generated_values.yaml``
+.. tab-set::
 
-In the examples below, we're assuming you created a Kubernetes ``secret`` named ``holmes-secrets`` to store sensitive variables.
+    .. tab-item:: OpenAI
+        :name: open-ai
+
+        Create a secret with your OpenAI API key:
 
-To use Open AI (this is the default llm):
+        .. code-block:: bash
 
-.. code-block:: yaml
+          kubectl create secret generic holmes-secrets -n robusta --from-literal=openAiKey='<API_KEY_GOES_HERE>'
 
-    enableHolmesGPT: true
-    holmes:
-      additionalEnvVars:
-      - name: MODEL
-        value: gpt-4o
-      - name: OPENAI_API_KEY
-        valueFrom:
-          secretKeyRef:
-            name: holmes-secrets
-            key: openAiKey
+        Then add the following to your helm values (``generated_values.yaml`` file):
 
+        .. code-block:: yaml
 
-To use Azure Open AI:
+            enableHolmesGPT: true
+            holmes:
+              additionalEnvVars:
+              - name: MODEL
+                value: gpt-4o
+              - name: OPENAI_API_KEY
+                valueFrom:
+                  secretKeyRef:
+                    name: holmes-secrets
+                    key: openAiKey
 
-.. code-block:: yaml
 
-    enableHolmesGPT: true
-    holmes:
-      additionalEnvVars:
-      - name: MODEL
-        value: azure/my-azure-deployment  # the name of your azure deployment
-      - name: AZURE_API_VERSION
-        value: 2024-02-15-preview
-      - name: AZURE_API_BASE
-        value: https://my-org.openai.azure.com/  # base url of you azure deployment
-      - name: AZURE_API_KEY
-        valueFrom:
-          secretKeyRef:
-            name: holmes-secrets
-            key: azureOpenAiKey
-
-
-To use Azure AI, follow the setup instructions below and then edit Robusta's Helm values. Do NOT skip the setup instructions, as they include a mandatory change rate-limits. Without this change, Holmes wont work.
-
-.. details:: Mandatory Setup for Azure AI
-
-  The following steps cover how to obtain the correct AZURE_API_VERSION value and how to increase the token limit to prevent rate limiting.
-
-  1. Go to your Azure portal and choose `Azure OpenAI`
-    .. image:: /images/AzureAI/AzureAI_HolmesStep1.png
-        :width: 600px
-  2. Click your AI service
-    .. image:: /images/AzureAI/AzureAI_HolmesStep2.png
-        :width: 600px
-  3. Click Go to Azure Open AI Studio
-    .. image:: /images/AzureAI/AzureAI_HolmesStep3.png
-        :width: 600px
-  4. Choose Deployments
-    .. image:: /images/AzureAI/AzureAI_HolmesStep4.png
-        :width: 600px
-  5. Select your Deployment
-    .. image:: /images/AzureAI/AzureAI_HolmesStep5.png
-        :width: 600px
-  6. Click Open in Playground
-    .. image:: /images/AzureAI/AzureAI_HolmesStep6.png
-        :width: 600px
-  7. Go to View Code
-    .. image:: /images/AzureAI/AzureAI_HolmesStep7.png
-        :width: 600px
-  8. Choose Python and scroll to find the API VERSION. Copy this! You will need it for Robusta's Helm values.
-    .. image:: /images/AzureAI/AzureAI_HolmesStep8.png
-        :width: 600px
-  9. Go back to Deployments, and click Edit Deployment
-    .. image:: /images/AzureAI/AzureAI_HolmesStep9.png
-        :width: 600px
-  10. MANDATORY: Increase the token limit. Change this value to at least 450K tokens for Holmes to work properly. We recommend choosing the highest value available. (Holmes queries Azure AI infrequently but in bursts. Therefore the overall cost of using Holmes with Azure AI is very low, but you must increase the quota to avoid getting rate-limited on a single burst of requests.)
-    .. image:: /images/AzureAI/AzureAI_HolmesStep10.png
-        :width: 600px
-
-
-To use AWS Bedrock:
+        Do a Helm upgrade to apply the new values: ``helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>``
+
+
+    .. tab-item:: Azure AI
+        :name: azure-ai
+
+        Go into your Azure portal, **change the default rate-limit to the maximum**, and find the following parameters:
+
+        * API_VERSION
+        * DEPLOYMENT_NAME
+        * ENDPOINT
+        * API_KEY 
+
+        .. details:: Step-By-Step Instruction for Azure Portal
+
+          The following steps cover how to obtain the correct AZURE_API_VERSION value and how to increase the token limit to prevent rate limiting.
+
+          1. Go to your Azure portal and choose `Azure OpenAI`
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep1.png
+              :width: 600px
+
+          2. Click your AI service
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep2.png
+              :width: 600px
+
+          3. Click Go to Azure Open AI Studio
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep3.png
+              :width: 600px
+
+          4. Choose Deployments
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep4.png
+              :width: 600px
+
+          5. Select your Deployment - note the DEPLOYMENT_NAME!
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep5.png
+              :width: 600px
+
+          6. Click Open in Playground
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep6.png
+              :width: 600px
+
+          7. Go to View Code
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep7.png
+              :width: 600px
+
+          8. Choose Python and scroll to find the ENDPOINT, API_KEY, and API_VERSION. Copy them! You will need them for Robusta's Helm values.
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep8.png
+              :width: 600px
+
+          9. Go back to Deployments, and click Edit Deployment
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep9.png
+              :width: 600px
+
+          10. MANDATORY: Increase the token limit. Change this value to at least 450K tokens for Holmes to work properly. We recommend choosing the highest value available. (Holmes queries Azure AI infrequently but in bursts. Therefore the overall cost of using Holmes with Azure AI is very low, but you must increase the quota to avoid getting rate-limited on a single burst of requests.)
+
+          .. image:: /images/AzureAI/AzureAI_HolmesStep10.png
+              :width: 600px
+
+
+        Create a secret with the Azure API key you found above:
+
+        .. code-block:: bash
+
+          kubectl create secret generic holmes-secrets -n robusta --from-literal=azureOpenAiKey='<AZURE_API_KEY_GOES_HERE>'
+
+
+        Update your helm values (``generated_values.yaml`` file) with the following configuration:
+
+        .. code-block:: yaml
+
+            enableHolmesGPT: true
+            holmes:
+              additionalEnvVars:
+              - name: MODEL
+                value: azure/<DEPLOYMENT_NAME>  # replace with deployment name from the portal (e.g. avi-deployment), leave "azure/" prefix
+              - name: AZURE_API_VERSION
+                value: <API_VERSION>            # replace with API version you found in the Azure portal
+              - name: AZURE_API_BASE
+                value: <AZURE_ENDPOINT>         # fill in the base endpoint url of your azure deployment - e.g. https://my-org.openai.azure.com/
+              - name: AZURE_API_KEY
+                valueFrom:
+                  secretKeyRef:
+                    name: holmes-secrets
+                    key: azureOpenAiKey
+
+        Do a Helm upgrade to apply the new values: ``helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>``
+
+    .. tab-item:: AWS Bedrock
+        :name: aws-bedrock
+
+        You will need the following AWS parameters:
+
+        * BEDROCK_MODEL_NAME
+        * AWS_ACCESS_KEY_ID
+        * AWS_SECRET_ACCESS_KEY
+
+        Create a secret with your AWS credentials:
+
+        .. code-block:: bash
+
+          kubectl create secret generic holmes-secrets -n robusta --from-literal=awsAccessKeyId='<YOUR_AWS_ACCESS_KEY_ID>' --from-literal=awsSecretAccessKey'<YOUR_AWS_SECRET_ACCESS_KEY>'
+
+        Update your helm values (``generated_values.yaml`` file) with the following configuration:
+
+        .. code-block:: yaml
+
+            enableHolmesGPT: true
+            holmes:
+              enablePostProcessing: true
+              additionalEnvVars:
+              - name: MODEL
+                value: bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0  # your bedrock model - replace with your own exact model name
+              - name: AWS_REGION_NAME
+                value: us-east-1
+              - name: AWS_ACCESS_KEY_ID
+                valueFrom:
+                  secretKeyRef:
+                    name: holmes-secrets
+                    key: awsAccessKeyId
+              - name: AWS_SECRET_ACCESS_KEY
+                valueFrom:
+                  secretKeyRef:
+                    name: holmes-secrets
+                    key: awsSecretAccessKey
+          
+        Do a Helm upgrade to apply the new values: ``helm upgrade robusta robusta/robusta --values=generated_values.yaml --set clusterName=<YOUR_CLUSTER_NAME>``
+
+
+Test Holmes Integration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this section we will see Holmes in action by deploying a crashing pod and analyzing the alert with AI.
+
+Before we proceed, you must follow the instructions above and configure Holmes.
+
+1. Let's deploy a crashing pod to simulate an issue.
 
 .. code-block:: yaml
 
-    enableHolmesGPT: true
-    holmes:
-      enablePostProcessing: true
-      additionalEnvVars:
-      - name: MODEL
-        value: bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0  # your bedrock model
-      - name: AWS_REGION_NAME
-        value: us-east-1
-      - name: AWS_ACCESS_KEY_ID
-        valueFrom:
-          secretKeyRef:
-            name: holmes-secrets
-            key: awsAccessKeyId
-      - name: AWS_SECRET_ACCESS_KEY
-        valueFrom:
-          secretKeyRef:
-            name: holmes-secrets
-            key: awsSecretAccessKey
+    kubectl apply -f https://raw.githubusercontent.com/robusta-dev/kubernetes-demos/main/crashpod/broken.yaml
+
+2. Go to the **Timeline** in `platform.robusta.dev  <https://platform.robusta.dev/>`_ and click on the ``CrashLoopBackOff`` alert
+
+.. image:: /images/AI_Analysis_demo.png
+    :width: 1000px
+
+3. Click the "Root Cause" tab on the top. This gives you the result of an investigation done by HolmesGPT based on the alert.
+
+.. image:: /images/AI_Analysis_demo2.png
+    :width: 1000px
+
+Additionally your alerts on Slack will have an "Ask Holmes" button. Clicking it will give you results in the Slack channel itself. Note that due to technical limitations with Slack-buttons, alerts analyzed from Slack will be sent to the AI without alert-labels. For the most accurate results, it is best to use the UI.