Merge branch 'Azure-Samples:main' into release-please-token

Azure-Samples · May 16, 2024 · 6ca3f07 · 6ca3f07
2 parents a04f85d + b8e34aa
commit 6ca3f07
Show file tree

Hide file tree

Showing 21 changed files with 479 additions and 111 deletions.
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -1,6 +1,5 @@
-FROM --platform=linux/amd64 mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
-# We need to force the container to be amd so that it works on a Mac. Without this the functions extension doesn't install.
+FROM mcr.microsoft.com/devcontainers/python:3.11
 
 # install git
 RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
-    && apt-get -y install --no-install-recommends git libgtk2.0-0 libgtk-3-0 libgbm-dev libnotify-dev libnss3 libxss1 libasound2 libxtst6 xauth xvfb
+    && apt-get -y install --no-install-recommends git libgtk2.0-0 libgtk-3-0 libgbm-dev libnotify-dev libnss3 libxss1 libasound2 libxtst6 xauth xvfb
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -7,9 +7,7 @@
 		"ghcr.io/devcontainers/features/azure-cli:1": {},
 		"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
 		"ghcr.io/devcontainers/features/node:1": {},
-		"ghcr.io/jlaundry/devcontainer-features/azure-functions-core-tools:1": {
-			"version": "4.0.5530"
-		},
+		"ghcr.io/jlaundry/devcontainer-features/azure-functions-core-tools:1": {},
 		"ghcr.io/azure/azure-dev/azd:latest": {},
 		"ghcr.io/rchaganti/vsc-devcontainer-features/azurebicep:1.0.5": {}
 	},
@@ -28,6 +26,7 @@
 				"ms-python.python",
 				"ms-python.black-formatter",
 				"ms-python.vscode-pylance",
+				"ms-python.pylint",
 				"ms-toolsai.jupyter",
 				"ms-vscode.vscode-node-azure-pack",
 				"TeamsDevApp.ms-teams-vscode-extension",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -21,4 +21,5 @@
     "python.testing.cwd": "${workspaceFolder}/code",
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "pylint.path" : [ "${interpreter}", "-m", "pylint" ]
 }
diff --git a/code/app.py b/code/app.py
@@ -1,18 +1,24 @@
+"""
+This module contains the entry point for the application.
+"""
+
 import os
 import logging
 from azure.monitor.opentelemetry import configure_azure_monitor
 from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
 
 logging.captureWarnings(True)
 logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO").upper())
-# Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
+# Raising the azure log level to WARN as it is too verbose -
+# https://github.com/Azure/azure-sdk-for-python/issues/9422
 logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
 # We cannot use EnvHelper here as Application Insights should be configured first
 # for instrumentation to work correctly
 if os.getenv("APPLICATIONINSIGHTS_ENABLED", "false").lower() == "true":
     configure_azure_monitor()
     HTTPXClientInstrumentor().instrument()  # httpx is used by openai
 
+# pylint: disable=wrong-import-position
 from create_app import create_app  # noqa: E402
 
 app = create_app()

diff --git a/code/backend/Admin.py b/code/backend/Admin.py
@@ -1,14 +1,19 @@
-import streamlit as st
+"""
+This module contains the code for the Admin app of the Chat with your data Solution Accelerator.
+"""
+
 import os
 import logging
 import sys
+import streamlit as st
 from azure.monitor.opentelemetry import configure_azure_monitor
 
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 
 logging.captureWarnings(True)
 logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO").upper())
-# Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
+# Raising the azure log level to WARN as it is too verbose
+# https://github.com/Azure/azure-sdk-for-python/issues/9422
 logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
 # We cannot use EnvHelper here as Application Insights needs to be configured first
 # for instrumentation to work correctly
@@ -26,14 +31,14 @@
     menu_items=None,
 )
 
-mod_page_style = """
+MOD_PAGE_STYLE = """
             <style>
             #MainMenu {visibility: hidden;}
             footer {visibility: hidden;}
             header {visibility: hidden;}
             </style>
             """
-st.markdown(mod_page_style, unsafe_allow_html=True)
+st.markdown(MOD_PAGE_STYLE, unsafe_allow_html=True)
 
 
 col1, col2, col3 = st.columns([1, 2, 1])

diff --git a/code/backend/batch/batch_push_results.py b/code/backend/batch/batch_push_results.py
@@ -1,20 +1,20 @@
 import os
 import logging
 import json
-import azure.functions as func
 from urllib.parse import urlparse
+import azure.functions as func
 
 from utilities.helpers.azure_blob_storage_client import AzureBlobStorageClient
 from utilities.helpers.env_helper import EnvHelper
 from utilities.helpers.embedders.embedder_factory import EmbedderFactory
+from utilities.search.search import Search
 
 bp_batch_push_results = func.Blueprint()
 logger = logging.getLogger(__name__)
 logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO").upper())
 
 
-def _get_file_name_from_message(msg: func.QueueMessage) -> str:
-    message_body = json.loads(msg.get_body().decode("utf-8"))
+def _get_file_name_from_message(message_body) -> str:
     return message_body.get(
         "filename",
         "/".join(
@@ -27,21 +27,37 @@ def _get_file_name_from_message(msg: func.QueueMessage) -> str:
     arg_name="msg", queue_name="doc-processing", connection="AzureWebJobsStorage"
 )
 def batch_push_results(msg: func.QueueMessage) -> None:
-    do_batch_push_results(msg)
+    message_body = json.loads(msg.get_body().decode("utf-8"))
+    logger.debug("Process Document Event queue function triggered: %s", message_body)
+
+    event_type = message_body.get("eventType", "")
+    # We handle "" in this scenario for backwards compatibility
+    # This function is primarily triggered by an Event Grid queue message from the blob storage
+    # However, it can also be triggered using a legacy schema from BatchStartProcessing
+    if event_type in ("", "Microsoft.Storage.BlobCreated"):
+        _process_document_created_event(message_body)
+
+    elif event_type == "Microsoft.Storage.BlobDeleted":
+        _process_document_deleted_event(message_body)
+
+    else:
+        raise NotImplementedError(f"Unknown event type received: {event_type}")
 
 
-def do_batch_push_results(msg: func.QueueMessage) -> None:
+def _process_document_created_event(message_body) -> None:
     env_helper: EnvHelper = EnvHelper()
-    logger.info(
-        "Python queue trigger function processed a queue item: %s",
-        msg.get_body().decode("utf-8"),
-    )
 
     blob_client = AzureBlobStorageClient()
-    # Get the file name from the message
-    file_name = _get_file_name_from_message(msg)
-    # Generate the SAS URL for the file
+    file_name = _get_file_name_from_message(message_body)
     file_sas = blob_client.get_blob_sas(file_name)
-    # Process the file
+
     embedder = EmbedderFactory.create(env_helper)
     embedder.embed_file(file_sas, file_name)
+
+
+def _process_document_deleted_event(message_body) -> None:
+    env_helper: EnvHelper = EnvHelper()
+    search_handler = Search.get_search_handler(env_helper)
+
+    blob_url = message_body.get("data", {}).get("url", "")
+    search_handler.delete_by_source(f"{blob_url}_SAS_TOKEN_PLACEHOLDER_")
diff --git a/code/backend/batch/function_app.py b/code/backend/batch/function_app.py
@@ -10,7 +10,8 @@
 logging.captureWarnings(True)
 # Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
 logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
-configure_azure_monitor()
+if os.getenv("APPLICATIONINSIGHTS_ENABLED", "false").lower() == "true":
+    configure_azure_monitor()
 
 app = func.FunctionApp(
     http_auth_level=func.AuthLevel.FUNCTION

diff --git a/code/backend/batch/utilities/helpers/embedders/push_embedder.py b/code/backend/batch/utilities/helpers/embedders/push_embedder.py
@@ -24,6 +24,7 @@
 
 class PushEmbedder(EmbedderBase):
     def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
+        self.env_helper = env_helper
         self.llm_helper = LLMHelper()
         self.azure_search_helper = AzureSearchHelper()
         self.azure_computer_vision_client = AzureComputerVisionClient(env_helper)
@@ -59,13 +60,15 @@ def __embed(
             in self.config.get_advanced_image_processing_image_types()
         ):
             logger.warning("Advanced image processing is not supported yet")
-            image_vectors = self.azure_computer_vision_client.vectorize_image(
-                source_url
-            )
-            logger.info("Image vectors: " + str(image_vectors))
 
+            caption = self.__generate_image_caption(source_url)
+            caption_vector = self.llm_helper.generate_embeddings(caption)
+
+            image_vector = self.azure_computer_vision_client.vectorize_image(source_url)
             documents_to_upload.append(
-                self.__create_image_document(source_url, image_vectors)
+                self.__create_image_document(
+                    source_url, image_vector, caption, caption_vector
+                )
             )
         else:
             documents: List[SourceDocument] = self.document_loading.load(
@@ -85,6 +88,32 @@ def __embed(
             logger.error("Failed to upload documents to search index")
             raise Exception(response)
 
+    def __generate_image_caption(self, source_url):
+        model = self.env_helper.AZURE_OPENAI_VISION_MODEL
+        caption_system_message = """You are an assistant that generates rich descriptions of images.
+You need to be accurate in the information you extract and detailed in the descriptons you generate.
+Do not abbreviate anything and do not shorten sentances. Explain the image completely.
+If you are provided with an image of a flow chart, describe the flow chart in detail.
+If the image is mostly text, use OCR to extract the text as it is displayed in the image."""
+
+        messages = [
+            {"role": "system", "content": caption_system_message},
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "text": "Describe this image in detail. Limit the response to 500 words.",
+                        "type": "text",
+                    },
+                    {"image_url": source_url, "type": "image_url"},
+                ],
+            },
+        ]
+
+        response = self.llm_helper.get_chat_completion(messages, model)
+        caption = response.choices[0].message.content
+        return caption
+
     def __convert_to_search_document(self, document: SourceDocument):
         embedded_content = self.llm_helper.generate_embeddings(document.content)
         metadata = {
@@ -111,7 +140,13 @@ def __generate_document_id(self, source_url: str) -> str:
         hash_key = hashlib.sha1(f"{source_url}_1".encode("utf-8")).hexdigest()
         return f"doc_{hash_key}"
 
-    def __create_image_document(self, source_url: str, image_vectors: List[float]):
+    def __create_image_document(
+        self,
+        source_url: str,
+        image_vector: List[float],
+        content: str,
+        content_vector: List[float],
+    ):
         parsed_url = urlparse(source_url)
 
         file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
@@ -127,9 +162,9 @@ def __create_image_document(self, source_url: str, image_vectors: List[float]):
 
         return {
             "id": document_id,
-            "content": "",
-            "content_vector": [],
-            "image_vector": image_vectors,
+            "content": content,
+            "content_vector": content_vector,
+            "image_vector": image_vector,
             "metadata": json.dumps(
                 {
                     "id": document_id,

diff --git a/code/backend/batch/utilities/helpers/env_helper.py b/code/backend/batch/utilities/helpers/env_helper.py
@@ -86,6 +86,7 @@ def __load_config(self, **kwargs) -> None:
         self.AZURE_OPENAI_MODEL_NAME = os.getenv(
             "AZURE_OPENAI_MODEL_NAME", "gpt-35-turbo"
         )
+        self.AZURE_OPENAI_VISION_MODEL = os.getenv("AZURE_OPENAI_VISION_MODEL", "gpt-4")
         self.AZURE_OPENAI_TEMPERATURE = os.getenv("AZURE_OPENAI_TEMPERATURE", "0")
         self.AZURE_OPENAI_TOP_P = os.getenv("AZURE_OPENAI_TOP_P", "1.0")
         self.AZURE_OPENAI_MAX_TOKENS = os.getenv("AZURE_OPENAI_MAX_TOKENS", "1000")

diff --git a/code/backend/batch/utilities/helpers/llm_helper.py b/code/backend/batch/utilities/helpers/llm_helper.py
@@ -117,9 +117,9 @@ def get_chat_completion_with_functions(
             function_call=function_call,
         )
 
-    def get_chat_completion(self, messages: list[dict]):
+    def get_chat_completion(self, messages: list[dict], model: str | None = None):
         return self.openai_client.chat.completions.create(
-            model=self.llm_model,
+            model=model or self.llm_model,
             messages=messages,
         )
 

diff --git a/code/backend/batch/utilities/search/search_handler_base.py b/code/backend/batch/utilities/search/search_handler_base.py
@@ -36,13 +36,35 @@ def get_files(self):
         pass
 
     @abstractmethod
-    def output_results(self, results, id_field):
+    def output_results(self, results):
         pass
 
     @abstractmethod
-    def delete_files(self, files, id_field):
+    def delete_files(self, files):
         pass
 
     @abstractmethod
     def query_search(self, question) -> list[SourceDocument]:
         pass
+
+    def delete_by_source(self, source) -> None:
+        if source is None:
+            return
+
+        documents = self._get_documents_by_source(source)
+        if documents is None:
+            return
+
+        files_to_delete = self.output_results(documents)
+        self.delete_files(files_to_delete)
+
+    def _get_documents_by_source(self, source):
+        if source is None:
+            return None
+
+        return self.search_client.search(
+            "*",
+            select="id, title",
+            include_total_count=True,
+            filter=f"source eq '{source}'",
+        )