Skip to content

Commit

Permalink
Merge branch 'Azure-Samples:main' into release-please-token
Browse files Browse the repository at this point in the history
  • Loading branch information
frtibble authored May 16, 2024
2 parents a04f85d + b8e34aa commit 6ca3f07
Show file tree
Hide file tree
Showing 21 changed files with 479 additions and 111 deletions.
5 changes: 2 additions & 3 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
FROM --platform=linux/amd64 mcr.microsoft.com/devcontainers/python:1-3.11-bullseye
# We need to force the container to be amd so that it works on a Mac. Without this the functions extension doesn't install.
FROM mcr.microsoft.com/devcontainers/python:3.11

# install git
RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
&& apt-get -y install --no-install-recommends git libgtk2.0-0 libgtk-3-0 libgbm-dev libnotify-dev libnss3 libxss1 libasound2 libxtst6 xauth xvfb
&& apt-get -y install --no-install-recommends git libgtk2.0-0 libgtk-3-0 libgbm-dev libnotify-dev libnss3 libxss1 libasound2 libxtst6 xauth xvfb
5 changes: 2 additions & 3 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
"ghcr.io/devcontainers/features/azure-cli:1": {},
"ghcr.io/devcontainers/features/docker-outside-of-docker:1": {},
"ghcr.io/devcontainers/features/node:1": {},
"ghcr.io/jlaundry/devcontainer-features/azure-functions-core-tools:1": {
"version": "4.0.5530"
},
"ghcr.io/jlaundry/devcontainer-features/azure-functions-core-tools:1": {},
"ghcr.io/azure/azure-dev/azd:latest": {},
"ghcr.io/rchaganti/vsc-devcontainer-features/azurebicep:1.0.5": {}
},
Expand All @@ -28,6 +26,7 @@
"ms-python.python",
"ms-python.black-formatter",
"ms-python.vscode-pylance",
"ms-python.pylint",
"ms-toolsai.jupyter",
"ms-vscode.vscode-node-azure-pack",
"TeamsDevApp.ms-teams-vscode-extension",
Expand Down
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@
"python.testing.cwd": "${workspaceFolder}/code",
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"pylint.path" : [ "${interpreter}", "-m", "pylint" ]
}
8 changes: 7 additions & 1 deletion code/app.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
"""
This module contains the entry point for the application.
"""

import os
import logging
from azure.monitor.opentelemetry import configure_azure_monitor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor

logging.captureWarnings(True)
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO").upper())
# Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
# Raising the azure log level to WARN as it is too verbose -
# https://github.com/Azure/azure-sdk-for-python/issues/9422
logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
# We cannot use EnvHelper here as Application Insights should be configured first
# for instrumentation to work correctly
if os.getenv("APPLICATIONINSIGHTS_ENABLED", "false").lower() == "true":
configure_azure_monitor()
HTTPXClientInstrumentor().instrument() # httpx is used by openai

# pylint: disable=wrong-import-position
from create_app import create_app # noqa: E402

app = create_app()
Expand Down
13 changes: 9 additions & 4 deletions code/backend/Admin.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import streamlit as st
"""
This module contains the code for the Admin app of the Chat with your data Solution Accelerator.
"""

import os
import logging
import sys
import streamlit as st
from azure.monitor.opentelemetry import configure_azure_monitor

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

logging.captureWarnings(True)
logging.basicConfig(level=os.getenv("LOGLEVEL", "INFO").upper())
# Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
# Raising the azure log level to WARN as it is too verbose
# https://github.com/Azure/azure-sdk-for-python/issues/9422
logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
# We cannot use EnvHelper here as Application Insights needs to be configured first
# for instrumentation to work correctly
Expand All @@ -26,14 +31,14 @@
menu_items=None,
)

mod_page_style = """
MOD_PAGE_STYLE = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
header {visibility: hidden;}
</style>
"""
st.markdown(mod_page_style, unsafe_allow_html=True)
st.markdown(MOD_PAGE_STYLE, unsafe_allow_html=True)


col1, col2, col3 = st.columns([1, 2, 1])
Expand Down
42 changes: 29 additions & 13 deletions code/backend/batch/batch_push_results.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import os
import logging
import json
import azure.functions as func
from urllib.parse import urlparse
import azure.functions as func

from utilities.helpers.azure_blob_storage_client import AzureBlobStorageClient
from utilities.helpers.env_helper import EnvHelper
from utilities.helpers.embedders.embedder_factory import EmbedderFactory
from utilities.search.search import Search

bp_batch_push_results = func.Blueprint()
logger = logging.getLogger(__name__)
logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO").upper())


def _get_file_name_from_message(msg: func.QueueMessage) -> str:
message_body = json.loads(msg.get_body().decode("utf-8"))
def _get_file_name_from_message(message_body) -> str:
return message_body.get(
"filename",
"/".join(
Expand All @@ -27,21 +27,37 @@ def _get_file_name_from_message(msg: func.QueueMessage) -> str:
arg_name="msg", queue_name="doc-processing", connection="AzureWebJobsStorage"
)
def batch_push_results(msg: func.QueueMessage) -> None:
do_batch_push_results(msg)
message_body = json.loads(msg.get_body().decode("utf-8"))
logger.debug("Process Document Event queue function triggered: %s", message_body)

event_type = message_body.get("eventType", "")
# We handle "" in this scenario for backwards compatibility
# This function is primarily triggered by an Event Grid queue message from the blob storage
# However, it can also be triggered using a legacy schema from BatchStartProcessing
if event_type in ("", "Microsoft.Storage.BlobCreated"):
_process_document_created_event(message_body)

elif event_type == "Microsoft.Storage.BlobDeleted":
_process_document_deleted_event(message_body)

else:
raise NotImplementedError(f"Unknown event type received: {event_type}")


def do_batch_push_results(msg: func.QueueMessage) -> None:
def _process_document_created_event(message_body) -> None:
env_helper: EnvHelper = EnvHelper()
logger.info(
"Python queue trigger function processed a queue item: %s",
msg.get_body().decode("utf-8"),
)

blob_client = AzureBlobStorageClient()
# Get the file name from the message
file_name = _get_file_name_from_message(msg)
# Generate the SAS URL for the file
file_name = _get_file_name_from_message(message_body)
file_sas = blob_client.get_blob_sas(file_name)
# Process the file

embedder = EmbedderFactory.create(env_helper)
embedder.embed_file(file_sas, file_name)


def _process_document_deleted_event(message_body) -> None:
env_helper: EnvHelper = EnvHelper()
search_handler = Search.get_search_handler(env_helper)

blob_url = message_body.get("data", {}).get("url", "")
search_handler.delete_by_source(f"{blob_url}_SAS_TOKEN_PLACEHOLDER_")
3 changes: 2 additions & 1 deletion code/backend/batch/function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
logging.captureWarnings(True)
# Raising the azure log level to WARN as it is too verbose - https://github.com/Azure/azure-sdk-for-python/issues/9422
logging.getLogger("azure").setLevel(os.environ.get("LOGLEVEL_AZURE", "WARN").upper())
configure_azure_monitor()
if os.getenv("APPLICATIONINSIGHTS_ENABLED", "false").lower() == "true":
configure_azure_monitor()

app = func.FunctionApp(
http_auth_level=func.AuthLevel.FUNCTION
Expand Down
53 changes: 44 additions & 9 deletions code/backend/batch/utilities/helpers/embedders/push_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

class PushEmbedder(EmbedderBase):
def __init__(self, blob_client: AzureBlobStorageClient, env_helper: EnvHelper):
self.env_helper = env_helper
self.llm_helper = LLMHelper()
self.azure_search_helper = AzureSearchHelper()
self.azure_computer_vision_client = AzureComputerVisionClient(env_helper)
Expand Down Expand Up @@ -59,13 +60,15 @@ def __embed(
in self.config.get_advanced_image_processing_image_types()
):
logger.warning("Advanced image processing is not supported yet")
image_vectors = self.azure_computer_vision_client.vectorize_image(
source_url
)
logger.info("Image vectors: " + str(image_vectors))

caption = self.__generate_image_caption(source_url)
caption_vector = self.llm_helper.generate_embeddings(caption)

image_vector = self.azure_computer_vision_client.vectorize_image(source_url)
documents_to_upload.append(
self.__create_image_document(source_url, image_vectors)
self.__create_image_document(
source_url, image_vector, caption, caption_vector
)
)
else:
documents: List[SourceDocument] = self.document_loading.load(
Expand All @@ -85,6 +88,32 @@ def __embed(
logger.error("Failed to upload documents to search index")
raise Exception(response)

def __generate_image_caption(self, source_url):
model = self.env_helper.AZURE_OPENAI_VISION_MODEL
caption_system_message = """You are an assistant that generates rich descriptions of images.
You need to be accurate in the information you extract and detailed in the descriptons you generate.
Do not abbreviate anything and do not shorten sentances. Explain the image completely.
If you are provided with an image of a flow chart, describe the flow chart in detail.
If the image is mostly text, use OCR to extract the text as it is displayed in the image."""

messages = [
{"role": "system", "content": caption_system_message},
{
"role": "user",
"content": [
{
"text": "Describe this image in detail. Limit the response to 500 words.",
"type": "text",
},
{"image_url": source_url, "type": "image_url"},
],
},
]

response = self.llm_helper.get_chat_completion(messages, model)
caption = response.choices[0].message.content
return caption

def __convert_to_search_document(self, document: SourceDocument):
embedded_content = self.llm_helper.generate_embeddings(document.content)
metadata = {
Expand All @@ -111,7 +140,13 @@ def __generate_document_id(self, source_url: str) -> str:
hash_key = hashlib.sha1(f"{source_url}_1".encode("utf-8")).hexdigest()
return f"doc_{hash_key}"

def __create_image_document(self, source_url: str, image_vectors: List[float]):
def __create_image_document(
self,
source_url: str,
image_vector: List[float],
content: str,
content_vector: List[float],
):
parsed_url = urlparse(source_url)

file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
Expand All @@ -127,9 +162,9 @@ def __create_image_document(self, source_url: str, image_vectors: List[float]):

return {
"id": document_id,
"content": "",
"content_vector": [],
"image_vector": image_vectors,
"content": content,
"content_vector": content_vector,
"image_vector": image_vector,
"metadata": json.dumps(
{
"id": document_id,
Expand Down
1 change: 1 addition & 0 deletions code/backend/batch/utilities/helpers/env_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __load_config(self, **kwargs) -> None:
self.AZURE_OPENAI_MODEL_NAME = os.getenv(
"AZURE_OPENAI_MODEL_NAME", "gpt-35-turbo"
)
self.AZURE_OPENAI_VISION_MODEL = os.getenv("AZURE_OPENAI_VISION_MODEL", "gpt-4")
self.AZURE_OPENAI_TEMPERATURE = os.getenv("AZURE_OPENAI_TEMPERATURE", "0")
self.AZURE_OPENAI_TOP_P = os.getenv("AZURE_OPENAI_TOP_P", "1.0")
self.AZURE_OPENAI_MAX_TOKENS = os.getenv("AZURE_OPENAI_MAX_TOKENS", "1000")
Expand Down
4 changes: 2 additions & 2 deletions code/backend/batch/utilities/helpers/llm_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,9 @@ def get_chat_completion_with_functions(
function_call=function_call,
)

def get_chat_completion(self, messages: list[dict]):
def get_chat_completion(self, messages: list[dict], model: str | None = None):
return self.openai_client.chat.completions.create(
model=self.llm_model,
model=model or self.llm_model,
messages=messages,
)

Expand Down
26 changes: 24 additions & 2 deletions code/backend/batch/utilities/search/search_handler_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,35 @@ def get_files(self):
pass

@abstractmethod
def output_results(self, results, id_field):
def output_results(self, results):
pass

@abstractmethod
def delete_files(self, files, id_field):
def delete_files(self, files):
pass

@abstractmethod
def query_search(self, question) -> list[SourceDocument]:
pass

def delete_by_source(self, source) -> None:
if source is None:
return

documents = self._get_documents_by_source(source)
if documents is None:
return

files_to_delete = self.output_results(documents)
self.delete_files(files_to_delete)

def _get_documents_by_source(self, source):
if source is None:
return None

return self.search_client.search(
"*",
select="id, title",
include_total_count=True,
filter=f"source eq '{source}'",
)
Loading

0 comments on commit 6ca3f07

Please sign in to comment.