[Integrated Vectorization] Changes for Admin Configuration page (#798)

* downloading the contents of url & uploading blob * changes for downloading url content & uploading to blob * Adding url as the metadata to the blob * PR only for adding environment var * removing chnages unrelated to env variable * Grouping integrated vec. with azure search * upadting description to match the usage * grouping search variables * Spike for resource creation * Failing build due to poetry lock file * pull resources creation * Pull resource creation refactoring * updating index field names * fixing tests for when using IV * changes for rbac IV * role for storage & tests * unit tests for helpers * unit tests for helper classes * code review changes * tests update * Explore Data changes * decoupled explore & delete pages * showing only the chunk number to match push model * Deleting unused files * Deleting unused files * unused code * unit tests for explore & delete * Code review changes * Chat changes for IV * missed mapping * Question Answer tool changes * module load issue * mapping chunk id as chunk number * sync main * test fix * code review comments * method name change * Admin config changes * Refactoring Document Processor * unit test fix * code review changes * updating configHelper path * test fix * test fix * unit test fix * code review comments * unit test fix * config type * loading env variables inAdmin.py * type declaration for env_helper * moving env helper below app insghts * unused variable
Azure-Samples · May 7, 2024 · ba0f096 · ba0f096
1 parent c923176
commit ba0f096
Show file tree

Hide file tree

Showing 34 changed files with 453 additions and 285 deletions.
diff --git a/.env.sample b/.env.sample
@@ -19,8 +19,6 @@ AZURE_SEARCH_CONVERSATIONS_LOG_INDEX=conversations-log
 AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION=false
 AZURE_SEARCH_INDEXER_NAME=
 AZURE_SEARCH_DATASOURCE_NAME=
-AZURE_SEARCH_IV_MAX_PAGE_LENGTH=2000
-AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH=500
 # Azure OpenAI for generating the answer and computing the embedding of the documents
 AZURE_OPENAI_RESOURCE=
 AZURE_OPENAI_API_KEY=

diff --git a/code/backend/Admin.py b/code/backend/Admin.py
@@ -2,12 +2,8 @@
 import os
 import logging
 import sys
-from dotenv import load_dotenv
 from azure.monitor.opentelemetry import configure_azure_monitor
 
-
-load_dotenv()
-
 sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
 
 logging.captureWarnings(True)

diff --git a/code/backend/batch/AddURLEmbeddings.py b/code/backend/batch/AddURLEmbeddings.py
@@ -2,10 +2,8 @@
 import logging
 import traceback
 import azure.functions as func
-
-from utilities.helpers.DocumentProcessorHelper import DocumentProcessor
-from utilities.helpers.ConfigHelper import ConfigHelper
-
+from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory
+from utilities.helpers.EnvHelper import EnvHelper
 
 bp_add_url_embeddings = func.Blueprint()
 logger = logging.getLogger(__name__)
@@ -14,6 +12,7 @@
 
 @bp_add_url_embeddings.route(route="AddURLEmbeddings")
 def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse:
+    env_helper: EnvHelper = EnvHelper()
     logger.info("Python HTTP trigger function processed a request.")
 
     # Get Url from request
@@ -28,12 +27,8 @@ def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse:
     # Check if url is present, compute embeddings and add them to VectorStore
     if url:
         try:
-            config = ConfigHelper.get_active_config_or_default()
-            document_processor = DocumentProcessor()
-            processors = list(
-                filter(lambda x: x.document_type == "url", config.document_processors)
-            )
-            document_processor.process(source_url=url, processors=processors)
+            embedder = EmbedderFactory.create(env_helper)
+            embedder.embed_file(url, ".url")
         except Exception:
             return func.HttpResponse(
                 f"Error: {traceback.format_exc()}", status_code=500

diff --git a/code/backend/batch/BatchPushResults.py b/code/backend/batch/BatchPushResults.py
@@ -5,10 +5,8 @@
 from urllib.parse import urlparse
 
 from utilities.helpers.AzureBlobStorageClient import AzureBlobStorageClient
-from utilities.helpers.DocumentProcessorHelper import DocumentProcessor
-from utilities.helpers.ConfigHelper import ConfigHelper
 from utilities.helpers.EnvHelper import EnvHelper
-
+from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory
 
 bp_batch_push_results = func.Blueprint()
 logger = logging.getLogger(__name__)
@@ -39,24 +37,11 @@ def do_batch_push_results(msg: func.QueueMessage) -> None:
         msg.get_body().decode("utf-8"),
     )
 
-    document_processor = DocumentProcessor()
     blob_client = AzureBlobStorageClient()
     # Get the file name from the message
     file_name = _get_file_name_from_message(msg)
     # Generate the SAS URL for the file
     file_sas = blob_client.get_blob_sas(file_name)
-    # Get file extension's processors
-    file_extension = file_name.split(".")[-1]
-
     # Process the file
-    if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
-        document_processor.process_using_integrated_vectorisation(source_url=file_sas)
-    else:
-        processors = list(
-            filter(
-                lambda x: x.document_type.lower() == file_extension.lower(),
-                ConfigHelper.get_active_config_or_default().document_processors,
-            )
-        )
-        document_processor.process(source_url=file_sas, processors=processors)
-        blob_client.upsert_blob_metadata(file_name, {"embeddings_added": "true"})
+    embedder = EmbedderFactory.create(env_helper)
+    embedder.embed_file(file_sas, file_name)
diff --git a/code/backend/batch/GetConversationResponse.py b/code/backend/batch/GetConversationResponse.py
@@ -5,7 +5,7 @@
 
 from utilities.helpers.EnvHelper import EnvHelper
 from utilities.helpers.OrchestratorHelper import Orchestrator
-from utilities.helpers.ConfigHelper import ConfigHelper
+from utilities.helpers.config.ConfigHelper import ConfigHelper
 
 
 bp_get_conversation_response = func.Blueprint()

diff --git a/code/backend/batch/utilities/helpers/DocumentProcessorHelper.py b/code/backend/batch/utilities/helpers/DocumentProcessorHelper.py
diff --git a/code/backend/batch/utilities/helpers/EnvHelper.py b/code/backend/batch/utilities/helpers/EnvHelper.py
@@ -78,12 +78,6 @@ def __load_config(self, **kwargs) -> None:
         self.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = self.get_env_var_bool(
             "AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION", "False"
         )
-        self.AZURE_SEARCH_IV_MAX_PAGE_LENGTH = int(
-            os.getenv("AZURE_SEARCH_IV_MAX_PAGE_LENGTH", "800")
-        )
-        self.AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH = int(
-            os.getenv("AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH", "100")
-        )
 
         self.AZURE_AUTH_TYPE = os.getenv("AZURE_AUTH_TYPE", "keys")
         # Azure OpenAI

diff --git a/...d/batch/utilities/helpers/ConfigHelper.py → .../utilities/helpers/config/ConfigHelper.py b/...d/batch/utilities/helpers/ConfigHelper.py → .../utilities/helpers/config/ConfigHelper.py
@@ -2,15 +2,15 @@
 import json
 import logging
 from string import Template
-from .AzureBlobStorageClient import AzureBlobStorageClient
-from ..document_chunking.Strategies import ChunkingSettings, ChunkingStrategy
-from ..document_loading import LoadingSettings, LoadingStrategy
-from .DocumentProcessorHelper import Processor
-from .OrchestratorHelper import (
+from ..AzureBlobStorageClient import AzureBlobStorageClient
+from ...document_chunking.Strategies import ChunkingSettings, ChunkingStrategy
+from ...document_loading import LoadingSettings, LoadingStrategy
+from .EmbeddingConfig import EmbeddingConfig
+from ..OrchestratorHelper import (
     OrchestrationSettings,
     OrchestrationStrategy,
 )
-from .EnvHelper import EnvHelper
+from ..EnvHelper import EnvHelper
 
 CONFIG_CONTAINER_NAME = "config"
 CONFIG_FILE_NAME = "active.json"
@@ -24,7 +24,7 @@ def __init__(self, config: dict):
         self.example = Example(config["example"])
         self.logging = Logging(config["logging"])
         self.document_processors = [
-            Processor(
+            EmbeddingConfig(
                 document_type=c["document_type"],
                 chunking=(
                     ChunkingSettings(c["chunking"])
@@ -49,6 +49,11 @@ def __init__(self, config: dict):
         self.orchestrator = OrchestrationSettings(
             config.get("orchestrator", self.default_orchestration_settings)
         )
+        self.integrated_vectorization_config = (
+            IntegratedVectorizationConfig(config["integrated_vectorization_config"])
+            if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
+            else None
+        )
 
     def get_available_document_types(self):
         document_types = [
@@ -107,6 +112,14 @@ def __init__(self, logging: dict):
         self.log_tokens = logging["log_tokens"]
 
 
+class IntegratedVectorizationConfig:
+    def __init__(self, integrated_vectorization_config: dict):
+        self.max_page_length = integrated_vectorization_config["max_page_length"]
+        self.page_overlap_length = integrated_vectorization_config[
+            "page_overlap_length"
+        ]
+
+
 class ConfigHelper:
     _default_config = None
 
@@ -142,6 +155,11 @@ def _set_new_config_properties(config: dict, default_config: dict):
         if config.get("example") is None:
             config["example"] = default_config["example"]
 
+        if config.get("integrated_vectorization_config") is None:
+            config["integrated_vectorization_config"] = default_config[
+                "integrated_vectorization_config"
+            ]
+
     @staticmethod
     def get_active_config_or_default():
         env_helper = EnvHelper()
@@ -175,9 +193,7 @@ def get_default_config():
         if ConfigHelper._default_config is None:
             env_helper = EnvHelper()
 
-            config_file_path = os.path.join(
-                os.path.dirname(__file__), "config", "default.json"
-            )
+            config_file_path = os.path.join(os.path.dirname(__file__), "default.json")
 
             with open(config_file_path) as f:
                 logger.info(f"Loading default config from {config_file_path}")

diff --git a/code/backend/batch/utilities/helpers/config/EmbeddingConfig.py b/code/backend/batch/utilities/helpers/config/EmbeddingConfig.py
@@ -0,0 +1,27 @@
+from ..DocumentLoadingHelper import LoadingSettings
+from ..DocumentChunkingHelper import ChunkingSettings
+
+
+class EmbeddingConfig(ChunkingSettings, LoadingSettings):
+    def __init__(
+        self,
+        document_type: str,
+        chunking: ChunkingSettings | None,
+        loading: LoadingSettings | None,
+        use_advanced_image_processing: bool,
+    ):
+        self.document_type = document_type
+        self.chunking = chunking
+        self.loading = loading
+        self.use_advanced_image_processing = use_advanced_image_processing
+
+    def __eq__(self, other):
+        if isinstance(self, other.__class__):
+            return (
+                self.document_type == other.document_type
+                and self.chunking == other.chunking
+                and self.loading == other.loading
+                and self.use_advanced_image_processing
+                == other.use_advanced_image_processing
+            )
+        return False
diff --git a/code/backend/batch/utilities/helpers/config/default.json b/code/backend/batch/utilities/helpers/config/default.json
@@ -107,6 +107,10 @@
       }
     }
   ],
+  "integrated_vectorization_config": {
+    "max_page_length": "800",
+    "page_overlap_length": "100"
+  },
   "logging": {
     "log_user_interactions": true,
     "log_tokens": true

diff --git a/code/backend/batch/utilities/helpers/embedders/EmbedderBase.py b/code/backend/batch/utilities/helpers/embedders/EmbedderBase.py
@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+
+
+class EmbedderBase(ABC):
+    @abstractmethod
+    def embed_file(self, source_url: str, file_name: str):
+        pass
diff --git a/code/backend/batch/utilities/helpers/embedders/EmbedderFactory.py b/code/backend/batch/utilities/helpers/embedders/EmbedderFactory.py
@@ -0,0 +1,15 @@
+from ..EnvHelper import EnvHelper
+from ..AzureBlobStorageClient import AzureBlobStorageClient
+from .PushEmbedder import PushEmbedder
+from .IntegratedVectorizationEmbedder import (
+    IntegratedVectorizationEmbedder,
+)
+
+
+class EmbedderFactory:
+    @staticmethod
+    def create(env_helper: EnvHelper):
+        if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
+            return IntegratedVectorizationEmbedder(env_helper)
+        else:
+            return PushEmbedder(AzureBlobStorageClient())
diff --git a/code/backend/batch/utilities/helpers/embedders/IntegratedVectorizationEmbedder.py b/code/backend/batch/utilities/helpers/embedders/IntegratedVectorizationEmbedder.py
@@ -0,0 +1,41 @@
+from .EmbedderBase import EmbedderBase
+from ..EnvHelper import EnvHelper
+from ..LLMHelper import LLMHelper
+from ...integrated_vectorization.AzureSearchIndex import AzureSearchIndex
+from ...integrated_vectorization.AzureSearchIndexer import AzureSearchIndexer
+from ...integrated_vectorization.AzureSearchDatasource import AzureSearchDatasource
+from ...integrated_vectorization.AzureSearchSkillset import AzureSearchSkillset
+from ..config.ConfigHelper import ConfigHelper
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class IntegratedVectorizationEmbedder(EmbedderBase):
+    def __init__(self, env_helper: EnvHelper):
+        self.env_helper = env_helper
+        self.llm_helper: LLMHelper = LLMHelper()
+
+    def embed_file(self, source_url: str, file_name: str):
+        self.process_using_integrated_vectorization(source_url=source_url)
+
+    def process_using_integrated_vectorization(self, source_url: str):
+        config = ConfigHelper.get_active_config_or_default()
+        try:
+            search_datasource = AzureSearchDatasource(self.env_helper)
+            search_datasource.create_or_update_datasource()
+            search_index = AzureSearchIndex(self.env_helper, self.llm_helper)
+            search_index.create_or_update_index()
+            search_skillset = AzureSearchSkillset(
+                self.env_helper, config.integrated_vectorization_config
+            )
+            search_skillset_result = search_skillset.create_skillset()
+            search_indexer = AzureSearchIndexer(self.env_helper)
+            indexer_result = search_indexer.create_or_update_indexer(
+                self.env_helper.AZURE_SEARCH_INDEXER_NAME,
+                skillset_name=search_skillset_result.name,
+            )
+            return indexer_result
+        except Exception as e:
+            logger.error(f"Error processing {source_url}: {e}")
+            raise e