Skip to content

Commit

Permalink
[Integrated Vectorization] Changes for Admin Configuration page (#798)
Browse files Browse the repository at this point in the history
* downloading the contents of url & uploading blob

* changes for downloading url content & uploading to blob

* Adding url as the metadata to the blob

* PR only for adding environment var

* removing chnages unrelated to env variable

* Grouping integrated vec. with azure search

* upadting description to match the usage

* grouping search variables

* Spike for resource creation

* Failing build due to poetry lock file

* pull resources creation

* Pull resource creation refactoring

* updating index field names

* fixing tests for when using IV

* changes for rbac IV

* role for storage & tests

* unit tests for helpers

* unit tests for helper classes

* code review changes

* tests update

* Explore Data changes

* decoupled explore & delete pages

* showing only the chunk number to match push model

* Deleting unused files

* Deleting unused files

* unused code

* unit tests for explore & delete

* Code review changes

* Chat changes for IV

* missed mapping

* Question Answer tool changes

* module load issue

* mapping chunk id as chunk number

* sync main

* test fix

* code review comments

* method name change

* Admin config changes

* Refactoring Document Processor

* unit test fix

* code review changes

* updating configHelper path

* test fix

* test fix

* unit test fix

* code review comments

* unit test fix

* config type

* loading env variables inAdmin.py

* type declaration for env_helper

* moving env helper below app insghts

* unused variable
  • Loading branch information
komalg1 authored May 7, 2024
1 parent c923176 commit ba0f096
Show file tree
Hide file tree
Showing 34 changed files with 453 additions and 285 deletions.
2 changes: 0 additions & 2 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@ AZURE_SEARCH_CONVERSATIONS_LOG_INDEX=conversations-log
AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION=false
AZURE_SEARCH_INDEXER_NAME=
AZURE_SEARCH_DATASOURCE_NAME=
AZURE_SEARCH_IV_MAX_PAGE_LENGTH=2000
AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH=500
# Azure OpenAI for generating the answer and computing the embedding of the documents
AZURE_OPENAI_RESOURCE=
AZURE_OPENAI_API_KEY=
Expand Down
4 changes: 0 additions & 4 deletions code/backend/Admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
import os
import logging
import sys
from dotenv import load_dotenv
from azure.monitor.opentelemetry import configure_azure_monitor


load_dotenv()

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

logging.captureWarnings(True)
Expand Down
15 changes: 5 additions & 10 deletions code/backend/batch/AddURLEmbeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
import logging
import traceback
import azure.functions as func

from utilities.helpers.DocumentProcessorHelper import DocumentProcessor
from utilities.helpers.ConfigHelper import ConfigHelper

from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory
from utilities.helpers.EnvHelper import EnvHelper

bp_add_url_embeddings = func.Blueprint()
logger = logging.getLogger(__name__)
Expand All @@ -14,6 +12,7 @@

@bp_add_url_embeddings.route(route="AddURLEmbeddings")
def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse:
env_helper: EnvHelper = EnvHelper()
logger.info("Python HTTP trigger function processed a request.")

# Get Url from request
Expand All @@ -28,12 +27,8 @@ def add_url_embeddings(req: func.HttpRequest) -> func.HttpResponse:
# Check if url is present, compute embeddings and add them to VectorStore
if url:
try:
config = ConfigHelper.get_active_config_or_default()
document_processor = DocumentProcessor()
processors = list(
filter(lambda x: x.document_type == "url", config.document_processors)
)
document_processor.process(source_url=url, processors=processors)
embedder = EmbedderFactory.create(env_helper)
embedder.embed_file(url, ".url")
except Exception:
return func.HttpResponse(
f"Error: {traceback.format_exc()}", status_code=500
Expand Down
21 changes: 3 additions & 18 deletions code/backend/batch/BatchPushResults.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,8 @@
from urllib.parse import urlparse

from utilities.helpers.AzureBlobStorageClient import AzureBlobStorageClient
from utilities.helpers.DocumentProcessorHelper import DocumentProcessor
from utilities.helpers.ConfigHelper import ConfigHelper
from utilities.helpers.EnvHelper import EnvHelper

from utilities.helpers.embedders.EmbedderFactory import EmbedderFactory

bp_batch_push_results = func.Blueprint()
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,24 +37,11 @@ def do_batch_push_results(msg: func.QueueMessage) -> None:
msg.get_body().decode("utf-8"),
)

document_processor = DocumentProcessor()
blob_client = AzureBlobStorageClient()
# Get the file name from the message
file_name = _get_file_name_from_message(msg)
# Generate the SAS URL for the file
file_sas = blob_client.get_blob_sas(file_name)
# Get file extension's processors
file_extension = file_name.split(".")[-1]

# Process the file
if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
document_processor.process_using_integrated_vectorisation(source_url=file_sas)
else:
processors = list(
filter(
lambda x: x.document_type.lower() == file_extension.lower(),
ConfigHelper.get_active_config_or_default().document_processors,
)
)
document_processor.process(source_url=file_sas, processors=processors)
blob_client.upsert_blob_metadata(file_name, {"embeddings_added": "true"})
embedder = EmbedderFactory.create(env_helper)
embedder.embed_file(file_sas, file_name)
2 changes: 1 addition & 1 deletion code/backend/batch/GetConversationResponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from utilities.helpers.EnvHelper import EnvHelper
from utilities.helpers.OrchestratorHelper import Orchestrator
from utilities.helpers.ConfigHelper import ConfigHelper
from utilities.helpers.config.ConfigHelper import ConfigHelper


bp_get_conversation_response = func.Blueprint()
Expand Down
88 changes: 0 additions & 88 deletions code/backend/batch/utilities/helpers/DocumentProcessorHelper.py

This file was deleted.

6 changes: 0 additions & 6 deletions code/backend/batch/utilities/helpers/EnvHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,6 @@ def __load_config(self, **kwargs) -> None:
self.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = self.get_env_var_bool(
"AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION", "False"
)
self.AZURE_SEARCH_IV_MAX_PAGE_LENGTH = int(
os.getenv("AZURE_SEARCH_IV_MAX_PAGE_LENGTH", "800")
)
self.AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH = int(
os.getenv("AZURE_SEARCH_IV_PAGE_OVERLAP_LENGTH", "100")
)

self.AZURE_AUTH_TYPE = os.getenv("AZURE_AUTH_TYPE", "keys")
# Azure OpenAI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
import json
import logging
from string import Template
from .AzureBlobStorageClient import AzureBlobStorageClient
from ..document_chunking.Strategies import ChunkingSettings, ChunkingStrategy
from ..document_loading import LoadingSettings, LoadingStrategy
from .DocumentProcessorHelper import Processor
from .OrchestratorHelper import (
from ..AzureBlobStorageClient import AzureBlobStorageClient
from ...document_chunking.Strategies import ChunkingSettings, ChunkingStrategy
from ...document_loading import LoadingSettings, LoadingStrategy
from .EmbeddingConfig import EmbeddingConfig
from ..OrchestratorHelper import (
OrchestrationSettings,
OrchestrationStrategy,
)
from .EnvHelper import EnvHelper
from ..EnvHelper import EnvHelper

CONFIG_CONTAINER_NAME = "config"
CONFIG_FILE_NAME = "active.json"
Expand All @@ -24,7 +24,7 @@ def __init__(self, config: dict):
self.example = Example(config["example"])
self.logging = Logging(config["logging"])
self.document_processors = [
Processor(
EmbeddingConfig(
document_type=c["document_type"],
chunking=(
ChunkingSettings(c["chunking"])
Expand All @@ -49,6 +49,11 @@ def __init__(self, config: dict):
self.orchestrator = OrchestrationSettings(
config.get("orchestrator", self.default_orchestration_settings)
)
self.integrated_vectorization_config = (
IntegratedVectorizationConfig(config["integrated_vectorization_config"])
if self.env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION
else None
)

def get_available_document_types(self):
document_types = [
Expand Down Expand Up @@ -107,6 +112,14 @@ def __init__(self, logging: dict):
self.log_tokens = logging["log_tokens"]


class IntegratedVectorizationConfig:
def __init__(self, integrated_vectorization_config: dict):
self.max_page_length = integrated_vectorization_config["max_page_length"]
self.page_overlap_length = integrated_vectorization_config[
"page_overlap_length"
]


class ConfigHelper:
_default_config = None

Expand Down Expand Up @@ -142,6 +155,11 @@ def _set_new_config_properties(config: dict, default_config: dict):
if config.get("example") is None:
config["example"] = default_config["example"]

if config.get("integrated_vectorization_config") is None:
config["integrated_vectorization_config"] = default_config[
"integrated_vectorization_config"
]

@staticmethod
def get_active_config_or_default():
env_helper = EnvHelper()
Expand Down Expand Up @@ -175,9 +193,7 @@ def get_default_config():
if ConfigHelper._default_config is None:
env_helper = EnvHelper()

config_file_path = os.path.join(
os.path.dirname(__file__), "config", "default.json"
)
config_file_path = os.path.join(os.path.dirname(__file__), "default.json")

with open(config_file_path) as f:
logger.info(f"Loading default config from {config_file_path}")
Expand Down
27 changes: 27 additions & 0 deletions code/backend/batch/utilities/helpers/config/EmbeddingConfig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from ..DocumentLoadingHelper import LoadingSettings
from ..DocumentChunkingHelper import ChunkingSettings


class EmbeddingConfig(ChunkingSettings, LoadingSettings):
def __init__(
self,
document_type: str,
chunking: ChunkingSettings | None,
loading: LoadingSettings | None,
use_advanced_image_processing: bool,
):
self.document_type = document_type
self.chunking = chunking
self.loading = loading
self.use_advanced_image_processing = use_advanced_image_processing

def __eq__(self, other):
if isinstance(self, other.__class__):
return (
self.document_type == other.document_type
and self.chunking == other.chunking
and self.loading == other.loading
and self.use_advanced_image_processing
== other.use_advanced_image_processing
)
return False
4 changes: 4 additions & 0 deletions code/backend/batch/utilities/helpers/config/default.json
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@
}
}
],
"integrated_vectorization_config": {
"max_page_length": "800",
"page_overlap_length": "100"
},
"logging": {
"log_user_interactions": true,
"log_tokens": true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from abc import ABC, abstractmethod


class EmbedderBase(ABC):
@abstractmethod
def embed_file(self, source_url: str, file_name: str):
pass
15 changes: 15 additions & 0 deletions code/backend/batch/utilities/helpers/embedders/EmbedderFactory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from ..EnvHelper import EnvHelper
from ..AzureBlobStorageClient import AzureBlobStorageClient
from .PushEmbedder import PushEmbedder
from .IntegratedVectorizationEmbedder import (
IntegratedVectorizationEmbedder,
)


class EmbedderFactory:
@staticmethod
def create(env_helper: EnvHelper):
if env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION:
return IntegratedVectorizationEmbedder(env_helper)
else:
return PushEmbedder(AzureBlobStorageClient())
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from .EmbedderBase import EmbedderBase
from ..EnvHelper import EnvHelper
from ..LLMHelper import LLMHelper
from ...integrated_vectorization.AzureSearchIndex import AzureSearchIndex
from ...integrated_vectorization.AzureSearchIndexer import AzureSearchIndexer
from ...integrated_vectorization.AzureSearchDatasource import AzureSearchDatasource
from ...integrated_vectorization.AzureSearchSkillset import AzureSearchSkillset
from ..config.ConfigHelper import ConfigHelper
import logging

logger = logging.getLogger(__name__)


class IntegratedVectorizationEmbedder(EmbedderBase):
def __init__(self, env_helper: EnvHelper):
self.env_helper = env_helper
self.llm_helper: LLMHelper = LLMHelper()

def embed_file(self, source_url: str, file_name: str):
self.process_using_integrated_vectorization(source_url=source_url)

def process_using_integrated_vectorization(self, source_url: str):
config = ConfigHelper.get_active_config_or_default()
try:
search_datasource = AzureSearchDatasource(self.env_helper)
search_datasource.create_or_update_datasource()
search_index = AzureSearchIndex(self.env_helper, self.llm_helper)
search_index.create_or_update_index()
search_skillset = AzureSearchSkillset(
self.env_helper, config.integrated_vectorization_config
)
search_skillset_result = search_skillset.create_skillset()
search_indexer = AzureSearchIndexer(self.env_helper)
indexer_result = search_indexer.create_or_update_indexer(
self.env_helper.AZURE_SEARCH_INDEXER_NAME,
skillset_name=search_skillset_result.name,
)
return indexer_result
except Exception as e:
logger.error(f"Error processing {source_url}: {e}")
raise e
Loading

0 comments on commit ba0f096

Please sign in to comment.