feat: Set limit for advanced image processing images (#978)

Azure-Samples · May 28, 2024 · f604655 · f604655
1 parent 671da33
commit f604655
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 8 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -21,4 +21,5 @@
     "python.testing.cwd": "${workspaceFolder}/code",
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
+    "pylint.cwd": "${workspaceFolder}/code",
 }
diff --git a/code/backend/batch/utilities/helpers/env_helper.py b/code/backend/batch/utilities/helpers/env_helper.py
@@ -48,7 +48,7 @@ def __load_config(self, **kwargs) -> None:
             "AZURE_SEARCH_INDEX_IS_PRECHUNKED", ""
         )
         self.AZURE_SEARCH_FILTER = os.getenv("AZURE_SEARCH_FILTER", "")
-        self.AZURE_SEARCH_TOP_K = int(os.getenv("AZURE_SEARCH_TOP_K", "5"))
+        self.AZURE_SEARCH_TOP_K = self.get_env_var_int("AZURE_SEARCH_TOP_K", 5)
         self.AZURE_SEARCH_ENABLE_IN_DOMAIN = (
             os.getenv("AZURE_SEARCH_ENABLE_IN_DOMAIN", "true").lower() == "true"
         )
@@ -114,6 +114,9 @@ def __load_config(self, **kwargs) -> None:
         self.USE_ADVANCED_IMAGE_PROCESSING = self.get_env_var_bool(
             "USE_ADVANCED_IMAGE_PROCESSING", "False"
         )
+        self.ADVANCED_IMAGE_PROCESSING_MAX_IMAGES = self.get_env_var_int(
+            "ADVANCED_IMAGE_PROCESSING_MAX_IMAGES", 1
+        )
         self.AZURE_COMPUTER_VISION_ENDPOINT = os.getenv(
             "AZURE_COMPUTER_VISION_ENDPOINT"
         )
@@ -244,7 +247,10 @@ def get_env_var_bool(self, var_name: str, default: str = "True") -> bool:
     def get_env_var_array(self, var_name: str, default: str = ""):
         return os.getenv(var_name, default).split(",")
 
-    def get_env_var_float(self, var_name: str, default: int):
+    def get_env_var_int(self, var_name: str, default: int):
+        return int(os.getenv(var_name, default))
+
+    def get_env_var_float(self, var_name: str, default: float):
         return float(os.getenv(var_name, default))
 
     def is_auth_type_keys(self):

diff --git a/code/backend/batch/utilities/tools/question_answer_tool.py b/code/backend/batch/utilities/tools/question_answer_tool.py
@@ -186,7 +186,7 @@ def create_image_url_list(self, source_documents):
             doc.source.replace("_SAS_TOKEN_PLACEHOLDER_", container_sas)
             for doc in source_documents
             if doc.title is not None and doc.title.split(".")[-1] in image_types
-        ]
+        ][: self.env_helper.ADVANCED_IMAGE_PROCESSING_MAX_IMAGES]
 
         return image_urls
 

diff --git a/code/tests/functional/app_config.py b/code/tests/functional/app_config.py
@@ -74,6 +74,7 @@ class AppConfig:
         "AZURE_SPEECH_RECOGNIZER_LANGUAGES": "en-US,es-ES",
         "TIKTOKEN_CACHE_DIR": f"{os.path.dirname(os.path.realpath(__file__))}/resources",
         "USE_ADVANCED_IMAGE_PROCESSING": "False",
+        "ADVANCED_IMAGE_PROCESSING_MAX_IMAGES": "1",
         "USE_KEY_VAULT": "False",
         # These values are set directly within EnvHelper, adding them here ensures
         # that they are removed from the environment when remove_from_environment() runs

diff --git a/code/tests/utilities/tools/test_question_answer_tool.py b/code/tests/utilities/tools/test_question_answer_tool.py
@@ -42,6 +42,7 @@ def env_helper_mock():
         env_helper.AZURE_SEARCH_USE_INTEGRATED_VECTORIZATION = False
         env_helper.USE_ADVANCED_IMAGE_PROCESSING = False
         env_helper.AZURE_OPENAI_VISION_MODEL = "mock vision model"
+        env_helper.ADVANCED_IMAGE_PROCESSING_MAX_IMAGES = 1
 
         yield env_helper
 
@@ -83,7 +84,7 @@ def search_handler_mock():
 
 
 @pytest.fixture(autouse=True)
-def source_documents_mock():
+def get_source_documents_mock():
     with patch(
         "backend.batch.utilities.tools.question_answer_tool.Search.get_source_documents"
     ) as mock:
@@ -106,11 +107,11 @@ def source_documents_mock():
             ),
         ]
         mock.return_value = documents
-        yield documents
+        yield mock
 
 
 def test_answer_question_returns_source_documents(
-    source_documents_mock: list[SourceDocument],
+    get_source_documents_mock: MagicMock,
 ):
     # given
     tool = QuestionAnswerTool()
@@ -121,7 +122,7 @@ def test_answer_question_returns_source_documents(
     # then
     assert len(answer.source_documents) == 2
     assert isinstance(answer.source_documents[0], SourceDocument)
-    assert answer.source_documents == source_documents_mock
+    assert answer.source_documents == get_source_documents_mock.return_value
 
 
 def test_answer_question_returns_answer():
@@ -350,3 +351,71 @@ def test_use_advanced_vision_processing(env_helper_mock, llm_helper_mock):
     assert isinstance(answer, Answer)
     assert answer.question == "mock question"
     assert answer.answer == "mock content"
+
+
+def test_limit_number_of_images_passed_to_llm(
+    get_source_documents_mock: MagicMock,
+    env_helper_mock: MagicMock,
+    llm_helper_mock: MagicMock,
+):
+    # given
+    get_source_documents_mock.return_value = [
+        SourceDocument(
+            id="mock id",
+            content="mock content",
+            title="mock title",
+            source="mock source",
+            chunk=123,
+            offset=123,
+            page_number=123,
+        ),
+        SourceDocument(
+            id="mock id 2",
+            content="mock content 2",
+            title="mock title 2.jpg",
+            source="mock source 2_SAS_TOKEN_PLACEHOLDER_",
+            chunk_id="mock chunk id 2",
+        ),
+        SourceDocument(
+            id="mock id 3",
+            content="mock content 3",
+            title="mock title 3.jpg",
+            source="mock source 3_SAS_TOKEN_PLACEHOLDER_",
+            chunk_id="mock chunk id 3",
+        ),
+    ]
+    env_helper_mock.USE_ADVANCED_IMAGE_PROCESSING = True
+    tool = QuestionAnswerTool()
+
+    # when
+    tool.answer_question("mock question", [])
+
+    # then
+    llm_helper_mock.get_chat_completion.assert_called_once_with(
+        [
+            {"content": "mock answering system prompt", "role": "system"},
+            {
+                "content": 'Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock example content"}}]}, Question: mock example user question',
+                "name": "example_user",
+                "role": "system",
+            },
+            {
+                "content": "mock example answer",
+                "name": "example_assistant",
+                "role": "system",
+            },
+            {"content": "mock azure openai system message", "role": "system"},
+            {
+                "content": [
+                    {
+                        "type": "text",
+                        "text": 'Sources: {"retrieved_documents":[{"[doc1]":{"content":"mock content"}},{"[doc2]":{"content":"mock content 2"}},{"[doc3]":{"content":"mock content 3"}}]}, Question: mock question',
+                    },
+                    {"type": "image_url", "image_url": "mock source 2mock sas"},
+                ],
+                "role": "user",
+            },
+        ],
+        model="mock vision model",
+        temperature=0,
+    )
diff --git a/docs/advanced_image_processing.md b/docs/advanced_image_processing.md
@@ -38,4 +38,11 @@ Once enabled, advanced image processing will be enabled for all supported image
 
 ![image](./images/enable_advanced_image_processing.png)
 
+The `ADVANCED_IMAGE_PROCESSING_MAX_IMAGES` environment variable can be used to control the maximum number of images passed to GPT-4 vision in a single request (default is `1`).
+Increasing the number of images consumes more tokens and may result in throttled requests.
+
+```bash
+azd env set ADVANCED_IMAGE_PROCESSING_MAX_IMAGES 2
+```
+
 Advanced image processing is only used in the `custom` conversation flow and not the `byod` flow, as Azure OpenAI On Your Data only supports Ada embeddings. It is currently not possible to use advanced image processing when integrated vectorization is enabled.
diff --git a/infra/main.bicep b/infra/main.bicep
@@ -110,6 +110,9 @@ param azureOpenAIModelCapacity int = 30
 @description('Enables the use of a vision LLM and Computer Vision for embedding images')
 param useAdvancedImageProcessing bool = false
 
+@description('The maximum number of images to pass to the vision model in a single request')
+param advancedImageProcessingMaxImages int = 1
+
 @description('Azure OpenAI Vision Model Deployment Name')
 param azureOpenAIVisionModel string = 'gpt-4'
 
@@ -554,6 +557,7 @@ module web './app/web.bicep' = if (hostingModel == 'code') {
       AZURE_SPEECH_SERVICE_REGION: location
       AZURE_SPEECH_RECOGNIZER_LANGUAGES: recognizedLanguages
       USE_ADVANCED_IMAGE_PROCESSING: useAdvancedImageProcessing
+      ADVANCED_IMAGE_PROCESSING_MAX_IMAGES: advancedImageProcessingMaxImages
       ORCHESTRATION_STRATEGY: orchestrationStrategy
       CONVERSATION_FLOW: conversationFlow
       LOGLEVEL: logLevel
@@ -627,6 +631,7 @@ module web_docker './app/web.bicep' = if (hostingModel == 'container') {
       AZURE_SPEECH_SERVICE_REGION: location
       AZURE_SPEECH_RECOGNIZER_LANGUAGES: recognizedLanguages
       USE_ADVANCED_IMAGE_PROCESSING: useAdvancedImageProcessing
+      ADVANCED_IMAGE_PROCESSING_MAX_IMAGES: advancedImageProcessingMaxImages
       ORCHESTRATION_STRATEGY: orchestrationStrategy
       CONVERSATION_FLOW: conversationFlow
       LOGLEVEL: logLevel
@@ -1097,3 +1102,5 @@ output ADMIN_WEBSITE_NAME string = hostingModel == 'code'
   : adminweb_docker.outputs.WEBSITE_ADMIN_URI
 output LOGLEVEL string = logLevel
 output CONVERSATION_FLOW string = conversationFlow
+output USE_ADVANCED_IMAGE_PROCESSING bool = useAdvancedImageProcessing
+output ADVANCED_IMAGE_PROCESSING_MAX_IMAGES int = advancedImageProcessingMaxImages
diff --git a/infra/main.bicepparam b/infra/main.bicepparam
@@ -26,6 +26,7 @@ param azureOpenAIModelName = readEnvironmentVariable('AZURE_OPENAI_MODEL_NAME',
 param azureOpenAIModelVersion = readEnvironmentVariable('AZURE_OPENAI_MODEL_VERSION', '0613')
 param azureOpenAIModelCapacity = int(readEnvironmentVariable('AZURE_OPENAI_MODEL_CAPACITY', '30'))
 param useAdvancedImageProcessing = bool(readEnvironmentVariable('USE_ADVANCED_IMAGE_PROCESSING', 'false'))
+param advancedImageProcessingMaxImages = int(readEnvironmentVariable('ADVANCED_IMAGE_PROCESSING_MAX_IMAGES', '1'))
 param azureOpenAIVisionModel = readEnvironmentVariable('AZURE_OPENAI_VISION_MODEL', 'gpt-4')
 param azureOpenAIVisionModelName = readEnvironmentVariable('AZURE_OPENAI_VISION_MODEL_NAME', 'gpt-4')
 param azureOpenAIVisionModelVersion = readEnvironmentVariable('AZURE_OPENAI_VISION_MODEL_VERSION', 'vision-preview')

diff --git a/infra/main.json b/infra/main.json
@@ -5,7 +5,7 @@
     "_generator": {
       "name": "bicep",
       "version": "0.27.1.19265",
-      "templateHash": "6027201902589320671"
+      "templateHash": "10484197901623589764"
     }
   },
   "parameters": {
@@ -229,6 +229,13 @@
         "description": "Enables the use of a vision LLM and Computer Vision for embedding images"
       }
     },
+    "advancedImageProcessingMaxImages": {
+      "type": "int",
+      "defaultValue": 1,
+      "metadata": {
+        "description": "The maximum number of images to pass to the vision model in a single request"
+      }
+    },
     "azureOpenAIVisionModel": {
       "type": "string",
       "defaultValue": "gpt-4",
@@ -2031,6 +2038,7 @@
               "AZURE_SPEECH_SERVICE_REGION": "[parameters('location')]",
               "AZURE_SPEECH_RECOGNIZER_LANGUAGES": "[parameters('recognizedLanguages')]",
               "USE_ADVANCED_IMAGE_PROCESSING": "[parameters('useAdvancedImageProcessing')]",
+              "ADVANCED_IMAGE_PROCESSING_MAX_IMAGES": "[parameters('advancedImageProcessingMaxImages')]",
               "ORCHESTRATION_STRATEGY": "[parameters('orchestrationStrategy')]",
               "CONVERSATION_FLOW": "[parameters('conversationFlow')]",
               "LOGLEVEL": "[parameters('logLevel')]"
@@ -2984,6 +2992,7 @@
               "AZURE_SPEECH_SERVICE_REGION": "[parameters('location')]",
               "AZURE_SPEECH_RECOGNIZER_LANGUAGES": "[parameters('recognizedLanguages')]",
               "USE_ADVANCED_IMAGE_PROCESSING": "[parameters('useAdvancedImageProcessing')]",
+              "ADVANCED_IMAGE_PROCESSING_MAX_IMAGES": "[parameters('advancedImageProcessingMaxImages')]",
               "ORCHESTRATION_STRATEGY": "[parameters('orchestrationStrategy')]",
               "CONVERSATION_FLOW": "[parameters('conversationFlow')]",
               "LOGLEVEL": "[parameters('logLevel')]"
@@ -11102,6 +11111,14 @@
     "CONVERSATION_FLOW": {
       "type": "string",
       "value": "[parameters('conversationFlow')]"
+    },
+    "USE_ADVANCED_IMAGE_PROCESSING": {
+      "type": "bool",
+      "value": "[parameters('useAdvancedImageProcessing')]"
+    },
+    "ADVANCED_IMAGE_PROCESSING_MAX_IMAGES": {
+      "type": "int",
+      "value": "[parameters('advancedImageProcessingMaxImages')]"
     }
   }
 }