From 3a14c89c054119f99675b8ff16422910238899e6 Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Tue, 19 Aug 2025 14:04:59 -0400
Subject: [PATCH 1/7] update categorisation process

rebase onto main
---
 .../content-categoriser/categoriser.py        |  5 +++-
 schemas                                       |  2 +-
 utils/llm/prompts.py                          | 29 +++++++++++++++----
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index 96defc40..c0d998a0 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -74,13 +74,16 @@ def categorise():
     base64_image = source.split(",")[1]
 
     graphic_category = llm_client.chat_completion(
-        prompt=CATEGORISER_PROMPT + POSSIBLE_CATEGORIES,
+        prompt=f"{CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}",
         image_base64=base64_image,
         temperature=0.0,
         json_schema=CATEGORISER_RESPONSE_SCHEMA,
         parse_json=True
     )
 
+    logging.debug(f"PROMPT: {CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}")
+    logging.debug(f"Schema: {CATEGORISER_RESPONSE_SCHEMA}")
+
     if graphic_category is None:
         logging.error("Failed to receive response from LLM.")
         return jsonify(
diff --git a/schemas b/schemas
index 768f9357..2a790c8f 160000
--- a/schemas
+++ b/schemas
@@ -1 +1 @@
-Subproject commit 768f935733686d7a56c7075cf4d412ade3fe9eeb
+Subproject commit 2a790c8fe10aa895b5f4cb13846140d80db339b2
diff --git a/utils/llm/prompts.py b/utils/llm/prompts.py
index 12b264cc..3e5fb12b 100644
--- a/utils/llm/prompts.py
+++ b/utils/llm/prompts.py
@@ -21,14 +21,33 @@
 ###
 
 # Content categoriser
+# CATEGORISER_PROMPT = """
+# Answer only in JSON with the following format:
+# '{"category": "YOUR_ANSWER"}.'
+# Which of the following categories best
+# describes this image, selecting from this enum:
+# """
+
+# POSSIBLE_CATEGORIES = "photograph, chart, text, other"
+
 CATEGORISER_PROMPT = """
-Answer only in JSON with the following format:
-'{"category": "YOUR_ANSWER"}.'
-Which of the following categories best
-describes this image, selecting from this enum:
+Your task is to categorise the content of an image.
+Answer only in JSON.
+Assign boolean values (true or false) to each of the following categories:
 """
 
-POSSIBLE_CATEGORIES = "photograph, chart, text, other"
+
+POSSIBLE_CATEGORIES = [
+        "photo",
+        "diagram",
+        "flow_diagram",
+        "contains_text",
+        "people",
+        "animals",
+        "collage",
+        "chart_or_graph",
+        "illustration"
+    ]
 ###
 
 # Followup

From b2f23b513d54f40b7e8e18a2bf983e1380747087 Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Tue, 26 Aug 2025 13:51:54 -0400
Subject: [PATCH 2/7] parse categories based on updated schema

---
 preprocessors/content-categoriser/categoriser.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index c0d998a0..893287b3 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -36,6 +36,16 @@
 with open(DATA_SCHEMA, 'r') as f:
     CATEGORISER_RESPONSE_SCHEMA = json.load(f)
 
+categories_properties = (
+    CATEGORISER_RESPONSE_SCHEMA.get("properties", {})
+    .get("categories", {})
+    .get("properties", {})
+)
+POSSIBLE_CATEGORIES = list(categories_properties.keys())
+
+logging.debug(f"Data schema: {CATEGORISER_RESPONSE_SCHEMA}")
+logging.debug(f"Possible categories: {POSSIBLE_CATEGORIES}")
+
 PREPROCESSOR_NAME = "ca.mcgill.a11y.image.preprocessor.contentCategoriser"
 
 try:
@@ -73,7 +83,7 @@ def categorise():
     source = content["graphic"]
     base64_image = source.split(",")[1]
 
-    graphic_category = llm_client.chat_completion(
+    graphic_categories = llm_client.chat_completion(
         prompt=f"{CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}",
         image_base64=base64_image,
         temperature=0.0,
@@ -84,7 +94,7 @@ def categorise():
     logging.debug(f"PROMPT: {CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}")
     logging.debug(f"Schema: {CATEGORISER_RESPONSE_SCHEMA}")
 
-    if graphic_category is None:
+    if graphic_categories is None:
         logging.error("Failed to receive response from LLM.")
         return jsonify(
             {"error": "Failed to get graphic category from LLM"}

From 9c73c4c7231252585d77133a53a3dade308a2e61 Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Tue, 26 Aug 2025 16:27:02 -0400
Subject: [PATCH 3/7] revert schemas

---
 schemas | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schemas b/schemas
index 2a790c8f..768f9357 160000
--- a/schemas
+++ b/schemas
@@ -1 +1 @@
-Subproject commit 2a790c8fe10aa895b5f4cb13846140d80db339b2
+Subproject commit 768f935733686d7a56c7075cf4d412ade3fe9eeb

From 1a956d57f90eef6b418f15cac6c8eab384f5108a Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Tue, 26 Aug 2025 16:37:12 -0400
Subject: [PATCH 4/7] remove unused prompts and excessive logging

---
 .../content-categoriser/categoriser.py        |  5 ----
 utils/llm/prompts.py                          | 24 -------------------
 2 files changed, 29 deletions(-)

diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index 893287b3..4e4f49f2 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -42,8 +42,6 @@
     .get("properties", {})
 )
 POSSIBLE_CATEGORIES = list(categories_properties.keys())
-
-logging.debug(f"Data schema: {CATEGORISER_RESPONSE_SCHEMA}")
 logging.debug(f"Possible categories: {POSSIBLE_CATEGORIES}")
 
 PREPROCESSOR_NAME = "ca.mcgill.a11y.image.preprocessor.contentCategoriser"
@@ -91,9 +89,6 @@ def categorise():
         parse_json=True
     )
 
-    logging.debug(f"PROMPT: {CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}")
-    logging.debug(f"Schema: {CATEGORISER_RESPONSE_SCHEMA}")
-
     if graphic_categories is None:
         logging.error("Failed to receive response from LLM.")
         return jsonify(
diff --git a/utils/llm/prompts.py b/utils/llm/prompts.py
index 3e5fb12b..f9ecdbf0 100644
--- a/utils/llm/prompts.py
+++ b/utils/llm/prompts.py
@@ -18,36 +18,12 @@
 the graphic type is significant (like oil painting or aerial photo).
 Instead, start describing the graphic right away.
 """
-###
-
 # Content categoriser
-# CATEGORISER_PROMPT = """
-# Answer only in JSON with the following format:
-# '{"category": "YOUR_ANSWER"}.'
-# Which of the following categories best
-# describes this image, selecting from this enum:
-# """
-
-# POSSIBLE_CATEGORIES = "photograph, chart, text, other"
-
 CATEGORISER_PROMPT = """
 Your task is to categorise the content of an image.
 Answer only in JSON.
 Assign boolean values (true or false) to each of the following categories:
 """
-
-
-POSSIBLE_CATEGORIES = [
-        "photo",
-        "diagram",
-        "flow_diagram",
-        "contains_text",
-        "people",
-        "animals",
-        "collage",
-        "chart_or_graph",
-        "illustration"
-    ]
 ###
 
 # Followup

From 1643f7dc3e8d68dcb44976d884e5afc4c78f064d Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Thu, 11 Sep 2025 13:05:49 -0400
Subject: [PATCH 5/7] cleanup after rebase

---
 preprocessors/content-categoriser/categoriser.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index 4e4f49f2..8c59752f 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -20,11 +20,8 @@
 import sys
 from datetime import datetime
 from config.logging_utils import configure_logging
-from utils.llm import (
-    LLMClient,
-    CATEGORISER_PROMPT,
-    POSSIBLE_CATEGORIES
-)
+from utils.llm import LLMClient, CATEGORISER_PROMPT
+
 from utils.validation import Validator
 import json
 
@@ -81,7 +78,7 @@ def categorise():
     source = content["graphic"]
     base64_image = source.split(",")[1]
 
-    graphic_categories = llm_client.chat_completion(
+    graphic_category = llm_client.chat_completion(
         prompt=f"{CATEGORISER_PROMPT} {POSSIBLE_CATEGORIES}",
         image_base64=base64_image,
         temperature=0.0,
@@ -89,7 +86,7 @@ def categorise():
         parse_json=True
     )
 
-    if graphic_categories is None:
+    if graphic_category is None:
         logging.error("Failed to receive response from LLM.")
         return jsonify(
             {"error": "Failed to get graphic category from LLM"}

From f1c697e6f253db40ac7c52e920a3868be6be82ee Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Thu, 11 Sep 2025 13:09:40 -0400
Subject: [PATCH 6/7] remove fixed categories from the utility module

---
 utils/llm/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utils/llm/__init__.py b/utils/llm/__init__.py
index c3173a37..4588284d 100644
--- a/utils/llm/__init__.py
+++ b/utils/llm/__init__.py
@@ -9,7 +9,6 @@
     BOUNDING_BOX_PROMPT_EXAMPLE,
     GRAPHIC_CAPTION_PROMPT,
     CATEGORISER_PROMPT,
-    POSSIBLE_CATEGORIES,
     FOLLOWUP_PROMPT
 )
 
@@ -20,6 +19,5 @@
     'BOUNDING_BOX_PROMPT_EXAMPLE',
     'GRAPHIC_CAPTION_PROMPT',
     'CATEGORISER_PROMPT',
-    'POSSIBLE_CATEGORIES',
     'FOLLOWUP_PROMPT'
 ]

From 28be6aed67e602570b825fe6625d3d1e827dca02 Mon Sep 17 00:00:00 2001
From: Mike Gvozdev <mv.gvozdev@gmail.com>
Date: Thu, 11 Sep 2025 14:34:42 -0400
Subject: [PATCH 7/7] add category check

---
 docker-compose.yml                                    |  2 +-
 preprocessors/content-categoriser/categoriser.py      |  2 --
 .../multistage-diagram-segmentation.py                | 11 +++++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index ca9cf249..2f0d5fbf 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -198,7 +198,7 @@ services:
       ca.mcgill.a11y.image.preprocessor: 1
       ca.mcgill.a11y.image.port: 5000
       ca.mcgill.a11y.image.cacheTimeout: 3600
-      ca.mcgill.a11y.image.required_dependencies: ""
+      ca.mcgill.a11y.image.required_dependencies: "content-categoriser"
       ca.mcgill.a11y.image.optional_dependencies: ""
     deploy:
       resources:
diff --git a/preprocessors/content-categoriser/categoriser.py b/preprocessors/content-categoriser/categoriser.py
index 8c59752f..92cfb8bc 100644
--- a/preprocessors/content-categoriser/categoriser.py
+++ b/preprocessors/content-categoriser/categoriser.py
@@ -92,8 +92,6 @@ def categorise():
             {"error": "Failed to get graphic category from LLM"}
         ), 500
 
-    logging.pii(f"Graphic category JSON: {graphic_category}")
-
     # data schema validation
     ok, _ = validator.check_data(graphic_category)
     if not ok:
diff --git a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
index a7a78e5c..2d9fc2cb 100644
--- a/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
+++ b/preprocessors/multistage-diagram-segmentation/multistage-diagram-segmentation.py
@@ -97,6 +97,17 @@ def process_diagram():
     if not ok:
         return jsonify({"error": "Invalid Preprocessor JSON format"}), 400
 
+    # Determine if the content is a multistage diagram
+    # based on the categoriser output
+    preprocess_output = content["preprocessors"]
+    categoriser = "ca.mcgill.a11y.image.preprocessor.contentCategoriser"
+    if categoriser in preprocess_output:
+        categoriser_output = preprocess_output[categoriser]
+        categoriser_tags = categoriser_output["categories"]
+        if not categoriser_tags["multistage_diagram"]:
+            logging.info("Not a multistage diagram. Skipping...")
+            return "", 204
+
     request_uuid = content["request_uuid"]
     timestamp = time.time()