From af7dd593b135f6ed679915d2eb1d6864376d5879 Mon Sep 17 00:00:00 2001 From: Tadashi Date: Sat, 16 Nov 2024 09:40:26 +0700 Subject: [PATCH] fix: improve docling output parsing --- .../azureai_document_intelligence_loader.py | 3 + .../kotaemon/loaders/docling_loader.py | 73 +++++++++++++++---- libs/kotaemon/kotaemon/loaders/utils/adobe.py | 22 +++--- 3 files changed, 73 insertions(+), 25 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py index 1e1d9020d..0e66c849e 100644 --- a/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py +++ b/libs/kotaemon/kotaemon/loaders/azureai_document_intelligence_loader.py @@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag """ left, upper, right, lower = bbox + left, right = min(left, right), max(left, right) + upper, lower = min(upper, lower), max(upper, lower) + img: Image.Image suffix = file_path.suffix.lower() if suffix == ".pdf": diff --git a/libs/kotaemon/kotaemon/loaders/docling_loader.py b/libs/kotaemon/kotaemon/loaders/docling_loader.py index dd18f76a9..2f2b43fed 100644 --- a/libs/kotaemon/kotaemon/loaders/docling_loader.py +++ b/libs/kotaemon/kotaemon/loaders/docling_loader.py @@ -1,4 +1,5 @@ import base64 +from collections import defaultdict from io import BytesIO from pathlib import Path from typing import List, Optional @@ -30,7 +31,7 @@ class DoclingReader(BaseReader): ), ) - figure_friednly_filetypes: list[str] = Param( + figure_friendly_filetypes: list[str] = Param( [".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"], help=( "File types that we can reliably open and extract figures. " @@ -73,7 +74,7 @@ def load_data( for figure_obj in result_dict.get("pictures", []): if not self.vlm_endpoint: continue - if file_path.suffix.lower() not in self.figure_friednly_filetypes: + if file_path.suffix.lower() not in self.figure_friendly_filetypes: continue # retrieve extractive captions provided by docling @@ -81,20 +82,35 @@ def load_data( extractive_captions = [] for caption_ref in caption_refs: text_id = caption_ref.split("/")[-1] - extractive_captions.append( - result_dict["texts"][text_id]["prov"][0]["text"] - ) + try: + caption_text = result_dict["texts"][int(text_id)]["text"] + extractive_captions.append(caption_text) + except (ValueError, TypeError, IndexError) as e: + print(e) + continue # read & crop image page_number = figure_obj["prov"][0]["page_no"] - page_width = result_dict["pages"][page_number]["size"]["width"] - page_height = result_dict["pages"][page_number]["size"]["height"] - bbox_obj = figure_obj["prov"][0]["bbox"] - bbox = [bbox_obj["l"], bbox_obj["t"], bbox_obj["r"], bbox_obj["b"]] - if bbox_obj["coord_origin"] == "BOTTOMLEFT": - bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height) - img = crop_image(file_path, bbox, page_number - 1) + try: + page_number_text = str(page_number) + page_width = result_dict["pages"][page_number_text]["size"]["width"] + page_height = result_dict["pages"][page_number_text]["size"]["height"] + + bbox_obj = figure_obj["prov"][0]["bbox"] + bbox: list[float] = [ + bbox_obj["l"], + bbox_obj["t"], + bbox_obj["r"], + bbox_obj["b"], + ] + if bbox_obj["coord_origin"] == "BOTTOMLEFT": + bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height) + + img = crop_image(file_path, bbox, page_number - 1) + except KeyError as e: + print(e, list(result_dict["pages"].keys())) + continue # convert img to base64 img_bytes = BytesIO() @@ -136,6 +152,21 @@ def load_data( for table_obj in result_dict.get("tables", []): # convert the tables into markdown format markdown_table = self._parse_table(table_obj) + caption_refs = [caption["$ref"] for caption in table_obj["captions"]] + + extractive_captions = [] + for caption_ref in caption_refs: + text_id = caption_ref.split("/")[-1] + try: + caption_text = result_dict["texts"][int(text_id)]["text"] + extractive_captions.append(caption_text) + except (ValueError, TypeError, IndexError) as e: + print(e) + continue + # join the extractive and generative captions + caption = "\n".join(extractive_captions) + markdown_table = f"{caption}\n{markdown_table}" + page_number = table_obj["prov"][0].get("page_no", 1) table_metadata = { @@ -156,11 +187,16 @@ def load_data( # join plain text elements texts = [] + page_number_to_text = defaultdict(list) + for text_obj in result_dict["texts"]: page_number = text_obj["prov"][0].get("page_no", 1) + page_number_to_text[page_number].append(text_obj["text"]) + + for page_number, txts in page_number_to_text.items(): texts.append( Document( - text=text_obj["text"], + text="\n".join(txts), metadata={ "page_label": page_number, "file_name": file_name, @@ -173,11 +209,16 @@ def load_data( return texts + tables + figures def _convert_bbox_bl_tl( - self, bbox: list[int], page_width: int, page_height: int - ) -> list[int]: + self, bbox: list[float], page_width: int, page_height: int + ) -> list[float]: """Convert bbox from bottom-left to top-left""" x0, y0, x1, y1 = bbox - return [x0, page_height - y1, x1, page_height - y0] + return [ + x0 / page_width, + (page_height - y1) / page_height, + x1 / page_width, + (page_height - y0) / page_height, + ] def _parse_table(self, table_obj: dict) -> str: """Convert docling table object to markdown table""" diff --git a/libs/kotaemon/kotaemon/loaders/utils/adobe.py b/libs/kotaemon/kotaemon/loaders/utils/adobe.py index bd0b975a2..24b3d9d63 100644 --- a/libs/kotaemon/kotaemon/loaders/utils/adobe.py +++ b/libs/kotaemon/kotaemon/loaders/utils/adobe.py @@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]: def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str: + output = "" + """Summarize a single figure using GPT-4V""" if figure: - output = generate_gpt4v( - endpoint=vlm_endpoint, - prompt="Provide a short 2 sentence summary of this image?", - images=figure, - ) - if "sorry" in output.lower(): - output = "" - else: - output = "" + try: + output = generate_gpt4v( + endpoint=vlm_endpoint, + prompt="Provide a short 2 sentence summary of this image?", + images=figure, + ) + if "sorry" in output.lower(): + output = "" + except Exception as e: + print(f"Error generating caption: {e}") + return output