Skip to content

Commit

Permalink
fix: improve docling output parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
taprosoft committed Nov 16, 2024
1 parent 8560a69 commit af7dd59
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
"""
left, upper, right, lower = bbox

left, right = min(left, right), max(left, right)
upper, lower = min(upper, lower), max(upper, lower)

img: Image.Image
suffix = file_path.suffix.lower()
if suffix == ".pdf":
Expand Down
73 changes: 57 additions & 16 deletions libs/kotaemon/kotaemon/loaders/docling_loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import List, Optional
Expand Down Expand Up @@ -30,7 +31,7 @@ class DoclingReader(BaseReader):
),
)

figure_friednly_filetypes: list[str] = Param(
figure_friendly_filetypes: list[str] = Param(
[".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
help=(
"File types that we can reliably open and extract figures. "
Expand Down Expand Up @@ -73,28 +74,43 @@ def load_data(
for figure_obj in result_dict.get("pictures", []):
if not self.vlm_endpoint:
continue
if file_path.suffix.lower() not in self.figure_friednly_filetypes:
if file_path.suffix.lower() not in self.figure_friendly_filetypes:
continue

# retrieve extractive captions provided by docling
caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
extractive_captions.append(
result_dict["texts"][text_id]["prov"][0]["text"]
)
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue

# read & crop image
page_number = figure_obj["prov"][0]["page_no"]
page_width = result_dict["pages"][page_number]["size"]["width"]
page_height = result_dict["pages"][page_number]["size"]["height"]

bbox_obj = figure_obj["prov"][0]["bbox"]
bbox = [bbox_obj["l"], bbox_obj["t"], bbox_obj["r"], bbox_obj["b"]]
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)
img = crop_image(file_path, bbox, page_number - 1)
try:
page_number_text = str(page_number)
page_width = result_dict["pages"][page_number_text]["size"]["width"]
page_height = result_dict["pages"][page_number_text]["size"]["height"]

bbox_obj = figure_obj["prov"][0]["bbox"]
bbox: list[float] = [
bbox_obj["l"],
bbox_obj["t"],
bbox_obj["r"],
bbox_obj["b"],
]
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)

img = crop_image(file_path, bbox, page_number - 1)
except KeyError as e:
print(e, list(result_dict["pages"].keys()))
continue

# convert img to base64
img_bytes = BytesIO()
Expand Down Expand Up @@ -136,6 +152,21 @@ def load_data(
for table_obj in result_dict.get("tables", []):
# convert the tables into markdown format
markdown_table = self._parse_table(table_obj)
caption_refs = [caption["$ref"] for caption in table_obj["captions"]]

extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue
# join the extractive and generative captions
caption = "\n".join(extractive_captions)
markdown_table = f"{caption}\n{markdown_table}"

page_number = table_obj["prov"][0].get("page_no", 1)

table_metadata = {
Expand All @@ -156,11 +187,16 @@ def load_data(

# join plain text elements
texts = []
page_number_to_text = defaultdict(list)

for text_obj in result_dict["texts"]:
page_number = text_obj["prov"][0].get("page_no", 1)
page_number_to_text[page_number].append(text_obj["text"])

for page_number, txts in page_number_to_text.items():
texts.append(
Document(
text=text_obj["text"],
text="\n".join(txts),
metadata={
"page_label": page_number,
"file_name": file_name,
Expand All @@ -173,11 +209,16 @@ def load_data(
return texts + tables + figures

def _convert_bbox_bl_tl(
self, bbox: list[int], page_width: int, page_height: int
) -> list[int]:
self, bbox: list[float], page_width: int, page_height: int
) -> list[float]:
"""Convert bbox from bottom-left to top-left"""
x0, y0, x1, y1 = bbox
return [x0, page_height - y1, x1, page_height - y0]
return [
x0 / page_width,
(page_height - y1) / page_height,
x1 / page_width,
(page_height - y0) / page_height,
]

def _parse_table(self, table_obj: dict) -> str:
"""Convert docling table object to markdown table"""
Expand Down
22 changes: 13 additions & 9 deletions libs/kotaemon/kotaemon/loaders/utils/adobe.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:


def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
output = ""

"""Summarize a single figure using GPT-4V"""
if figure:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
else:
output = ""
try:
output = generate_gpt4v(
endpoint=vlm_endpoint,
prompt="Provide a short 2 sentence summary of this image?",
images=figure,
)
if "sorry" in output.lower():
output = ""
except Exception as e:
print(f"Error generating caption: {e}")

return output


Expand Down

0 comments on commit af7dd59

Please sign in to comment.