-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into feat/dynamic-llm
- Loading branch information
Showing
32 changed files
with
1,721 additions
and
292 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,23 @@ | ||
from .adobe_loader import AdobeReader | ||
from .base import AutoReader, BaseReader | ||
from .composite_loader import DirectoryReader | ||
from .docx_loader import DocxReader | ||
from .excel_loader import PandasExcelReader | ||
from .html_loader import HtmlReader | ||
from .mathpix_loader import MathpixPDFReader | ||
from .ocr_loader import OCRReader | ||
from .ocr_loader import ImageReader, OCRReader | ||
from .unstructured_loader import UnstructuredReader | ||
|
||
__all__ = [ | ||
"AutoReader", | ||
"BaseReader", | ||
"PandasExcelReader", | ||
"MathpixPDFReader", | ||
"ImageReader", | ||
"OCRReader", | ||
"DirectoryReader", | ||
"UnstructuredReader", | ||
"DocxReader", | ||
"HtmlReader", | ||
"AdobeReader", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
import logging | ||
import os | ||
import re | ||
from collections import defaultdict | ||
from pathlib import Path | ||
from typing import Any, Dict, List, Optional | ||
|
||
from decouple import config | ||
from llama_index.readers.base import BaseReader | ||
|
||
from kotaemon.base import Document | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
DEFAULT_VLM_ENDPOINT = ( | ||
"{0}openai/deployments/{1}/chat/completions?api-version={2}".format( | ||
config("AZURE_OPENAI_ENDPOINT", default=""), | ||
"gpt-4-vision", | ||
config("OPENAI_API_VERSION", default=""), | ||
) | ||
) | ||
|
||
|
||
class AdobeReader(BaseReader): | ||
"""Read PDF using the Adobe's PDF Services. | ||
Be able to extract text, table, and figure with high accuracy | ||
Example: | ||
```python | ||
>> from kotaemon.loaders import AdobeReader | ||
>> reader = AdobeReader() | ||
>> documents = reader.load_data("path/to/pdf") | ||
``` | ||
Args: | ||
endpoint: URL to the Vision Language Model endpoint. If not provided, | ||
will use the default `kotaemon.loaders.adobe_loader.DEFAULT_VLM_ENDPOINT` | ||
max_figures_to_caption: an int decides how many figured will be captioned. | ||
The rest will be ignored (are indexed without captions). | ||
""" | ||
|
||
def __init__( | ||
self, | ||
vlm_endpoint: Optional[str] = None, | ||
max_figures_to_caption: int = 100, | ||
*args: Any, | ||
**kwargs: Any, | ||
) -> None: | ||
"""Init params""" | ||
super().__init__(*args) | ||
self.table_regex = r"/Table(\[\d+\])?$" | ||
self.figure_regex = r"/Figure(\[\d+\])?$" | ||
self.vlm_endpoint = vlm_endpoint or DEFAULT_VLM_ENDPOINT | ||
self.max_figures_to_caption = max_figures_to_caption | ||
|
||
def load_data( | ||
self, file: Path, extra_info: Optional[Dict] = None, **kwargs | ||
) -> List[Document]: | ||
"""Load data by calling to the Adobe's API | ||
Args: | ||
file (Path): Path to the PDF file | ||
Returns: | ||
List[Document]: list of documents extracted from the PDF file, | ||
includes 3 types: text, table, and image | ||
""" | ||
from .utils.adobe import ( | ||
generate_figure_captions, | ||
load_json, | ||
parse_figure_paths, | ||
parse_table_paths, | ||
request_adobe_service, | ||
) | ||
|
||
filename = file.name | ||
filepath = str(Path(file).resolve()) | ||
output_path = request_adobe_service(file_path=str(file), output_path="") | ||
results_path = os.path.join(output_path, "structuredData.json") | ||
|
||
if not os.path.exists(results_path): | ||
logger.exception("Fail to parse the document.") | ||
return [] | ||
|
||
data = load_json(results_path) | ||
|
||
texts = defaultdict(list) | ||
tables = [] | ||
figures = [] | ||
|
||
elements = data["elements"] | ||
for item_id, item in enumerate(elements): | ||
page_number = item.get("Page", -1) + 1 | ||
item_path = item["Path"] | ||
item_text = item.get("Text", "") | ||
|
||
file_paths = [ | ||
Path(output_path) / path for path in item.get("filePaths", []) | ||
] | ||
prev_item = elements[item_id - 1] | ||
title = prev_item.get("Text", "") | ||
|
||
if re.search(self.table_regex, item_path): | ||
table_content = parse_table_paths(file_paths) | ||
if not table_content: | ||
continue | ||
table_caption = ( | ||
table_content.replace("|", "").replace("---", "") | ||
+ f"\n(Table in Page {page_number}. {title})" | ||
) | ||
tables.append((page_number, table_content, table_caption)) | ||
|
||
elif re.search(self.figure_regex, item_path): | ||
figure_caption = ( | ||
item_text + f"\n(Figure in Page {page_number}. {title})" | ||
) | ||
figure_content = parse_figure_paths(file_paths) | ||
if not figure_content: | ||
continue | ||
figures.append([page_number, figure_content, figure_caption]) | ||
|
||
else: | ||
if item_text and "Table" not in item_path and "Figure" not in item_path: | ||
texts[page_number].append(item_text) | ||
|
||
# get figure caption using GPT-4V | ||
figure_captions = generate_figure_captions( | ||
self.vlm_endpoint, | ||
[item[1] for item in figures], | ||
self.max_figures_to_caption, | ||
) | ||
for item, caption in zip(figures, figure_captions): | ||
# update figure caption | ||
item[2] += " " + caption | ||
|
||
# Wrap elements with Document | ||
documents = [] | ||
|
||
# join plain text elements | ||
for page_number, txts in texts.items(): | ||
documents.append( | ||
Document( | ||
text="\n".join(txts), | ||
metadata={ | ||
"page_label": page_number, | ||
"file_name": filename, | ||
"file_path": filepath, | ||
}, | ||
) | ||
) | ||
|
||
# table elements | ||
for page_number, table_content, table_caption in tables: | ||
documents.append( | ||
Document( | ||
text=table_caption, | ||
metadata={ | ||
"table_origin": table_content, | ||
"type": "table", | ||
"page_label": page_number, | ||
"file_name": filename, | ||
"file_path": filepath, | ||
}, | ||
metadata_template="", | ||
metadata_seperator="", | ||
) | ||
) | ||
|
||
# figure elements | ||
for page_number, figure_content, figure_caption in figures: | ||
documents.append( | ||
Document( | ||
text=figure_caption, | ||
metadata={ | ||
"image_origin": figure_content, | ||
"type": "image", | ||
"page_label": page_number, | ||
"file_name": filename, | ||
"file_path": filepath, | ||
}, | ||
metadata_template="", | ||
metadata_seperator="", | ||
) | ||
) | ||
return documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.