Skip to content

Commit

Permalink
Enable MHTML reader (#44)
Browse files Browse the repository at this point in the history
* Enable mhtml loader

* Use default supported file types

* Add tests and bump version
  • Loading branch information
trducng authored Apr 23, 2024
1 parent fbe983c commit 456f020
Show file tree
Hide file tree
Showing 6 changed files with 797 additions and 5 deletions.
9 changes: 9 additions & 0 deletions libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from kotaemon.loaders import (
AdobeReader,
DirectoryReader,
HtmlReader,
MathpixPDFReader,
MhtmlReader,
OCRReader,
PandasExcelReader,
UnstructuredReader,
Expand All @@ -20,6 +22,13 @@
".docx": UnstructuredReader,
".xls": UnstructuredReader,
".doc": UnstructuredReader,
".html": HtmlReader,
".mhtml": MhtmlReader,
".png": UnstructuredReader,
".jpeg": UnstructuredReader,
".jpg": UnstructuredReader,
".tiff": UnstructuredReader,
".tif": UnstructuredReader,
}


Expand Down
3 changes: 2 additions & 1 deletion libs/kotaemon/kotaemon/loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .composite_loader import DirectoryReader
from .docx_loader import DocxReader
from .excel_loader import PandasExcelReader
from .html_loader import HtmlReader
from .html_loader import HtmlReader, MhtmlReader
from .mathpix_loader import MathpixPDFReader
from .ocr_loader import ImageReader, OCRReader
from .unstructured_loader import UnstructuredReader
Expand All @@ -19,5 +19,6 @@
"UnstructuredReader",
"DocxReader",
"HtmlReader",
"MhtmlReader",
"AdobeReader",
]
80 changes: 78 additions & 2 deletions libs/kotaemon/kotaemon/loaders/html_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import email
from pathlib import Path
from typing import List, Optional
from typing import Optional

from llama_index.readers.base import BaseReader

Expand Down Expand Up @@ -33,7 +34,7 @@ def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs):

def load_data(
self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
) -> list[Document]:
"""Load data using Html reader
Args:
Expand Down Expand Up @@ -70,3 +71,78 @@ def load_data(
]

return documents


class MhtmlReader(BaseReader):
"""Parse `MHTML` files with `BeautifulSoup`."""

def __init__(
self,
open_encoding: Optional[str] = None,
bs_kwargs: Optional[dict] = None,
get_text_separator: str = "",
) -> None:
"""initialize with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.
Args:
file_path: Path to file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting the text
from the soup.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)

self.open_encoding = open_encoding
if bs_kwargs is None:
bs_kwargs = {"features": "lxml"}
self.bs_kwargs = bs_kwargs
self.get_text_separator = get_text_separator

def load_data(
self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
"""Load MHTML document into document objects."""

from bs4 import BeautifulSoup

extra_info = extra_info or {}
metadata: dict = extra_info
page = []
with open(file_path, "r", encoding=self.open_encoding) as f:
message = email.message_from_string(f.read())
parts = message.get_payload()

if not isinstance(parts, list):
parts = [message]

for part in parts:
if part.get_content_type() == "text/html":
html = part.get_payload(decode=True).decode()

soup = BeautifulSoup(html, **self.bs_kwargs)
text = soup.get_text(self.get_text_separator)

if soup.title:
title = str(soup.title.string)
else:
title = ""

metadata = {
"source": str(file_path),
"title": title,
**extra_info,
}
lines = [line for line in text.split("\n") if line.strip()]
text = "\n\n".join(lines)
if text:
page.append(text)

return [Document(text="\n\n".join(page), metadata=metadata)]
3 changes: 2 additions & 1 deletion libs/kotaemon/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
# metadata and dependencies
[project]
name = "kotaemon"
version = "0.3.10"
version = "0.3.11"
requires-python = ">= 3.10"
description = "Kotaemon core library for AI development."
dependencies = [
Expand Down Expand Up @@ -63,6 +63,7 @@ adv = [
"llama-cpp-python",
"pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
"fastembed",
"beautifulsoup4",
]
dev = [
"ipython",
Expand Down
Loading

0 comments on commit 456f020

Please sign in to comment.