From 456f020caf5f51126035d3414a2881b0c0d5e54f Mon Sep 17 00:00:00 2001 From: "Duc Nguyen (john)" Date: Tue, 23 Apr 2024 14:16:24 +0700 Subject: [PATCH] Enable MHTML reader (#44) * Enable mhtml loader * Use default supported file types * Add tests and bump version --- .../kotaemon/indices/ingests/files.py | 9 + libs/kotaemon/kotaemon/loaders/__init__.py | 3 +- libs/kotaemon/kotaemon/loaders/html_loader.py | 80 +- libs/kotaemon/pyproject.toml | 3 +- libs/kotaemon/tests/resources/dummy.mhtml | 690 ++++++++++++++++++ libs/kotaemon/tests/test_reader.py | 17 +- 6 files changed, 797 insertions(+), 5 deletions(-) create mode 100644 libs/kotaemon/tests/resources/dummy.mhtml diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 0f80cb272..e5e325e02 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -9,7 +9,9 @@ from kotaemon.loaders import ( AdobeReader, DirectoryReader, + HtmlReader, MathpixPDFReader, + MhtmlReader, OCRReader, PandasExcelReader, UnstructuredReader, @@ -20,6 +22,13 @@ ".docx": UnstructuredReader, ".xls": UnstructuredReader, ".doc": UnstructuredReader, + ".html": HtmlReader, + ".mhtml": MhtmlReader, + ".png": UnstructuredReader, + ".jpeg": UnstructuredReader, + ".jpg": UnstructuredReader, + ".tiff": UnstructuredReader, + ".tif": UnstructuredReader, } diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index a59d71315..f0e7d0f7f 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -3,7 +3,7 @@ from .composite_loader import DirectoryReader from .docx_loader import DocxReader from .excel_loader import PandasExcelReader -from .html_loader import HtmlReader +from .html_loader import HtmlReader, MhtmlReader from .mathpix_loader import MathpixPDFReader from .ocr_loader import ImageReader, OCRReader from .unstructured_loader import UnstructuredReader @@ -19,5 +19,6 @@ "UnstructuredReader", "DocxReader", "HtmlReader", + "MhtmlReader", "AdobeReader", ] diff --git a/libs/kotaemon/kotaemon/loaders/html_loader.py b/libs/kotaemon/kotaemon/loaders/html_loader.py index 1295cfca3..c939c8a60 100644 --- a/libs/kotaemon/kotaemon/loaders/html_loader.py +++ b/libs/kotaemon/kotaemon/loaders/html_loader.py @@ -1,5 +1,6 @@ +import email from pathlib import Path -from typing import List, Optional +from typing import Optional from llama_index.readers.base import BaseReader @@ -33,7 +34,7 @@ def __init__(self, page_break_pattern: Optional[str] = None, *args, **kwargs): def load_data( self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs - ) -> List[Document]: + ) -> list[Document]: """Load data using Html reader Args: @@ -70,3 +71,78 @@ def load_data( ] return documents + + +class MhtmlReader(BaseReader): + """Parse `MHTML` files with `BeautifulSoup`.""" + + def __init__( + self, + open_encoding: Optional[str] = None, + bs_kwargs: Optional[dict] = None, + get_text_separator: str = "", + ) -> None: + """initialize with path, and optionally, file encoding to use, and any kwargs + to pass to the BeautifulSoup object. + + Args: + file_path: Path to file to load. + open_encoding: The encoding to use when opening the file. + bs_kwargs: Any kwargs to pass to the BeautifulSoup object. + get_text_separator: The separator to use when getting the text + from the soup. + """ + try: + import bs4 # noqa:F401 + except ImportError: + raise ImportError( + "beautifulsoup4 package not found, please install it with " + "`pip install beautifulsoup4`" + ) + + self.open_encoding = open_encoding + if bs_kwargs is None: + bs_kwargs = {"features": "lxml"} + self.bs_kwargs = bs_kwargs + self.get_text_separator = get_text_separator + + def load_data( + self, file_path: Path | str, extra_info: Optional[dict] = None, **kwargs + ) -> list[Document]: + """Load MHTML document into document objects.""" + + from bs4 import BeautifulSoup + + extra_info = extra_info or {} + metadata: dict = extra_info + page = [] + with open(file_path, "r", encoding=self.open_encoding) as f: + message = email.message_from_string(f.read()) + parts = message.get_payload() + + if not isinstance(parts, list): + parts = [message] + + for part in parts: + if part.get_content_type() == "text/html": + html = part.get_payload(decode=True).decode() + + soup = BeautifulSoup(html, **self.bs_kwargs) + text = soup.get_text(self.get_text_separator) + + if soup.title: + title = str(soup.title.string) + else: + title = "" + + metadata = { + "source": str(file_path), + "title": title, + **extra_info, + } + lines = [line for line in text.split("\n") if line.strip()] + text = "\n\n".join(lines) + if text: + page.append(text) + + return [Document(text="\n\n".join(page), metadata=metadata)] diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 2fd5ad75f..43678b4f8 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"] # metadata and dependencies [project] name = "kotaemon" -version = "0.3.10" +version = "0.3.11" requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ @@ -63,6 +63,7 @@ adv = [ "llama-cpp-python", "pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements", "fastembed", + "beautifulsoup4", ] dev = [ "ipython", diff --git a/libs/kotaemon/tests/resources/dummy.mhtml b/libs/kotaemon/tests/resources/dummy.mhtml new file mode 100644 index 000000000..86c1be6b8 --- /dev/null +++ b/libs/kotaemon/tests/resources/dummy.mhtml @@ -0,0 +1,690 @@ +MIME-Version: 1.0 +Content-Type: multipart/related; boundary="----=_NextPart_01CF5AE5.5C24CD00" + +This document is a Single File Web Page, also known as a Web Archive file. If you are seeing this message, your browser or editor doesn't support Web Archive files. Please download a browser that supports Web Archive, such as Windows® Internet Explorer®. + +------=_NextPart_01CF5AE5.5C24CD00 +Content-Location: file:///C:/D16BB227/testing.htm +Content-Transfer-Encoding: quoted-printable +Content-Type: text/html; charset="us-ascii" + + + + + + + + + + + + + + + + + + + +
+ +
+ +

This is a test.

+ +
+ +

This is bold,= + italic, and underlined.= +

+ +

asdakl fskljf +sklf jkslaf; djks dlkfa sk +sdjkl ksjkl jsjk skdjjks i +w ie sjkfksd fjisdf jks fjs +kdj fsk dfjskd +fjskd fjsd kfjsk f jskdf jskd +fjsk dfjskdf jsifj sifj sk +fjks fjksd fjskdf kjs jdfksk +fdjs fksj fks dfjs dfks +fdjsk fjskdfjskdf <= +span +class=3DSpellE>sjkf skjf sjkdf +skfjsfjk s

+ +

The end.

+ +
+ + + + + +------=_NextPart_01CF5AE5.5C24CD00 +Content-Location: file:///C:/D16BB227/testing_files/themedata.thmx +Content-Transfer-Encoding: base64 +Content-Type: application/vnd.ms-officetheme + +UEsDBBQABgAIAAAAIQCCirwT+gAAABwCAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbKyRy2rDMBBF +94X+g9C22HK6KKXYzqJJd30s0g8Y5LEtao+ENAnJ33fsuFC6CC10IxBizpl7Va6P46AOGJPzVOlV +XmiFZH3jqKv0++4pu9cqMVADgyes9AmTXtfXV+XuFDApmaZU6Z45PBiTbI8jpNwHJHlpfRyB5Ro7 +E8B+QIfmtijujPXESJzxxNB1+SoLRNegeoPILzCKx7Cg8Pv5DCSAmAtYq8czYVqi0hDC4CywRDAH +an7oM9+2zmLj7X4UaT6DF9jNBDO/XGD1P+ov5wZb2A+stkfp4lx/xCH9LdtSay6Tc/7Uu5AuGC6X +t7Rh5r+tPwEAAP//AwBQSwMEFAAGAAgAAAAhAKXWp+fAAAAANgEAAAsAAABfcmVscy8ucmVsc4SP +z2rDMAyH74W9g9F9UdLDGCV2L6WQQy+jfQDhKH9oIhvbG+vbT8cGCrsIhKTv96k9/q6L+eGU5yAW +mqoGw+JDP8to4XY9v3+CyYWkpyUIW3hwhqN727VfvFDRozzNMRulSLYwlRIPiNlPvFKuQmTRyRDS +SkXbNGIkf6eRcV/XH5ieGeA2TNP1FlLXN2Cuj6jJ/7PDMMyeT8F/ryzlRQRuN5RMaeRioagv41O9 +kKhlqtQe0LW4+db9AQAA//8DAFBLAwQUAAYACAAAACEAa3mWFoMAAACKAAAAHAAAAHRoZW1lL3Ro +ZW1lL3RoZW1lTWFuYWdlci54bWwMzE0KwyAQQOF9oXeQ2TdjuyhFYrLLrrv2AEOcGkHHoNKf29fl +44M3zt8U1ZtLDVksnAcNimXNLoi38Hwspxuo2kgcxSxs4ccV5ul4GMm0jRPfSchzUX0j1ZCFrbXd +INa1K9Uh7yzdXrkkaj2LR1fo0/cp4kXrKyYKAjj9AQAA//8DAFBLAwQUAAYACAAAACEAlrWt4pYG +AABQGwAAFgAAAHRoZW1lL3RoZW1lL3RoZW1lMS54bWzsWU9v2zYUvw/YdyB0b2MndhoHdYrYsZst +TRvEboceaYmW2FCiQNJJfRva44ABw7phhxXYbYdhW4EW2KX7NNk6bB3Qr7BHUpLFWF6SNtiKrT4k +Evnj+/8eH6mr1+7HDB0SISlP2l79cs1DJPF5QJOw7d0e9i+teUgqnASY8YS0vSmR3rWN99+7itdV +RGKCYH0i13Hbi5RK15eWpA/DWF7mKUlgbsxFjBW8inApEPgI6MZsablWW12KMU08lOAYyN4aj6lP +0FCT9DZy4j0Gr4mSesBnYqBJE2eFwQYHdY2QU9llAh1i1vaAT8CPhuS+8hDDUsFE26uZn7e0cXUJ +r2eLmFqwtrSub37ZumxBcLBseIpwVDCt9xutK1sFfQNgah7X6/W6vXpBzwCw74OmVpYyzUZ/rd7J +aZZA9nGedrfWrDVcfIn+ypzMrU6n02xlsliiBmQfG3P4tdpqY3PZwRuQxTfn8I3OZre76uANyOJX +5/D9K63Vhos3oIjR5GAOrR3a72fUC8iYs+1K+BrA12oZfIaCaCiiS7MY80QtirUY3+OiDwANZFjR +BKlpSsbYhyju4ngkKNYM8DrBpRk75Mu5Ic0LSV/QVLW9D1MMGTGj9+r596+eP0XHD54dP/jp+OHD +4wc/WkLOqm2chOVVL7/97M/HH6M/nn7z8tEX1XhZxv/6wye//Px5NRDSZybOiy+f/PbsyYuvPv39 +u0cV8E2BR2X4kMZEopvkCO3zGBQzVnElJyNxvhXDCNPyis0klDjBmksF/Z6KHPTNKWaZdxw5OsS1 +4B0B5aMKeH1yzxF4EImJohWcd6LYAe5yzjpcVFphR/MqmXk4ScJq5mJSxu1jfFjFu4sTx7+9SQp1 +Mw9LR/FuRBwx9xhOFA5JQhTSc/yAkArt7lLq2HWX+oJLPlboLkUdTCtNMqQjJ5pmi7ZpDH6ZVukM +/nZss3sHdTir0nqLHLpIyArMKoQfEuaY8TqeKBxXkRzimJUNfgOrqErIwVT4ZVxPKvB0SBhHvYBI +WbXmlgB9S07fwVCxKt2+y6axixSKHlTRvIE5LyO3+EE3wnFahR3QJCpjP5AHEKIY7XFVBd/lbobo +d/ADTha6+w4ljrtPrwa3aeiINAsQPTMR2pdQqp0KHNPk78oxo1CPbQxcXDmGAvji68cVkfW2FuJN +2JOqMmH7RPldhDtZdLtcBPTtr7lbeJLsEQjz+Y3nXcl9V3K9/3zJXZTPZy20s9oKZVf3DbYpNi1y +vLBDHlPGBmrKyA1pmmQJ+0TQh0G9zpwOSXFiSiN4zOq6gwsFNmuQ4OojqqJBhFNosOueJhLKjHQo +UcolHOzMcCVtjYcmXdljYVMfGGw9kFjt8sAOr+jh/FxQkDG7TWgOnzmjFU3grMxWrmREQe3XYVbX +Qp2ZW92IZkqdw61QGXw4rxoMFtaEBgRB2wJWXoXzuWYNBxPMSKDtbvfe3C3GCxfpIhnhgGQ+0nrP ++6hunJTHirkJgNip8JE+5J1itRK3lib7BtzO4qQyu8YCdrn33sRLeQTPvKTz9kQ6sqScnCxBR22v +1VxuesjHadsbw5kWHuMUvC51z4dZCBdDvhI27E9NZpPlM2+2csXcJKjDNYW1+5zCTh1IhVRbWEY2 +NMxUFgIs0Zys/MtNMOtFKWAj/TWkWFmDYPjXpAA7uq4l4zHxVdnZpRFtO/ualVI+UUQMouAIjdhE +7GNwvw5V0CegEq4mTEXQL3CPpq1tptzinCVd+fbK4Ow4ZmmEs3KrUzTPZAs3eVzIYN5K4oFulbIb +5c6vikn5C1KlHMb/M1X0fgI3BSuB9oAP17gCI52vbY8LFXGoQmlE/b6AxsHUDogWuIuFaQgquEw2 +/wU51P9tzlkaJq3hwKf2aYgEhf1IRYKQPShLJvpOIVbP9i5LkmWETESVxJWpFXtEDgkb6hq4qvd2 +D0UQ6qaaZGXA4E7Gn/ueZdAo1E1OOd+cGlLsvTYH/unOxyYzKOXWYdPQ5PYvRKzYVe16szzfe8uK +6IlZm9XIswKYlbaCVpb2rynCObdaW7HmNF5u5sKBF+c1hsGiIUrhvgfpP7D/UeEz+2VCb6hDvg+1 +FcGHBk0Mwgai+pJtPJAukHZwBI2THbTBpElZ02atk7ZavllfcKdb8D1hbC3ZWfx9TmMXzZnLzsnF +izR2ZmHH1nZsoanBsydTFIbG+UHGOMZ80ip/deKje+DoLbjfnzAlTTDBNyWBofUcmDyA5LcczdKN +vwAAAP//AwBQSwMEFAAGAAgAAAAhAA3RkJ+2AAAAGwEAACcAAAB0aGVtZS90aGVtZS9fcmVscy90 +aGVtZU1hbmFnZXIueG1sLnJlbHOEj00KwjAUhPeCdwhvb9O6EJEm3YjQrdQDhOQ1DTY/JFHs7Q2u +LAguh2G+mWm7l53JE2My3jFoqhoIOumVcZrBbbjsjkBSFk6J2TtksGCCjm837RVnkUsoTSYkUigu +MZhyDidKk5zQilT5gK44o49W5CKjpkHIu9BI93V9oPGbAXzFJL1iEHvVABmWUJr/s/04GolnLx8W +Xf5RQXPZhQUoosbM4CObqkwEylu6usTfAAAA//8DAFBLAQItABQABgAIAAAAIQCCirwT+gAAABwC +AAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0AFAAGAAgAAAAhAKXW +p+fAAAAANgEAAAsAAAAAAAAAAAAAAAAAKwEAAF9yZWxzLy5yZWxzUEsBAi0AFAAGAAgAAAAhAGt5 +lhaDAAAAigAAABwAAAAAAAAAAAAAAAAAFAIAAHRoZW1lL3RoZW1lL3RoZW1lTWFuYWdlci54bWxQ +SwECLQAUAAYACAAAACEAlrWt4pYGAABQGwAAFgAAAAAAAAAAAAAAAADRAgAAdGhlbWUvdGhlbWUv +dGhlbWUxLnhtbFBLAQItABQABgAIAAAAIQAN0ZCftgAAABsBAAAnAAAAAAAAAAAAAAAAAJsJAAB0 +aGVtZS90aGVtZS9fcmVscy90aGVtZU1hbmFnZXIueG1sLnJlbHNQSwUGAAAAAAUABQBdAQAAlgoA +AAAA + +------=_NextPart_01CF5AE5.5C24CD00 +Content-Location: file:///C:/D16BB227/testing_files/colorschememapping.xml +Content-Transfer-Encoding: quoted-printable +Content-Type: text/xml + + + +------=_NextPart_01CF5AE5.5C24CD00 +Content-Location: file:///C:/D16BB227/testing_files/filelist.xml +Content-Transfer-Encoding: quoted-printable +Content-Type: text/xml; charset="utf-8" + + + + + + + +------=_NextPart_01CF5AE5.5C24CD00-- diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py index 1ead0d8d3..0aa2f2b48 100644 --- a/libs/kotaemon/tests/test_reader.py +++ b/libs/kotaemon/tests/test_reader.py @@ -4,7 +4,13 @@ from llama_index.node_parser import SimpleNodeParser from kotaemon.base import Document -from kotaemon.loaders import AutoReader, DocxReader, HtmlReader, UnstructuredReader +from kotaemon.loaders import ( + AutoReader, + DocxReader, + HtmlReader, + MhtmlReader, + UnstructuredReader, +) def test_docx_reader(): @@ -61,3 +67,12 @@ def test_unstructured_pdf_reader(): documents = reader.load_data(input_path, split_documents=True) # check document reader output assert len(documents) == 1 + + +def test_mhtml_reader(): + reader = MhtmlReader() + input_path = Path(__file__).parent / "resources" / "dummy.mhtml" + docs = reader.load_data(input_path) + + assert len(docs) == 1 + assert docs[0].text.startswith("This is a test")