diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst index 8ee6d53e..39fa6ea9 100644 --- a/docs/source/dev/ords_architecture.rst +++ b/docs/source/dev/ords_architecture.rst @@ -561,7 +561,7 @@ for multiprocessing tasks. import asyncio import openai - from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm.ords.extraction.ordinance import OrdinanceValidator from elm.ords.services.provider import RunningAsyncServices from elm.ords.services.openai import OpenAIService @@ -616,7 +616,7 @@ for multiprocessing tasks. import asyncio import openai - from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm.ords.extraction.ordinance import OrdinanceExtractor from elm.ords.services.provider import RunningAsyncServices from elm.ords.services.openai import OpenAIService diff --git a/elm/ords/process.py b/elm/ords/process.py index 0123d399..91bb0aa5 100644 --- a/elm/ords/process.py +++ b/elm/ords/process.py @@ -10,7 +10,7 @@ import openai import pandas as pd -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm import ApiBase from elm.utilities import validate_azure_api_params @@ -137,11 +137,11 @@ async def process_counties_with_openai( By default, ``4000``. text_splitter_chunk_size : int, optional Chunk size input to - `langchain.text_splitter.RecursiveCharacterTextSplitter`. + `langchain_text_splitters.character.RecursiveCharacterTextSplitter`. By default, ``3000``. text_splitter_chunk_overlap : int, optional Chunk overlap input to - `langchain.text_splitter.RecursiveCharacterTextSplitter`. + `langchain_text_splitters.character.RecursiveCharacterTextSplitter`. By default, ``300``. num_urls_to_check_per_county : int, optional Number of unique Google search result URL's to check for diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py index 0a4d7e90..0ac65fa8 100644 --- a/elm/utilities/parse.py +++ b/elm/utilities/parse.py @@ -8,6 +8,7 @@ import html2text import numpy as np import pandas as pd +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @@ -60,6 +61,22 @@ def remove_blank_pages(pages): return [page for page in pages if any(page.strip())] +def resembles_html(text): + """Check if text resembles HTML + + Parameters + ---------- + text : str + Input text which may be plaintext or HTML. + + Returns + ------- + bool + ``True`` if the text resembles HTML, ``False`` otherwise. + """ + return bool(BeautifulSoup(text, 'html.parser').find()) + + def html_to_text(html, ignore_links=True): """Call to `HTML2Text` class with basic args. diff --git a/elm/version.py b/elm/version.py index 9c04c46a..f2720333 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.28" +__version__ = "0.0.29" diff --git a/elm/web/document.py b/elm/web/document.py index 0bc93a1e..d57bdd65 100644 --- a/elm/web/document.py +++ b/elm/web/document.py @@ -13,6 +13,7 @@ clean_headers, html_to_text, remove_blank_pages, + resembles_html, format_html_tables, read_pdf, read_pdf_ocr, @@ -263,6 +264,7 @@ class HTMLDocument(BaseDocument): """Default :func:`~elm.utilities.parse.format_html_tables` arguments""" WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"} FILE_EXTENSION = "txt" + NUM_HTML_PARSE_ATTEMPTS = 3 def __init__( self, @@ -311,10 +313,17 @@ def __init__( def _cleaned_text(self): """Compute cleaned text from document""" text = combine_pages(self.pages) - text = html_to_text(text, self.ignore_html_links) - text = format_html_tables(text, **self.html_table_to_markdown_kwargs) + for ind in range(self.NUM_HTML_PARSE_ATTEMPTS): + if ind > 0 and not resembles_html(text): + break + text = self._process_html_text(text) return text + def _process_html_text(self, text): + """Process HTML text to plain text with formatted tables""" + text = html_to_text(text, self.ignore_html_links) + return format_html_tables(text, **self.html_table_to_markdown_kwargs) + def _raw_pages(self): """Get raw pages from document""" if self.text_splitter is None: diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 9cfc9bf7..91c8a47a 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -216,28 +216,39 @@ async def _fetch_doc(self, url): async with aiohttp.ClientSession() as session: try: logger.debug("Fetching content from %r", url) - url_bytes = await self._fetch_content_with_retry(url, session) + out = await self._fetch_content_with_retry(url, session) except ELMRuntimeError: logger.exception("Could not fetch content from %r", url) return PDFDocument(pages=[]), None + raw_content, ct, charset = out logger.debug("Got content from %r", url) - doc = await self.pdf_read_coroutine(url_bytes, **self.pdf_read_kwargs) + doc = await self.pdf_read_coroutine(raw_content, + **self.pdf_read_kwargs) if not doc.empty: - return doc, url_bytes + return doc, raw_content logger.debug("PDF read failed; fetching HTML content from %r", url) doc = await self._fetch_html_using_pw_with_retry(url) if not doc.empty: return doc, doc.text - if self.pdf_ocr_read_coroutine: + if "text" in ct: + logger.debug("HTML read with playwright failed; fetching HTML " + "content from response with content type %r and " + "charset %r for %r", ct, charset, url) + doc = await self._try_load_doc_from_response_text(raw_content, + charset) + if not doc.empty: + return doc, doc.text + + elif self.pdf_ocr_read_coroutine: logger.debug("HTML read failed; fetching OCR content from %r", url) doc = await self.pdf_ocr_read_coroutine( - url_bytes, **self.pdf_read_kwargs + raw_content, **self.pdf_read_kwargs ) - return doc, url_bytes + return doc, raw_content async def _fetch_html_using_pw_with_retry(self, url): """Fetch HTML content with several retry attempts""" @@ -277,7 +288,19 @@ async def _fetch_html_using_pw_with_retry(self, url): async def _fetch_content_with_retry(self, url, session): """Fetch content from URL with several retry attempts""" async with session.get(url, **self.get_kwargs) as response: - return await response.read() + body = await response.read() + ct = response.content_type.casefold() + charset = response.charset or 'utf-8' + return body, ct, charset + + async def _try_load_doc_from_response_text(self, raw_content, charset): + """Try to load document by decoding response text""" + try: + text = raw_content.decode(charset) + except Exception: + return HTMLDocument(pages=[]) + + return await self.html_read_coroutine(text, **self.html_read_kwargs) async def _cache_doc(self, doc, raw_content): """Cache doc if user provided a coroutine""" diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py index 82362e5f..911025ec 100644 --- a/examples/ordinance_gpt/parse_pdf.py +++ b/examples/ordinance_gpt/parse_pdf.py @@ -2,7 +2,7 @@ from functools import partial import openai -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from rex import init_logger from elm.base import ApiBase diff --git a/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb b/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb index c7bbacdd..043fea7b 100644 --- a/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb +++ b/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb @@ -253,13 +253,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", "from elm import ApiBase\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_text_splitters.character import RecursiveCharacterTextSplitter\n", "from elm.ords.utilities import RTS_SEPARATORS\n", "\n", "model = \"gpt-4\"\n", diff --git a/requirements.txt b/requirements.txt index b65e220a..907e39c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ google-api-python-client google-search-results html2text httpx -langchain +langchain-text-splitters lxml matplotlib networkx diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py index 834caf69..d1c36925 100644 --- a/tests/ords/test_integrated.py +++ b/tests/ords/test_integrated.py @@ -31,6 +31,8 @@ class MockResponse: def __init__(self, read_return): self.read_return = read_return + self.content_type = "application/pdf" + self.charset = "utf-8" async def read(self): return self.read_return diff --git a/tests/ords/validation/test_validation_location.py b/tests/ords/validation/test_validation_location.py index bee03762..80da1a4c 100644 --- a/tests/ords/validation/test_validation_location.py +++ b/tests/ords/validation/test_validation_location.py @@ -6,7 +6,7 @@ import pytest import openai -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm import TEST_DATA_DIR, ApiBase from elm.web.document import PDFDocument, HTMLDocument diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index 8a708b8b..29bbbf55 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -27,6 +27,8 @@ class MockResponse: def __init__(self, read_return): """Store the desired read response.""" self.read_return = read_return + self.content_type = "application/pdf" + self.charset = "utf-8" async def read(self): """Return what class was initialized with."""