From d1b6c67647f3e5eb17c063135ca194bbb922b6d9 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 14:05:02 -0600 Subject: [PATCH 01/12] Add `resembles_html` function --- elm/utilities/parse.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py index 0a4d7e90..0ac65fa8 100644 --- a/elm/utilities/parse.py +++ b/elm/utilities/parse.py @@ -8,6 +8,7 @@ import html2text import numpy as np import pandas as pd +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @@ -60,6 +61,22 @@ def remove_blank_pages(pages): return [page for page in pages if any(page.strip())] +def resembles_html(text): + """Check if text resembles HTML + + Parameters + ---------- + text : str + Input text which may be plaintext or HTML. + + Returns + ------- + bool + ``True`` if the text resembles HTML, ``False`` otherwise. + """ + return bool(BeautifulSoup(text, 'html.parser').find()) + + def html_to_text(html, ignore_links=True): """Call to `HTML2Text` class with basic args. From 7cb14d48c5f468d8a789f80fcc579e2215b93c5d Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 14:05:25 -0600 Subject: [PATCH 02/12] Attempt to re-process html if it's still detected in text --- elm/web/document.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/elm/web/document.py b/elm/web/document.py index 0bc93a1e..d57bdd65 100644 --- a/elm/web/document.py +++ b/elm/web/document.py @@ -13,6 +13,7 @@ clean_headers, html_to_text, remove_blank_pages, + resembles_html, format_html_tables, read_pdf, read_pdf_ocr, @@ -263,6 +264,7 @@ class HTMLDocument(BaseDocument): """Default :func:`~elm.utilities.parse.format_html_tables` arguments""" WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"} FILE_EXTENSION = "txt" + NUM_HTML_PARSE_ATTEMPTS = 3 def __init__( self, @@ -311,10 +313,17 @@ def __init__( def _cleaned_text(self): """Compute cleaned text from document""" text = combine_pages(self.pages) - text = html_to_text(text, self.ignore_html_links) - text = format_html_tables(text, **self.html_table_to_markdown_kwargs) + for ind in range(self.NUM_HTML_PARSE_ATTEMPTS): + if ind > 0 and not resembles_html(text): + break + text = self._process_html_text(text) return text + def _process_html_text(self, text): + """Process HTML text to plain text with formatted tables""" + text = html_to_text(text, self.ignore_html_links) + return format_html_tables(text, **self.html_table_to_markdown_kwargs) + def _raw_pages(self): """Get raw pages from document""" if self.text_splitter is None: From 76084d06b5666fe64f3a013e2ec5c1dc879be815 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 14:05:42 -0600 Subject: [PATCH 03/12] Bump version --- elm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/version.py b/elm/version.py index 9c04c46a..f2720333 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.28" +__version__ = "0.0.29" From 48cc7ec19d7a6ebd6e52da67a346bdd2cc854885 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:12:32 -0600 Subject: [PATCH 04/12] Refactor method --- elm/web/file_loader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 9cfc9bf7..eb88d761 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -216,7 +216,7 @@ async def _fetch_doc(self, url): async with aiohttp.ClientSession() as session: try: logger.debug("Fetching content from %r", url) - url_bytes = await self._fetch_content_with_retry(url, session) + out = await self._fetch_content_with_retry(url, session) except ELMRuntimeError: logger.exception("Could not fetch content from %r", url) return PDFDocument(pages=[]), None @@ -277,7 +277,10 @@ async def _fetch_html_using_pw_with_retry(self, url): async def _fetch_content_with_retry(self, url, session): """Fetch content from URL with several retry attempts""" async with session.get(url, **self.get_kwargs) as response: - return await response.read() + body = await response.read() + ct = response.content_type + charset = response.charset or 'utf-8' + return body, ct, charset async def _cache_doc(self, doc, raw_content): """Cache doc if user provided a coroutine""" From ec45cd537de574d74ae230e343545bff540b39f4 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:13:06 -0600 Subject: [PATCH 05/12] Unpack response --- elm/web/file_loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index eb88d761..3e61ea29 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -221,10 +221,12 @@ async def _fetch_doc(self, url): logger.exception("Could not fetch content from %r", url) return PDFDocument(pages=[]), None + raw_content, ct, charset = out logger.debug("Got content from %r", url) - doc = await self.pdf_read_coroutine(url_bytes, **self.pdf_read_kwargs) + doc = await self.pdf_read_coroutine(raw_content, + **self.pdf_read_kwargs) if not doc.empty: - return doc, url_bytes + return doc, raw_content logger.debug("PDF read failed; fetching HTML content from %r", url) doc = await self._fetch_html_using_pw_with_retry(url) @@ -237,7 +239,7 @@ async def _fetch_doc(self, url): url_bytes, **self.pdf_read_kwargs ) - return doc, url_bytes + return doc, raw_content async def _fetch_html_using_pw_with_retry(self, url): """Fetch HTML content with several retry attempts""" From 95d3f23084e84a2e0f88cefc371d6df4a966febb Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:13:15 -0600 Subject: [PATCH 06/12] variable update --- elm/web/file_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 3e61ea29..2550170f 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -236,7 +236,7 @@ async def _fetch_doc(self, url): if self.pdf_ocr_read_coroutine: logger.debug("HTML read failed; fetching OCR content from %r", url) doc = await self.pdf_ocr_read_coroutine( - url_bytes, **self.pdf_read_kwargs + raw_content, **self.pdf_read_kwargs ) return doc, raw_content From 949bcfd4ac555c305aaecd383953a403b6a3eb65 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:13:28 -0600 Subject: [PATCH 07/12] Add method to load doc based on text content --- elm/web/file_loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 2550170f..4d36977f 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -284,6 +284,15 @@ async def _fetch_content_with_retry(self, url, session): charset = response.charset or 'utf-8' return body, ct, charset + async def _try_load_doc_from_response_text(self, raw_content, charset): + """Try to load document by decoding response text""" + try: + text = raw_content.decode(charset) + except Exception: + return HTMLDocument(pages=[]) + + return await self.html_read_coroutine(text, **self.html_read_kwargs) + async def _cache_doc(self, doc, raw_content): """Cache doc if user provided a coroutine""" if doc.empty or not raw_content: From 00a7b4979f6c0c4b62f29cd223f851d9b194103c Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:14:25 -0600 Subject: [PATCH 08/12] Add branch to load html text from content if PW loading fails --- elm/web/file_loader.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 4d36977f..86e2deb4 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -233,6 +233,15 @@ async def _fetch_doc(self, url): if not doc.empty: return doc, doc.text + if "text" in ct: + logger.debug("HTML read with playwright failed; fetching HTML " + "content from response with content type %r and " + "charset %r for %r", ct, charset, url) + doc = await self._try_load_doc_from_response_text(raw_content, + charset) + if not doc.empty: + return doc, doc.text + if self.pdf_ocr_read_coroutine: logger.debug("HTML read failed; fetching OCR content from %r", url) doc = await self.pdf_ocr_read_coroutine( From c57d2e0e4535bc04a628386dd48d1ba780ce8715 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:17:43 -0600 Subject: [PATCH 09/12] lowercase content type --- elm/web/file_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 86e2deb4..43070ce1 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -289,7 +289,7 @@ async def _fetch_content_with_retry(self, url, session): """Fetch content from URL with several retry attempts""" async with session.get(url, **self.get_kwargs) as response: body = await response.read() - ct = response.content_type + ct = response.content_type.casefold() charset = response.charset or 'utf-8' return body, ct, charset From 5bd7e823713b1c69e994af93f3697570d7152b15 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 15:53:50 -0600 Subject: [PATCH 10/12] Update dep --- docs/source/dev/ords_architecture.rst | 4 ++-- elm/ords/process.py | 6 +++--- examples/ordinance_gpt/parse_pdf.py | 2 +- .../example_search_retrieval_wiki.ipynb | 4 ++-- requirements.txt | 2 +- tests/ords/validation/test_validation_location.py | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst index 8ee6d53e..39fa6ea9 100644 --- a/docs/source/dev/ords_architecture.rst +++ b/docs/source/dev/ords_architecture.rst @@ -561,7 +561,7 @@ for multiprocessing tasks. import asyncio import openai - from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm.ords.extraction.ordinance import OrdinanceValidator from elm.ords.services.provider import RunningAsyncServices from elm.ords.services.openai import OpenAIService @@ -616,7 +616,7 @@ for multiprocessing tasks. import asyncio import openai - from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm.ords.extraction.ordinance import OrdinanceExtractor from elm.ords.services.provider import RunningAsyncServices from elm.ords.services.openai import OpenAIService diff --git a/elm/ords/process.py b/elm/ords/process.py index 0123d399..91bb0aa5 100644 --- a/elm/ords/process.py +++ b/elm/ords/process.py @@ -10,7 +10,7 @@ import openai import pandas as pd -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm import ApiBase from elm.utilities import validate_azure_api_params @@ -137,11 +137,11 @@ async def process_counties_with_openai( By default, ``4000``. text_splitter_chunk_size : int, optional Chunk size input to - `langchain.text_splitter.RecursiveCharacterTextSplitter`. + `langchain_text_splitters.character.RecursiveCharacterTextSplitter`. By default, ``3000``. text_splitter_chunk_overlap : int, optional Chunk overlap input to - `langchain.text_splitter.RecursiveCharacterTextSplitter`. + `langchain_text_splitters.character.RecursiveCharacterTextSplitter`. By default, ``300``. num_urls_to_check_per_county : int, optional Number of unique Google search result URL's to check for diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py index 82362e5f..911025ec 100644 --- a/examples/ordinance_gpt/parse_pdf.py +++ b/examples/ordinance_gpt/parse_pdf.py @@ -2,7 +2,7 @@ from functools import partial import openai -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from rex import init_logger from elm.base import ApiBase diff --git a/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb b/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb index c7bbacdd..043fea7b 100644 --- a/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb +++ b/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb @@ -253,13 +253,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from functools import partial\n", "from elm import ApiBase\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain_text_splitters.character import RecursiveCharacterTextSplitter\n", "from elm.ords.utilities import RTS_SEPARATORS\n", "\n", "model = \"gpt-4\"\n", diff --git a/requirements.txt b/requirements.txt index b65e220a..907e39c2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ google-api-python-client google-search-results html2text httpx -langchain +langchain-text-splitters lxml matplotlib networkx diff --git a/tests/ords/validation/test_validation_location.py b/tests/ords/validation/test_validation_location.py index bee03762..80da1a4c 100644 --- a/tests/ords/validation/test_validation_location.py +++ b/tests/ords/validation/test_validation_location.py @@ -6,7 +6,7 @@ import pytest import openai -from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_text_splitters.character import RecursiveCharacterTextSplitter from elm import TEST_DATA_DIR, ApiBase from elm.web.document import PDFDocument, HTMLDocument From c4ec26e10f3a6f85ab79c531b2064338e74319b4 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 16:16:12 -0600 Subject: [PATCH 11/12] Fix tests --- tests/ords/test_integrated.py | 2 ++ tests/web/test_web_file_loader.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py index 834caf69..d1c36925 100644 --- a/tests/ords/test_integrated.py +++ b/tests/ords/test_integrated.py @@ -31,6 +31,8 @@ class MockResponse: def __init__(self, read_return): self.read_return = read_return + self.content_type = "application/pdf" + self.charset = "utf-8" async def read(self): return self.read_return diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index 8a708b8b..29bbbf55 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -27,6 +27,8 @@ class MockResponse: def __init__(self, read_return): """Store the desired read response.""" self.read_return = read_return + self.content_type = "application/pdf" + self.charset = "utf-8" async def read(self): """Return what class was initialized with.""" From 8642f6ba3b6b8d201d684527e0ed043bab50ede1 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Fri, 17 Oct 2025 16:35:13 -0600 Subject: [PATCH 12/12] Don't try OCR if "text" in content type --- elm/web/file_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 43070ce1..91c8a47a 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -242,7 +242,7 @@ async def _fetch_doc(self, url): if not doc.empty: return doc, doc.text - if self.pdf_ocr_read_coroutine: + elif self.pdf_ocr_read_coroutine: logger.debug("HTML read failed; fetching OCR content from %r", url) doc = await self.pdf_ocr_read_coroutine( raw_content, **self.pdf_read_kwargs