NatLabRockies · ppinchuk · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst
@@ -561,7 +561,7 @@ for multiprocessing tasks.
 
     import asyncio
     import openai
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_text_splitters.character import RecursiveCharacterTextSplitter
     from elm.ords.extraction.ordinance import OrdinanceValidator
     from elm.ords.services.provider import RunningAsyncServices
     from elm.ords.services.openai import OpenAIService
@@ -616,7 +616,7 @@ for multiprocessing tasks.
 
     import asyncio
     import openai
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_text_splitters.character import RecursiveCharacterTextSplitter
     from elm.ords.extraction.ordinance import OrdinanceExtractor
     from elm.ords.services.provider import RunningAsyncServices
     from elm.ords.services.openai import OpenAIService

diff --git a/elm/ords/process.py b/elm/ords/process.py
@@ -10,7 +10,7 @@
 
 import openai
 import pandas as pd
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_text_splitters.character import RecursiveCharacterTextSplitter
 
 from elm import ApiBase
 from elm.utilities import validate_azure_api_params
@@ -137,11 +137,11 @@ async def process_counties_with_openai(
         By default, ``4000``.
     text_splitter_chunk_size : int, optional
         Chunk size input to
-        `langchain.text_splitter.RecursiveCharacterTextSplitter`.
+        `langchain_text_splitters.character.RecursiveCharacterTextSplitter`.
         By default, ``3000``.
     text_splitter_chunk_overlap : int, optional
         Chunk overlap input to
-        `langchain.text_splitter.RecursiveCharacterTextSplitter`.
+        `langchain_text_splitters.character.RecursiveCharacterTextSplitter`.
         By default, ``300``.
     num_urls_to_check_per_county : int, optional
         Number of unique Google search result URL's to check for

diff --git a/elm/utilities/parse.py b/elm/utilities/parse.py
@@ -8,6 +8,7 @@
 import html2text
 import numpy as np
 import pandas as pd
+from bs4 import BeautifulSoup
 
 
 logger = logging.getLogger(__name__)
@@ -60,6 +61,22 @@ def remove_blank_pages(pages):
     return [page for page in pages if any(page.strip())]
 
 
+def resembles_html(text):
+    """Check if text resembles HTML
+
+    Parameters
+    ----------
+    text : str
+        Input text which may be plaintext or HTML.
+
+    Returns
+    -------
+    bool
+        ``True`` if the text resembles HTML, ``False`` otherwise.
+    """
+    return bool(BeautifulSoup(text, 'html.parser').find())
+
+
 def html_to_text(html, ignore_links=True):
     """Call to `HTML2Text` class with basic args.
 

diff --git a/elm/version.py b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.28"
+__version__ = "0.0.29"
diff --git a/elm/web/document.py b/elm/web/document.py
@@ -13,6 +13,7 @@
     clean_headers,
     html_to_text,
     remove_blank_pages,
+    resembles_html,
     format_html_tables,
     read_pdf,
     read_pdf_ocr,
@@ -263,6 +264,7 @@ class HTMLDocument(BaseDocument):
     """Default :func:`~elm.utilities.parse.format_html_tables` arguments"""
     WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"}
     FILE_EXTENSION = "txt"
+    NUM_HTML_PARSE_ATTEMPTS = 3
 
     def __init__(
         self,
@@ -311,10 +313,17 @@ def __init__(
     def _cleaned_text(self):
         """Compute cleaned text from document"""
         text = combine_pages(self.pages)
-        text = html_to_text(text, self.ignore_html_links)
-        text = format_html_tables(text, **self.html_table_to_markdown_kwargs)
+        for ind in range(self.NUM_HTML_PARSE_ATTEMPTS):
+            if ind > 0 and not resembles_html(text):
+                break
+            text = self._process_html_text(text)
         return text
 
+    def _process_html_text(self, text):
+        """Process HTML text to plain text with formatted tables"""
+        text = html_to_text(text, self.ignore_html_links)
+        return format_html_tables(text, **self.html_table_to_markdown_kwargs)
+
     def _raw_pages(self):
         """Get raw pages from document"""
         if self.text_splitter is None:

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
@@ -216,28 +216,39 @@ async def _fetch_doc(self, url):
         async with aiohttp.ClientSession() as session:
             try:
                 logger.debug("Fetching content from %r", url)
-                url_bytes = await self._fetch_content_with_retry(url, session)
+                out = await self._fetch_content_with_retry(url, session)
             except ELMRuntimeError:
                 logger.exception("Could not fetch content from %r", url)
                 return PDFDocument(pages=[]), None
 
+        raw_content, ct, charset = out
         logger.debug("Got content from %r", url)
-        doc = await self.pdf_read_coroutine(url_bytes, **self.pdf_read_kwargs)
+        doc = await self.pdf_read_coroutine(raw_content,
+                                            **self.pdf_read_kwargs)
         if not doc.empty:
-            return doc, url_bytes
+            return doc, raw_content
 
         logger.debug("PDF read failed; fetching HTML content from %r", url)
         doc = await self._fetch_html_using_pw_with_retry(url)
         if not doc.empty:
             return doc, doc.text
 
-        if self.pdf_ocr_read_coroutine:
+        if "text" in ct:
+            logger.debug("HTML read with playwright failed; fetching HTML "
+                         "content from response with content type %r and "
+                         "charset %r for %r", ct, charset, url)
+            doc = await self._try_load_doc_from_response_text(raw_content,
+                                                              charset)
+            if not doc.empty:
+                return doc, doc.text
+
+        elif self.pdf_ocr_read_coroutine:
             logger.debug("HTML read failed; fetching OCR content from %r", url)
             doc = await self.pdf_ocr_read_coroutine(
-                url_bytes, **self.pdf_read_kwargs
+                raw_content, **self.pdf_read_kwargs
             )
 
-        return doc, url_bytes
+        return doc, raw_content
 
     async def _fetch_html_using_pw_with_retry(self, url):
         """Fetch HTML content with several retry attempts"""
@@ -277,7 +288,19 @@ async def _fetch_html_using_pw_with_retry(self, url):
     async def _fetch_content_with_retry(self, url, session):
         """Fetch content from URL with several retry attempts"""
         async with session.get(url, **self.get_kwargs) as response:
-            return await response.read()
+            body = await response.read()
+            ct = response.content_type.casefold()
+            charset = response.charset or 'utf-8'
+            return body, ct, charset
+
+    async def _try_load_doc_from_response_text(self, raw_content, charset):
+        """Try to load document by decoding response text"""
+        try:
+            text = raw_content.decode(charset)
+        except Exception:
+            return HTMLDocument(pages=[])
+
+        return await self.html_read_coroutine(text, **self.html_read_kwargs)
 
     async def _cache_doc(self, doc, raw_content):
         """Cache doc if user provided a coroutine"""

diff --git a/examples/ordinance_gpt/parse_pdf.py b/examples/ordinance_gpt/parse_pdf.py
@@ -2,7 +2,7 @@
 from functools import partial
 
 import openai
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_text_splitters.character import RecursiveCharacterTextSplitter
 
 from rex import init_logger
 from elm.base import ApiBase

diff --git a/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb b/examples/web_information_retrieval/example_search_retrieval_wiki.ipynb
@@ -253,13 +253,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from functools import partial\n",
     "from elm import ApiBase\n",
-    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+    "from langchain_text_splitters.character import RecursiveCharacterTextSplitter\n",
     "from elm.ords.utilities import RTS_SEPARATORS\n",
     "\n",
     "model = \"gpt-4\"\n",

diff --git a/requirements.txt b/requirements.txt
@@ -10,7 +10,7 @@ google-api-python-client
 google-search-results
 html2text
 httpx
-langchain
+langchain-text-splitters
 lxml
 matplotlib
 networkx

diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py
@@ -31,6 +31,8 @@
 class MockResponse:
     def __init__(self, read_return):
         self.read_return = read_return
+        self.content_type = "application/pdf"
+        self.charset = "utf-8"
 
     async def read(self):
         return self.read_return

diff --git a/tests/ords/validation/test_validation_location.py b/tests/ords/validation/test_validation_location.py
@@ -6,7 +6,7 @@
 
 import pytest
 import openai
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_text_splitters.character import RecursiveCharacterTextSplitter
 
 from elm import TEST_DATA_DIR, ApiBase
 from elm.web.document import PDFDocument, HTMLDocument

diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py
@@ -27,6 +27,8 @@ class MockResponse:
     def __init__(self, read_return):
         """Store the desired read response."""
         self.read_return = read_return
+        self.content_type = "application/pdf"
+        self.charset = "utf-8"
 
     async def read(self):
         """Return what class was initialized with."""