Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/dev/ords_architecture.rst
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ for multiprocessing tasks.

import asyncio
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from elm.ords.extraction.ordinance import OrdinanceValidator
from elm.ords.services.provider import RunningAsyncServices
from elm.ords.services.openai import OpenAIService
Expand Down Expand Up @@ -616,7 +616,7 @@ for multiprocessing tasks.

import asyncio
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from elm.ords.extraction.ordinance import OrdinanceExtractor
from elm.ords.services.provider import RunningAsyncServices
from elm.ords.services.openai import OpenAIService
Expand Down
6 changes: 3 additions & 3 deletions elm/ords/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import openai
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

from elm import ApiBase
from elm.utilities import validate_azure_api_params
Expand Down Expand Up @@ -137,11 +137,11 @@ async def process_counties_with_openai(
By default, ``4000``.
text_splitter_chunk_size : int, optional
Chunk size input to
`langchain.text_splitter.RecursiveCharacterTextSplitter`.
`langchain_text_splitters.character.RecursiveCharacterTextSplitter`.
By default, ``3000``.
text_splitter_chunk_overlap : int, optional
Chunk overlap input to
`langchain.text_splitter.RecursiveCharacterTextSplitter`.
`langchain_text_splitters.character.RecursiveCharacterTextSplitter`.
By default, ``300``.
num_urls_to_check_per_county : int, optional
Number of unique Google search result URL's to check for
Expand Down
17 changes: 17 additions & 0 deletions elm/utilities/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import html2text
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -60,6 +61,22 @@ def remove_blank_pages(pages):
return [page for page in pages if any(page.strip())]


def resembles_html(text):
"""Check if text resembles HTML

Parameters
----------
text : str
Input text which may be plaintext or HTML.

Returns
-------
bool
``True`` if the text resembles HTML, ``False`` otherwise.
"""
return bool(BeautifulSoup(text, 'html.parser').find())


def html_to_text(html, ignore_links=True):
"""Call to `HTML2Text` class with basic args.

Expand Down
2 changes: 1 addition & 1 deletion elm/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
ELM version number
"""

__version__ = "0.0.28"
__version__ = "0.0.29"
13 changes: 11 additions & 2 deletions elm/web/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
clean_headers,
html_to_text,
remove_blank_pages,
resembles_html,
format_html_tables,
read_pdf,
read_pdf_ocr,
Expand Down Expand Up @@ -263,6 +264,7 @@ class HTMLDocument(BaseDocument):
"""Default :func:`~elm.utilities.parse.format_html_tables` arguments"""
WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"}
FILE_EXTENSION = "txt"
NUM_HTML_PARSE_ATTEMPTS = 3

def __init__(
self,
Expand Down Expand Up @@ -311,10 +313,17 @@ def __init__(
def _cleaned_text(self):
"""Compute cleaned text from document"""
text = combine_pages(self.pages)
text = html_to_text(text, self.ignore_html_links)
text = format_html_tables(text, **self.html_table_to_markdown_kwargs)
for ind in range(self.NUM_HTML_PARSE_ATTEMPTS):
if ind > 0 and not resembles_html(text):
break
text = self._process_html_text(text)
return text

def _process_html_text(self, text):
"""Process HTML text to plain text with formatted tables"""
text = html_to_text(text, self.ignore_html_links)
return format_html_tables(text, **self.html_table_to_markdown_kwargs)

def _raw_pages(self):
"""Get raw pages from document"""
if self.text_splitter is None:
Expand Down
37 changes: 30 additions & 7 deletions elm/web/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,28 +216,39 @@ async def _fetch_doc(self, url):
async with aiohttp.ClientSession() as session:
try:
logger.debug("Fetching content from %r", url)
url_bytes = await self._fetch_content_with_retry(url, session)
out = await self._fetch_content_with_retry(url, session)
except ELMRuntimeError:
logger.exception("Could not fetch content from %r", url)
return PDFDocument(pages=[]), None

raw_content, ct, charset = out
logger.debug("Got content from %r", url)
doc = await self.pdf_read_coroutine(url_bytes, **self.pdf_read_kwargs)
doc = await self.pdf_read_coroutine(raw_content,
**self.pdf_read_kwargs)
if not doc.empty:
return doc, url_bytes
return doc, raw_content

logger.debug("PDF read failed; fetching HTML content from %r", url)
doc = await self._fetch_html_using_pw_with_retry(url)
if not doc.empty:
return doc, doc.text

if self.pdf_ocr_read_coroutine:
if "text" in ct:
logger.debug("HTML read with playwright failed; fetching HTML "
"content from response with content type %r and "
"charset %r for %r", ct, charset, url)
doc = await self._try_load_doc_from_response_text(raw_content,
charset)
if not doc.empty:
return doc, doc.text

elif self.pdf_ocr_read_coroutine:
logger.debug("HTML read failed; fetching OCR content from %r", url)
doc = await self.pdf_ocr_read_coroutine(
url_bytes, **self.pdf_read_kwargs
raw_content, **self.pdf_read_kwargs
)

return doc, url_bytes
return doc, raw_content

async def _fetch_html_using_pw_with_retry(self, url):
"""Fetch HTML content with several retry attempts"""
Expand Down Expand Up @@ -277,7 +288,19 @@ async def _fetch_html_using_pw_with_retry(self, url):
async def _fetch_content_with_retry(self, url, session):
"""Fetch content from URL with several retry attempts"""
async with session.get(url, **self.get_kwargs) as response:
return await response.read()
body = await response.read()
ct = response.content_type.casefold()
charset = response.charset or 'utf-8'
return body, ct, charset

async def _try_load_doc_from_response_text(self, raw_content, charset):
"""Try to load document by decoding response text"""
try:
text = raw_content.decode(charset)
except Exception:
return HTMLDocument(pages=[])

return await self.html_read_coroutine(text, **self.html_read_kwargs)

async def _cache_doc(self, doc, raw_content):
"""Cache doc if user provided a coroutine"""
Expand Down
2 changes: 1 addition & 1 deletion examples/ordinance_gpt/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import partial

import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

from rex import init_logger
from elm.base import ApiBase
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,13 +253,13 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functools import partial\n",
"from elm import ApiBase\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain_text_splitters.character import RecursiveCharacterTextSplitter\n",
"from elm.ords.utilities import RTS_SEPARATORS\n",
"\n",
"model = \"gpt-4\"\n",
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ google-api-python-client
google-search-results
html2text
httpx
langchain
langchain-text-splitters
lxml
matplotlib
networkx
Expand Down
2 changes: 2 additions & 0 deletions tests/ords/test_integrated.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
class MockResponse:
def __init__(self, read_return):
self.read_return = read_return
self.content_type = "application/pdf"
self.charset = "utf-8"

async def read(self):
return self.read_return
Expand Down
2 changes: 1 addition & 1 deletion tests/ords/validation/test_validation_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

from elm import TEST_DATA_DIR, ApiBase
from elm.web.document import PDFDocument, HTMLDocument
Expand Down
2 changes: 2 additions & 0 deletions tests/web/test_web_file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class MockResponse:
def __init__(self, read_return):
"""Store the desired read response."""
self.read_return = read_return
self.content_type = "application/pdf"
self.charset = "utf-8"

async def read(self):
"""Return what class was initialized with."""
Expand Down
Loading