diff --git a/elm/version.py b/elm/version.py index 6f80db09..b7605d28 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.31" +__version__ = "0.0.32" diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 91c8a47a..a3a148b2 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -2,6 +2,8 @@ """ELM Web file loader class.""" import asyncio import logging +from pathlib import Path +from abc import ABC, abstractmethod import aiohttp from fake_useragent import UserAgent @@ -29,7 +31,148 @@ async def _read_html_doc(text, **kwargs): return HTMLDocument([text], **kwargs) -class AsyncFileLoader: +async def _read_pdf_file(pdf_fp, **kwargs): + """Default read PDF file function (runs in main thread)""" + verbose = kwargs.pop("verbose", True) + with open(pdf_fp, "rb") as fh: + pdf_bytes = fh.read() + pages = read_pdf(pdf_bytes, verbose=verbose) + return PDFDocument(pages, **kwargs), pdf_bytes + + +async def _read_html_file(html_fp, **kwargs): + """Default read HTML function (runs in main thread)""" + with open(html_fp, "r") as fh: + text = fh.read() + return HTMLDocument([text], **kwargs), text + + +class BaseAsyncFileLoader(ABC): + """Base class for async file loading""" + + def __init__( + self, + pdf_read_coroutine, + html_read_coroutine, + pdf_read_kwargs=None, + html_read_kwargs=None, + pdf_ocr_read_coroutine=None, + file_cache_coroutine=None, + **__, # consume any extra kwargs + ): + """ + + Parameters + ---------- + pdf_read_coroutine : callable + PDF file read coroutine. Must by an async function. + Must return a :obj:`elm.web.document.PDFDocument`. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. + Must return a :obj:`elm.web.document.HTMLDocument`. + pdf_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `pdf_read_coroutine`. By default, ``None``. + html_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `html_read_coroutine`. By default, ``None``. + pdf_ocr_read_coroutine : callable, optional + PDF OCR file read coroutine. Must by an async function. + Should accept PDF bytes as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.PDFDocument`. + If ``None``, PDF OCR parsing is not attempted, and any + scanned PDF URL's will return a blank document. + By default, ``None``. + file_cache_coroutine : callable, optional + File caching coroutine. Can be used to cache files + downloaded by this class. Must accept an + :obj:`~elm.web.document.Document` instance as the first + argument and the file content to be written as the second + argument. If this method is not provided, no document + caching is performed. By default, ``None``. + """ + self.pdf_read_coroutine = pdf_read_coroutine + self.html_read_coroutine = html_read_coroutine + self.pdf_read_kwargs = pdf_read_kwargs or {} + self.html_read_kwargs = html_read_kwargs or {} + self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine + self.file_cache_coroutine = file_cache_coroutine + + async def fetch_all(self, *sources): + """Fetch documents for all requested sources. + + Parameters + ---------- + *sources + Iterable of sources (as strings) used to fetch the + documents. + + Returns + ------- + list + List of documents, one per requested sources. + """ + outer_task_name = asyncio.current_task().get_name() + fetches = [ + asyncio.create_task(self.fetch(source), name=outer_task_name) + for source in sources + ] + return await asyncio.gather(*fetches) + + async def fetch(self, source): + """Fetch a document for the given source. + + Parameters + ---------- + source : str + Source used to load the document. + + Returns + ------- + :class:`elm.web.document.Document` + Document instance containing text, if the load was + successful. + """ + try: + doc, raw = await self._fetch_doc_with_url_in_metadata(source) + except KeyboardInterrupt: + raise + except Exception as e: + msg = ("Encountered error of type %r while fetching document from " + "%s:") + err_type = type(e) + logger.exception(msg, err_type, source) + return HTMLDocument(pages=[]) + + doc = await self._cache_doc(doc, raw) + return doc + + async def _fetch_doc_with_url_in_metadata(self, source): + """Fetch doc contents and add source to metadata""" + doc, raw_content = await self._fetch_doc(source) + doc.attrs["source"] = source + return doc, raw_content + + async def _cache_doc(self, doc, raw_content): + """Cache doc if user provided a coroutine""" + if doc.empty or not raw_content: + return doc + + if not self.file_cache_coroutine: + return doc + + cache_fn = await self.file_cache_coroutine(doc, raw_content) + if cache_fn is not None: + doc.attrs["cache_fn"] = cache_fn + return doc + + @abstractmethod + async def _fetch_doc(self, source): + """Fetch documents given a source""" + raise NotImplementedError + + +class AsyncWebFileLoader(BaseAsyncFileLoader): """Async web file (PDF or HTML) loader Purpose: @@ -132,18 +275,20 @@ def __init__( of attempts will always be 2, even if the user provides a value smaller than this. By default, ``3``. """ + super().__init__( + pdf_read_coroutine=pdf_read_coroutine or _read_pdf_doc, + html_read_coroutine=html_read_coroutine or _read_html_doc, + pdf_read_kwargs=pdf_read_kwargs, + html_read_kwargs=html_read_kwargs, + pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, + file_cache_coroutine=file_cache_coroutine + ) self.pw_launch_kwargs = pw_launch_kwargs or {} - self.pdf_read_kwargs = pdf_read_kwargs or {} - self.html_read_kwargs = html_read_kwargs or {} self.get_kwargs = { "headers": self._header_from_template(header_template), "ssl": None if verify_ssl else False, **(aget_kwargs or {}), } - self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc - self.html_read_coroutine = html_read_coroutine or _read_html_doc - self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine - self.file_cache_coroutine = file_cache_coroutine self.browser_semaphore = browser_semaphore self.uss = use_scrapling_stealth self.num_pw_html_retries = num_pw_html_retries @@ -156,60 +301,6 @@ def _header_from_template(self, header_template): headers["User-Agent"] = UserAgent().random return headers - async def fetch_all(self, *urls): - """Fetch documents for all requested URL's. - - Parameters - ---------- - *urls - Iterable of URL's (as strings) to fetch. - - Returns - ------- - list - List of documents, one per requested URL. - """ - outer_task_name = asyncio.current_task().get_name() - fetches = [ - asyncio.create_task(self.fetch(url), name=outer_task_name) - for url in urls - ] - return await asyncio.gather(*fetches) - - async def fetch(self, url): - """Fetch a document for the given URL. - - Parameters - ---------- - url : str - URL for the document to pull down. - - Returns - ------- - :class:`elm.web.document.Document` - Document instance containing text, if the fetch was - successful. - """ - try: - doc, raw_content = await self._fetch_doc_with_url_in_metadata(url) - except KeyboardInterrupt: - raise - except Exception as e: - msg = ("Encountered error of type %r while fetching document from " - "%s:") - err_type = type(e) - logger.exception(msg, err_type, url) - return HTMLDocument(pages=[]) - - doc = await self._cache_doc(doc, raw_content) - return doc - - async def _fetch_doc_with_url_in_metadata(self, url): - """Fetch doc contents and add URL to metadata""" - doc, raw_content = await self._fetch_doc(url) - doc.attrs["source"] = url - return doc, raw_content - async def _fetch_doc(self, url): """Fetch a doc by trying pdf read, then HTML read, then PDF OCR""" @@ -302,15 +393,110 @@ async def _try_load_doc_from_response_text(self, raw_content, charset): return await self.html_read_coroutine(text, **self.html_read_kwargs) - async def _cache_doc(self, doc, raw_content): - """Cache doc if user provided a coroutine""" - if doc.empty or not raw_content: - return doc - if not self.file_cache_coroutine: - return doc +class AsyncLocalFileLoader(BaseAsyncFileLoader): + """Async local file (PDF or HTML) loader""" - cache_fn = await self.file_cache_coroutine(doc, raw_content) - if cache_fn is not None: - doc.attrs["cache_fn"] = cache_fn - return doc + def __init__( + self, + pdf_read_kwargs=None, + html_read_kwargs=None, + pdf_read_coroutine=None, + html_read_coroutine=None, + pdf_ocr_read_coroutine=None, + file_cache_coroutine=None, + doc_attrs=None, + **__, # consume any extra kwargs + ): + """ + + Parameters + ---------- + pdf_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `pdf_read_coroutine`. By default, ``None``. + html_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `html_read_coroutine`. By default, ``None``. + pdf_read_coroutine : callable, optional + PDF file read coroutine. Must by an async function. Should + accept a PDF filepath as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.PDFDocument` + along with the raw PDF bytes (for caching purposes). + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. Should + accept an HTML filepath as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.HTMLDocument` + along with the raw text (for caching purposes). + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + pdf_ocr_read_coroutine : callable, optional + PDF OCR file read coroutine. Must by an async function. + Should accept a PDF filepath as the first argument and + kwargs as the rest. Must return a + :obj:`elm.web.document.PDFDocument` along with the raw PDF + bytes (for caching purposes). + If ``None``, PDF OCR parsing is not attempted, and any + scanned PDF URL's will return a blank document. + By default, ``None``. + file_cache_coroutine : callable, optional + File caching coroutine. Can be used to cache files + downloaded by this class. Must accept an + :obj:`~elm.web.document.Document` instance as the first + argument and the file content to be written as the second + argument. If this method is not provided, no document + caching is performed. By default, ``None``. + doc_attrs : dict, optional + Additional document attributes to add to each loaded + document. By default, ``None``. + """ + super().__init__( + pdf_read_coroutine=pdf_read_coroutine or _read_pdf_file, + html_read_coroutine=html_read_coroutine or _read_html_file, + pdf_read_kwargs=pdf_read_kwargs, + html_read_kwargs=html_read_kwargs, + pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, + file_cache_coroutine=file_cache_coroutine + ) + self.doc_attrs = doc_attrs or {} + + async def _fetch_doc(self, source): + """Load a doc by reading file based on extension""" + fp = Path(source) + if fp.suffix.lower() == ".pdf": + logger.debug("Trying to read PDF file: %r", source) + doc, raw = await self.pdf_read_coroutine(fp, + **self.pdf_read_kwargs) + if not doc.empty: + return doc, raw + elif self.pdf_ocr_read_coroutine: + logger.debug("PDF read failed; fetching OCR content from %r", + source) + doc, raw = await self.pdf_ocr_read_coroutine( + fp, **self.pdf_read_kwargs) + if not doc.empty: + return doc, raw + + if fp.suffix.lower() == ".txt": + logger.debug("Trying to read HTML file: %r", source) + doc, raw = await self.html_read_coroutine(fp, + **self.html_read_kwargs) + if not doc.empty: + return doc, raw + + logger.error("Failed to read file: %r", source) + return PDFDocument(pages=[]), None + + async def _fetch_doc_with_url_in_metadata(self, source): + """Fetch doc contents and add source to metadata""" + doc, raw_content = await self._fetch_doc(source) + for key, value in self.doc_attrs.items(): + doc.attrs[key] = value + doc.attrs["source_fp"] = source + return doc, raw_content + + +class AsyncFileLoader(AsyncWebFileLoader): + """Alias for AsyncWebFileLoader (for backward compatibility)""" diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 0ebffa1a..4dcdaf4c 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -38,7 +38,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, By default, ``None``. page : int, default=1 The page of results to return. By default, ``1``. - backend : str or iter of str, optional + backend : str, optional Option for DuxDistributedGlobalSearch backend: - auto: Randomly select 3 search engines to use @@ -52,8 +52,8 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, - yandex: Yandex - duckduckgo: Duckduckgo - Can also be a list or tuple of a combination of these. - By default, ``("google", "bing", "yahoo", "duckduckgo")``. + Can also be a comma-separated combination of these. + By default, ``"all"``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional @@ -81,4 +81,3 @@ async def _search(self, query, num_results=10): return list(filter(None, (info.get('href', "").replace("+", "%20") for info in results))) - diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py index d1c36925..71051427 100644 --- a/tests/ords/test_integrated.py +++ b/tests/ords/test_integrated.py @@ -221,7 +221,7 @@ async def test_async_file_loader_with_temp_cache(monkeypatch): async with RunningAsyncServices([TempFileCache()]): loader = AsyncFileLoader(file_cache_coroutine=TempFileCache.call) - doc = await loader.fetch(url="Whatcom") + doc = await loader.fetch("Whatcom") assert doc.text == truth.text assert doc.attrs["source"] == "Whatcom" cached_fp = doc.attrs["cache_fn"] diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index 29bbbf55..cc2f2451 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -13,7 +13,7 @@ import pdftotext from elm import TEST_DATA_DIR -from elm.web.file_loader import AsyncFileLoader +from elm.web.file_loader import AsyncWebFileLoader, AsyncLocalFileLoader from elm.web.document import PDFDocument, HTMLDocument import elm.web.html_pw @@ -67,8 +67,8 @@ async def test_async_file_loader_basic_pdf(monkeypatch): raising=True, ) - loader = AsyncFileLoader() - doc = await loader.fetch(url="gpt-4") + loader = AsyncWebFileLoader() + doc = await loader.fetch("gpt-4") with open(GPT4_DOC_PATH, "rb") as fh: pdf = pdftotext.PDF(fh, physical=True) @@ -97,8 +97,8 @@ async def test_async_file_loader_basic_html(monkeypatch): raising=True, ) - loader = AsyncFileLoader() - doc = await loader.fetch(url="Whatcom") + loader = AsyncWebFileLoader() + doc = await loader.fetch("Whatcom") with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh: content = fh.read() @@ -112,7 +112,7 @@ async def test_async_file_loader_basic_html(monkeypatch): @pytest.mark.asyncio async def test_async_file_loader_fetch_all(monkeypatch, tmp_path): - """Test `AsyncFileLoader.fetch_all` function for basic docs""" + """Test `AsyncWebFileLoader.fetch_all` function for basic docs""" monkeypatch.setattr( aiohttp.ClientSession, @@ -146,7 +146,7 @@ async def _cache_file(doc, content): assert not list(tmp_path.glob("*")) with ThreadPoolExecutor() as pool: - loader = AsyncFileLoader(file_cache_coroutine=_cache_file) + loader = AsyncWebFileLoader(file_cache_coroutine=_cache_file) docs = await loader.fetch_all("gpt-4", "Whatcom") assert len(docs) == 2 @@ -178,5 +178,41 @@ async def _cache_file(doc, content): assert truth_html.text == fh.read() +@pytest.mark.asyncio +async def test_async_local_file_loader_basic_html(): + """Test `AsyncLocalFileLoader` for a basic HTML doc""" + + loader = AsyncLocalFileLoader(doc_attrs={"test_1": 1}) + doc = await loader.fetch(WHATCOM_DOC_PATH) + + with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh: + content = fh.read() + + truth = HTMLDocument([content]) + + assert doc.text == truth.text + assert doc.attrs["source_fp"] == WHATCOM_DOC_PATH + assert doc.attrs["test_1"] == 1 + assert "cache_fn" not in doc.attrs + + +@pytest.mark.asyncio +async def test_async_local_file_loader_basic_pdf(): + """Test `AsyncLocalFileLoader` for a basic PDF doc""" + + loader = AsyncLocalFileLoader() + doc = await loader.fetch(GPT4_DOC_PATH) + + with open(GPT4_DOC_PATH, "rb") as fh: + pdf = pdftotext.PDF(fh, physical=True) + + truth = PDFDocument(pdf) + + assert doc.text == truth.text + assert doc.attrs["source_fp"] == GPT4_DOC_PATH + assert "test_1" not in doc.attrs + assert "cache_fn" not in doc.attrs + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])