From 5f1df96d0ea4c3c1943c6d3075aa698ef7289e64 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 18:13:49 -0600 Subject: [PATCH 01/12] Fix docstring --- elm/web/search/dux.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index 0ebffa1a..a9c77dbe 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -38,7 +38,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, By default, ``None``. page : int, default=1 The page of results to return. By default, ``1``. - backend : str or iter of str, optional + backend : str, optional Option for DuxDistributedGlobalSearch backend: - auto: Randomly select 3 search engines to use @@ -52,8 +52,8 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, - yandex: Yandex - duckduckgo: Duckduckgo - Can also be a list or tuple of a combination of these. - By default, ``("google", "bing", "yahoo", "duckduckgo")``. + Can also be a comma-separated combination of these. + By default, ``"all``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional From c3707fae37d1bf497572ec4015a584ee5be2b31a Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 18:14:01 -0600 Subject: [PATCH 02/12] Bump version --- elm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/version.py b/elm/version.py index 6f80db09..b7605d28 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.31" +__version__ = "0.0.32" From 9924744daecc1e951cf4b753cb32e3629c3ea9c1 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 19:31:43 -0600 Subject: [PATCH 03/12] Refactor to `BaseAsyncFileLoader` --- elm/web/file_loader.py | 215 ++++++++++++++++++++---------- tests/web/test_web_file_loader.py | 4 +- 2 files changed, 145 insertions(+), 74 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 91c8a47a..0a9096d7 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -2,6 +2,7 @@ """ELM Web file loader class.""" import asyncio import logging +from abc import ABC, abstractmethod import aiohttp from fake_useragent import UserAgent @@ -29,7 +30,138 @@ async def _read_html_doc(text, **kwargs): return HTMLDocument([text], **kwargs) -class AsyncFileLoader: +class BaseAsyncFileLoader(ABC): + """Base class for async file loading""" + + def __init__( + self, + pdf_read_kwargs=None, + html_read_kwargs=None, + pdf_read_coroutine=None, + html_read_coroutine=None, + pdf_ocr_read_coroutine=None, + file_cache_coroutine=None, + **__, # consume any extra kwargs + ): + """ + + Parameters + ---------- + pdf_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `pdf_read_coroutine`. By default, ``None``. + html_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `html_read_coroutine`. By default, ``None``. + pdf_read_coroutine : callable, optional + PDF file read coroutine. Must by an async function. Should + accept PDF bytes as the first argument and kwargs as the + rest. Must return a :obj:`elm.web.document.PDFDocument`. + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. Should + accept HTML text as the first argument and kwargs as the + rest. Must return a :obj:`elm.web.document.HTMLDocument`. + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + pdf_ocr_read_coroutine : callable, optional + PDF OCR file read coroutine. Must by an async function. + Should accept PDF bytes as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.PDFDocument`. + If ``None``, PDF OCR parsing is not attempted, and any + scanned PDF URL's will return a blank document. + By default, ``None``. + file_cache_coroutine : callable, optional + File caching coroutine. Can be used to cache files + downloaded by this class. Must accept an + :obj:`~elm.web.document.Document` instance as the first + argument and the file content to be written as the second + argument. If this method is not provided, no document + caching is performed. By default, ``None``. + """ + self.pdf_read_kwargs = pdf_read_kwargs or {} + self.html_read_kwargs = html_read_kwargs or {} + self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc + self.html_read_coroutine = html_read_coroutine or _read_html_doc + self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine + self.file_cache_coroutine = file_cache_coroutine + + async def fetch_all(self, *sources): + """Fetch documents for all requested sources. + + Parameters + ---------- + *sources + Iterable of sources (as strings) used to fetch the + documents. + + Returns + ------- + list + List of documents, one per requested sources. + """ + outer_task_name = asyncio.current_task().get_name() + fetches = [ + asyncio.create_task(self.fetch(source), name=outer_task_name) + for source in sources + ] + return await asyncio.gather(*fetches) + + async def fetch(self, source): + """Fetch a document for the given source. + + Parameters + ---------- + source : str + Source used to load the document. + + Returns + ------- + :class:`elm.web.document.Document` + Document instance containing text, if the load was + successful. + """ + try: + doc, raw = await self._fetch_doc_with_url_in_metadata(source) + except KeyboardInterrupt: + raise + except Exception as e: + msg = ("Encountered error of type %r while fetching document from " + "%s:") + err_type = type(e) + logger.exception(msg, err_type, source) + return HTMLDocument(pages=[]) + + doc = await self._cache_doc(doc, raw) + return doc + + async def _fetch_doc_with_url_in_metadata(self, source): + """Fetch doc contents and add source to metadata""" + doc, raw_content = await self._fetch_doc(source) + doc.attrs["source"] = source + return doc, raw_content + + async def _cache_doc(self, doc, raw_content): + """Cache doc if user provided a coroutine""" + if doc.empty or not raw_content: + return doc + + if not self.file_cache_coroutine: + return doc + + cache_fn = await self.file_cache_coroutine(doc, raw_content) + if cache_fn is not None: + doc.attrs["cache_fn"] = cache_fn + return doc + + @abstractmethod + async def _fetch_doc(self, source): + """Fetch documents given a source""" + raise NotImplementedError + + +class AsyncWebFileLoader(BaseAsyncFileLoader): """Async web file (PDF or HTML) loader Purpose: @@ -132,18 +264,20 @@ def __init__( of attempts will always be 2, even if the user provides a value smaller than this. By default, ``3``. """ + + super().__init__(pdf_read_kwargs=pdf_read_kwargs, + html_read_kwargs=html_read_kwargs, + pdf_read_coroutine=pdf_read_coroutine, + html_read_coroutine=html_read_coroutine, + pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, + file_cache_coroutine=file_cache_coroutine) + self.pw_launch_kwargs = pw_launch_kwargs or {} - self.pdf_read_kwargs = pdf_read_kwargs or {} - self.html_read_kwargs = html_read_kwargs or {} self.get_kwargs = { "headers": self._header_from_template(header_template), "ssl": None if verify_ssl else False, **(aget_kwargs or {}), } - self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc - self.html_read_coroutine = html_read_coroutine or _read_html_doc - self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine - self.file_cache_coroutine = file_cache_coroutine self.browser_semaphore = browser_semaphore self.uss = use_scrapling_stealth self.num_pw_html_retries = num_pw_html_retries @@ -156,60 +290,6 @@ def _header_from_template(self, header_template): headers["User-Agent"] = UserAgent().random return headers - async def fetch_all(self, *urls): - """Fetch documents for all requested URL's. - - Parameters - ---------- - *urls - Iterable of URL's (as strings) to fetch. - - Returns - ------- - list - List of documents, one per requested URL. - """ - outer_task_name = asyncio.current_task().get_name() - fetches = [ - asyncio.create_task(self.fetch(url), name=outer_task_name) - for url in urls - ] - return await asyncio.gather(*fetches) - - async def fetch(self, url): - """Fetch a document for the given URL. - - Parameters - ---------- - url : str - URL for the document to pull down. - - Returns - ------- - :class:`elm.web.document.Document` - Document instance containing text, if the fetch was - successful. - """ - try: - doc, raw_content = await self._fetch_doc_with_url_in_metadata(url) - except KeyboardInterrupt: - raise - except Exception as e: - msg = ("Encountered error of type %r while fetching document from " - "%s:") - err_type = type(e) - logger.exception(msg, err_type, url) - return HTMLDocument(pages=[]) - - doc = await self._cache_doc(doc, raw_content) - return doc - - async def _fetch_doc_with_url_in_metadata(self, url): - """Fetch doc contents and add URL to metadata""" - doc, raw_content = await self._fetch_doc(url) - doc.attrs["source"] = url - return doc, raw_content - async def _fetch_doc(self, url): """Fetch a doc by trying pdf read, then HTML read, then PDF OCR""" @@ -302,15 +382,6 @@ async def _try_load_doc_from_response_text(self, raw_content, charset): return await self.html_read_coroutine(text, **self.html_read_kwargs) - async def _cache_doc(self, doc, raw_content): - """Cache doc if user provided a coroutine""" - if doc.empty or not raw_content: - return doc - - if not self.file_cache_coroutine: - return doc - cache_fn = await self.file_cache_coroutine(doc, raw_content) - if cache_fn is not None: - doc.attrs["cache_fn"] = cache_fn - return doc +class AsyncFileLoader(AsyncWebFileLoader): + """Alias for AsyncWebFileLoader (for backward compatibility)""" diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index 29bbbf55..cc2bf3ca 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -68,7 +68,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch): ) loader = AsyncFileLoader() - doc = await loader.fetch(url="gpt-4") + doc = await loader.fetch("gpt-4") with open(GPT4_DOC_PATH, "rb") as fh: pdf = pdftotext.PDF(fh, physical=True) @@ -98,7 +98,7 @@ async def test_async_file_loader_basic_html(monkeypatch): ) loader = AsyncFileLoader() - doc = await loader.fetch(url="Whatcom") + doc = await loader.fetch("Whatcom") with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh: content = fh.read() From 8d6a2f6ebb0248da928d03aba6b3090d1bf8e47b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:23:14 -0600 Subject: [PATCH 04/12] Re-order args --- elm/web/file_loader.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 0a9096d7..b32872e5 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -35,10 +35,10 @@ class BaseAsyncFileLoader(ABC): def __init__( self, + pdf_read_coroutine, + html_read_coroutine, pdf_read_kwargs=None, html_read_kwargs=None, - pdf_read_coroutine=None, - html_read_coroutine=None, pdf_ocr_read_coroutine=None, file_cache_coroutine=None, **__, # consume any extra kwargs @@ -47,24 +47,18 @@ def __init__( Parameters ---------- + pdf_read_coroutine : callable + PDF file read coroutine. Must by an async function. + Must return a :obj:`elm.web.document.PDFDocument`. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. + Must return a :obj:`elm.web.document.HTMLDocument`. pdf_read_kwargs : dict, optional Keyword-value argument pairs to pass to the `pdf_read_coroutine`. By default, ``None``. html_read_kwargs : dict, optional Keyword-value argument pairs to pass to the `html_read_coroutine`. By default, ``None``. - pdf_read_coroutine : callable, optional - PDF file read coroutine. Must by an async function. Should - accept PDF bytes as the first argument and kwargs as the - rest. Must return a :obj:`elm.web.document.PDFDocument`. - If ``None``, a default function that runs in the main thread - is used. By default, ``None``. - html_read_coroutine : callable, optional - HTML file read coroutine. Must by an async function. Should - accept HTML text as the first argument and kwargs as the - rest. Must return a :obj:`elm.web.document.HTMLDocument`. - If ``None``, a default function that runs in the main thread - is used. By default, ``None``. pdf_ocr_read_coroutine : callable, optional PDF OCR file read coroutine. Must by an async function. Should accept PDF bytes as the first argument and kwargs as From deb142aed76a1aed06a89079ea251e76140ef533 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:23:25 -0600 Subject: [PATCH 05/12] No more defaults --- elm/web/file_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index b32872e5..1bc98597 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -74,10 +74,10 @@ def __init__( argument. If this method is not provided, no document caching is performed. By default, ``None``. """ + self.pdf_read_coroutine = pdf_read_coroutine + self.html_read_coroutine = html_read_coroutine self.pdf_read_kwargs = pdf_read_kwargs or {} self.html_read_kwargs = html_read_kwargs or {} - self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc - self.html_read_coroutine = html_read_coroutine or _read_html_doc self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine self.file_cache_coroutine = file_cache_coroutine From b1f7a98d993efc5b382c047386914bac5b382f26 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:23:40 -0600 Subject: [PATCH 06/12] Add file read methods --- elm/web/file_loader.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 1bc98597..593e56dc 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -30,6 +30,23 @@ async def _read_html_doc(text, **kwargs): return HTMLDocument([text], **kwargs) +async def _read_pdf_file(pdf_fp, **kwargs): + """Default read PDF file function (runs in main thread)""" + verbose = kwargs.pop("verbose", True) + with open(pdf_fp, "rb") as fh: + pdf_bytes = fh.read() + pages = read_pdf(pdf_bytes, verbose=verbose) + return PDFDocument(pages, **kwargs), pdf_bytes + + +async def _read_html_file(html_fp, **kwargs): + """Default read HTML function (runs in main thread)""" + with open(html_fp, "r") as fh: + text = fh.read() + return HTMLDocument([text], **kwargs), text + + + class BaseAsyncFileLoader(ABC): """Base class for async file loading""" From 305c5b54849b0c2197b373ce1a8652e579232456 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:23:49 -0600 Subject: [PATCH 07/12] Re-order inputs --- elm/web/file_loader.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 593e56dc..38ca2c78 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -275,13 +275,14 @@ def __init__( of attempts will always be 2, even if the user provides a value smaller than this. By default, ``3``. """ - - super().__init__(pdf_read_kwargs=pdf_read_kwargs, - html_read_kwargs=html_read_kwargs, - pdf_read_coroutine=pdf_read_coroutine, - html_read_coroutine=html_read_coroutine, - pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, - file_cache_coroutine=file_cache_coroutine) + super().__init__( + pdf_read_coroutine=pdf_read_coroutine or _read_pdf_doc, + html_read_coroutine=html_read_coroutine or _read_html_doc, + pdf_read_kwargs=pdf_read_kwargs, + html_read_kwargs=html_read_kwargs, + pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, + file_cache_coroutine=file_cache_coroutine + ) self.pw_launch_kwargs = pw_launch_kwargs or {} self.get_kwargs = { From cef430cb8988f7b3327db0e780c235dce965890b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:24:03 -0600 Subject: [PATCH 08/12] Add `AsyncLocalFileLoader` --- elm/web/file_loader.py | 104 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 38ca2c78..10056dd7 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -2,6 +2,7 @@ """ELM Web file loader class.""" import asyncio import logging +from pathlib import Path from abc import ABC, abstractmethod import aiohttp @@ -395,5 +396,108 @@ async def _try_load_doc_from_response_text(self, raw_content, charset): return await self.html_read_coroutine(text, **self.html_read_kwargs) +class AsyncLocalFileLoader(BaseAsyncFileLoader): + """Async local file (PDF or HTML) loader""" + + def __init__( + self, + pdf_read_kwargs=None, + html_read_kwargs=None, + pdf_read_coroutine=None, + html_read_coroutine=None, + pdf_ocr_read_coroutine=None, + file_cache_coroutine=None, + doc_attrs=None, + **__, # consume any extra kwargs + ): + """ + + Parameters + ---------- + pdf_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `pdf_read_coroutine`. By default, ``None``. + html_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `html_read_coroutine`. By default, ``None``. + pdf_read_coroutine : callable, optional + PDF file read coroutine. Must by an async function. Should + accept a PDF filepath as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.PDFDocument` + along with the raw PDF bytes (for caching purposes). + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. Should + accept an HTML filepath as the first argument and kwargs as + the rest. Must return a :obj:`elm.web.document.HTMLDocument` + along with the raw text (for caching purposes). + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + pdf_ocr_read_coroutine : callable, optional + PDF OCR file read coroutine. Must by an async function. + Should accept a PDF filepath as the first argument and + kwargs as the rest. Must return a + :obj:`elm.web.document.PDFDocument` along with the raw PDF + bytes (for caching purposes). + If ``None``, PDF OCR parsing is not attempted, and any + scanned PDF URL's will return a blank document. + By default, ``None``. + file_cache_coroutine : callable, optional + File caching coroutine. Can be used to cache files + downloaded by this class. Must accept an + :obj:`~elm.web.document.Document` instance as the first + argument and the file content to be written as the second + argument. If this method is not provided, no document + caching is performed. By default, ``None``. + doc_attrs : dict, optional + Additional document attributes to add to each loaded + document. By default, ``None``. + """ + super().__init__( + pdf_read_coroutine=pdf_read_coroutine or _read_pdf_file, + html_read_coroutine=html_read_coroutine or _read_html_file, + pdf_read_kwargs=pdf_read_kwargs, + html_read_kwargs=html_read_kwargs, + pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, + file_cache_coroutine=file_cache_coroutine + ) + self.doc_attrs = doc_attrs or {} + + async def _fetch_doc(self, source): + """Load a doc by reading file base don extension""" + fp = Path(source) + if fp.suffix.lower() == ".pdf": + logger.debug("Trying to read PDF file: %r", source) + doc, raw = await self.pdf_read_coroutine(fp, + **self.pdf_read_kwargs) + if not doc.empty: + return doc, raw + elif self.pdf_ocr_read_coroutine: + logger.debug("PDF read failed; fetching OCR content from %r", + source) + doc, raw = await self.pdf_ocr_read_coroutine( + fp, **self.pdf_read_kwargs) + if not doc.empty: + return doc, raw + + if fp.suffix.lower() == ".txt": + logger.debug("Trying to read HTML file: %r", source) + doc = await self.html_read_coroutine(fp, **self.html_read_kwargs) + if not doc.empty: + return doc, raw + + logger.error("Failed to read file file: %r", source) + return PDFDocument(pages=[]), None + + async def _fetch_doc_with_url_in_metadata(self, source): + """Fetch doc contents and add source to metadata""" + doc, raw_content = await self._fetch_doc(source) + for key, value in self.doc_attrs.items(): + doc.attrs[key] = value + doc.attrs["source_fp"] = source + return doc, raw_content + + class AsyncFileLoader(AsyncWebFileLoader): """Alias for AsyncWebFileLoader (for backward compatibility)""" From 65a3bdbc4f72b650058df4b67c01a7f276e9fcfb Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:26:50 -0600 Subject: [PATCH 09/12] Fix bug --- elm/web/file_loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 10056dd7..2ac797a4 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -483,7 +483,8 @@ async def _fetch_doc(self, source): if fp.suffix.lower() == ".txt": logger.debug("Trying to read HTML file: %r", source) - doc = await self.html_read_coroutine(fp, **self.html_read_kwargs) + doc, raw = await self.html_read_coroutine(fp, + **self.html_read_kwargs) if not doc.empty: return doc, raw From 12a6d269a4ca7886c9d9b366ecd250f2b50d22ef Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:28:24 -0600 Subject: [PATCH 10/12] Add tests for `AsyncLocalFileLoader` --- tests/web/test_web_file_loader.py | 46 +++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index cc2bf3ca..cc2f2451 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -13,7 +13,7 @@ import pdftotext from elm import TEST_DATA_DIR -from elm.web.file_loader import AsyncFileLoader +from elm.web.file_loader import AsyncWebFileLoader, AsyncLocalFileLoader from elm.web.document import PDFDocument, HTMLDocument import elm.web.html_pw @@ -67,7 +67,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch): raising=True, ) - loader = AsyncFileLoader() + loader = AsyncWebFileLoader() doc = await loader.fetch("gpt-4") with open(GPT4_DOC_PATH, "rb") as fh: @@ -97,7 +97,7 @@ async def test_async_file_loader_basic_html(monkeypatch): raising=True, ) - loader = AsyncFileLoader() + loader = AsyncWebFileLoader() doc = await loader.fetch("Whatcom") with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh: @@ -112,7 +112,7 @@ async def test_async_file_loader_basic_html(monkeypatch): @pytest.mark.asyncio async def test_async_file_loader_fetch_all(monkeypatch, tmp_path): - """Test `AsyncFileLoader.fetch_all` function for basic docs""" + """Test `AsyncWebFileLoader.fetch_all` function for basic docs""" monkeypatch.setattr( aiohttp.ClientSession, @@ -146,7 +146,7 @@ async def _cache_file(doc, content): assert not list(tmp_path.glob("*")) with ThreadPoolExecutor() as pool: - loader = AsyncFileLoader(file_cache_coroutine=_cache_file) + loader = AsyncWebFileLoader(file_cache_coroutine=_cache_file) docs = await loader.fetch_all("gpt-4", "Whatcom") assert len(docs) == 2 @@ -178,5 +178,41 @@ async def _cache_file(doc, content): assert truth_html.text == fh.read() +@pytest.mark.asyncio +async def test_async_local_file_loader_basic_html(): + """Test `AsyncLocalFileLoader` for a basic HTML doc""" + + loader = AsyncLocalFileLoader(doc_attrs={"test_1": 1}) + doc = await loader.fetch(WHATCOM_DOC_PATH) + + with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh: + content = fh.read() + + truth = HTMLDocument([content]) + + assert doc.text == truth.text + assert doc.attrs["source_fp"] == WHATCOM_DOC_PATH + assert doc.attrs["test_1"] == 1 + assert "cache_fn" not in doc.attrs + + +@pytest.mark.asyncio +async def test_async_local_file_loader_basic_pdf(): + """Test `AsyncLocalFileLoader` for a basic PDF doc""" + + loader = AsyncLocalFileLoader() + doc = await loader.fetch(GPT4_DOC_PATH) + + with open(GPT4_DOC_PATH, "rb") as fh: + pdf = pdftotext.PDF(fh, physical=True) + + truth = PDFDocument(pdf) + + assert doc.text == truth.text + assert doc.attrs["source_fp"] == GPT4_DOC_PATH + assert "test_1" not in doc.attrs + assert "cache_fn" not in doc.attrs + + if __name__ == "__main__": pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) From 33fb90b7cf857fa76d60557aca0daa0f58f405ef Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 20:36:51 -0600 Subject: [PATCH 11/12] Formatting updates --- elm/web/file_loader.py | 6 ++---- elm/web/search/dux.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 2ac797a4..a3a148b2 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -47,7 +47,6 @@ async def _read_html_file(html_fp, **kwargs): return HTMLDocument([text], **kwargs), text - class BaseAsyncFileLoader(ABC): """Base class for async file loading""" @@ -284,7 +283,6 @@ def __init__( pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, file_cache_coroutine=file_cache_coroutine ) - self.pw_launch_kwargs = pw_launch_kwargs or {} self.get_kwargs = { "headers": self._header_from_template(header_template), @@ -465,7 +463,7 @@ def __init__( self.doc_attrs = doc_attrs or {} async def _fetch_doc(self, source): - """Load a doc by reading file base don extension""" + """Load a doc by reading file based on extension""" fp = Path(source) if fp.suffix.lower() == ".pdf": logger.debug("Trying to read PDF file: %r", source) @@ -488,7 +486,7 @@ async def _fetch_doc(self, source): if not doc.empty: return doc, raw - logger.error("Failed to read file file: %r", source) + logger.error("Failed to read file: %r", source) return PDFDocument(pages=[]), None async def _fetch_doc_with_url_in_metadata(self, source): diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index a9c77dbe..abebbfb9 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -53,7 +53,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None, - duckduckgo: Duckduckgo Can also be a comma-separated combination of these. - By default, ``"all``. + By default, ``"all"``. timeout : int, optional Timeout for HTTP requests, in seconds. By default, ``10``. verify : bool, optional From 4dd4e0ebf5669dc0ae22a830e6a864197cb1cf3b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Mon, 27 Oct 2025 21:23:35 -0600 Subject: [PATCH 12/12] Fix tests --- elm/web/search/dux.py | 1 - tests/ords/test_integrated.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py index abebbfb9..4dcdaf4c 100644 --- a/elm/web/search/dux.py +++ b/elm/web/search/dux.py @@ -81,4 +81,3 @@ async def _search(self, query, num_results=10): return list(filter(None, (info.get('href', "").replace("+", "%20") for info in results))) - diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py index d1c36925..71051427 100644 --- a/tests/ords/test_integrated.py +++ b/tests/ords/test_integrated.py @@ -221,7 +221,7 @@ async def test_async_file_loader_with_temp_cache(monkeypatch): async with RunningAsyncServices([TempFileCache()]): loader = AsyncFileLoader(file_cache_coroutine=TempFileCache.call) - doc = await loader.fetch(url="Whatcom") + doc = await loader.fetch("Whatcom") assert doc.text == truth.text assert doc.attrs["source"] == "Whatcom" cached_fp = doc.attrs["cache_fn"]