From 5f1df96d0ea4c3c1943c6d3075aa698ef7289e64 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 18:13:49 -0600
Subject: [PATCH 01/12] Fix docstring

---
 elm/web/search/dux.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py
index 0ebffa1a..a9c77dbe 100644
--- a/elm/web/search/dux.py
+++ b/elm/web/search/dux.py
@@ -38,7 +38,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None,
             By default, ``None``.
         page : int, default=1
             The page of results to return. By default, ``1``.
-        backend : str or iter of str, optional
+        backend : str, optional
             Option for DuxDistributedGlobalSearch backend:
 
                 - auto: Randomly select 3 search engines to use
@@ -52,8 +52,8 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None,
                 - yandex: Yandex
                 - duckduckgo: Duckduckgo
 
-            Can also be a list or tuple of a combination of these.
-            By default, ``("google", "bing", "yahoo", "duckduckgo")``.
+            Can also be a comma-separated combination of these.
+            By default, ``"all``.
         timeout : int, optional
             Timeout for HTTP requests, in seconds. By default, ``10``.
         verify : bool, optional

From c3707fae37d1bf497572ec4015a584ee5be2b31a Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 18:14:01 -0600
Subject: [PATCH 02/12] Bump version

---
 elm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elm/version.py b/elm/version.py
index 6f80db09..b7605d28 100644
--- a/elm/version.py
+++ b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.31"
+__version__ = "0.0.32"

From 9924744daecc1e951cf4b753cb32e3629c3ea9c1 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 19:31:43 -0600
Subject: [PATCH 03/12] Refactor to `BaseAsyncFileLoader`

---
 elm/web/file_loader.py            | 215 ++++++++++++++++++++----------
 tests/web/test_web_file_loader.py |   4 +-
 2 files changed, 145 insertions(+), 74 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 91c8a47a..0a9096d7 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -2,6 +2,7 @@
 """ELM Web file loader class."""
 import asyncio
 import logging
+from abc import ABC, abstractmethod
 
 import aiohttp
 from fake_useragent import UserAgent
@@ -29,7 +30,138 @@ async def _read_html_doc(text, **kwargs):
     return HTMLDocument([text], **kwargs)
 
 
-class AsyncFileLoader:
+class BaseAsyncFileLoader(ABC):
+    """Base class for async file loading"""
+
+    def __init__(
+        self,
+        pdf_read_kwargs=None,
+        html_read_kwargs=None,
+        pdf_read_coroutine=None,
+        html_read_coroutine=None,
+        pdf_ocr_read_coroutine=None,
+        file_cache_coroutine=None,
+        **__,  # consume any extra kwargs
+    ):
+        """
+
+        Parameters
+        ----------
+        pdf_read_kwargs : dict, optional
+            Keyword-value argument pairs to pass to the
+            `pdf_read_coroutine`. By default, ``None``.
+        html_read_kwargs : dict, optional
+            Keyword-value argument pairs to pass to the
+            `html_read_coroutine`. By default, ``None``.
+        pdf_read_coroutine : callable, optional
+            PDF file read coroutine. Must by an async function. Should
+            accept PDF bytes as the first argument and kwargs as the
+            rest. Must return a :obj:`elm.web.document.PDFDocument`.
+            If ``None``, a default function that runs in the main thread
+            is used. By default, ``None``.
+        html_read_coroutine : callable, optional
+            HTML file read coroutine. Must by an async function. Should
+            accept HTML text as the first argument and kwargs as the
+            rest. Must return a :obj:`elm.web.document.HTMLDocument`.
+            If ``None``, a default function that runs in the main thread
+            is used. By default, ``None``.
+        pdf_ocr_read_coroutine : callable, optional
+            PDF OCR file read coroutine. Must by an async function.
+            Should accept PDF bytes as the first argument and kwargs as
+            the rest. Must return a :obj:`elm.web.document.PDFDocument`.
+            If ``None``, PDF OCR parsing is not attempted, and any
+            scanned PDF URL's will return a blank document.
+            By default, ``None``.
+        file_cache_coroutine : callable, optional
+            File caching coroutine. Can be used to cache files
+            downloaded by this class. Must accept an
+            :obj:`~elm.web.document.Document` instance as the first
+            argument and the file content to be written as the second
+            argument. If this method is not provided, no document
+            caching is performed. By default, ``None``.
+        """
+        self.pdf_read_kwargs = pdf_read_kwargs or {}
+        self.html_read_kwargs = html_read_kwargs or {}
+        self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc
+        self.html_read_coroutine = html_read_coroutine or _read_html_doc
+        self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
+        self.file_cache_coroutine = file_cache_coroutine
+
+    async def fetch_all(self, *sources):
+        """Fetch documents for all requested sources.
+
+        Parameters
+        ----------
+        *sources
+            Iterable of sources (as strings) used to fetch the
+            documents.
+
+        Returns
+        -------
+        list
+            List of documents, one per requested sources.
+        """
+        outer_task_name = asyncio.current_task().get_name()
+        fetches = [
+            asyncio.create_task(self.fetch(source), name=outer_task_name)
+            for source in sources
+        ]
+        return await asyncio.gather(*fetches)
+
+    async def fetch(self, source):
+        """Fetch a document for the given source.
+
+        Parameters
+        ----------
+        source : str
+            Source used to load the document.
+
+        Returns
+        -------
+        :class:`elm.web.document.Document`
+            Document instance containing text, if the load was
+            successful.
+        """
+        try:
+            doc, raw = await self._fetch_doc_with_url_in_metadata(source)
+        except KeyboardInterrupt:
+            raise
+        except Exception as e:
+            msg = ("Encountered error of type %r while fetching document from "
+                   "%s:")
+            err_type = type(e)
+            logger.exception(msg, err_type, source)
+            return HTMLDocument(pages=[])
+
+        doc = await self._cache_doc(doc, raw)
+        return doc
+
+    async def _fetch_doc_with_url_in_metadata(self, source):
+        """Fetch doc contents and add source to metadata"""
+        doc, raw_content = await self._fetch_doc(source)
+        doc.attrs["source"] = source
+        return doc, raw_content
+
+    async def _cache_doc(self, doc, raw_content):
+        """Cache doc if user provided a coroutine"""
+        if doc.empty or not raw_content:
+            return doc
+
+        if not self.file_cache_coroutine:
+            return doc
+
+        cache_fn = await self.file_cache_coroutine(doc, raw_content)
+        if cache_fn is not None:
+            doc.attrs["cache_fn"] = cache_fn
+        return doc
+
+    @abstractmethod
+    async def _fetch_doc(self, source):
+        """Fetch documents given a source"""
+        raise NotImplementedError
+
+
+class AsyncWebFileLoader(BaseAsyncFileLoader):
     """Async web file (PDF or HTML) loader
 
     Purpose:
@@ -132,18 +264,20 @@ def __init__(
             of attempts will always be 2, even if the user provides a
             value smaller than this. By default, ``3``.
         """
+
+        super().__init__(pdf_read_kwargs=pdf_read_kwargs,
+                         html_read_kwargs=html_read_kwargs,
+                         pdf_read_coroutine=pdf_read_coroutine,
+                         html_read_coroutine=html_read_coroutine,
+                         pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
+                         file_cache_coroutine=file_cache_coroutine)
+
         self.pw_launch_kwargs = pw_launch_kwargs or {}
-        self.pdf_read_kwargs = pdf_read_kwargs or {}
-        self.html_read_kwargs = html_read_kwargs or {}
         self.get_kwargs = {
             "headers": self._header_from_template(header_template),
             "ssl": None if verify_ssl else False,
             **(aget_kwargs or {}),
         }
-        self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc
-        self.html_read_coroutine = html_read_coroutine or _read_html_doc
-        self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
-        self.file_cache_coroutine = file_cache_coroutine
         self.browser_semaphore = browser_semaphore
         self.uss = use_scrapling_stealth
         self.num_pw_html_retries = num_pw_html_retries
@@ -156,60 +290,6 @@ def _header_from_template(self, header_template):
             headers["User-Agent"] = UserAgent().random
         return headers
 
-    async def fetch_all(self, *urls):
-        """Fetch documents for all requested URL's.
-
-        Parameters
-        ----------
-        *urls
-            Iterable of URL's (as strings) to fetch.
-
-        Returns
-        -------
-        list
-            List of documents, one per requested URL.
-        """
-        outer_task_name = asyncio.current_task().get_name()
-        fetches = [
-            asyncio.create_task(self.fetch(url), name=outer_task_name)
-            for url in urls
-        ]
-        return await asyncio.gather(*fetches)
-
-    async def fetch(self, url):
-        """Fetch a document for the given URL.
-
-        Parameters
-        ----------
-        url : str
-            URL for the document to pull down.
-
-        Returns
-        -------
-        :class:`elm.web.document.Document`
-            Document instance containing text, if the fetch was
-            successful.
-        """
-        try:
-            doc, raw_content = await self._fetch_doc_with_url_in_metadata(url)
-        except KeyboardInterrupt:
-            raise
-        except Exception as e:
-            msg = ("Encountered error of type %r while fetching document from "
-                   "%s:")
-            err_type = type(e)
-            logger.exception(msg, err_type, url)
-            return HTMLDocument(pages=[])
-
-        doc = await self._cache_doc(doc, raw_content)
-        return doc
-
-    async def _fetch_doc_with_url_in_metadata(self, url):
-        """Fetch doc contents and add URL to metadata"""
-        doc, raw_content = await self._fetch_doc(url)
-        doc.attrs["source"] = url
-        return doc, raw_content
-
     async def _fetch_doc(self, url):
         """Fetch a doc by trying pdf read, then HTML read, then PDF OCR"""
 
@@ -302,15 +382,6 @@ async def _try_load_doc_from_response_text(self, raw_content, charset):
 
         return await self.html_read_coroutine(text, **self.html_read_kwargs)
 
-    async def _cache_doc(self, doc, raw_content):
-        """Cache doc if user provided a coroutine"""
-        if doc.empty or not raw_content:
-            return doc
-
-        if not self.file_cache_coroutine:
-            return doc
 
-        cache_fn = await self.file_cache_coroutine(doc, raw_content)
-        if cache_fn is not None:
-            doc.attrs["cache_fn"] = cache_fn
-        return doc
+class AsyncFileLoader(AsyncWebFileLoader):
+    """Alias for AsyncWebFileLoader (for backward compatibility)"""
diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py
index 29bbbf55..cc2bf3ca 100644
--- a/tests/web/test_web_file_loader.py
+++ b/tests/web/test_web_file_loader.py
@@ -68,7 +68,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch):
     )
 
     loader = AsyncFileLoader()
-    doc = await loader.fetch(url="gpt-4")
+    doc = await loader.fetch("gpt-4")
 
     with open(GPT4_DOC_PATH, "rb") as fh:
         pdf = pdftotext.PDF(fh, physical=True)
@@ -98,7 +98,7 @@ async def test_async_file_loader_basic_html(monkeypatch):
     )
 
     loader = AsyncFileLoader()
-    doc = await loader.fetch(url="Whatcom")
+    doc = await loader.fetch("Whatcom")
 
     with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh:
         content = fh.read()

From 8d6a2f6ebb0248da928d03aba6b3090d1bf8e47b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:23:14 -0600
Subject: [PATCH 04/12] Re-order args

---
 elm/web/file_loader.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 0a9096d7..b32872e5 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -35,10 +35,10 @@ class BaseAsyncFileLoader(ABC):
 
     def __init__(
         self,
+        pdf_read_coroutine,
+        html_read_coroutine,
         pdf_read_kwargs=None,
         html_read_kwargs=None,
-        pdf_read_coroutine=None,
-        html_read_coroutine=None,
         pdf_ocr_read_coroutine=None,
         file_cache_coroutine=None,
         **__,  # consume any extra kwargs
@@ -47,24 +47,18 @@ def __init__(
 
         Parameters
         ----------
+        pdf_read_coroutine : callable
+            PDF file read coroutine. Must by an async function.
+            Must return a :obj:`elm.web.document.PDFDocument`.
+        html_read_coroutine : callable, optional
+            HTML file read coroutine. Must by an async function.
+            Must return a :obj:`elm.web.document.HTMLDocument`.
         pdf_read_kwargs : dict, optional
             Keyword-value argument pairs to pass to the
             `pdf_read_coroutine`. By default, ``None``.
         html_read_kwargs : dict, optional
             Keyword-value argument pairs to pass to the
             `html_read_coroutine`. By default, ``None``.
-        pdf_read_coroutine : callable, optional
-            PDF file read coroutine. Must by an async function. Should
-            accept PDF bytes as the first argument and kwargs as the
-            rest. Must return a :obj:`elm.web.document.PDFDocument`.
-            If ``None``, a default function that runs in the main thread
-            is used. By default, ``None``.
-        html_read_coroutine : callable, optional
-            HTML file read coroutine. Must by an async function. Should
-            accept HTML text as the first argument and kwargs as the
-            rest. Must return a :obj:`elm.web.document.HTMLDocument`.
-            If ``None``, a default function that runs in the main thread
-            is used. By default, ``None``.
         pdf_ocr_read_coroutine : callable, optional
             PDF OCR file read coroutine. Must by an async function.
             Should accept PDF bytes as the first argument and kwargs as

From deb142aed76a1aed06a89079ea251e76140ef533 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:23:25 -0600
Subject: [PATCH 05/12] No more defaults

---
 elm/web/file_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index b32872e5..1bc98597 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -74,10 +74,10 @@ def __init__(
             argument. If this method is not provided, no document
             caching is performed. By default, ``None``.
         """
+        self.pdf_read_coroutine = pdf_read_coroutine
+        self.html_read_coroutine = html_read_coroutine
         self.pdf_read_kwargs = pdf_read_kwargs or {}
         self.html_read_kwargs = html_read_kwargs or {}
-        self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc
-        self.html_read_coroutine = html_read_coroutine or _read_html_doc
         self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
         self.file_cache_coroutine = file_cache_coroutine
 

From b1f7a98d993efc5b382c047386914bac5b382f26 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:23:40 -0600
Subject: [PATCH 06/12] Add file read methods

---
 elm/web/file_loader.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 1bc98597..593e56dc 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -30,6 +30,23 @@ async def _read_html_doc(text, **kwargs):
     return HTMLDocument([text], **kwargs)
 
 
+async def _read_pdf_file(pdf_fp, **kwargs):
+    """Default read PDF file function (runs in main thread)"""
+    verbose = kwargs.pop("verbose", True)
+    with open(pdf_fp, "rb") as fh:
+        pdf_bytes = fh.read()
+    pages = read_pdf(pdf_bytes, verbose=verbose)
+    return PDFDocument(pages, **kwargs), pdf_bytes
+
+
+async def _read_html_file(html_fp, **kwargs):
+    """Default read HTML function (runs in main thread)"""
+    with open(html_fp, "r") as fh:
+        text = fh.read()
+    return HTMLDocument([text], **kwargs), text
+
+
+
 class BaseAsyncFileLoader(ABC):
     """Base class for async file loading"""
 

From 305c5b54849b0c2197b373ce1a8652e579232456 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:23:49 -0600
Subject: [PATCH 07/12] Re-order inputs

---
 elm/web/file_loader.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 593e56dc..38ca2c78 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -275,13 +275,14 @@ def __init__(
             of attempts will always be 2, even if the user provides a
             value smaller than this. By default, ``3``.
         """
-
-        super().__init__(pdf_read_kwargs=pdf_read_kwargs,
-                         html_read_kwargs=html_read_kwargs,
-                         pdf_read_coroutine=pdf_read_coroutine,
-                         html_read_coroutine=html_read_coroutine,
-                         pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
-                         file_cache_coroutine=file_cache_coroutine)
+        super().__init__(
+            pdf_read_coroutine=pdf_read_coroutine or _read_pdf_doc,
+            html_read_coroutine=html_read_coroutine or _read_html_doc,
+            pdf_read_kwargs=pdf_read_kwargs,
+            html_read_kwargs=html_read_kwargs,
+            pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
+            file_cache_coroutine=file_cache_coroutine
+        )
 
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.get_kwargs = {

From cef430cb8988f7b3327db0e780c235dce965890b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:24:03 -0600
Subject: [PATCH 08/12] Add `AsyncLocalFileLoader`

---
 elm/web/file_loader.py | 104 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 38ca2c78..10056dd7 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -2,6 +2,7 @@
 """ELM Web file loader class."""
 import asyncio
 import logging
+from pathlib import Path
 from abc import ABC, abstractmethod
 
 import aiohttp
@@ -395,5 +396,108 @@ async def _try_load_doc_from_response_text(self, raw_content, charset):
         return await self.html_read_coroutine(text, **self.html_read_kwargs)
 
 
+class AsyncLocalFileLoader(BaseAsyncFileLoader):
+    """Async local file (PDF or HTML) loader"""
+
+    def __init__(
+        self,
+        pdf_read_kwargs=None,
+        html_read_kwargs=None,
+        pdf_read_coroutine=None,
+        html_read_coroutine=None,
+        pdf_ocr_read_coroutine=None,
+        file_cache_coroutine=None,
+        doc_attrs=None,
+        **__,  # consume any extra kwargs
+    ):
+        """
+
+        Parameters
+        ----------
+        pdf_read_kwargs : dict, optional
+            Keyword-value argument pairs to pass to the
+            `pdf_read_coroutine`. By default, ``None``.
+        html_read_kwargs : dict, optional
+            Keyword-value argument pairs to pass to the
+            `html_read_coroutine`. By default, ``None``.
+        pdf_read_coroutine : callable, optional
+            PDF file read coroutine. Must by an async function. Should
+            accept a PDF filepath as the first argument and kwargs as
+            the rest. Must return a :obj:`elm.web.document.PDFDocument`
+            along with the raw PDF bytes (for caching purposes).
+            If ``None``, a default function that runs in the main thread
+            is used. By default, ``None``.
+        html_read_coroutine : callable, optional
+            HTML file read coroutine. Must by an async function. Should
+            accept an HTML filepath as the first argument and kwargs as
+            the rest. Must return a :obj:`elm.web.document.HTMLDocument`
+            along with the raw text (for caching purposes).
+            If ``None``, a default function that runs in the main thread
+            is used. By default, ``None``.
+        pdf_ocr_read_coroutine : callable, optional
+            PDF OCR file read coroutine. Must by an async function.
+            Should accept a PDF filepath as the first argument and
+            kwargs as the rest. Must return a
+            :obj:`elm.web.document.PDFDocument` along with the raw PDF
+            bytes (for caching purposes).
+            If ``None``, PDF OCR parsing is not attempted, and any
+            scanned PDF URL's will return a blank document.
+            By default, ``None``.
+        file_cache_coroutine : callable, optional
+            File caching coroutine. Can be used to cache files
+            downloaded by this class. Must accept an
+            :obj:`~elm.web.document.Document` instance as the first
+            argument and the file content to be written as the second
+            argument. If this method is not provided, no document
+            caching is performed. By default, ``None``.
+        doc_attrs : dict, optional
+            Additional document attributes to add to each loaded
+            document. By default, ``None``.
+        """
+        super().__init__(
+            pdf_read_coroutine=pdf_read_coroutine or _read_pdf_file,
+            html_read_coroutine=html_read_coroutine or _read_html_file,
+            pdf_read_kwargs=pdf_read_kwargs,
+            html_read_kwargs=html_read_kwargs,
+            pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
+            file_cache_coroutine=file_cache_coroutine
+        )
+        self.doc_attrs = doc_attrs or {}
+
+    async def _fetch_doc(self, source):
+        """Load a doc by reading file base don extension"""
+        fp = Path(source)
+        if fp.suffix.lower() == ".pdf":
+            logger.debug("Trying to read PDF file: %r", source)
+            doc, raw = await self.pdf_read_coroutine(fp,
+                                                     **self.pdf_read_kwargs)
+            if not doc.empty:
+                return doc, raw
+            elif self.pdf_ocr_read_coroutine:
+                logger.debug("PDF read failed; fetching OCR content from %r",
+                             source)
+                doc, raw = await self.pdf_ocr_read_coroutine(
+                    fp, **self.pdf_read_kwargs)
+                if not doc.empty:
+                    return doc, raw
+
+        if fp.suffix.lower() == ".txt":
+            logger.debug("Trying to read HTML file: %r", source)
+            doc = await self.html_read_coroutine(fp, **self.html_read_kwargs)
+            if not doc.empty:
+                return doc, raw
+
+        logger.error("Failed to read file file: %r", source)
+        return PDFDocument(pages=[]), None
+
+    async def _fetch_doc_with_url_in_metadata(self, source):
+        """Fetch doc contents and add source to metadata"""
+        doc, raw_content = await self._fetch_doc(source)
+        for key, value in self.doc_attrs.items():
+            doc.attrs[key] = value
+        doc.attrs["source_fp"] = source
+        return doc, raw_content
+
+
 class AsyncFileLoader(AsyncWebFileLoader):
     """Alias for AsyncWebFileLoader (for backward compatibility)"""

From 65a3bdbc4f72b650058df4b67c01a7f276e9fcfb Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:26:50 -0600
Subject: [PATCH 09/12] Fix bug

---
 elm/web/file_loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 10056dd7..2ac797a4 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -483,7 +483,8 @@ async def _fetch_doc(self, source):
 
         if fp.suffix.lower() == ".txt":
             logger.debug("Trying to read HTML file: %r", source)
-            doc = await self.html_read_coroutine(fp, **self.html_read_kwargs)
+            doc, raw = await self.html_read_coroutine(fp,
+                                                      **self.html_read_kwargs)
             if not doc.empty:
                 return doc, raw
 

From 12a6d269a4ca7886c9d9b366ecd250f2b50d22ef Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:28:24 -0600
Subject: [PATCH 10/12] Add tests for `AsyncLocalFileLoader`

---
 tests/web/test_web_file_loader.py | 46 +++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py
index cc2bf3ca..cc2f2451 100644
--- a/tests/web/test_web_file_loader.py
+++ b/tests/web/test_web_file_loader.py
@@ -13,7 +13,7 @@
 import pdftotext
 
 from elm import TEST_DATA_DIR
-from elm.web.file_loader import AsyncFileLoader
+from elm.web.file_loader import AsyncWebFileLoader, AsyncLocalFileLoader
 from elm.web.document import PDFDocument, HTMLDocument
 import elm.web.html_pw
 
@@ -67,7 +67,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch):
         raising=True,
     )
 
-    loader = AsyncFileLoader()
+    loader = AsyncWebFileLoader()
     doc = await loader.fetch("gpt-4")
 
     with open(GPT4_DOC_PATH, "rb") as fh:
@@ -97,7 +97,7 @@ async def test_async_file_loader_basic_html(monkeypatch):
         raising=True,
     )
 
-    loader = AsyncFileLoader()
+    loader = AsyncWebFileLoader()
     doc = await loader.fetch("Whatcom")
 
     with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh:
@@ -112,7 +112,7 @@ async def test_async_file_loader_basic_html(monkeypatch):
 
 @pytest.mark.asyncio
 async def test_async_file_loader_fetch_all(monkeypatch, tmp_path):
-    """Test `AsyncFileLoader.fetch_all` function for basic docs"""
+    """Test `AsyncWebFileLoader.fetch_all` function for basic docs"""
 
     monkeypatch.setattr(
         aiohttp.ClientSession,
@@ -146,7 +146,7 @@ async def _cache_file(doc, content):
     assert not list(tmp_path.glob("*"))
 
     with ThreadPoolExecutor() as pool:
-        loader = AsyncFileLoader(file_cache_coroutine=_cache_file)
+        loader = AsyncWebFileLoader(file_cache_coroutine=_cache_file)
         docs = await loader.fetch_all("gpt-4", "Whatcom")
 
     assert len(docs) == 2
@@ -178,5 +178,41 @@ async def _cache_file(doc, content):
         assert truth_html.text == fh.read()
 
 
+@pytest.mark.asyncio
+async def test_async_local_file_loader_basic_html():
+    """Test `AsyncLocalFileLoader` for a basic HTML doc"""
+
+    loader = AsyncLocalFileLoader(doc_attrs={"test_1": 1})
+    doc = await loader.fetch(WHATCOM_DOC_PATH)
+
+    with open(WHATCOM_DOC_PATH, "r", encoding="utf-8") as fh:
+        content = fh.read()
+
+    truth = HTMLDocument([content])
+
+    assert doc.text == truth.text
+    assert doc.attrs["source_fp"] == WHATCOM_DOC_PATH
+    assert doc.attrs["test_1"] == 1
+    assert "cache_fn" not in doc.attrs
+
+
+@pytest.mark.asyncio
+async def test_async_local_file_loader_basic_pdf():
+    """Test `AsyncLocalFileLoader` for a basic PDF doc"""
+
+    loader = AsyncLocalFileLoader()
+    doc = await loader.fetch(GPT4_DOC_PATH)
+
+    with open(GPT4_DOC_PATH, "rb") as fh:
+        pdf = pdftotext.PDF(fh, physical=True)
+
+    truth = PDFDocument(pdf)
+
+    assert doc.text == truth.text
+    assert doc.attrs["source_fp"] == GPT4_DOC_PATH
+    assert "test_1" not in doc.attrs
+    assert "cache_fn" not in doc.attrs
+
+
 if __name__ == "__main__":
     pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"])

From 33fb90b7cf857fa76d60557aca0daa0f58f405ef Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 20:36:51 -0600
Subject: [PATCH 11/12] Formatting updates

---
 elm/web/file_loader.py | 6 ++----
 elm/web/search/dux.py  | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 2ac797a4..a3a148b2 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -47,7 +47,6 @@ async def _read_html_file(html_fp, **kwargs):
     return HTMLDocument([text], **kwargs), text
 
 
-
 class BaseAsyncFileLoader(ABC):
     """Base class for async file loading"""
 
@@ -284,7 +283,6 @@ def __init__(
             pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
             file_cache_coroutine=file_cache_coroutine
         )
-
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.get_kwargs = {
             "headers": self._header_from_template(header_template),
@@ -465,7 +463,7 @@ def __init__(
         self.doc_attrs = doc_attrs or {}
 
     async def _fetch_doc(self, source):
-        """Load a doc by reading file base don extension"""
+        """Load a doc by reading file based on extension"""
         fp = Path(source)
         if fp.suffix.lower() == ".pdf":
             logger.debug("Trying to read PDF file: %r", source)
@@ -488,7 +486,7 @@ async def _fetch_doc(self, source):
             if not doc.empty:
                 return doc, raw
 
-        logger.error("Failed to read file file: %r", source)
+        logger.error("Failed to read file: %r", source)
         return PDFDocument(pages=[]), None
 
     async def _fetch_doc_with_url_in_metadata(self, source):
diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py
index a9c77dbe..abebbfb9 100644
--- a/elm/web/search/dux.py
+++ b/elm/web/search/dux.py
@@ -53,7 +53,7 @@ def __init__(self, region="us-en", safesearch="moderate", timelimit=None,
                 - duckduckgo: Duckduckgo
 
             Can also be a comma-separated combination of these.
-            By default, ``"all``.
+            By default, ``"all"``.
         timeout : int, optional
             Timeout for HTTP requests, in seconds. By default, ``10``.
         verify : bool, optional

From 4dd4e0ebf5669dc0ae22a830e6a864197cb1cf3b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Mon, 27 Oct 2025 21:23:35 -0600
Subject: [PATCH 12/12] Fix tests

---
 elm/web/search/dux.py         | 1 -
 tests/ords/test_integrated.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/elm/web/search/dux.py b/elm/web/search/dux.py
index abebbfb9..4dcdaf4c 100644
--- a/elm/web/search/dux.py
+++ b/elm/web/search/dux.py
@@ -81,4 +81,3 @@ async def _search(self, query, num_results=10):
 
         return list(filter(None, (info.get('href', "").replace("+", "%20")
                                   for info in results)))
-
diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py
index d1c36925..71051427 100644
--- a/tests/ords/test_integrated.py
+++ b/tests/ords/test_integrated.py
@@ -221,7 +221,7 @@ async def test_async_file_loader_with_temp_cache(monkeypatch):
 
     async with RunningAsyncServices([TempFileCache()]):
         loader = AsyncFileLoader(file_cache_coroutine=TempFileCache.call)
-        doc = await loader.fetch(url="Whatcom")
+        doc = await loader.fetch("Whatcom")
         assert doc.text == truth.text
         assert doc.attrs["source"] == "Whatcom"
         cached_fp = doc.attrs["cache_fn"]