From 84928cba2a7c0c5f38935b0813893aba3a4b16ab Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:11:26 -0600
Subject: [PATCH 01/14] Add new param with default

---
 elm/web/file_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 0659dc88..ef767111 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -63,6 +63,7 @@ def __init__(
         file_cache_coroutine=None,
         browser_semaphore=None,
         use_scrapling_stealth=False,
+        num_pw_html_retries=3,
         **__,  # consume any extra kwargs
     ):
         """

From 4ebd1e45c35d61fe611233b7c0abf71692af6fc7 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:13:39 -0600
Subject: [PATCH 02/14] Add docstring

---
 elm/web/file_loader.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index ef767111..d74706f8 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -63,7 +63,6 @@ def __init__(
         file_cache_coroutine=None,
         browser_semaphore=None,
         use_scrapling_stealth=False,
-        num_pw_html_retries=3,
         **__,  # consume any extra kwargs
     ):
         """
@@ -124,6 +123,12 @@ def __init__(
             Option to use scrapling stealth scripts instead of
             tf-playwright-stealth. By default, ``False``.
         """
+        num_pw_html_retries : int, default=3
+            Number of attempts to load HTML content. This is useful
+            because the playwright parameters are stochastic, and
+            sometimes a combination of them can fail to load HTML. The
+            default value is likely a good balance between processing
+            attempts and retrieval success. By default, ``3``.
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.pdf_read_kwargs = pdf_read_kwargs or {}
         self.html_read_kwargs = html_read_kwargs or {}

From 9840c4951a7f253696133d3cb9a249e402d86b68 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:14:05 -0600
Subject: [PATCH 03/14] Fix docstring

---
 elm/web/file_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index d74706f8..77268716 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -122,13 +122,13 @@ def __init__(
         use_scrapling_stealth : bool, default=False
             Option to use scrapling stealth scripts instead of
             tf-playwright-stealth. By default, ``False``.
-        """
         num_pw_html_retries : int, default=3
             Number of attempts to load HTML content. This is useful
             because the playwright parameters are stochastic, and
             sometimes a combination of them can fail to load HTML. The
             default value is likely a good balance between processing
             attempts and retrieval success. By default, ``3``.
+        """
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.pdf_read_kwargs = pdf_read_kwargs or {}
         self.html_read_kwargs = html_read_kwargs or {}

From 4ac53d0d572b29335241659e81510de03130a947 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:14:22 -0600
Subject: [PATCH 04/14] Store param as instance attr

---
 elm/web/file_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 77268716..6dbda040 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -143,6 +143,7 @@ def __init__(
         self.file_cache_coroutine = file_cache_coroutine
         self.browser_semaphore = browser_semaphore
         self.uss = use_scrapling_stealth
+        self.num_pw_html_retries = num_pw_html_retries
 
     def _header_from_template(self, header_template):
         """Compile header from user or default template"""

From 6f33f3ba25dbf9756dbde2e938e84f01b9991126 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:15:07 -0600
Subject: [PATCH 05/14] Add method to retry HTML fetch

---
 elm/web/file_loader.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 6dbda040..30f358bf 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -240,6 +240,21 @@ async def _fetch_doc(self, url):
 
         return doc, url_bytes
 
+    async def _fetch_html_using_pw_with_retry(self, url):
+        """Fetch HTML content with several retry attempts"""
+        for attempt in range(self.num_pw_html_retries):
+            text = await load_html_with_pw(url, self.browser_semaphore,
+                                        timeout=self.PAGE_LOAD_TIMEOUT,
+                                        use_scrapling_stealth=self.uss,
+                                        **self.pw_launch_kwargs)
+            doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
+            if not doc.empty:
+                return doc
+
+            logger.debug("HTML read failed; retrying %r (attempt %d of %d)",
+                         url, attempt + 1, self.num_pw_html_retries)
+        return doc
+
     @async_retry_with_exponential_backoff(
         base_delay=2,
         exponential_base=1.5,

From 174597c965368a363b0309c2db63512461e1a419 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:15:25 -0600
Subject: [PATCH 06/14] Use new method

---
 elm/web/file_loader.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 30f358bf..eb7dd25b 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -224,11 +224,7 @@ async def _fetch_doc(self, url):
             return doc, url_bytes
 
         logger.debug("PDF read failed; fetching HTML content from %r", url)
-        text = await load_html_with_pw(url, self.browser_semaphore,
-                                       timeout=self.PAGE_LOAD_TIMEOUT,
-                                       use_scrapling_stealth=self.uss,
-                                       **self.pw_launch_kwargs)
-        doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
+        doc = await self._fetch_html_using_pw_with_retry(url)
         if not doc.empty:
             return doc, doc.text
 

From 4db79a08f0e6075e88d770499cd5fae610712a34 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:15:35 -0600
Subject: [PATCH 07/14] Updates

---
 elm/web/file_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index eb7dd25b..4d82061d 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -63,6 +63,7 @@ def __init__(
         file_cache_coroutine=None,
         browser_semaphore=None,
         use_scrapling_stealth=False,
+        num_pw_html_retries=3,
         **__,  # consume any extra kwargs
     ):
         """

From 5b332a108a5e61122313c68c91e00f37fc1e2a3d Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:15:47 -0600
Subject: [PATCH 08/14] Bump version

---
 elm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/elm/version.py b/elm/version.py
index 480f51ac..9c04c46a 100644
--- a/elm/version.py
+++ b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.27"
+__version__ = "0.0.28"

From b854712a1f94174d14f5d1445b75fc44a5d0158e Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 13:19:34 -0600
Subject: [PATCH 09/14] Formatting fix

---
 elm/web/file_loader.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 4d82061d..c632e47c 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -241,9 +241,9 @@ async def _fetch_html_using_pw_with_retry(self, url):
         """Fetch HTML content with several retry attempts"""
         for attempt in range(self.num_pw_html_retries):
             text = await load_html_with_pw(url, self.browser_semaphore,
-                                        timeout=self.PAGE_LOAD_TIMEOUT,
-                                        use_scrapling_stealth=self.uss,
-                                        **self.pw_launch_kwargs)
+                                           timeout=self.PAGE_LOAD_TIMEOUT,
+                                           use_scrapling_stealth=self.uss,
+                                           **self.pw_launch_kwargs)
             doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
             if not doc.empty:
                 return doc

From 0e7a1ae7021ba6ef8bfc121f86101c877483890b Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 14:20:49 -0600
Subject: [PATCH 10/14] `load_state` now a param

---
 elm/web/html_pw.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py
index 0257187a..374579dd 100644
--- a/elm/web/html_pw.py
+++ b/elm/web/html_pw.py
@@ -59,7 +59,7 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
 
 async def _load_html( url, browser_semaphore=None, # pragma: no cover
                      timeout=90_000, use_scrapling_stealth=False,
-                     **pw_launch_kwargs):
+                     load_state="networkidle", **pw_launch_kwargs):
     """Load html using playwright"""
     logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs)
     logger.trace("browser_semaphore=%r", browser_semaphore)
@@ -81,8 +81,9 @@ async def _load_html( url, browser_semaphore=None, # pragma: no cover
         async with pw_page(**page_kwargs) as page:
             logger.trace("Navigating to: %r", url)
             await page.goto(url)
-            logger.trace("Waiting for load with timeout: %r", timeout)
-            await page.wait_for_load_state("networkidle", timeout=timeout)
+            logger.trace("Waiting for load state %r with timeout: %r",
+                         load_state, timeout)
+            await page.wait_for_load_state(load_state, timeout=timeout)
             text = await page.content()
 
     return text

From b8b3671b2e9858b11b016f63e1749babc26d8c9f Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 14:24:13 -0600
Subject: [PATCH 11/14] Pass parameter down

---
 elm/web/html_pw.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py
index 374579dd..a7ecf83d 100644
--- a/elm/web/html_pw.py
+++ b/elm/web/html_pw.py
@@ -20,7 +20,7 @@
 
 async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
                             timeout=90_000, use_scrapling_stealth=False,
-                            **pw_launch_kwargs):
+                            load_state="networkidle", **pw_launch_kwargs):
     """Extract HTML from URL using Playwright.
 
     Parameters
@@ -38,6 +38,19 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
     use_scrapling_stealth : bool, default=False
         Option to use scrapling stealth scripts instead of
         tf-playwright-stealth. By default, ``False``.
+    load_state : str, default="networkidle"
+        The load state to wait for. One of:
+
+            - "load" - consider navigation to be finished when the load
+                       event is fired.
+            - "domcontentloaded" - consider navigation to be finished
+                                   when the ``DOMContentLoaded`` event
+                                   is fired.
+            - "networkidle" - consider navigation to be finished when
+                              there are no network connections for at
+                              least 500 ms.
+
+        By default, ``"networkidle"``.
     **pw_launch_kwargs
         Keyword-value argument pairs to pass to
         :meth:`async_playwright.chromium.launch`.
@@ -51,13 +64,14 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
         text = await _load_html(url, browser_semaphore=browser_semaphore,
                                 timeout=timeout,
                                 use_scrapling_stealth=use_scrapling_stealth,
+                                load_state=load_state,
                                 **pw_launch_kwargs)
     except (PlaywrightError, PlaywrightTimeoutError):
         text = ""
     return text
 
 
-async def _load_html( url, browser_semaphore=None, # pragma: no cover
+async def _load_html(url, browser_semaphore=None, # pragma: no cover
                      timeout=90_000, use_scrapling_stealth=False,
                      load_state="networkidle", **pw_launch_kwargs):
     """Load html using playwright"""

From b7919b49e50e3bd652f92b63fbdf7c7e6f351756 Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 14:27:52 -0600
Subject: [PATCH 12/14] FInal retry uses `domcontentloaded` load state

---
 elm/web/file_loader.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index c632e47c..201c9514 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -250,7 +250,14 @@ async def _fetch_html_using_pw_with_retry(self, url):
 
             logger.debug("HTML read failed; retrying %r (attempt %d of %d)",
                          url, attempt + 1, self.num_pw_html_retries)
-        return doc
+
+        logger.debug("Attempting HTML read with load_state='domcontentloaded'")
+        text = await load_html_with_pw(url, self.browser_semaphore,
+                                       timeout=self.PAGE_LOAD_TIMEOUT,
+                                       use_scrapling_stealth=self.uss,
+                                       load_state="domcontentloaded",
+                                       **self.pw_launch_kwargs)
+        return await self.html_read_coroutine(text, **self.html_read_kwargs)
 
     @async_retry_with_exponential_backoff(
         base_delay=2,

From 0f5633ab7d7ee605fc3d9d9fc554786adad870bd Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 14:39:40 -0600
Subject: [PATCH 13/14] refactor retry logic

---
 elm/web/file_loader.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 201c9514..d3359d6a 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -239,7 +239,11 @@ async def _fetch_doc(self, url):
 
     async def _fetch_html_using_pw_with_retry(self, url):
         """Fetch HTML content with several retry attempts"""
-        for attempt in range(self.num_pw_html_retries):
+        num_attempts = max(1, int(self.num_pw_html_retries) - 1)
+        max_attempts = num_attempts + 1
+        for attempt in range(num_attempts):
+            logger.debug("HTML read for %r (attempt %d of %d)",
+                         url, attempt + 1, max_attempts)
             text = await load_html_with_pw(url, self.browser_semaphore,
                                            timeout=self.PAGE_LOAD_TIMEOUT,
                                            use_scrapling_stealth=self.uss,
@@ -248,10 +252,9 @@ async def _fetch_html_using_pw_with_retry(self, url):
             if not doc.empty:
                 return doc
 
-            logger.debug("HTML read failed; retrying %r (attempt %d of %d)",
-                         url, attempt + 1, self.num_pw_html_retries)
-
-        logger.debug("Attempting HTML read with load_state='domcontentloaded'")
+        logger.debug("HTML read for %r (attempt %d of %d) with "
+                     "load_state='domcontentloaded'",
+                     url, max_attempts, max_attempts)
         text = await load_html_with_pw(url, self.browser_semaphore,
                                        timeout=self.PAGE_LOAD_TIMEOUT,
                                        use_scrapling_stealth=self.uss,

From bd5bea78885b53cb09bec2ab331e33677b62781a Mon Sep 17 00:00:00 2001
From: ppinchuk <pinchukpaul@gmail.com>
Date: Thu, 16 Oct 2025 14:40:24 -0600
Subject: [PATCH 14/14] Docstring clarification

---
 elm/web/file_loader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index d3359d6a..9cfc9bf7 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -128,7 +128,9 @@ def __init__(
             because the playwright parameters are stochastic, and
             sometimes a combination of them can fail to load HTML. The
             default value is likely a good balance between processing
-            attempts and retrieval success. By default, ``3``.
+            attempts and retrieval success. Note that the minimum number
+            of attempts will always be 2, even if the user provides a
+            value smaller than this. By default, ``3``.
         """
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.pdf_read_kwargs = pdf_read_kwargs or {}