NatLabRockies · ppinchuk · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/elm/version.py b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.27"
+__version__ = "0.0.28"
diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
@@ -63,6 +63,7 @@ def __init__(
         file_cache_coroutine=None,
         browser_semaphore=None,
         use_scrapling_stealth=False,
+        num_pw_html_retries=3,
         **__,  # consume any extra kwargs
     ):
         """
@@ -122,6 +123,14 @@ def __init__(
         use_scrapling_stealth : bool, default=False
             Option to use scrapling stealth scripts instead of
             tf-playwright-stealth. By default, ``False``.
+        num_pw_html_retries : int, default=3
+            Number of attempts to load HTML content. This is useful
+            because the playwright parameters are stochastic, and
+            sometimes a combination of them can fail to load HTML. The
+            default value is likely a good balance between processing
+            attempts and retrieval success. Note that the minimum number
+            of attempts will always be 2, even if the user provides a
+            value smaller than this. By default, ``3``.
         """
         self.pw_launch_kwargs = pw_launch_kwargs or {}
         self.pdf_read_kwargs = pdf_read_kwargs or {}
@@ -137,6 +146,7 @@ def __init__(
         self.file_cache_coroutine = file_cache_coroutine
         self.browser_semaphore = browser_semaphore
         self.uss = use_scrapling_stealth
+        self.num_pw_html_retries = num_pw_html_retries
 
     def _header_from_template(self, header_template):
         """Compile header from user or default template"""
@@ -217,11 +227,7 @@ async def _fetch_doc(self, url):
             return doc, url_bytes
 
         logger.debug("PDF read failed; fetching HTML content from %r", url)
-        text = await load_html_with_pw(url, self.browser_semaphore,
-                                       timeout=self.PAGE_LOAD_TIMEOUT,
-                                       use_scrapling_stealth=self.uss,
-                                       **self.pw_launch_kwargs)
-        doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
+        doc = await self._fetch_html_using_pw_with_retry(url)
         if not doc.empty:
             return doc, doc.text
 
@@ -233,6 +239,31 @@ async def _fetch_doc(self, url):
 
         return doc, url_bytes
 
+    async def _fetch_html_using_pw_with_retry(self, url):
+        """Fetch HTML content with several retry attempts"""
+        num_attempts = max(1, int(self.num_pw_html_retries) - 1)
+        max_attempts = num_attempts + 1
+        for attempt in range(num_attempts):
+            logger.debug("HTML read for %r (attempt %d of %d)",
+                         url, attempt + 1, max_attempts)
+            text = await load_html_with_pw(url, self.browser_semaphore,
+                                           timeout=self.PAGE_LOAD_TIMEOUT,
+                                           use_scrapling_stealth=self.uss,
+                                           **self.pw_launch_kwargs)
+            doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
+            if not doc.empty:
+                return doc
+
+        logger.debug("HTML read for %r (attempt %d of %d) with "
+                     "load_state='domcontentloaded'",
+                     url, max_attempts, max_attempts)
+        text = await load_html_with_pw(url, self.browser_semaphore,
+                                       timeout=self.PAGE_LOAD_TIMEOUT,
+                                       use_scrapling_stealth=self.uss,
+                                       load_state="domcontentloaded",
+                                       **self.pw_launch_kwargs)
+        return await self.html_read_coroutine(text, **self.html_read_kwargs)
+
     @async_retry_with_exponential_backoff(
         base_delay=2,
         exponential_base=1.5,

diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py
@@ -20,7 +20,7 @@
 
 async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
                             timeout=90_000, use_scrapling_stealth=False,
-                            **pw_launch_kwargs):
+                            load_state="networkidle", **pw_launch_kwargs):
     """Extract HTML from URL using Playwright.
 
     Parameters
@@ -38,6 +38,19 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
     use_scrapling_stealth : bool, default=False
         Option to use scrapling stealth scripts instead of
         tf-playwright-stealth. By default, ``False``.
+    load_state : str, default="networkidle"
+        The load state to wait for. One of:
+
+            - "load" - consider navigation to be finished when the load
+                       event is fired.
+            - "domcontentloaded" - consider navigation to be finished
+                                   when the ``DOMContentLoaded`` event
+                                   is fired.
+            - "networkidle" - consider navigation to be finished when
+                              there are no network connections for at
+                              least 500 ms.
+
+        By default, ``"networkidle"``.
     **pw_launch_kwargs
         Keyword-value argument pairs to pass to
         :meth:`async_playwright.chromium.launch`.
@@ -51,15 +64,16 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
         text = await _load_html(url, browser_semaphore=browser_semaphore,
                                 timeout=timeout,
                                 use_scrapling_stealth=use_scrapling_stealth,
+                                load_state=load_state,
                                 **pw_launch_kwargs)
     except (PlaywrightError, PlaywrightTimeoutError):
         text = ""
     return text
 
 
-async def _load_html( url, browser_semaphore=None, # pragma: no cover
+async def _load_html(url, browser_semaphore=None, # pragma: no cover
                      timeout=90_000, use_scrapling_stealth=False,
-                     **pw_launch_kwargs):
+                     load_state="networkidle", **pw_launch_kwargs):
     """Load html using playwright"""
     logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs)
     logger.trace("browser_semaphore=%r", browser_semaphore)
@@ -81,8 +95,9 @@ async def _load_html( url, browser_semaphore=None, # pragma: no cover
         async with pw_page(**page_kwargs) as page:
             logger.trace("Navigating to: %r", url)
             await page.goto(url)
-            logger.trace("Waiting for load with timeout: %r", timeout)
-            await page.wait_for_load_state("networkidle", timeout=timeout)
+            logger.trace("Waiting for load state %r with timeout: %r",
+                         load_state, timeout)
+            await page.wait_for_load_state(load_state, timeout=timeout)
             text = await page.content()
 
     return text