From 84928cba2a7c0c5f38935b0813893aba3a4b16ab Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:11:26 -0600 Subject: [PATCH 01/14] Add new param with default --- elm/web/file_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 0659dc88..ef767111 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -63,6 +63,7 @@ def __init__( file_cache_coroutine=None, browser_semaphore=None, use_scrapling_stealth=False, + num_pw_html_retries=3, **__, # consume any extra kwargs ): """ From 4ebd1e45c35d61fe611233b7c0abf71692af6fc7 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:13:39 -0600 Subject: [PATCH 02/14] Add docstring --- elm/web/file_loader.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index ef767111..d74706f8 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -63,7 +63,6 @@ def __init__( file_cache_coroutine=None, browser_semaphore=None, use_scrapling_stealth=False, - num_pw_html_retries=3, **__, # consume any extra kwargs ): """ @@ -124,6 +123,12 @@ def __init__( Option to use scrapling stealth scripts instead of tf-playwright-stealth. By default, ``False``. """ + num_pw_html_retries : int, default=3 + Number of attempts to load HTML content. This is useful + because the playwright parameters are stochastic, and + sometimes a combination of them can fail to load HTML. The + default value is likely a good balance between processing + attempts and retrieval success. By default, ``3``. self.pw_launch_kwargs = pw_launch_kwargs or {} self.pdf_read_kwargs = pdf_read_kwargs or {} self.html_read_kwargs = html_read_kwargs or {} From 9840c4951a7f253696133d3cb9a249e402d86b68 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:14:05 -0600 Subject: [PATCH 03/14] Fix docstring --- elm/web/file_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index d74706f8..77268716 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -122,13 +122,13 @@ def __init__( use_scrapling_stealth : bool, default=False Option to use scrapling stealth scripts instead of tf-playwright-stealth. By default, ``False``. - """ num_pw_html_retries : int, default=3 Number of attempts to load HTML content. This is useful because the playwright parameters are stochastic, and sometimes a combination of them can fail to load HTML. The default value is likely a good balance between processing attempts and retrieval success. By default, ``3``. + """ self.pw_launch_kwargs = pw_launch_kwargs or {} self.pdf_read_kwargs = pdf_read_kwargs or {} self.html_read_kwargs = html_read_kwargs or {} From 4ac53d0d572b29335241659e81510de03130a947 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:14:22 -0600 Subject: [PATCH 04/14] Store param as instance attr --- elm/web/file_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 77268716..6dbda040 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -143,6 +143,7 @@ def __init__( self.file_cache_coroutine = file_cache_coroutine self.browser_semaphore = browser_semaphore self.uss = use_scrapling_stealth + self.num_pw_html_retries = num_pw_html_retries def _header_from_template(self, header_template): """Compile header from user or default template""" From 6f33f3ba25dbf9756dbde2e938e84f01b9991126 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:15:07 -0600 Subject: [PATCH 05/14] Add method to retry HTML fetch --- elm/web/file_loader.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 6dbda040..30f358bf 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -240,6 +240,21 @@ async def _fetch_doc(self, url): return doc, url_bytes + async def _fetch_html_using_pw_with_retry(self, url): + """Fetch HTML content with several retry attempts""" + for attempt in range(self.num_pw_html_retries): + text = await load_html_with_pw(url, self.browser_semaphore, + timeout=self.PAGE_LOAD_TIMEOUT, + use_scrapling_stealth=self.uss, + **self.pw_launch_kwargs) + doc = await self.html_read_coroutine(text, **self.html_read_kwargs) + if not doc.empty: + return doc + + logger.debug("HTML read failed; retrying %r (attempt %d of %d)", + url, attempt + 1, self.num_pw_html_retries) + return doc + @async_retry_with_exponential_backoff( base_delay=2, exponential_base=1.5, From 174597c965368a363b0309c2db63512461e1a419 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:15:25 -0600 Subject: [PATCH 06/14] Use new method --- elm/web/file_loader.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 30f358bf..eb7dd25b 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -224,11 +224,7 @@ async def _fetch_doc(self, url): return doc, url_bytes logger.debug("PDF read failed; fetching HTML content from %r", url) - text = await load_html_with_pw(url, self.browser_semaphore, - timeout=self.PAGE_LOAD_TIMEOUT, - use_scrapling_stealth=self.uss, - **self.pw_launch_kwargs) - doc = await self.html_read_coroutine(text, **self.html_read_kwargs) + doc = await self._fetch_html_using_pw_with_retry(url) if not doc.empty: return doc, doc.text From 4db79a08f0e6075e88d770499cd5fae610712a34 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:15:35 -0600 Subject: [PATCH 07/14] Updates --- elm/web/file_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index eb7dd25b..4d82061d 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -63,6 +63,7 @@ def __init__( file_cache_coroutine=None, browser_semaphore=None, use_scrapling_stealth=False, + num_pw_html_retries=3, **__, # consume any extra kwargs ): """ From 5b332a108a5e61122313c68c91e00f37fc1e2a3d Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:15:47 -0600 Subject: [PATCH 08/14] Bump version --- elm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/version.py b/elm/version.py index 480f51ac..9c04c46a 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.27" +__version__ = "0.0.28" From b854712a1f94174d14f5d1445b75fc44a5d0158e Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 13:19:34 -0600 Subject: [PATCH 09/14] Formatting fix --- elm/web/file_loader.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 4d82061d..c632e47c 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -241,9 +241,9 @@ async def _fetch_html_using_pw_with_retry(self, url): """Fetch HTML content with several retry attempts""" for attempt in range(self.num_pw_html_retries): text = await load_html_with_pw(url, self.browser_semaphore, - timeout=self.PAGE_LOAD_TIMEOUT, - use_scrapling_stealth=self.uss, - **self.pw_launch_kwargs) + timeout=self.PAGE_LOAD_TIMEOUT, + use_scrapling_stealth=self.uss, + **self.pw_launch_kwargs) doc = await self.html_read_coroutine(text, **self.html_read_kwargs) if not doc.empty: return doc From 0e7a1ae7021ba6ef8bfc121f86101c877483890b Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 14:20:49 -0600 Subject: [PATCH 10/14] `load_state` now a param --- elm/web/html_pw.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py index 0257187a..374579dd 100644 --- a/elm/web/html_pw.py +++ b/elm/web/html_pw.py @@ -59,7 +59,7 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover async def _load_html( url, browser_semaphore=None, # pragma: no cover timeout=90_000, use_scrapling_stealth=False, - **pw_launch_kwargs): + load_state="networkidle", **pw_launch_kwargs): """Load html using playwright""" logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs) logger.trace("browser_semaphore=%r", browser_semaphore) @@ -81,8 +81,9 @@ async def _load_html( url, browser_semaphore=None, # pragma: no cover async with pw_page(**page_kwargs) as page: logger.trace("Navigating to: %r", url) await page.goto(url) - logger.trace("Waiting for load with timeout: %r", timeout) - await page.wait_for_load_state("networkidle", timeout=timeout) + logger.trace("Waiting for load state %r with timeout: %r", + load_state, timeout) + await page.wait_for_load_state(load_state, timeout=timeout) text = await page.content() return text From b8b3671b2e9858b11b016f63e1749babc26d8c9f Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 14:24:13 -0600 Subject: [PATCH 11/14] Pass parameter down --- elm/web/html_pw.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py index 374579dd..a7ecf83d 100644 --- a/elm/web/html_pw.py +++ b/elm/web/html_pw.py @@ -20,7 +20,7 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover timeout=90_000, use_scrapling_stealth=False, - **pw_launch_kwargs): + load_state="networkidle", **pw_launch_kwargs): """Extract HTML from URL using Playwright. Parameters @@ -38,6 +38,19 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover use_scrapling_stealth : bool, default=False Option to use scrapling stealth scripts instead of tf-playwright-stealth. By default, ``False``. + load_state : str, default="networkidle" + The load state to wait for. One of: + + - "load" - consider navigation to be finished when the load + event is fired. + - "domcontentloaded" - consider navigation to be finished + when the ``DOMContentLoaded`` event + is fired. + - "networkidle" - consider navigation to be finished when + there are no network connections for at + least 500 ms. + + By default, ``"networkidle"``. **pw_launch_kwargs Keyword-value argument pairs to pass to :meth:`async_playwright.chromium.launch`. @@ -51,13 +64,14 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover text = await _load_html(url, browser_semaphore=browser_semaphore, timeout=timeout, use_scrapling_stealth=use_scrapling_stealth, + load_state=load_state, **pw_launch_kwargs) except (PlaywrightError, PlaywrightTimeoutError): text = "" return text -async def _load_html( url, browser_semaphore=None, # pragma: no cover +async def _load_html(url, browser_semaphore=None, # pragma: no cover timeout=90_000, use_scrapling_stealth=False, load_state="networkidle", **pw_launch_kwargs): """Load html using playwright""" From b7919b49e50e3bd652f92b63fbdf7c7e6f351756 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 14:27:52 -0600 Subject: [PATCH 12/14] FInal retry uses `domcontentloaded` load state --- elm/web/file_loader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index c632e47c..201c9514 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -250,7 +250,14 @@ async def _fetch_html_using_pw_with_retry(self, url): logger.debug("HTML read failed; retrying %r (attempt %d of %d)", url, attempt + 1, self.num_pw_html_retries) - return doc + + logger.debug("Attempting HTML read with load_state='domcontentloaded'") + text = await load_html_with_pw(url, self.browser_semaphore, + timeout=self.PAGE_LOAD_TIMEOUT, + use_scrapling_stealth=self.uss, + load_state="domcontentloaded", + **self.pw_launch_kwargs) + return await self.html_read_coroutine(text, **self.html_read_kwargs) @async_retry_with_exponential_backoff( base_delay=2, From 0f5633ab7d7ee605fc3d9d9fc554786adad870bd Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 14:39:40 -0600 Subject: [PATCH 13/14] refactor retry logic --- elm/web/file_loader.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 201c9514..d3359d6a 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -239,7 +239,11 @@ async def _fetch_doc(self, url): async def _fetch_html_using_pw_with_retry(self, url): """Fetch HTML content with several retry attempts""" - for attempt in range(self.num_pw_html_retries): + num_attempts = max(1, int(self.num_pw_html_retries) - 1) + max_attempts = num_attempts + 1 + for attempt in range(num_attempts): + logger.debug("HTML read for %r (attempt %d of %d)", + url, attempt + 1, max_attempts) text = await load_html_with_pw(url, self.browser_semaphore, timeout=self.PAGE_LOAD_TIMEOUT, use_scrapling_stealth=self.uss, @@ -248,10 +252,9 @@ async def _fetch_html_using_pw_with_retry(self, url): if not doc.empty: return doc - logger.debug("HTML read failed; retrying %r (attempt %d of %d)", - url, attempt + 1, self.num_pw_html_retries) - - logger.debug("Attempting HTML read with load_state='domcontentloaded'") + logger.debug("HTML read for %r (attempt %d of %d) with " + "load_state='domcontentloaded'", + url, max_attempts, max_attempts) text = await load_html_with_pw(url, self.browser_semaphore, timeout=self.PAGE_LOAD_TIMEOUT, use_scrapling_stealth=self.uss, From bd5bea78885b53cb09bec2ab331e33677b62781a Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Thu, 16 Oct 2025 14:40:24 -0600 Subject: [PATCH 14/14] Docstring clarification --- elm/web/file_loader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index d3359d6a..9cfc9bf7 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -128,7 +128,9 @@ def __init__( because the playwright parameters are stochastic, and sometimes a combination of them can fail to load HTML. The default value is likely a good balance between processing - attempts and retrieval success. By default, ``3``. + attempts and retrieval success. Note that the minimum number + of attempts will always be 2, even if the user provides a + value smaller than this. By default, ``3``. """ self.pw_launch_kwargs = pw_launch_kwargs or {} self.pdf_read_kwargs = pdf_read_kwargs or {}