diff --git a/elm/version.py b/elm/version.py index 480f51ac..9c04c46a 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.27" +__version__ = "0.0.28" diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 0659dc88..9cfc9bf7 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -63,6 +63,7 @@ def __init__( file_cache_coroutine=None, browser_semaphore=None, use_scrapling_stealth=False, + num_pw_html_retries=3, **__, # consume any extra kwargs ): """ @@ -122,6 +123,14 @@ def __init__( use_scrapling_stealth : bool, default=False Option to use scrapling stealth scripts instead of tf-playwright-stealth. By default, ``False``. + num_pw_html_retries : int, default=3 + Number of attempts to load HTML content. This is useful + because the playwright parameters are stochastic, and + sometimes a combination of them can fail to load HTML. The + default value is likely a good balance between processing + attempts and retrieval success. Note that the minimum number + of attempts will always be 2, even if the user provides a + value smaller than this. By default, ``3``. """ self.pw_launch_kwargs = pw_launch_kwargs or {} self.pdf_read_kwargs = pdf_read_kwargs or {} @@ -137,6 +146,7 @@ def __init__( self.file_cache_coroutine = file_cache_coroutine self.browser_semaphore = browser_semaphore self.uss = use_scrapling_stealth + self.num_pw_html_retries = num_pw_html_retries def _header_from_template(self, header_template): """Compile header from user or default template""" @@ -217,11 +227,7 @@ async def _fetch_doc(self, url): return doc, url_bytes logger.debug("PDF read failed; fetching HTML content from %r", url) - text = await load_html_with_pw(url, self.browser_semaphore, - timeout=self.PAGE_LOAD_TIMEOUT, - use_scrapling_stealth=self.uss, - **self.pw_launch_kwargs) - doc = await self.html_read_coroutine(text, **self.html_read_kwargs) + doc = await self._fetch_html_using_pw_with_retry(url) if not doc.empty: return doc, doc.text @@ -233,6 +239,31 @@ async def _fetch_doc(self, url): return doc, url_bytes + async def _fetch_html_using_pw_with_retry(self, url): + """Fetch HTML content with several retry attempts""" + num_attempts = max(1, int(self.num_pw_html_retries) - 1) + max_attempts = num_attempts + 1 + for attempt in range(num_attempts): + logger.debug("HTML read for %r (attempt %d of %d)", + url, attempt + 1, max_attempts) + text = await load_html_with_pw(url, self.browser_semaphore, + timeout=self.PAGE_LOAD_TIMEOUT, + use_scrapling_stealth=self.uss, + **self.pw_launch_kwargs) + doc = await self.html_read_coroutine(text, **self.html_read_kwargs) + if not doc.empty: + return doc + + logger.debug("HTML read for %r (attempt %d of %d) with " + "load_state='domcontentloaded'", + url, max_attempts, max_attempts) + text = await load_html_with_pw(url, self.browser_semaphore, + timeout=self.PAGE_LOAD_TIMEOUT, + use_scrapling_stealth=self.uss, + load_state="domcontentloaded", + **self.pw_launch_kwargs) + return await self.html_read_coroutine(text, **self.html_read_kwargs) + @async_retry_with_exponential_backoff( base_delay=2, exponential_base=1.5, diff --git a/elm/web/html_pw.py b/elm/web/html_pw.py index 0257187a..a7ecf83d 100644 --- a/elm/web/html_pw.py +++ b/elm/web/html_pw.py @@ -20,7 +20,7 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover timeout=90_000, use_scrapling_stealth=False, - **pw_launch_kwargs): + load_state="networkidle", **pw_launch_kwargs): """Extract HTML from URL using Playwright. Parameters @@ -38,6 +38,19 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover use_scrapling_stealth : bool, default=False Option to use scrapling stealth scripts instead of tf-playwright-stealth. By default, ``False``. + load_state : str, default="networkidle" + The load state to wait for. One of: + + - "load" - consider navigation to be finished when the load + event is fired. + - "domcontentloaded" - consider navigation to be finished + when the ``DOMContentLoaded`` event + is fired. + - "networkidle" - consider navigation to be finished when + there are no network connections for at + least 500 ms. + + By default, ``"networkidle"``. **pw_launch_kwargs Keyword-value argument pairs to pass to :meth:`async_playwright.chromium.launch`. @@ -51,15 +64,16 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover text = await _load_html(url, browser_semaphore=browser_semaphore, timeout=timeout, use_scrapling_stealth=use_scrapling_stealth, + load_state=load_state, **pw_launch_kwargs) except (PlaywrightError, PlaywrightTimeoutError): text = "" return text -async def _load_html( url, browser_semaphore=None, # pragma: no cover +async def _load_html(url, browser_semaphore=None, # pragma: no cover timeout=90_000, use_scrapling_stealth=False, - **pw_launch_kwargs): + load_state="networkidle", **pw_launch_kwargs): """Load html using playwright""" logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs) logger.trace("browser_semaphore=%r", browser_semaphore) @@ -81,8 +95,9 @@ async def _load_html( url, browser_semaphore=None, # pragma: no cover async with pw_page(**page_kwargs) as page: logger.trace("Navigating to: %r", url) await page.goto(url) - logger.trace("Waiting for load with timeout: %r", timeout) - await page.wait_for_load_state("networkidle", timeout=timeout) + logger.trace("Waiting for load state %r with timeout: %r", + load_state, timeout) + await page.wait_for_load_state(load_state, timeout=timeout) text = await page.content() return text