Skip to content
2 changes: 1 addition & 1 deletion elm/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
ELM version number
"""

__version__ = "0.0.27"
__version__ = "0.0.28"
41 changes: 36 additions & 5 deletions elm/web/file_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(
file_cache_coroutine=None,
browser_semaphore=None,
use_scrapling_stealth=False,
num_pw_html_retries=3,
**__, # consume any extra kwargs
):
"""
Expand Down Expand Up @@ -122,6 +123,14 @@ def __init__(
use_scrapling_stealth : bool, default=False
Option to use scrapling stealth scripts instead of
tf-playwright-stealth. By default, ``False``.
num_pw_html_retries : int, default=3
Number of attempts to load HTML content. This is useful
because the playwright parameters are stochastic, and
sometimes a combination of them can fail to load HTML. The
default value is likely a good balance between processing
attempts and retrieval success. Note that the minimum number
of attempts will always be 2, even if the user provides a
value smaller than this. By default, ``3``.
"""
self.pw_launch_kwargs = pw_launch_kwargs or {}
self.pdf_read_kwargs = pdf_read_kwargs or {}
Expand All @@ -137,6 +146,7 @@ def __init__(
self.file_cache_coroutine = file_cache_coroutine
self.browser_semaphore = browser_semaphore
self.uss = use_scrapling_stealth
self.num_pw_html_retries = num_pw_html_retries

def _header_from_template(self, header_template):
"""Compile header from user or default template"""
Expand Down Expand Up @@ -217,11 +227,7 @@ async def _fetch_doc(self, url):
return doc, url_bytes

logger.debug("PDF read failed; fetching HTML content from %r", url)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
use_scrapling_stealth=self.uss,
**self.pw_launch_kwargs)
doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
doc = await self._fetch_html_using_pw_with_retry(url)
if not doc.empty:
return doc, doc.text

Expand All @@ -233,6 +239,31 @@ async def _fetch_doc(self, url):

return doc, url_bytes

async def _fetch_html_using_pw_with_retry(self, url):
"""Fetch HTML content with several retry attempts"""
num_attempts = max(1, int(self.num_pw_html_retries) - 1)
max_attempts = num_attempts + 1
for attempt in range(num_attempts):
logger.debug("HTML read for %r (attempt %d of %d)",
url, attempt + 1, max_attempts)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
use_scrapling_stealth=self.uss,
**self.pw_launch_kwargs)
doc = await self.html_read_coroutine(text, **self.html_read_kwargs)
if not doc.empty:
return doc

logger.debug("HTML read for %r (attempt %d of %d) with "
"load_state='domcontentloaded'",
url, max_attempts, max_attempts)
text = await load_html_with_pw(url, self.browser_semaphore,
timeout=self.PAGE_LOAD_TIMEOUT,
use_scrapling_stealth=self.uss,
load_state="domcontentloaded",
**self.pw_launch_kwargs)
return await self.html_read_coroutine(text, **self.html_read_kwargs)

@async_retry_with_exponential_backoff(
base_delay=2,
exponential_base=1.5,
Expand Down
25 changes: 20 additions & 5 deletions elm/web/html_pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
timeout=90_000, use_scrapling_stealth=False,
**pw_launch_kwargs):
load_state="networkidle", **pw_launch_kwargs):
"""Extract HTML from URL using Playwright.

Parameters
Expand All @@ -38,6 +38,19 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
use_scrapling_stealth : bool, default=False
Option to use scrapling stealth scripts instead of
tf-playwright-stealth. By default, ``False``.
load_state : str, default="networkidle"
The load state to wait for. One of:

- "load" - consider navigation to be finished when the load
event is fired.
- "domcontentloaded" - consider navigation to be finished
when the ``DOMContentLoaded`` event
is fired.
- "networkidle" - consider navigation to be finished when
there are no network connections for at
least 500 ms.

By default, ``"networkidle"``.
**pw_launch_kwargs
Keyword-value argument pairs to pass to
:meth:`async_playwright.chromium.launch`.
Expand All @@ -51,15 +64,16 @@ async def load_html_with_pw(url, browser_semaphore=None, # pragma: no cover
text = await _load_html(url, browser_semaphore=browser_semaphore,
timeout=timeout,
use_scrapling_stealth=use_scrapling_stealth,
load_state=load_state,
**pw_launch_kwargs)
except (PlaywrightError, PlaywrightTimeoutError):
text = ""
return text


async def _load_html( url, browser_semaphore=None, # pragma: no cover
async def _load_html(url, browser_semaphore=None, # pragma: no cover
timeout=90_000, use_scrapling_stealth=False,
**pw_launch_kwargs):
load_state="networkidle", **pw_launch_kwargs):
"""Load html using playwright"""
logger.trace("`_load_html` pw_launch_kwargs=%r", pw_launch_kwargs)
logger.trace("browser_semaphore=%r", browser_semaphore)
Expand All @@ -81,8 +95,9 @@ async def _load_html( url, browser_semaphore=None, # pragma: no cover
async with pw_page(**page_kwargs) as page:
logger.trace("Navigating to: %r", url)
await page.goto(url)
logger.trace("Waiting for load with timeout: %r", timeout)
await page.wait_for_load_state("networkidle", timeout=timeout)
logger.trace("Waiting for load state %r with timeout: %r",
load_state, timeout)
await page.wait_for_load_state(load_state, timeout=timeout)
text = await page.content()

return text
Loading