From 997470d4eb889a4b256e95af65a0b2356a2448ac Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 12:19:01 -0600 Subject: [PATCH 1/3] Add page limit to crawl --- elm/web/website_crawl.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py index 2f1242f5..5057a49b 100644 --- a/elm/web/website_crawl.py +++ b/elm/web/website_crawl.py @@ -377,7 +377,7 @@ def __init__(self, validator, file_loader_kwargs=None, browser_config_kwargs=None, crawl_strategy_kwargs=None, crawler_config_kwargs=None, cte_kwargs=None, extra_url_filters=None, include_external=False, - url_scorer=None, max_pages=100): + url_scorer=None, max_pages=100, page_limit=None): """ Parameters @@ -428,9 +428,16 @@ def __init__(self, validator, file_loader_kwargs=None, :meth:`ELMLinkScorer.score` method to score the URLs. By default, ``None``. max_pages : int, optional - Maximum number of pages to crawl. By default, ``100``. + Maximum number of **successful** pages to crawl. + By default, ``100``. + page_limit : int, optional + Maximum number of pages to crawl regardless of success + status. If ``None``, a page limit of 2 * `max_pages` is + used. To set no limit (not recommended), use ``math.inf``. + By default, ``None``. """ self.validator = validator + self.page_limit = page_limit or 2 * max_pages flk = {"verify_ssl": False} flk.update(file_loader_kwargs or {}) @@ -513,7 +520,12 @@ async def run(self, base_url, termination_callback=None, async with AsyncWebCrawler(config=self.browser_config) as crawler: crawl_results = await crawler.arun(base_url, config=self.config) async with aclosing(crawl_results) as agen: - async for result in agen: + async for ind, result in enumerate(agen): + if ind >= self.page_limit: + logger.debug("Exiting crawl due to page limit") + break + if not result.success: + continue results.append(result) logger.debug("Crawled %s", result.url) if on_result_hook: From 6c2a9881cb2cf6775c76f86d55580b091014ddc0 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 12:19:53 -0600 Subject: [PATCH 2/3] Bump version --- elm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elm/version.py b/elm/version.py index 268a372a..806a233f 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.22" +__version__ = "0.0.23" From 27e4a68d8198c4e3da2033ed872d0eb28a026afd Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 15 Jul 2025 12:54:38 -0600 Subject: [PATCH 3/3] Track page count manually --- elm/web/website_crawl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py index 5057a49b..e95745ba 100644 --- a/elm/web/website_crawl.py +++ b/elm/web/website_crawl.py @@ -517,11 +517,13 @@ async def run(self, base_url, termination_callback=None, out_docs = [] should_stop = (termination_callback or ELMWebsiteCrawlingStrategy.found_enough_docs) + page_count = 0 async with AsyncWebCrawler(config=self.browser_config) as crawler: crawl_results = await crawler.arun(base_url, config=self.config) async with aclosing(crawl_results) as agen: - async for ind, result in enumerate(agen): - if ind >= self.page_limit: + async for result in agen: + page_count += 1 + if page_count > self.page_limit: logger.debug("Exiting crawl due to page limit") break if not result.success: