Skip to content

Commit

Permalink
feat: Added maxDepth feature for BeautifulSoupCrawler
Browse files Browse the repository at this point in the history
  • Loading branch information
akash47angadi committed Oct 4, 2024
1 parent bb9055a commit 0c024d3
Showing 1 changed file with 16 additions and 1 deletion.
17 changes: 16 additions & 1 deletion src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(
browser_pool: BrowserPool | None = None,
browser_type: BrowserType | None = None,
headless: bool | None = None,
max_depth: int | None = None, # Add max_depth here
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
"""Create a new instance.
Expand All @@ -58,8 +59,10 @@ def __init__(
This option should not be used if `browser_pool` is provided.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
max_depth: The maximum depth for link enqueuing. Stops further requests after this depth.
kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`.
"""
self.max_depth = max_depth # Store max_depth
if browser_pool:
# Raise an exception if browser_pool is provided together with headless or browser_type arguments.
if headless is not None or browser_type is not None:
Expand Down Expand Up @@ -122,6 +125,14 @@ async def enqueue_links(
**kwargs: Unpack[AddRequestsKwargs],
) -> None:
"""The `PlaywrightCrawler` implementation of the `EnqueueLinksFunction` function."""
# Check the current depth from metadata
current_depth = context.request.metadata.get('depth', 0)

# Stop further requests if the depth exceeds max_depth
if self.max_depth is not None and current_depth >= self.max_depth:
context.log.info(f'Max depth of {self.max_depth} reached. Not enqueueing further links.')
return

kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME)

requests = list[BaseRequestData]()
Expand All @@ -144,7 +155,11 @@ async def enqueue_links(
link_user_data.setdefault('label', label)

try:
request = BaseRequestData.from_url(url, user_data=link_user_data)
request = BaseRequestData.from_url(
url,
user_data=link_user_data,
metadata={'depth': current_depth + 1} # Increment the depth
)
except ValidationError as exc:
context.log.debug(
f'Skipping URL "{url}" due to invalid format: {exc}. '
Expand Down

0 comments on commit 0c024d3

Please sign in to comment.