Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for pre_navigation_hooks in PlaywrightCrawler #602

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/crawlee/playwright_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
try:
from ._playwright_crawler import PlaywrightCrawler
from ._playwright_crawler import PlaywrightCrawler, PlaywrightHook
from ._playwright_crawling_context import PlaywrightCrawlingContext
except ImportError as exc:
raise ImportError(
"To import anything from this subpackage, you need to install the 'playwright' extra."
"For example, if you use pip, run `pip install 'crawlee[playwright]'`.",
) from exc

__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext']
__all__ = ['PlaywrightCrawler', 'PlaywrightCrawlingContext', 'PlaywrightHook']
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason to export this type?

61 changes: 35 additions & 26 deletions src/crawlee/playwright_crawler/_playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, List, Callable, Awaitable

from pydantic import ValidationError
from typing_extensions import Unpack
Expand All @@ -18,10 +18,11 @@

if TYPE_CHECKING:
from collections.abc import AsyncGenerator

from crawlee._types import AddRequestsKwargs, BasicCrawlingContext
from crawlee.browsers._types import BrowserType
from playwright.async_api import Page

PlaywrightHook = Callable[[PlaywrightCrawlingContext, dict], Awaitable[None]]

class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
"""A crawler that leverages the [Playwright](https://playwright.dev/python/) browser automation library.
Expand All @@ -48,6 +49,7 @@ def __init__(
browser_pool: BrowserPool | None = None,
browser_type: BrowserType | None = None,
headless: bool | None = None,
pre_navigation_hooks: List[PlaywrightHook] | None = None,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a perfectly valid way of passing the hooks, but have you considered adding a decorator instead/as well? We did a similar change for request handlers because Python does not have anonymous functions (that can contain more than one statement).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will work on adding a decorator also mate

**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
"""Create a new instance.
Expand All @@ -58,22 +60,20 @@ def __init__(
This option should not be used if `browser_pool` is provided.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
pre_navigation_hooks: A list of functions to be executed before each page navigation.
kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`.
"""
if browser_pool:
# Raise an exception if browser_pool is provided together with headless or browser_type arguments.
if headless is not None or browser_type is not None:
raise ValueError(
'You cannot provide `headless` or `browser_type` arguments when `browser_pool` is provided.'
)

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
browser_pool = BrowserPool.with_default_plugin(headless=headless, browser_type=browser_type)

self._browser_pool = browser_pool
self._pre_navigation_hooks = pre_navigation_hooks or []

# Compose the context pipeline with the Playwright-specific context enhancer.
kwargs['_context_pipeline'] = (
ContextPipeline().compose(self._make_http_request).compose(self._handle_blocked_request)
)
Expand Down Expand Up @@ -101,17 +101,37 @@ async def _make_http_request(
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')

# Create a new browser page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)

async with crawlee_page.page:
# Navigate to the URL and get response.
response = await crawlee_page.page.goto(context.request.url)
goto_options = {}

playwright_context = PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=crawlee_page.page,
infinite_scroll=lambda: infinite_scroll(crawlee_page.page),
response=None,
enqueue_links=None, # We'll set this later
Comment on lines +120 to +121
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a no-go. First, it won't pass the type checker. Second, nullable attributes lead to errors, mental overhead and hard to read code.

Instead, we should add one more parent class to PlaywrightCrawlingContext that only adds the page attribute. Then, we should split the _make_http_request into two stages - one that opens the page and executes prenavigation hooks, and second that does the navigation. The context pipeline (see line 78) should be responsible for chaining these.

Does that make sense?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes will work on that

)

# Execute pre-navigation hooks
for hook in self._pre_navigation_hooks:
await hook(playwright_context, goto_options)

# Navigate to the URL and get response
response = await crawlee_page.page.goto(context.request.url, **goto_options)

if response is None:
raise SessionError(f'Failed to load the URL: {context.request.url}')

# Set the loaded URL to the actual URL after redirection.
# Set the loaded URL to the actual URL after redirection
context.request.loaded_url = crawlee_page.page.url

async def enqueue_links(
Expand Down Expand Up @@ -157,20 +177,11 @@ async def enqueue_links(

await context.add_requests(requests, **kwargs)

yield PlaywrightCrawlingContext(
request=context.request,
session=context.session,
add_requests=context.add_requests,
send_request=context.send_request,
push_data=context.push_data,
proxy_info=context.proxy_info,
get_key_value_store=context.get_key_value_store,
log=context.log,
page=crawlee_page.page,
infinite_scroll=lambda: infinite_scroll(crawlee_page.page),
response=response,
enqueue_links=enqueue_links,
)
# Update the PlaywrightCrawlingContext with the response and enqueue_links function
playwright_context.response = response
playwright_context.enqueue_links = enqueue_links

yield playwright_context

async def _handle_blocked_request(
self,
Expand All @@ -190,15 +201,13 @@ async def _handle_blocked_request(
if self._retry_on_blocked:
status_code = crawling_context.response.status

# Check if the session is blocked based on the HTTP status code.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't delete random comments.

if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code):
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}.')

matched_selectors = [
selector for selector in RETRY_CSS_SELECTORS if (await crawling_context.page.query_selector(selector))
]

# Check if the session is blocked based on the response content
if matched_selectors:
raise SessionError(
'Assuming the session is blocked - '
Expand Down
Loading