From f5347c7f22ace9dba6349a87dc48ece93cdf18a0 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 16 Sep 2025 21:16:46 +0000 Subject: [PATCH 1/7] add `rq_name` and `rq_alias` params for `add_requests` and `enqueue_links` methods --- src/crawlee/_types.py | 41 +++++++++- src/crawlee/crawlers/_basic/_basic_crawler.py | 26 ++++++- .../crawlers/_basic/test_basic_crawler.py | 48 ++++++++++++ .../test_beautifulsoup_crawler.py | 78 +++++++++++++++++++ .../crawlers/_parsel/test_parsel_crawler.py | 76 ++++++++++++++++++ .../_playwright/test_playwright_crawler.py | 74 ++++++++++++++++++ 6 files changed, 337 insertions(+), 6 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 51f9d357e7..b21be6b6dc 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -180,6 +180,12 @@ class AddRequestsKwargs(EnqueueLinksKwargs): requests: Sequence[str | Request] """Requests to be added to the `RequestManager`.""" + rq_name: str | None + """Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can be provided.""" + + rq_alias: str | None + """Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can be provided.""" + class PushDataKwargs(TypedDict): """Keyword arguments for dataset's `push_data` method.""" @@ -261,10 +267,16 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None: async def add_requests( self, requests: Sequence[str | Request], + rq_name: str | None = None, + rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: """Track a call to the `add_requests` context helper.""" - self.add_requests_calls.append(AddRequestsKwargs(requests=requests, **kwargs)) + if rq_name is not None and rq_alias is not None: + raise ValueError('Only one of rq_name or rq_alias can be provided.') + self.add_requests_calls.append( + AddRequestsKwargs(requests=requests, rq_name=rq_name, rq_alias=rq_alias, **kwargs) + ) async def push_data( self, @@ -311,12 +323,19 @@ class AddRequestsFunction(Protocol): def __call__( self, requests: Sequence[str | Request], + rq_name: str | None = None, + rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call dunder method. Args: - requests: Requests to be added to the `RequestManager`. + requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to + the corresponding `RequestQueue`. + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + be provided. **kwargs: Additional keyword arguments. """ @@ -344,12 +363,19 @@ def __call__( label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @overload def __call__( - self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs] + self, + *, + requests: Sequence[str | Request] | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, + **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... def __call__( @@ -360,6 +386,8 @@ def __call__( user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call enqueue links function. @@ -376,7 +404,12 @@ def __call__( - Modified `RequestOptions` to update the request configuration, - `'skip'` to exclude the request from being enqueued, - `'unchanged'` to use the original request options without modification. - requests: Requests to be added to the `RequestManager`. + requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to + the corresponding `RequestQueue`. + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + be provided. **kwargs: Additional keyword arguments. """ diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 0a6d4eae87..d27110ae27 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -944,6 +944,8 @@ async def enqueue_links( transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: kwargs.setdefault('strategy', 'same-hostname') @@ -955,7 +957,9 @@ async def enqueue_links( '`transform_request_function` arguments when `requests` is provided.' ) # Add directly passed requests. - await context.add_requests(requests or list[str | Request](), **kwargs) + await context.add_requests( + requests or list[str | Request](), rq_name=rq_name, rq_alias=rq_alias, **kwargs + ) else: # Add requests from extracted links. await context.add_requests( @@ -965,6 +969,8 @@ async def enqueue_links( user_data=user_data, transform_request_function=transform_request_function, ), + rq_name=rq_name, + rq_alias=rq_alias, **kwargs, ) @@ -1241,10 +1247,26 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> """Commit request handler result for the input `context`. Result is taken from `_context_result_map`.""" result = self._context_result_map[context] - request_manager = await self.get_request_manager() + base_request_manager = await self.get_request_manager() + origin = context.request.loaded_url or context.request.url for add_requests_call in result.add_requests_calls: + rq_name = add_requests_call.get('rq_name') + rq_alias = add_requests_call.get('rq_alias') + + if rq_name and rq_alias: + raise ValueError('You cannot provide both `rq_name` and `rq_alias` arguments.') + if rq_name or rq_alias: + request_manager: RequestManager | RequestQueue = await RequestQueue.open( + name=rq_name, + alias=rq_alias, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + else: + request_manager = base_request_manager + requests = list[Request]() base_url = url if (url := add_requests_call.get('base_url')) else origin diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 62ede11e67..3459f27730 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1549,3 +1549,51 @@ def listener(event_data: EventCrawlerStatusData) -> None: event_manager.off(event=Event.CRAWLER_STATUS, listener=listener) assert status_message_listener.called + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None) -> None: + crawler = BasicCrawler() + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls = set() + + check_requests = [ + Request.from_url('https://a.placeholder.com'), + Request.from_url('https://b.placeholder.com'), + Request.from_url('https://c.placeholder.com'), + ] + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.add_requests(check_requests, rq_name=queue_name, rq_alias=queue_alias) + + await crawler.run(['https://start.placeholder.com']) + + requests_from_queue = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request) + + assert requests_from_queue == check_requests + assert visit_urls == {'https://start.placeholder.com'} + + +async def test_add_requests_error_with_rq_alias_and_rq_name() -> None: + crawler = BasicCrawler() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): + await context.add_requests( + [Request.from_url('https://a.placeholder.com')], + rq_name='named-queue', + rq_alias='alias-queue', + ) + + await crawler.run(['https://start.placeholder.com']) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 37f2d1b8ed..e5e7113c52 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -3,8 +3,11 @@ from typing import TYPE_CHECKING from unittest import mock +import pytest + from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext +from crawlee.storages import RequestQueue if TYPE_CHECKING: from yarl import URL @@ -198,3 +201,78 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_with_rq_param( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None +) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client) + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} + assert visit_urls == {str(server_url / 'start_enqueue')} + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_requests_with_rq_param( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None +) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client) + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + check_requests: list[str] = [ + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + ] + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == set(check_requests) + assert visit_urls == {str(server_url / 'start_enqueue')} + + +async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client) + + @crawler.router.default_handler + async def handler(context: BeautifulSoupCrawlingContext) -> None: + with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): + await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + + await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 909563d822..c2161a13d5 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -8,6 +8,7 @@ from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import ParselCrawler +from crawlee.storages import RequestQueue if TYPE_CHECKING: from yarl import URL @@ -294,3 +295,78 @@ async def request_handler(context: ParselCrawlingContext) -> None: assert len(extracted_links) == 1 assert extracted_links[0] == str(server_url / 'page_1') + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_with_rq_param( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None +) -> None: + crawler = ParselCrawler(http_client=http_client) + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} + assert visit_urls == {str(server_url / 'start_enqueue')} + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_requests_with_rq_param( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None +) -> None: + crawler = ParselCrawler(http_client=http_client) + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + check_requests: list[str] = [ + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + ] + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == set(check_requests) + assert visit_urls == {str(server_url / 'start_enqueue')} + + +async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client) + + @crawler.router.default_handler + async def handler(context: ParselCrawlingContext) -> None: + with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): + await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + + await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 2f52cac163..7ab4bd76eb 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -37,6 +37,7 @@ from crawlee.sessions import Session, SessionPool from crawlee.statistics import Statistics from crawlee.statistics._error_snapshotter import ErrorSnapshotter +from crawlee.storages import RequestQueue from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD if TYPE_CHECKING: @@ -784,3 +785,76 @@ async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.Lo break assert found_playwright_message, 'Expected log message about request handler error was not found.' + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_with_rq_param(server_url: URL, queue_name: str | None, queue_alias: str | None) -> None: + crawler = PlaywrightCrawler() + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} + assert visit_urls == {str(server_url / 'start_enqueue')} + + +@pytest.mark.parametrize( + ('queue_name', 'queue_alias'), + [ + pytest.param('named-queue', None, id='with rq_name'), + pytest.param(None, 'alias-queue', id='with rq_alias'), + ], +) +async def test_enqueue_links_requests_with_rq_param( + server_url: URL, queue_name: str | None, queue_alias: str | None +) -> None: + crawler = PlaywrightCrawler() + rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + visit_urls: set[str] = set() + + check_requests: list[str] = [ + 'https://a.placeholder.com', + 'https://b.placeholder.com', + 'https://c.placeholder.com', + ] + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + visit_urls.add(context.request.url) + await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + + await crawler.run([str(server_url / 'start_enqueue')]) + + requests_from_queue: list[str] = [] + while request := await rq.fetch_next_request(): + requests_from_queue.append(request.url) + + assert set(requests_from_queue) == set(check_requests) + assert visit_urls == {str(server_url / 'start_enqueue')} + + +async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL) -> None: + crawler = PlaywrightCrawler() + + @crawler.router.default_handler + async def handler(context: PlaywrightCrawlingContext) -> None: + with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): + await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + + await crawler.run([str(server_url / 'start_enqueue')]) From 7689b967d43d8fb2e7afbc4bca83d15a8daa4461 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sat, 20 Sep 2025 16:43:46 +0000 Subject: [PATCH 2/7] add rq_id --- src/crawlee/_types.py | 37 ++++++++++---- src/crawlee/crawlers/_basic/_basic_crawler.py | 14 +++-- .../crawlers/_basic/test_basic_crawler.py | 36 +++++++++---- .../test_beautifulsoup_crawler.py | 49 +++++++++++++----- .../crawlers/_parsel/test_parsel_crawler.py | 49 +++++++++++++----- .../_playwright/test_playwright_crawler.py | 51 ++++++++++++++----- 6 files changed, 175 insertions(+), 61 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index b21be6b6dc..4ee66cc78e 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -181,10 +181,15 @@ class AddRequestsKwargs(EnqueueLinksKwargs): """Requests to be added to the `RequestManager`.""" rq_name: str | None - """Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can be provided.""" + """Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can be provided. + """ rq_alias: str | None - """Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can be provided.""" + """Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` can be provided. + """ + + rq_id: str | None + """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" class PushDataKwargs(TypedDict): @@ -269,13 +274,15 @@ async def add_requests( requests: Sequence[str | Request], rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: """Track a call to the `add_requests` context helper.""" - if rq_name is not None and rq_alias is not None: - raise ValueError('Only one of rq_name or rq_alias can be provided.') + specified_params = sum(1 for param in [rq_name, rq_alias, rq_id] if param is not None) + if specified_params > 1: + raise ValueError('Only one of `rq_name`, `rq_alias` or `rq_id` can be provided.') self.add_requests_calls.append( - AddRequestsKwargs(requests=requests, rq_name=rq_name, rq_alias=rq_alias, **kwargs) + AddRequestsKwargs(requests=requests, rq_name=rq_name, rq_alias=rq_alias, rq_id=rq_id, **kwargs) ) async def push_data( @@ -325,6 +332,7 @@ def __call__( requests: Sequence[str | Request], rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call dunder method. @@ -332,10 +340,12 @@ def __call__( Args: requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to the corresponding `RequestQueue`. - rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can - be provided. - rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` + can be provided. + rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be + provided. **kwargs: Additional keyword arguments. """ @@ -365,6 +375,7 @@ def __call__( transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -375,6 +386,7 @@ def __call__( requests: Sequence[str | Request] | None = None, rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -388,6 +400,7 @@ def __call__( requests: Sequence[str | Request] | None = None, rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call enqueue links function. @@ -406,10 +419,12 @@ def __call__( - `'unchanged'` to use the original request options without modification. requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to the corresponding `RequestQueue`. - rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can - be provided. - rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_name` or `rq_alias` can + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` + can be provided. + rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be + provided. **kwargs: Additional keyword arguments. """ diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d27110ae27..b1a19f079a 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -946,6 +946,7 @@ async def enqueue_links( requests: Sequence[str | Request] | None = None, rq_name: str | None = None, rq_alias: str | None = None, + rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: kwargs.setdefault('strategy', 'same-hostname') @@ -958,7 +959,7 @@ async def enqueue_links( ) # Add directly passed requests. await context.add_requests( - requests or list[str | Request](), rq_name=rq_name, rq_alias=rq_alias, **kwargs + requests or list[str | Request](), rq_name=rq_name, rq_alias=rq_alias, rq_id=rq_id, **kwargs ) else: # Add requests from extracted links. @@ -971,6 +972,7 @@ async def enqueue_links( ), rq_name=rq_name, rq_alias=rq_alias, + rq_id=rq_id, **kwargs, ) @@ -1254,11 +1256,13 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> for add_requests_call in result.add_requests_calls: rq_name = add_requests_call.get('rq_name') rq_alias = add_requests_call.get('rq_alias') - - if rq_name and rq_alias: - raise ValueError('You cannot provide both `rq_name` and `rq_alias` arguments.') - if rq_name or rq_alias: + rq_id = add_requests_call.get('rq_id') + specified_params = sum(1 for param in [rq_name, rq_alias, rq_id] if param is not None) + if specified_params > 1: + raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.') + if rq_name or rq_alias or rq_id: request_manager: RequestManager | RequestQueue = await RequestQueue.open( + id=rq_id, name=rq_name, alias=rq_alias, storage_client=self._service_locator.get_storage_client(), diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 3459f27730..2bba70a8af 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1552,15 +1552,21 @@ def listener(event_data: EventCrawlerStatusData) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) -async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None) -> None: +async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: str | None, *, by_id: bool) -> None: crawler = BasicCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_id = rq.id + queue_name = None + else: + queue_id = None visit_urls = set() check_requests = [ @@ -1572,7 +1578,7 @@ async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: s @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.add_requests(check_requests, rq_name=queue_name, rq_alias=queue_alias) + await context.add_requests(check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run(['https://start.placeholder.com']) @@ -1584,16 +1590,28 @@ async def handler(context: BasicCrawlingContext) -> None: assert visit_urls == {'https://start.placeholder.com'} -async def test_add_requests_error_with_rq_alias_and_rq_name() -> None: +@pytest.mark.parametrize( + ('queue_name', 'queue_alias', 'queue_id'), + [ + pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), + pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), + pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), + pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), + ], +) +async def test_add_requests_error_with_multi_params( + queue_name: str | None, queue_alias: str | None, queue_id: str | None +) -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): + with pytest.raises(ValueError, match='Only one of `rq_name`, `rq_alias` or `rq_id` can be set'): await context.add_requests( [Request.from_url('https://a.placeholder.com')], - rq_name='named-queue', - rq_alias='alias-queue', + rq_name=queue_name, + rq_alias=queue_alias, + rq_id=queue_id, ) await crawler.run(['https://start.placeholder.com']) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index e5e7113c52..120e9a0b7b 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -204,23 +204,29 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_with_rq_param( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) @@ -233,17 +239,23 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ @@ -255,7 +267,9 @@ async def test_enqueue_links_requests_with_rq_param( @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + await context.enqueue_links( + requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' + ) await crawler.run([str(server_url / 'start_enqueue')]) @@ -267,12 +281,23 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: assert visit_urls == {str(server_url / 'start_enqueue')} -async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL, http_client: HttpClient) -> None: +@pytest.mark.parametrize( + ('queue_name', 'queue_alias', 'queue_id'), + [ + pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), + pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), + pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), + pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), + ], +) +async def test_enqueue_links_error_with_multi_params( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, queue_id: str | None +) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index c2161a13d5..c5c3d62c6e 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -298,23 +298,29 @@ async def request_handler(context: ParselCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_with_rq_param( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = ParselCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) @@ -327,17 +333,23 @@ async def handler(context: ParselCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = ParselCrawler(http_client=http_client) rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ @@ -349,7 +361,9 @@ async def test_enqueue_links_requests_with_rq_param( @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + await context.enqueue_links( + requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' + ) await crawler.run([str(server_url / 'start_enqueue')]) @@ -361,12 +375,23 @@ async def handler(context: ParselCrawlingContext) -> None: assert visit_urls == {str(server_url / 'start_enqueue')} -async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL, http_client: HttpClient) -> None: +@pytest.mark.parametrize( + ('queue_name', 'queue_alias', 'queue_id'), + [ + pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), + pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), + pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), + pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), + ], +) +async def test_enqueue_links_error_with_multi_params( + server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, queue_id: str | None +) -> None: crawler = ParselCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 7ab4bd76eb..9f5ae5d58e 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -788,21 +788,29 @@ async def test_reduced_logs_from_playwright_navigation_timeout(caplog: pytest.Lo @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) -async def test_enqueue_links_with_rq_param(server_url: URL, queue_name: str | None, queue_alias: str | None) -> None: +async def test_enqueue_links_with_rq_param( + server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool +) -> None: crawler = PlaywrightCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias) + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) @@ -815,17 +823,23 @@ async def handler(context: PlaywrightCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias'), + ('queue_name', 'queue_alias', 'by_id'), [ - pytest.param('named-queue', None, id='with rq_name'), - pytest.param(None, 'alias-queue', id='with rq_alias'), + pytest.param('named-queue', None, False, id='with rq_name'), + pytest.param(None, 'alias-queue', False, id='with rq_alias'), + pytest.param('id-queue', None, True, id='with rq_id'), ], ) async def test_enqueue_links_requests_with_rq_param( - server_url: URL, queue_name: str | None, queue_alias: str | None + server_url: URL, queue_name: str | None, queue_alias: str | None, *, by_id: bool ) -> None: crawler = PlaywrightCrawler() rq = await RequestQueue.open(name=queue_name, alias=queue_alias) + if by_id: + queue_name = None + queue_id = rq.id + else: + queue_id = None visit_urls: set[str] = set() check_requests: list[str] = [ @@ -837,7 +851,9 @@ async def test_enqueue_links_requests_with_rq_param( @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, strategy='all') + await context.enqueue_links( + requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' + ) await crawler.run([str(server_url / 'start_enqueue')]) @@ -849,12 +865,23 @@ async def handler(context: PlaywrightCrawlingContext) -> None: assert visit_urls == {str(server_url / 'start_enqueue')} -async def test_enqueue_links_error_with_rq_alias_and_rq_name(server_url: URL) -> None: +@pytest.mark.parametrize( + ('queue_name', 'queue_alias', 'queue_id'), + [ + pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), + pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), + pytest.param(None, 'alias-queue', 'id-queue', id='rq_alias and rq_id'), + pytest.param('named-queue', 'alias-queue', 'id-queue', id='rq_name and rq_alias and rq_id'), + ], +) +async def test_enqueue_links_error_with_multi_params( + server_url: URL, queue_name: str | None, queue_alias: str | None, queue_id: str | None +) -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name='named-queue', rq_alias='alias-queue') + await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) await crawler.run([str(server_url / 'start_enqueue')]) From 4da169323eb9893a89b9c0cced6e43cd394a41b5 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sun, 21 Sep 2025 19:43:49 +0300 Subject: [PATCH 3/7] Update src/crawlee/_types.py Co-authored-by: Vlada Dusek --- src/crawlee/_types.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 4ee66cc78e..93c0ec0b16 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -181,8 +181,7 @@ class AddRequestsKwargs(EnqueueLinksKwargs): """Requests to be added to the `RequestManager`.""" rq_name: str | None - """Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can be provided. - """ + """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" rq_alias: str | None """Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` can be provided. From 7e4e829f0a4a121c0716dc0e3992cb17440e7465 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Sun, 21 Sep 2025 19:44:46 +0300 Subject: [PATCH 4/7] Apply suggestion from @vdusek Co-authored-by: Vlada Dusek --- src/crawlee/_types.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 93c0ec0b16..e2ef98eec9 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -184,8 +184,7 @@ class AddRequestsKwargs(EnqueueLinksKwargs): """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" rq_alias: str | None - """Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` can be provided. - """ + """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" rq_id: str | None """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" From cca6d1fe1128ab27f362fc0858774c0ba19d2387 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 21 Sep 2025 20:32:35 +0000 Subject: [PATCH 5/7] change order --- src/crawlee/_types.py | 44 ++++++++++--------- src/crawlee/crawlers/_basic/_basic_crawler.py | 12 ++--- .../crawlers/_basic/test_basic_crawler.py | 8 ++-- .../test_beautifulsoup_crawler.py | 8 ++-- .../crawlers/_parsel/test_parsel_crawler.py | 10 ++--- .../_playwright/test_playwright_crawler.py | 10 ++--- 6 files changed, 47 insertions(+), 45 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index e2ef98eec9..c7cb43aa63 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -180,14 +180,16 @@ class AddRequestsKwargs(EnqueueLinksKwargs): requests: Sequence[str | Request] """Requests to be added to the `RequestManager`.""" + rq_id: str | None + """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" + rq_name: str | None - """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" + """Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. + """ rq_alias: str | None - """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" - - rq_id: str | None - """ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.""" + """Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. + """ class PushDataKwargs(TypedDict): @@ -270,17 +272,17 @@ def __init__(self, *, key_value_store_getter: GetKeyValueStoreFunction) -> None: async def add_requests( self, requests: Sequence[str | Request], + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: """Track a call to the `add_requests` context helper.""" - specified_params = sum(1 for param in [rq_name, rq_alias, rq_id] if param is not None) + specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None) if specified_params > 1: - raise ValueError('Only one of `rq_name`, `rq_alias` or `rq_id` can be provided.') + raise ValueError('Only one of `rq_id`, `rq_name` or `rq_alias` can be provided.') self.add_requests_calls.append( - AddRequestsKwargs(requests=requests, rq_name=rq_name, rq_alias=rq_alias, rq_id=rq_id, **kwargs) + AddRequestsKwargs(requests=requests, rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs) ) async def push_data( @@ -328,9 +330,9 @@ class AddRequestsFunction(Protocol): def __call__( self, requests: Sequence[str | Request], + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call dunder method. @@ -338,12 +340,12 @@ def __call__( Args: requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to the corresponding `RequestQueue`. - rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can - be provided. - rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` - can be provided. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` + can be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` + can be provided. **kwargs: Additional keyword arguments. """ @@ -371,9 +373,9 @@ def __call__( label: str | None = None, user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -382,9 +384,9 @@ def __call__( self, *, requests: Sequence[str | Request] | None = None, + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: ... @@ -396,9 +398,9 @@ def __call__( user_data: dict[str, Any] | None = None, transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> Coroutine[None, None, None]: """Call enqueue links function. @@ -417,12 +419,12 @@ def __call__( - `'unchanged'` to use the original request options without modification. requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to the corresponding `RequestQueue`. - rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_name`, `rq_alias` or `rq_id` can - be provided. - rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_alias`, `rq_name` or `rq_id` - can be provided. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. + rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` + can be provided. + rq_alias: Alias of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` + can be provided. **kwargs: Additional keyword arguments. """ diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index b1a19f079a..1d384c0455 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -944,9 +944,9 @@ async def enqueue_links( transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None, requests: Sequence[str | Request] | None = None, + rq_id: str | None = None, rq_name: str | None = None, rq_alias: str | None = None, - rq_id: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: kwargs.setdefault('strategy', 'same-hostname') @@ -959,7 +959,7 @@ async def enqueue_links( ) # Add directly passed requests. await context.add_requests( - requests or list[str | Request](), rq_name=rq_name, rq_alias=rq_alias, rq_id=rq_id, **kwargs + requests or list[str | Request](), rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, **kwargs ) else: # Add requests from extracted links. @@ -970,9 +970,9 @@ async def enqueue_links( user_data=user_data, transform_request_function=transform_request_function, ), + rq_id=rq_id, rq_name=rq_name, rq_alias=rq_alias, - rq_id=rq_id, **kwargs, ) @@ -1254,13 +1254,13 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> origin = context.request.loaded_url or context.request.url for add_requests_call in result.add_requests_calls: + rq_id = add_requests_call.get('rq_id') rq_name = add_requests_call.get('rq_name') rq_alias = add_requests_call.get('rq_alias') - rq_id = add_requests_call.get('rq_id') - specified_params = sum(1 for param in [rq_name, rq_alias, rq_id] if param is not None) + specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None) if specified_params > 1: raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.') - if rq_name or rq_alias or rq_id: + if rq_id or rq_name or rq_alias: request_manager: RequestManager | RequestQueue = await RequestQueue.open( id=rq_id, name=rq_name, diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 2bba70a8af..6f4ccceb8d 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1578,7 +1578,7 @@ async def test_add_requests_with_rq_param(queue_name: str | None, queue_alias: s @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.add_requests(check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.add_requests(check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run(['https://start.placeholder.com']) @@ -1600,18 +1600,18 @@ async def handler(context: BasicCrawlingContext) -> None: ], ) async def test_add_requests_error_with_multi_params( - queue_name: str | None, queue_alias: str | None, queue_id: str | None + queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = BasicCrawler() @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: - with pytest.raises(ValueError, match='Only one of `rq_name`, `rq_alias` or `rq_id` can be set'): + with pytest.raises(ValueError, match='Only one of `rq_id`, `rq_name` or `rq_alias` can be set'): await context.add_requests( [Request.from_url('https://a.placeholder.com')], + rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, - rq_id=queue_id, ) await crawler.run(['https://start.placeholder.com']) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 120e9a0b7b..ee08be5a32 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -226,7 +226,7 @@ async def test_enqueue_links_with_rq_param( @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) @@ -282,7 +282,7 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias', 'queue_id'), + ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), @@ -291,13 +291,13 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: ], ) async def test_enqueue_links_error_with_multi_params( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, queue_id: str | None + server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = BeautifulSoupCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: BeautifulSoupCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index c5c3d62c6e..7d06bbdd0e 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -320,7 +320,7 @@ async def test_enqueue_links_with_rq_param( @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) @@ -362,7 +362,7 @@ async def test_enqueue_links_requests_with_rq_param( async def handler(context: ParselCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links( - requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' + requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all' ) await crawler.run([str(server_url / 'start_enqueue')]) @@ -376,7 +376,7 @@ async def handler(context: ParselCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias', 'queue_id'), + ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), @@ -385,13 +385,13 @@ async def handler(context: ParselCrawlingContext) -> None: ], ) async def test_enqueue_links_error_with_multi_params( - server_url: URL, http_client: HttpClient, queue_name: str | None, queue_alias: str | None, queue_id: str | None + server_url: URL, http_client: HttpClient, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = ParselCrawler(http_client=http_client) @crawler.router.default_handler async def handler(context: ParselCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 9f5ae5d58e..54b8765c8d 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -810,7 +810,7 @@ async def test_enqueue_links_with_rq_param( @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) @@ -852,7 +852,7 @@ async def test_enqueue_links_requests_with_rq_param( async def handler(context: PlaywrightCrawlingContext) -> None: visit_urls.add(context.request.url) await context.enqueue_links( - requests=check_requests, rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id, strategy='all' + requests=check_requests, rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias, strategy='all' ) await crawler.run([str(server_url / 'start_enqueue')]) @@ -866,7 +866,7 @@ async def handler(context: PlaywrightCrawlingContext) -> None: @pytest.mark.parametrize( - ('queue_name', 'queue_alias', 'queue_id'), + ('queue_id', 'queue_name', 'queue_alias'), [ pytest.param('named-queue', 'alias-queue', None, id='rq_name and rq_alias'), pytest.param('named-queue', None, 'id-queue', id='rq_name and rq_id'), @@ -875,13 +875,13 @@ async def handler(context: PlaywrightCrawlingContext) -> None: ], ) async def test_enqueue_links_error_with_multi_params( - server_url: URL, queue_name: str | None, queue_alias: str | None, queue_id: str | None + server_url: URL, queue_id: str | None, queue_name: str | None, queue_alias: str | None ) -> None: crawler = PlaywrightCrawler() @crawler.router.default_handler async def handler(context: PlaywrightCrawlingContext) -> None: with pytest.raises(ValueError, match='Cannot use both `rq_name` and `rq_alias`'): - await context.enqueue_links(rq_name=queue_name, rq_alias=queue_alias, rq_id=queue_id) + await context.enqueue_links(rq_id=queue_id, rq_name=queue_name, rq_alias=queue_alias) await crawler.run([str(server_url / 'start_enqueue')]) From 67ffa6214077c7ef2e2db64c90ab039046bef8a3 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 21 Sep 2025 20:36:33 +0000 Subject: [PATCH 6/7] fix docs --- src/crawlee/_types.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index c7cb43aa63..684f64d02e 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -338,8 +338,7 @@ def __call__( """Call dunder method. Args: - requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to - the corresponding `RequestQueue`. + requests: Requests to be added to the `RequestManager`. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` @@ -417,8 +416,7 @@ def __call__( - Modified `RequestOptions` to update the request configuration, - `'skip'` to exclude the request from being enqueued, - `'unchanged'` to use the original request options without modification. - requests: Requests to be added to the `RequestManager` or, if `rq_name` or `rq_alias` is specified, to - the corresponding `RequestQueue`. + requests: Requests to be added to the `RequestManager`. rq_id: ID of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` can be provided. rq_name: Name of the `RequestQueue` to add the requests to. Only one of `rq_id`, `rq_name` or `rq_alias` From 424ea7d00bd5019e6980d2a5ceabbb8f07e13fad Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Sun, 21 Sep 2025 20:46:54 +0000 Subject: [PATCH 7/7] drop rq after tests --- tests/unit/crawlers/_basic/test_basic_crawler.py | 2 ++ .../crawlers/_beautifulsoup/test_beautifulsoup_crawler.py | 4 ++++ tests/unit/crawlers/_parsel/test_parsel_crawler.py | 4 ++++ tests/unit/crawlers/_playwright/test_playwright_crawler.py | 4 ++++ 4 files changed, 14 insertions(+) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 6f4ccceb8d..7f864afbd4 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1589,6 +1589,8 @@ async def handler(context: BasicCrawlingContext) -> None: assert requests_from_queue == check_requests assert visit_urls == {'https://start.placeholder.com'} + await rq.drop() + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'queue_id'), diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index ee08be5a32..efe58665dd 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -237,6 +237,8 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), @@ -280,6 +282,8 @@ async def handler(context: BeautifulSoupCrawlingContext) -> None: assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'), diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 7d06bbdd0e..5f74b7b262 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -331,6 +331,8 @@ async def handler(context: ParselCrawlingContext) -> None: assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), @@ -374,6 +376,8 @@ async def handler(context: ParselCrawlingContext) -> None: assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'), diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 54b8765c8d..0bde7d55fd 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -821,6 +821,8 @@ async def handler(context: PlaywrightCrawlingContext) -> None: assert set(requests_from_queue) == {str(server_url / 'page_1'), str(server_url / 'sub_index')} assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_name', 'queue_alias', 'by_id'), @@ -864,6 +866,8 @@ async def handler(context: PlaywrightCrawlingContext) -> None: assert set(requests_from_queue) == set(check_requests) assert visit_urls == {str(server_url / 'start_enqueue')} + await rq.drop() + @pytest.mark.parametrize( ('queue_id', 'queue_name', 'queue_alias'),