From 90684d7103658946f26320f1d78a67b7f54927d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 15 Jun 2023 13:56:46 +0200 Subject: [PATCH 1/4] =?UTF-8?q?Remove=20ItemProvider=E2=80=99s=20Response?= =?UTF-8?q?=20dependency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scrapy_poet/api.py | 9 ++++++++- scrapy_poet/page_input_providers.py | 5 +++-- tests/test_providers.py | 15 +++++++++++---- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index 5a5b95db..f66fd2d9 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -27,7 +27,14 @@ def parse(self, response: DummyResponse): :class:`~.DummyResponse` to your parser instead. """ - def __init__(self, url: str, request=Optional[Request]): + def __init__(self, url: Optional[str] = None, request: Optional[Request] = None): + if url is None: + if request is None: + raise ValueError( + "One of the parameters, url or request, must have a " + "non-default value." + ) + url = request.url super().__init__(url=url, request=request) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index ce390bed..daa226da 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -44,6 +44,7 @@ ) from web_poet.pages import is_injectable +from scrapy_poet.api import DummyResponse from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.injection_errors import ( MalformedProvidedClassesError, @@ -365,7 +366,6 @@ async def __call__( self, to_provide: Set[Callable], request: Request, - response: Response, ) -> List[Any]: results = [] for cls in to_provide: @@ -392,9 +392,10 @@ async def __call__( externally_provided=self.injector.is_class_provided_by_any_provider, ) + dummy_response = DummyResponse(request=request) try: deferred_or_future = maybe_deferred_to_future( - self.injector.build_instances(request, response, plan) + self.injector.build_instances(request, dummy_response, plan) ) # RecursionError NOT raised when ``AsyncioSelectorReactor`` is used. # Could be related: https://github.com/python/cpython/issues/93837 diff --git a/tests/test_providers.py b/tests/test_providers.py index 5d018c77..c36ca257 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -248,13 +248,20 @@ def test_page_params_provider(settings): assert results[0] == expected_data -def test_item_provider_cache(settings): +def test_item_provider(settings): """Note that the bulk of the tests for the ``ItemProvider`` alongside the - ``Injector`` is tested in ``tests/test_web_poet_rules.py``. + ``Injector`` is tested in ``tests/test_web_poet_rules.py``.""" + crawler = get_crawler(Spider, settings) + injector = Injector(crawler) + provider = ItemProvider(injector) + request = scrapy.http.Request("https://example.com") - We'll only test its caching behavior here if its properly garbage collected. - """ + # The fact that no exception is raised below proves that a Response + # parameter is not required by ItemProvider. + provider(set(), request) + +def test_item_provider_cache(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler) provider = ItemProvider(injector) From f4b6c743f02730fa048034bc60facf7276f621f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Nov 2023 15:25:21 +0100 Subject: [PATCH 2/4] Fix ItemProvider test call --- tests/test_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_providers.py b/tests/test_providers.py index 43f8c02e..b900206d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -238,7 +238,7 @@ def test_item_provider(settings): # The fact that no exception is raised below proves that a Response # parameter is not required by ItemProvider. - provider(set(), request) + provider(set(), request, {}) def test_item_provider_cache(settings): From cace09ceb1e651502263f66bee7b12a372689b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Nov 2023 17:03:41 +0100 Subject: [PATCH 3/4] WIP --- scrapy_poet/downloadermiddlewares.py | 6 ++- scrapy_poet/injection.py | 15 ++----- scrapy_poet/page_input_providers.py | 67 +++++++++++++++++++++++++--- tests/test_injection.py | 4 +- tests/test_providers.py | 42 ++++++++++++----- tests/test_scrapy_dependencies.py | 2 +- tests/test_web_poet_rules.py | 4 +- 7 files changed, 105 insertions(+), 35 deletions(-) diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index d7340f1c..3aa7be5f 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -18,9 +18,10 @@ from .page_input_providers import ( HttpClientProvider, HttpResponseProvider, - ItemProvider, PageParamsProvider, + RequestItemProvider, RequestUrlProvider, + ResponseItemProvider, ResponseUrlProvider, ) from .utils import create_registry_instance, is_min_scrapy_version @@ -34,7 +35,8 @@ PageParamsProvider: 700, RequestUrlProvider: 800, ResponseUrlProvider: 900, - ItemProvider: 2000, + RequestItemProvider: 2000, + ResponseItemProvider: 2100, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 92de6a12..84504924 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -29,7 +29,10 @@ NonCallableProviderError, UndeclaredProvidedTypeError, ) -from scrapy_poet.page_input_providers import PageObjectInputProvider +from scrapy_poet.page_input_providers import ( + SCRAPY_PROVIDED_CLASSES, + PageObjectInputProvider, +) from scrapy_poet.utils import is_min_scrapy_version from .utils import create_registry_instance, get_scrapy_data_path @@ -394,16 +397,6 @@ def is_callback_requiring_scrapy_response( return True -SCRAPY_PROVIDED_CLASSES = { - Spider, - Request, - Response, - Crawler, - Settings, - StatsCollector, -} - - def is_provider_requiring_scrapy_response(provider): """Check whether injectable provider makes use of a valid :class:`scrapy.http.Response`. diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 6a71eb8f..a6fc5394 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -16,9 +16,11 @@ from weakref import WeakKeyDictionary import andi -from scrapy import Request +from scrapy import Request, Spider from scrapy.crawler import Crawler from scrapy.http import Response +from scrapy.settings import Settings +from scrapy.statscollectors import StatsCollector from scrapy.utils.defer import maybe_deferred_to_future from web_poet import ( HttpClient, @@ -39,6 +41,15 @@ ProviderDependencyDeadlockError, ) +SCRAPY_PROVIDED_CLASSES = { + Spider, + Request, + Response, + Crawler, + Settings, + StatsCollector, +} + class PageObjectInputProvider: """ @@ -229,7 +240,7 @@ def __call__(self, to_provide: Set[Callable], response: Response): return [ResponseUrl(url=response.url)] -class ItemProvider(PageObjectInputProvider): +class ResponseItemProvider(PageObjectInputProvider): name = "item" template_deadlock_msg = ( @@ -241,6 +252,7 @@ class ItemProvider(PageObjectInputProvider): def __init__(self, injector): super().__init__(injector) + self._injector = injector self.registry = self.injector.registry # The key that's used here is the ``scrapy.Request`` instance to ensure @@ -256,11 +268,32 @@ def __init__(self, injector): # Similar to ``_cached_instances`` above, the key is ``scrapy.Request``. self._build_instances_call_counter = WeakKeyDictionary() + def _requires_scrapy_response(self, injectable): + plan = andi.plan( + injectable, + is_injectable=is_injectable, + externally_provided=SCRAPY_PROVIDED_CLASSES, + ) + for dependency, _ in plan.dependencies: + for provider in self._injector.providers: + if provider.is_provided(dependency): + if self._injector.is_provider_requiring_scrapy_response[provider]: + return True + continue + raise ValueError(f"{injectable}: {plan.dependencies}") + # Gives ValueError: : [], why is HttpResponse not detected?! + return False + def provided_classes(self, cls): """If the item is in any of the ``to_return`` in the rules, then it can be provided by using the corresponding page object in ``use``. """ - return isclass(cls) and self.registry.search(to_return=cls) + if not isclass(cls): + return False + rules = self.registry.search(to_return=cls) + if not rules: + return False + return self._requires_scrapy_response(rules[0].use) def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: if request not in self._cached_instances: @@ -288,6 +321,7 @@ async def __call__( self, to_provide: Set[Callable], request: Request, + response: Response, prev_instances: Dict, ) -> List[Any]: results = [] @@ -314,11 +348,10 @@ async def __call__( externally_provided=self.injector.is_class_provided_by_any_provider, ) - dummy_response = DummyResponse(request=request) try: deferred_or_future = maybe_deferred_to_future( self.injector.build_instances( - request, dummy_response, plan, prev_instances + request, response, plan, prev_instances ) ) # RecursionError NOT raised when ``AsyncioSelectorReactor`` is used. @@ -348,6 +381,30 @@ async def __call__( return results +class RequestItemProvider(ResponseItemProvider): + async def __call__( + self, + to_provide: Set[Callable], + request: Request, + prev_instances: Dict, + ) -> List[Any]: + response = DummyResponse(request=request) + await super()( + to_provide=to_provide, + request=request, + response=response, + prev_instances=prev_instances, + ) + + def provided_classes(self, cls): + if not isclass(cls): + return False + rules = self.registry.search(to_return=cls) + if not rules: + return False + return not self._requires_scrapy_response(rules[0].use) + + class ScrapyPoetStatCollector(StatCollector): def __init__(self, stats): self._stats = stats diff --git a/tests/test_injection.py b/tests/test_injection.py index 9a0f0eba..21e9d0b8 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -343,7 +343,7 @@ class TestInjectorStats: ), ( {"item": TestItem}, - set(), # there must be no stats as ItemProvider is not enabled + set(), # there must be no stats as item providers are not enabled ), ), ) @@ -369,7 +369,7 @@ def callback_factory(): def test_po_provided_via_item(self, injector): rules = [ApplyRule(Patterns(include=()), use=TestItemPage, to_return=TestItem)] registry = RulesRegistry(rules=rules) - providers = {"scrapy_poet.page_input_providers.ItemProvider": 10} + providers = {"scrapy_poet.page_input_providers.ResponseItemProvider": 10} injector = get_injector_for_testing(providers, registry=registry) def callback(response: DummyResponse, item: TestItem): diff --git a/tests/test_providers.py b/tests/test_providers.py index b900206d..e107817d 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -8,16 +8,18 @@ from scrapy.settings import Settings from scrapy.utils.test import get_crawler from twisted.python.failure import Failure -from web_poet import HttpClient, HttpResponse +from web_poet import HttpClient, HttpResponse, ItemPage, WebPage +from web_poet.rules import ApplyRule, RulesRegistry from web_poet.serialization import SerializedLeafData, register_serialization from scrapy_poet import HttpResponseProvider from scrapy_poet.injection import Injector from scrapy_poet.page_input_providers import ( HttpClientProvider, - ItemProvider, PageObjectInputProvider, PageParamsProvider, + RequestItemProvider, + ResponseItemProvider, StatsProvider, ) from scrapy_poet.utils.mockserver import get_ephemeral_port @@ -228,23 +230,39 @@ def test_page_params_provider(settings): assert results[0] == expected_data -def test_item_provider(settings): - """Note that the bulk of the tests for the ``ItemProvider`` alongside the - ``Injector`` is tested in ``tests/test_web_poet_rules.py``.""" +def test_item_providers(settings): + """We have 2 item providers, one that does not require a Scrapy response + and one that does, and only one of them should be used for any given + injectable based on whether or not its dependency tree requires a Scrapy + response.""" + + class RequestItem: + pass + + class ResponseItem: + pass + crawler = get_crawler(Spider, settings) - injector = Injector(crawler) - provider = ItemProvider(injector) - request = scrapy.http.Request("https://example.com") + request_rule = ApplyRule(for_patterns="*", use=ItemPage, to_return=RequestItem) + response_rule = ApplyRule(for_patterns="*", use=WebPage, to_return=ResponseItem) + registry = RulesRegistry(rules=[request_rule, response_rule]) + injector = Injector(crawler, registry=registry) + + request_provider = RequestItemProvider(injector) + # assert request_provider.provided_classes(RequestItem) is True + assert request_provider.provided_classes(ResponseItem) is False + + response_provider = ResponseItemProvider(injector) + assert response_provider.provided_classes(RequestItem) is False + assert response_provider.provided_classes(ResponseItem) is True - # The fact that no exception is raised below proves that a Response - # parameter is not required by ItemProvider. - provider(set(), request, {}) + # TODO: Test chain, i.e. request item | response item. → page → request item | response item. def test_item_provider_cache(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler) - provider = ItemProvider(injector) + provider = ResponseItemProvider(injector) assert len(provider._cached_instances) == 0 diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 186d9765..4549efb2 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -5,8 +5,8 @@ from scrapy.http import Request from web_poet.pages import WebPage -from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES from scrapy_poet.page_input_providers import ( + SCRAPY_PROVIDED_CLASSES, HttpResponseProvider, PageObjectInputProvider, ) diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index afab22d2..292352cd 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -74,7 +74,7 @@ class PageObjectCounterMixin: to produce the same item. This is extremely wasteful should there be any additional requests used to produce the item. - ``ItemProvider`` should cache up such instances and prevent them from being + Item providers should cache up such instances and prevent them from being built again. """ @@ -432,7 +432,7 @@ async def name(self) -> str: ) @inlineCallbacks def test_item_using_asyncio() -> None: - """This ensures that ``ItemProvider`` works properly for page objects using + """This ensures that item providers work properly for page objects using the ``asyncio`` functionalities. """ item, deps = yield crawl_item_and_deps(DelayedProduct) From b41b3062339f035d5d126b5cb0a763053f1ef765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 Nov 2023 17:43:08 +0100 Subject: [PATCH 4/4] WIP --- scrapy_poet/injection.py | 23 ++++++++++++++--------- scrapy_poet/page_input_providers.py | 23 +++++++++++++---------- tests/test_injection.py | 7 ++++--- tests/test_providers.py | 13 ++++++++----- 4 files changed, 39 insertions(+), 27 deletions(-) diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 84504924..9bfdeb77 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -3,6 +3,7 @@ import os import pprint import warnings +from functools import partial from typing import Any, Callable, Dict, List, Mapping, Optional, Set, cast import andi @@ -120,7 +121,7 @@ def discover_callback_providers( result = set() for cls, _ in plan: for provider in self.providers: - if provider.is_provided(cls): + if provider.is_provided(cls, request): result.add(provider) return result @@ -146,7 +147,9 @@ def build_plan(self, request: Request) -> andi.Plan: return andi.plan( callback, is_injectable=is_injectable, - externally_provided=self.is_class_provided_by_any_provider, + externally_provided=partial( + self.is_class_provided_by_any_provider, request=request + ), # Ignore the type since andi.plan expects overrides to be # Callable[[Callable], Optional[Callable]] but the registry # returns the typing for ``dict.get()`` method. @@ -196,7 +199,7 @@ def build_instances_from_providers( objs: List[Any] for provider in self.providers: provided_classes = { - cls for cls in dependencies_set if provider.is_provided(cls) + cls for cls in dependencies_set if provider.is_provided(cls, request) } # ignore already provided types if provider doesn't need to use them @@ -304,7 +307,7 @@ def check_all_providers_are_callable(providers): def is_class_provided_by_any_provider_fn( providers: List[PageObjectInputProvider], -) -> Callable[[Callable], bool]: +) -> Callable[[Callable, Request], bool]: """ Return a function of type ``Callable[[Type], bool]`` that return True if the given type is provided by any of the registered providers. @@ -314,9 +317,11 @@ def is_class_provided_by_any_provider_fn( joined together for efficiency. """ sets_of_types: Set[Callable] = set() # caching all sets found - individual_is_callable: List[Callable[[Callable], bool]] = [ - sets_of_types.__contains__ - ] + + def in_set(type, request): + return type in sets_of_types + + individual_is_callable: List[Callable[[Callable], bool]] = [in_set] for provider in providers: provided_classes = provider.provided_classes @@ -331,9 +336,9 @@ def is_class_provided_by_any_provider_fn( f"or 'callable'" ) - def is_provided_fn(type: Callable) -> bool: + def is_provided_fn(type: Callable, request: Request) -> bool: for is_provided in individual_is_callable: - if is_provided(type): + if is_provided(type, request): return True return False diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index a6fc5394..f5fc4fb8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -10,6 +10,7 @@ """ import asyncio from dataclasses import make_dataclass +from functools import partial from inspect import isclass from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Type, Union from warnings import warn @@ -124,7 +125,7 @@ def __call__(self, to_provide, response: Response): # these previous instances before returning them to the Injector. allow_prev_instances: bool = False - def is_provided(self, type_: Callable) -> bool: + def is_provided(self, type_: Callable, request: Request) -> bool: """ Return ``True`` if the given type is provided by this provider based on the value of the attribute ``provided_classes`` @@ -132,7 +133,7 @@ def is_provided(self, type_: Callable) -> bool: if isinstance(self.provided_classes, Set): return type_ in self.provided_classes elif callable(self.provided_classes): - return self.provided_classes(type_) + return self.provided_classes(type_, request) else: raise MalformedProvidedClassesError( f"Unexpected type {type_!r} for 'provided_classes' attribute of" @@ -268,11 +269,14 @@ def __init__(self, injector): # Similar to ``_cached_instances`` above, the key is ``scrapy.Request``. self._build_instances_call_counter = WeakKeyDictionary() - def _requires_scrapy_response(self, injectable): + def _requires_scrapy_response(self, injectable, request): plan = andi.plan( injectable, is_injectable=is_injectable, - externally_provided=SCRAPY_PROVIDED_CLASSES, + externally_provided=partial( + self._injector.is_class_provided_by_any_provider, request=request + ), + overrides=self._injector.registry.overrides_for(request.url).get, # type: ignore[arg-type] ) for dependency, _ in plan.dependencies: for provider in self._injector.providers: @@ -280,11 +284,10 @@ def _requires_scrapy_response(self, injectable): if self._injector.is_provider_requiring_scrapy_response[provider]: return True continue - raise ValueError(f"{injectable}: {plan.dependencies}") - # Gives ValueError: : [], why is HttpResponse not detected?! + return False - def provided_classes(self, cls): + def provided_classes(self, cls, request): """If the item is in any of the ``to_return`` in the rules, then it can be provided by using the corresponding page object in ``use``. """ @@ -293,7 +296,7 @@ def provided_classes(self, cls): rules = self.registry.search(to_return=cls) if not rules: return False - return self._requires_scrapy_response(rules[0].use) + return self._requires_scrapy_response(rules[0].use, request) def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: if request not in self._cached_instances: @@ -396,13 +399,13 @@ async def __call__( prev_instances=prev_instances, ) - def provided_classes(self, cls): + def provided_classes(self, cls, request): if not isclass(cls): return False rules = self.registry.search(to_return=cls) if not rules: return False - return not self._requires_scrapy_response(rules[0].use) + return not self._requires_scrapy_response(rules[0].use, request) class ScrapyPoetStatCollector(StatCollector): diff --git a/tests/test_injection.py b/tests/test_injection.py index 21e9d0b8..814dccf6 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -101,9 +101,10 @@ class WrapCls(Injectable): class TestInjector: def test_constructor(self): injector = get_injector_for_testing(get_providers_for_testing()) - assert injector.is_class_provided_by_any_provider(ClsReqResponse) - assert injector.is_class_provided_by_any_provider(Cls1) - assert not injector.is_class_provided_by_any_provider(ClsNoProvided) + request = Request("https://example.com") + assert injector.is_class_provided_by_any_provider(ClsReqResponse, request) + assert injector.is_class_provided_by_any_provider(Cls1, request) + assert not injector.is_class_provided_by_any_provider(ClsNoProvided, request) for provider in injector.providers: assert ( diff --git a/tests/test_providers.py b/tests/test_providers.py index e107817d..ba4a5632 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -13,6 +13,7 @@ from web_poet.serialization import SerializedLeafData, register_serialization from scrapy_poet import HttpResponseProvider +from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS from scrapy_poet.injection import Injector from scrapy_poet.page_input_providers import ( HttpClientProvider, @@ -246,17 +247,19 @@ class ResponseItem: request_rule = ApplyRule(for_patterns="*", use=ItemPage, to_return=RequestItem) response_rule = ApplyRule(for_patterns="*", use=WebPage, to_return=ResponseItem) registry = RulesRegistry(rules=[request_rule, response_rule]) - injector = Injector(crawler, registry=registry) + injector = Injector(crawler, registry=registry, default_providers=DEFAULT_PROVIDERS) + request = Request("https://example.com") request_provider = RequestItemProvider(injector) - # assert request_provider.provided_classes(RequestItem) is True - assert request_provider.provided_classes(ResponseItem) is False + assert request_provider.provided_classes(RequestItem, request) is True + assert request_provider.provided_classes(ResponseItem, request) is False response_provider = ResponseItemProvider(injector) - assert response_provider.provided_classes(RequestItem) is False - assert response_provider.provided_classes(ResponseItem) is True + assert response_provider.provided_classes(RequestItem, request) is False + assert response_provider.provided_classes(ResponseItem, request) is True # TODO: Test chain, i.e. request item | response item. → page → request item | response item. + # TODO: Test overrides that should change things for one URL but not for another. def test_item_provider_cache(settings):