diff --git a/scrapy_poet/commands.py b/scrapy_poet/commands.py index 033f7718..e64ee544 100644 --- a/scrapy_poet/commands.py +++ b/scrapy_poet/commands.py @@ -1,7 +1,7 @@ import datetime import logging from pathlib import Path -from typing import Dict, Optional, Type +from typing import Optional, Type import andi import scrapy @@ -38,10 +38,9 @@ def build_instances_from_providers( request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): instances = yield super().build_instances_from_providers( - request, response, plan, prev_instances + request, response, plan ) if request.meta.get("savefixture", False): saved_dependencies.extend(instances.values()) diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index 9e9e5f5f..8f969963 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -18,7 +18,6 @@ from .page_input_providers import ( HttpClientProvider, HttpResponseProvider, - ItemProvider, PageParamsProvider, RequestUrlProvider, ResponseUrlProvider, @@ -36,7 +35,6 @@ RequestUrlProvider: 800, ResponseUrlProvider: 900, StatsProvider: 1000, - ItemProvider: 2000, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 7b6baa3f..d662ed0d 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -1,9 +1,10 @@ +import functools import inspect import logging import os import pprint import warnings -from typing import Any, Callable, Dict, List, Mapping, Optional, Set, cast +from typing import Any, Callable, Dict, List, Mapping, Optional, Set, Type, cast import andi from andi.typeutils import issubclass_safe @@ -13,12 +14,12 @@ from scrapy.settings import Settings from scrapy.statscollectors import MemoryStatsCollector, StatsCollector from scrapy.utils.conf import build_component_list -from scrapy.utils.defer import maybeDeferred_coro +from scrapy.utils.defer import deferred_from_coro, maybeDeferred_coro from scrapy.utils.misc import load_object from twisted.internet.defer import inlineCallbacks from web_poet import RulesRegistry from web_poet.page_inputs.http import request_fingerprint -from web_poet.pages import is_injectable +from web_poet.pages import ItemPage, is_injectable from web_poet.serialization.api import deserialize_leaf, load_class, serialize from web_poet.utils import get_fq_class_name @@ -147,15 +148,39 @@ def build_plan(self, request: Request) -> andi.Plan: # Callable[[Callable], Optional[Callable]] but the registry # returns the typing for ``dict.get()`` method. overrides=self.registry.overrides_for(request.url).get, # type: ignore[arg-type] + custom_builder_fn=self._get_item_builder(request), ) + def _get_item_builder( + self, request: Request + ) -> Callable[[Callable], Optional[Callable]]: + """Return a function suitable for passing as ``custom_builder_fn`` to ``andi.plan``. + + The returned function can map an item to a factory for that item based + on the registry. + """ + + @functools.lru_cache(maxsize=None) # to minimize the registry queries + def mapping_fn(item_cls: Callable) -> Optional[Callable]: + page_object_cls: Optional[Type[ItemPage]] = self.registry.page_cls_for_item( + request.url, cast(type, item_cls) + ) + if not page_object_cls: + return None + + async def item_factory(page: page_object_cls) -> item_cls: # type: ignore[valid-type] + return await page.to_item() # type: ignore[attr-defined] + + return item_factory + + return mapping_fn + @inlineCallbacks def build_instances( self, request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): """Build the instances dict from a plan including external dependencies.""" # First we build the external dependencies using the providers @@ -163,14 +188,20 @@ def build_instances( request, response, plan, - prev_instances, ) # All the remaining dependencies are internal so they can be built just # following the andi plan. for cls, kwargs_spec in plan.dependencies: if cls not in instances.keys(): - instances[cls] = cls(**kwargs_spec.kwargs(instances)) - cls_fqn = get_fq_class_name(cast(type, cls)) + result_cls: type = cast(type, cls) + if isinstance(cls, andi.CustomBuilder): + result_cls = cls.result_class_or_fn + instances[result_cls] = yield deferred_from_coro( + cls.factory(**kwargs_spec.kwargs(instances)) + ) + else: + instances[result_cls] = cls(**kwargs_spec.kwargs(instances)) + cls_fqn = get_fq_class_name(result_cls) self.crawler.stats.inc_value(f"poet/injector/{cls_fqn}") return instances @@ -181,10 +212,9 @@ def build_instances_from_providers( request: Request, response: Response, plan: andi.Plan, - prev_instances: Optional[Dict] = None, ): """Build dependencies handled by registered providers""" - instances: Dict[Callable, Any] = prev_instances or {} + instances: Dict[Callable, Any] = {} scrapy_provided_dependencies = self.available_dependencies_for_providers( request, response ) @@ -194,22 +224,11 @@ def build_instances_from_providers( provided_classes = { cls for cls in dependencies_set if provider.is_provided(cls) } - - # ignore already provided types if provider doesn't need to use them - if not provider.allow_prev_instances: - provided_classes -= instances.keys() + provided_classes -= instances.keys() # ignore already provided types if not provided_classes: continue - # If dependency instances were already made by previously invoked - # providers, don't try to build them again since it may result in - # incorrect values (e.g. PO modifying an item > 2 times). - required_deps = set(plan.dependencies[-1][1].values()) - built_deps = set(instances.keys()) - if required_deps and required_deps == built_deps: - continue - objs, fingerprint = [], None cache_hit = False if self.cache: @@ -245,8 +264,6 @@ def build_instances_from_providers( externally_provided=scrapy_provided_dependencies, full_final_kwargs=False, ).final_kwargs(scrapy_provided_dependencies) - if provider.allow_prev_instances: - kwargs.update({"prev_instances": instances}) try: # Invoke the provider to get the data objs = yield maybeDeferred_coro( diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index f81c0651..d5d70fd0 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -8,29 +8,12 @@ different providers in order to acquire data from multiple external sources, for example, from scrapy-playwright or from an API for automatic extraction. """ -import asyncio -from dataclasses import make_dataclass -from inspect import isclass -from typing import ( - Any, - Callable, - ClassVar, - Dict, - FrozenSet, - List, - Optional, - Set, - Type, - Union, -) +from typing import Any, Callable, ClassVar, FrozenSet, List, Set, Union from warnings import warn -from weakref import WeakKeyDictionary -import andi from scrapy import Request from scrapy.crawler import Crawler from scrapy.http import Response -from scrapy.utils.defer import maybe_deferred_to_future from web_poet import ( HttpClient, HttpRequest, @@ -43,13 +26,9 @@ Stats, ) from web_poet.page_inputs.stats import StatCollector, StatNum -from web_poet.pages import is_injectable from scrapy_poet.downloader import create_scrapy_downloader -from scrapy_poet.injection_errors import ( - MalformedProvidedClassesError, - ProviderDependencyDeadlockError, -) +from scrapy_poet.injection_errors import MalformedProvidedClassesError class PageObjectInputProvider: @@ -119,12 +98,6 @@ def __call__(self, to_provide, response: Response): provided_classes: Union[Set[Callable], Callable[[Callable], bool]] name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism - # If set to True, the Injector will not skip the Provider when the dependency has - # been built. Instead, the Injector will pass the previously built instances (by - # the other providers) to the Provider. The Provider can then choose to modify - # these previous instances before returning them to the Injector. - allow_prev_instances: bool = False - def is_provided(self, type_: Callable) -> bool: """ Return ``True`` if the given type is provided by this provider based @@ -265,122 +238,21 @@ def __call__(self, to_provide: Set[Callable], response: Response): class ItemProvider(PageObjectInputProvider): + provided_classes = set() name = "item" - template_deadlock_msg = ( - "Deadlock detected! A loop has been detected to " - "trying to resolve this plan: {plan}" - ) - - allow_prev_instances: bool = True - def __init__(self, injector): super().__init__(injector) - self.registry = self.injector.registry - - # The key that's used here is the ``scrapy.Request`` instance to ensure - # that the cached instances under it are properly garbage collected - # after processing such request. - self._cached_instances = WeakKeyDictionary() - - # This is only used when the reactor is ``AsyncioSelectorReactor`` since - # the ``asyncio.Future`` that it uses doesn't trigger a RecursionError - # unlike Twisted's Deferred. So we use this as a soft-proxy to recursion - # depth to check how many calls to ``self.injector.build_instances`` are - # made. - # Similar to ``_cached_instances`` above, the key is ``scrapy.Request``. - self._build_instances_call_counter = WeakKeyDictionary() - - def provided_classes(self, cls): - """If the item is in any of the ``to_return`` in the rules, then it can - be provided by using the corresponding page object in ``use``. - """ - return isclass(cls) and self.registry.search(to_return=cls) - - def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: - if request not in self._cached_instances: - self._cached_instances[request] = {} - self._cached_instances[request].update(mapping) - - def get_from_cache(self, request: Request, cls: Callable) -> Optional[Any]: - return self._cached_instances.get(request, {}).get(cls) - - def check_if_deadlock(self, request: Request) -> bool: - """Should only be used when ``AsyncioSelectorReactor`` is the reactor.""" - if request not in self._build_instances_call_counter: - self._build_instances_call_counter[request] = 0 - self._build_instances_call_counter[request] += 1 - - # If there are more than 100 calls to ``injector.build_instances()`` - # for a given request, it might be a deadlock. This limit is large - # enough since the dependency tree for item dependencies needing page - # objects and/or items wouldn't reach this far. - if self._build_instances_call_counter[request] > 100: - return True - return False + msg = "The ItemProvider now does nothing and you should disable it." + warn(msg, DeprecationWarning, stacklevel=2) async def __call__( self, to_provide: Set[Callable], request: Request, response: Response, - prev_instances: Dict, ) -> List[Any]: - results = [] - for cls in to_provide: - if item := self.get_from_cache(request, cls): - results.append(item) - continue - - page_object_cls = self.registry.page_cls_for_item(request.url, cls) - if not page_object_cls: - warn( - f"Can't find appropriate page object for {cls} item for " - f"url: '{request.url}'. Check the ApplyRules you're using." - ) - continue - - # https://github.com/scrapinghub/andi/issues/23#issuecomment-1331682180 - fake_call_signature = make_dataclass( - "FakeCallSignature", [("page_object", page_object_cls)] - ) - plan = andi.plan( - fake_call_signature, - is_injectable=is_injectable, - externally_provided=self.injector.is_class_provided_by_any_provider, - ) - - try: - deferred_or_future = maybe_deferred_to_future( - self.injector.build_instances( - request, response, plan, prev_instances - ) - ) - # RecursionError NOT raised when ``AsyncioSelectorReactor`` is used. - # Could be related: https://github.com/python/cpython/issues/93837 - - # Need to check before awaiting on the ``asyncio.Future`` - # before it gets stuck on a potential deadlock. - if asyncio.isfuture(deferred_or_future): - if self.check_if_deadlock(request): - raise ProviderDependencyDeadlockError( - self.template_deadlock_msg.format(plan=plan) - ) - - po_instances = await deferred_or_future - except RecursionError: - raise ProviderDependencyDeadlockError( - self.template_deadlock_msg.format(plan=plan) - ) - - page_object = po_instances[page_object_cls] - item = await page_object.to_item() - - self.update_cache(request, po_instances) - self.update_cache(request, {type(item): item}) - - results.append(item) - return results + return [] class ScrapyPoetStatCollector(StatCollector): diff --git a/setup.py b/setup.py index e2726246..0db70547 100755 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ package_data={"scrapy_poet": ["VERSION"]}, python_requires=">=3.8", install_requires=[ - "andi >= 0.5.0", + "andi >= 0.6.0", "attrs >= 21.3.0", "parsel >= 1.5.0", "scrapy >= 2.6.0", diff --git a/tests/test_injection.py b/tests/test_injection.py index 730d3541..65690173 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -11,7 +11,7 @@ from scrapy.http import Response from url_matcher import Patterns from url_matcher.util import get_domain -from web_poet import Injectable, ItemPage, RulesRegistry +from web_poet import Injectable, ItemPage, RulesRegistry, field from web_poet.mixins import ResponseShortcutsMixin from web_poet.rules import ApplyRule @@ -463,6 +463,74 @@ def callback( with pytest.raises(ValueError, match="Different instances of Cls1 requested"): yield from injector.build_instances(request, response, plan) + @inlineCallbacks + def test_build_callback_dependencies_minimize_provider_calls(self): + """Test that build_callback_dependencies does not call any given + provider more times than it needs when one provided class is requested + directly while another is a page object dependency requested through + an item.""" + + class ExpensiveDependency1: + pass + + class ExpensiveDependency2: + pass + + class ExpensiveProvider(PageObjectInputProvider): + provided_classes = {ExpensiveDependency1, ExpensiveDependency2} + + def __init__(self, injector): + super().__init__(injector) + self.call_count = 0 + + def __call__(self, to_provide): + self.call_count += 1 + if self.call_count > 1: + raise RuntimeError( + "The expensive dependency provider has been called " + "more than once." + ) + return [cls() for cls in to_provide] + + @attr.define + class MyItem(Injectable): + exp: ExpensiveDependency2 + i: int + + @attr.define + class MyPage(ItemPage[MyItem]): + expensive: ExpensiveDependency2 + + @field + def i(self): + return 42 + + @field + def exp(self): + return self.expensive + + def callback( + expensive: ExpensiveDependency1, + item: MyItem, + ): + pass + + providers = { + ExpensiveProvider: 2, + } + injector = get_injector_for_testing(providers) + injector.registry.add_rule(ApplyRule("", use=MyPage, to_return=MyItem)) + response = get_response_for_testing(callback) + + # This would raise RuntimeError if expectations are not met. + kwargs = yield from injector.build_callback_dependencies( + response.request, response + ) + + # Make sure the test does not simply pass because some dependencies were + # not injected at all. + assert set(kwargs.keys()) == {"expensive", "item"} + class Html(Injectable): url = "http://example.com" @@ -539,7 +607,7 @@ class TestInjectorStats: ), ( {"item": TestItem}, - set(), # there must be no stats as ItemProvider is not enabled + set(), # there must be no stats as TestItem is not in the registry ), ), ) @@ -562,11 +630,10 @@ def callback_factory(): assert set(poet_stats) == expected @inlineCallbacks - def test_po_provided_via_item(self, injector): + def test_po_provided_via_item(self): rules = [ApplyRule(Patterns(include=()), use=TestItemPage, to_return=TestItem)] registry = RulesRegistry(rules=rules) - providers = {"scrapy_poet.page_input_providers.ItemProvider": 10} - injector = get_injector_for_testing(providers, registry=registry) + injector = get_injector_for_testing({}, registry=registry) def callback(response: DummyResponse, item: TestItem): pass diff --git a/tests/test_providers.py b/tests/test_providers.py index 0ce9acdf..2b6cd0ed 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -2,6 +2,7 @@ from unittest import mock import attr +import pytest import scrapy from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider @@ -267,32 +268,13 @@ def test_page_params_provider(settings): assert results[0] == expected_data -def test_item_provider_cache(settings): - """Note that the bulk of the tests for the ``ItemProvider`` alongside the - ``Injector`` is tested in ``tests/test_web_poet_rules.py``. - - We'll only test its caching behavior here if its properly garbage collected. - """ - +def test_item_provider_deprecated(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler) - provider = ItemProvider(injector) - - assert len(provider._cached_instances) == 0 - - def inside(): - request = Request("https://example.com") - provider.update_cache(request, {Name: Name("test")}) - assert len(provider._cached_instances) == 1 - - cached_instance = provider.get_from_cache(request, Name) - assert isinstance(cached_instance, Name) - - # The cache should be empty after the ``inside`` scope has finished which - # means that the corresponding ``request`` and the contents under it are - # garbage collected. - inside() - assert len(provider._cached_instances) == 0 + msg = "The ItemProvider now does nothing and you should disable it." + with pytest.warns(DeprecationWarning, match=msg): + provider = ItemProvider(injector) + assert len(provider.provided_classes) == 0 def test_stats_provider(settings): diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py index 185c7929..cefd7441 100644 --- a/tests/test_web_poet_rules.py +++ b/tests/test_web_poet_rules.py @@ -73,9 +73,6 @@ class PageObjectCounterMixin: For example, a PO could have its ``.to_item()`` method called multiple times to produce the same item. This is extremely wasteful should there be any additional requests used to produce the item. - - ``ItemProvider`` should cache up such instances and prevent them from being - built again. """ instances: Dict[Type, Any] = defaultdict(list) @@ -146,6 +143,19 @@ def assert_warning_tokens(caught_warnings, expected_warning_tokens): assert all(results) +@inlineCallbacks +def assert_no_item(page: type) -> None: + # Starting Scrapy 2.7, there's better support for async callbacks. This + # means that errors aren't suppressed. + if is_min_scrapy_version("2.7.0"): + expected_msg = r"parse\(\) missing 1 required keyword-only argument: 'item'" + with pytest.raises(TypeError, match=expected_msg): + yield crawl_item_and_deps(page) + else: + item = yield crawl_item_and_deps(page) + assert item == (None, [{}]) + + @handle_urls(URL) class UrlMatchPage(ItemPage): async def to_item(self) -> dict: @@ -397,16 +407,7 @@ def test_basic_item_but_no_page_object() -> None: assigned to it in any of the given ``ApplyRule``, it would result to an error in the spider callback since """ - - # Starting Scrapy 2.7, there's better support for async callbacks. This - # means that errors aren't suppressed. - if is_min_scrapy_version("2.7.0"): - expected_msg = r"parse\(\) missing 1 required keyword-only argument: 'item'" - with pytest.raises(TypeError, match=expected_msg): - yield crawl_item_and_deps(ItemButNoPageObject) - else: - item = yield crawl_item_and_deps(ItemButNoPageObject) - assert item == (None, [{}]) + yield assert_no_item(ItemButNoPageObject) @attrs.define @@ -428,7 +429,7 @@ async def name(self) -> str: ) @inlineCallbacks def test_item_using_asyncio() -> None: - """This ensures that ``ItemProvider`` works properly for page objects using + """This ensures that the injector works properly for page objects using the ``asyncio`` functionalities. """ item, deps = yield crawl_item_and_deps(DelayedProduct) @@ -457,19 +458,8 @@ def name(self) -> str: def test_basic_item_with_page_object_but_different_url() -> None: """If an item has been requested and a page object can produce it, but the URL pattern is different, the item won't be produced at all. - - For these cases, a warning should be issued since the user might have written - some incorrect URL Pattern for the ``ApplyRule``. """ - msg = ( - "Can't find appropriate page object for item for url: " - ) - with warnings.catch_warnings(record=True) as caught_warnings: - item, deps = yield crawl_item_and_deps(ItemWithPageObjectButForDifferentUrl) - assert any(True for w in caught_warnings if msg in str(w.message)) - assert item is None - assert not deps + yield assert_no_item(ItemWithPageObjectButForDifferentUrl) @attrs.define @@ -800,10 +790,6 @@ def name(self) -> str: return "replaced product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) @inlineCallbacks def test_item_to_return_in_handle_urls() -> None: """Even if ``@handle_urls`` could derive the value for the ``to_return`` @@ -816,14 +802,8 @@ class vs the class that is actually returned. Using the ``to_return`` """ item, deps = yield crawl_item_and_deps(ReplacedProduct) assert item == Product(name="replaced product name") - assert_deps(deps, {"page": ReplacedProductPage}) - + assert_deps(deps, {"item": Product}) -@inlineCallbacks -def test_item_to_return_in_handle_urls_other() -> None: - """Remaining tests for ``test_item_to_return_in_handle_urls()`` which are - not expected to be xfail. - """ # Requesting the underlying item class from the PO should still work. item, deps = yield crawl_item_and_deps(Product) assert item == Product(name="product name") @@ -859,10 +839,6 @@ def name(self) -> str: return "subclass replaced product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) @inlineCallbacks def test_item_to_return_in_handle_urls_subclass() -> None: """Same case as with the ``test_item_to_return_in_handle_urls()`` case above @@ -870,14 +846,8 @@ def test_item_to_return_in_handle_urls_subclass() -> None: """ item, deps = yield crawl_item_and_deps(SubclassReplacedProduct) assert item == ParentReplacedProduct(name="subclass replaced product name") - assert_deps(deps, {"page": SubclassReplacedProductPage}) - + assert_deps(deps, {"item": ParentReplacedProduct}) -@inlineCallbacks -def test_item_to_return_in_handle_urls_subclass_others() -> None: - """Remaining tests for ``test_item_to_return_in_handle_urls_subclass()`` - which are not expected to be xfail. - """ # Requesting the underlying item class from the parent PO should still work. item, deps = yield crawl_item_and_deps(ParentReplacedProduct) assert item == ParentReplacedProduct(name="parent replaced product name") @@ -906,10 +876,6 @@ def name(self) -> str: return "standalone product name" -@pytest.mark.xfail( - reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " - "ItemProvide has received a different type of item class from the page object." -) @inlineCallbacks def test_item_to_return_standalone() -> None: """Same case as with ``test_item_to_return_in_handle_urls()`` above but the @@ -917,14 +883,7 @@ def test_item_to_return_standalone() -> None: """ item, deps = yield crawl_item_and_deps(StandaloneProduct) assert item == {"name": "standalone product name"} - assert_deps(deps, {"page": StandaloneProductPage}) - - -@inlineCallbacks -def test_item_to_return_standalone_others() -> None: - """Remaining tests for ``test_item_to_return_standalone()`` - which are not expected to be xfail. - """ + assert_deps(deps, {"item": dict}) # calling the actual page object should still work item, deps = yield crawl_item_and_deps(StandaloneProductPage) @@ -1368,7 +1327,7 @@ class EggItem: @handle_urls(URL) @attrs.define -class ChickenDeadlockPage(ItemPage[ChickenItem]): +class ChickenCyclePage(ItemPage[ChickenItem]): other_injected_item: EggItem @field @@ -1382,7 +1341,7 @@ def other(self) -> str: @handle_urls(URL) @attrs.define -class EggDeadlockPage(ItemPage[EggItem]): +class EggCyclePage(ItemPage[EggItem]): other_injected_item: ChickenItem @field @@ -1395,30 +1354,30 @@ def other(self) -> str: @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_a(caplog) -> None: - """Items with page objects which depend on each other resulting in a deadlock +def test_page_object_with_item_dependency_cycle_a(caplog) -> None: + """Items with page objects which depend on each other resulting in a plan cycle should have a corresponding error raised. """ yield crawl_item_and_deps(ChickenItem) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_b(caplog) -> None: +def test_page_object_with_item_dependency_cycle_b(caplog) -> None: yield crawl_item_and_deps(EggItem) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_c(caplog) -> None: - yield crawl_item_and_deps(ChickenDeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_c(caplog) -> None: + yield crawl_item_and_deps(ChickenCyclePage) + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_d(caplog) -> None: - yield crawl_item_and_deps(EggDeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_d(caplog) -> None: + yield crawl_item_and_deps(EggCyclePage) + assert "Cyclic dependency found" in caplog.text @attrs.define @@ -1435,7 +1394,7 @@ class Egg2Item: @handle_urls(URL) @attrs.define -class Chicken2DeadlockPage(ItemPage[Chicken2Item]): +class Chicken2CyclePage(ItemPage[Chicken2Item]): other_injected_item: Egg2Item @field @@ -1449,8 +1408,8 @@ def other(self) -> str: @handle_urls(URL) @attrs.define -class Egg2DeadlockPage(ItemPage[Egg2Item]): - other_injected_page: Chicken2DeadlockPage +class Egg2CyclePage(ItemPage[Egg2Item]): + other_injected_page: Chicken2CyclePage @field def name(self) -> str: @@ -1463,30 +1422,30 @@ async def other(self) -> str: @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_a(caplog) -> None: - """Same with ``test_page_object_with_item_dependency_deadlock()`` but one +def test_page_object_with_item_dependency_cycle_2_a(caplog) -> None: + """Same with ``test_page_object_with_item_dependency_cycle()`` but one of the page objects requires a page object instead of an item. """ yield crawl_item_and_deps(Chicken2Item) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_b(caplog) -> None: +def test_page_object_with_item_dependency_cycle_2_b(caplog) -> None: yield crawl_item_and_deps(Egg2Item) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_c(caplog) -> None: - yield crawl_item_and_deps(Chicken2DeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_2_c(caplog) -> None: + yield crawl_item_and_deps(Chicken2CyclePage) + assert "Cyclic dependency found" in caplog.text @inlineCallbacks -def test_page_object_with_item_dependency_deadlock_2_d(caplog) -> None: - yield crawl_item_and_deps(Egg2DeadlockPage) - assert "ProviderDependencyDeadlockError" in caplog.text +def test_page_object_with_item_dependency_cycle_2_d(caplog) -> None: + yield crawl_item_and_deps(Egg2CyclePage) + assert "Cyclic dependency found" in caplog.text @attrs.define @@ -1539,7 +1498,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_item( but there's no provider for the original item """ yield crawl_item_and_deps(Mobius) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "NonProvidableError" in caplog.text @inlineCallbacks @@ -1550,7 +1509,7 @@ def test_page_object_returning_item_which_is_also_a_dep_but_no_provider_po( but tests the PO instead of the item. """ yield crawl_item_and_deps(MobiusPage) - assert "ProviderDependencyDeadlockError" in caplog.text + assert "NonProvidableError" in caplog.text @attrs.define @@ -1599,11 +1558,14 @@ def test_page_object_returning_item_which_is_also_a_dep_2() -> None: assert item == Kangaroo(name="(modified by Joey) data from KangarooProvider") assert_deps(deps, {"item": Kangaroo}) - # calling the actual page objects should still work + # both page objects are called item, deps = yield crawl_item_and_deps(KangarooPage, override_settings=settings) - assert item == Kangaroo(name="(modified) data from KangarooProvider") + assert item == Kangaroo( + name="(modified) (modified by Joey) data from KangarooProvider" + ) assert_deps(deps, {"page": KangarooPage}) + # calling the actual page object should still work item, deps = yield crawl_item_and_deps(JoeyPage, override_settings=settings) assert item == Kangaroo(name="(modified by Joey) data from KangarooProvider") assert_deps(deps, {"page": JoeyPage}) @@ -1700,10 +1662,10 @@ def test_created_apply_rules() -> None: ), ApplyRule(URL, use=ProductDeepDependencyPage, to_return=MainProductC), ApplyRule(URL, use=ProductDuplicateDeepDependencyPage, to_return=MainProductD), - ApplyRule(URL, use=ChickenDeadlockPage, to_return=ChickenItem), - ApplyRule(URL, use=EggDeadlockPage, to_return=EggItem), - ApplyRule(URL, use=Chicken2DeadlockPage, to_return=Chicken2Item), - ApplyRule(URL, use=Egg2DeadlockPage, to_return=Egg2Item), + ApplyRule(URL, use=ChickenCyclePage, to_return=ChickenItem), + ApplyRule(URL, use=EggCyclePage, to_return=EggItem), + ApplyRule(URL, use=Chicken2CyclePage, to_return=Chicken2Item), + ApplyRule(URL, use=Egg2CyclePage, to_return=Egg2Item), ApplyRule(URL, use=MobiusPage, to_return=Mobius), ApplyRule(URL, use=KangarooPage, to_return=Kangaroo), ApplyRule(Patterns([URL], priority=600), use=JoeyPage, to_return=Kangaroo), diff --git a/tox.ini b/tox.ini index b3830667..9b499072 100644 --- a/tox.ini +++ b/tox.ini @@ -17,7 +17,7 @@ commands = [pinned] deps = {[testenv]deps} - andi==0.5.0 + andi==0.6.0 attrs==21.3.0 parsel==1.5.0 sqlitedict==1.5.0