Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove ItemProvider’s Response dependency #151

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion scrapy_poet/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ def parse(self, response: DummyResponse):
:class:`~.DummyResponse` to your parser instead.
"""

def __init__(self, url: str, request: Optional[Request] = None):
def __init__(self, url: Optional[str] = None, request: Optional[Request] = None):
if url is None:
if request is None:
raise ValueError(
"One of the parameters, url or request, must have a "
"non-default value."
)
url = request.url
super().__init__(url=url, request=request)


Expand Down
6 changes: 4 additions & 2 deletions scrapy_poet/downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
from .page_input_providers import (
HttpClientProvider,
HttpResponseProvider,
ItemProvider,
PageParamsProvider,
RequestItemProvider,
RequestUrlProvider,
ResponseItemProvider,
ResponseUrlProvider,
)
from .utils import create_registry_instance, is_min_scrapy_version
Expand All @@ -34,7 +35,8 @@
PageParamsProvider: 700,
RequestUrlProvider: 800,
ResponseUrlProvider: 900,
ItemProvider: 2000,
RequestItemProvider: 2000,
ResponseItemProvider: 2100,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")
Expand Down
38 changes: 18 additions & 20 deletions scrapy_poet/injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pprint
import warnings
from functools import partial
from typing import Any, Callable, Dict, List, Mapping, Optional, Set, cast

import andi
Expand All @@ -29,7 +30,10 @@
NonCallableProviderError,
UndeclaredProvidedTypeError,
)
from scrapy_poet.page_input_providers import PageObjectInputProvider
from scrapy_poet.page_input_providers import (
SCRAPY_PROVIDED_CLASSES,
PageObjectInputProvider,
)
from scrapy_poet.utils import is_min_scrapy_version

from .utils import create_registry_instance, get_scrapy_data_path
Expand Down Expand Up @@ -117,7 +121,7 @@ def discover_callback_providers(
result = set()
for cls, _ in plan:
for provider in self.providers:
if provider.is_provided(cls):
if provider.is_provided(cls, request):
result.add(provider)

return result
Expand All @@ -143,7 +147,9 @@ def build_plan(self, request: Request) -> andi.Plan:
return andi.plan(
callback,
is_injectable=is_injectable,
externally_provided=self.is_class_provided_by_any_provider,
externally_provided=partial(
self.is_class_provided_by_any_provider, request=request
),
# Ignore the type since andi.plan expects overrides to be
# Callable[[Callable], Optional[Callable]] but the registry
# returns the typing for ``dict.get()`` method.
Expand Down Expand Up @@ -193,7 +199,7 @@ def build_instances_from_providers(
objs: List[Any]
for provider in self.providers:
provided_classes = {
cls for cls in dependencies_set if provider.is_provided(cls)
cls for cls in dependencies_set if provider.is_provided(cls, request)
}

# ignore already provided types if provider doesn't need to use them
Expand Down Expand Up @@ -301,7 +307,7 @@ def check_all_providers_are_callable(providers):

def is_class_provided_by_any_provider_fn(
providers: List[PageObjectInputProvider],
) -> Callable[[Callable], bool]:
) -> Callable[[Callable, Request], bool]:
"""
Return a function of type ``Callable[[Type], bool]`` that return
True if the given type is provided by any of the registered providers.
Expand All @@ -311,9 +317,11 @@ def is_class_provided_by_any_provider_fn(
joined together for efficiency.
"""
sets_of_types: Set[Callable] = set() # caching all sets found
individual_is_callable: List[Callable[[Callable], bool]] = [
sets_of_types.__contains__
]

def in_set(type, request):
return type in sets_of_types

individual_is_callable: List[Callable[[Callable], bool]] = [in_set]
for provider in providers:
provided_classes = provider.provided_classes

Expand All @@ -328,9 +336,9 @@ def is_class_provided_by_any_provider_fn(
f"or 'callable'"
)

def is_provided_fn(type: Callable) -> bool:
def is_provided_fn(type: Callable, request: Request) -> bool:
for is_provided in individual_is_callable:
if is_provided(type):
if is_provided(type, request):
return True
return False

Expand Down Expand Up @@ -394,16 +402,6 @@ def is_callback_requiring_scrapy_response(
return True


SCRAPY_PROVIDED_CLASSES = {
Spider,
Request,
Response,
Crawler,
Settings,
StatsCollector,
}


def is_provider_requiring_scrapy_response(provider):
"""Check whether injectable provider makes use of a valid
:class:`scrapy.http.Response`.
Expand Down
73 changes: 67 additions & 6 deletions scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@
"""
import asyncio
from dataclasses import make_dataclass
from functools import partial
from inspect import isclass
from typing import Any, Callable, ClassVar, Dict, List, Optional, Set, Type, Union
from warnings import warn
from weakref import WeakKeyDictionary

import andi
from scrapy import Request
from scrapy import Request, Spider
from scrapy.crawler import Crawler
from scrapy.http import Response
from scrapy.settings import Settings
from scrapy.statscollectors import StatsCollector
from scrapy.utils.defer import maybe_deferred_to_future
from web_poet import (
HttpClient,
Expand All @@ -32,12 +35,22 @@
from web_poet.page_inputs.stats import StatCollector, StatNum
from web_poet.pages import is_injectable

from scrapy_poet.api import DummyResponse
from scrapy_poet.downloader import create_scrapy_downloader
from scrapy_poet.injection_errors import (
MalformedProvidedClassesError,
ProviderDependencyDeadlockError,
)

SCRAPY_PROVIDED_CLASSES = {
Spider,
Request,
Response,
Crawler,
Settings,
StatsCollector,
}


class PageObjectInputProvider:
"""
Expand Down Expand Up @@ -112,15 +125,15 @@ def __call__(self, to_provide, response: Response):
# these previous instances before returning them to the Injector.
allow_prev_instances: bool = False

def is_provided(self, type_: Callable) -> bool:
def is_provided(self, type_: Callable, request: Request) -> bool:
"""
Return ``True`` if the given type is provided by this provider based
on the value of the attribute ``provided_classes``
"""
if isinstance(self.provided_classes, Set):
return type_ in self.provided_classes
elif callable(self.provided_classes):
return self.provided_classes(type_)
return self.provided_classes(type_, request)
else:
raise MalformedProvidedClassesError(
f"Unexpected type {type_!r} for 'provided_classes' attribute of"
Expand Down Expand Up @@ -228,7 +241,7 @@ def __call__(self, to_provide: Set[Callable], response: Response):
return [ResponseUrl(url=response.url)]


class ItemProvider(PageObjectInputProvider):
class ResponseItemProvider(PageObjectInputProvider):
name = "item"

template_deadlock_msg = (
Expand All @@ -240,6 +253,7 @@ class ItemProvider(PageObjectInputProvider):

def __init__(self, injector):
super().__init__(injector)
self._injector = injector
self.registry = self.injector.registry

# The key that's used here is the ``scrapy.Request`` instance to ensure
Expand All @@ -255,11 +269,34 @@ def __init__(self, injector):
# Similar to ``_cached_instances`` above, the key is ``scrapy.Request``.
self._build_instances_call_counter = WeakKeyDictionary()

def provided_classes(self, cls):
def _requires_scrapy_response(self, injectable, request):
plan = andi.plan(
injectable,
is_injectable=is_injectable,
externally_provided=partial(
self._injector.is_class_provided_by_any_provider, request=request
),
overrides=self._injector.registry.overrides_for(request.url).get, # type: ignore[arg-type]
)
for dependency, _ in plan.dependencies:
for provider in self._injector.providers:
if provider.is_provided(dependency):
if self._injector.is_provider_requiring_scrapy_response[provider]:
return True
continue

return False

def provided_classes(self, cls, request):
"""If the item is in any of the ``to_return`` in the rules, then it can
be provided by using the corresponding page object in ``use``.
"""
return isclass(cls) and self.registry.search(to_return=cls)
if not isclass(cls):
return False
rules = self.registry.search(to_return=cls)
if not rules:
return False
return self._requires_scrapy_response(rules[0].use, request)

def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None:
if request not in self._cached_instances:
Expand Down Expand Up @@ -347,6 +384,30 @@ async def __call__(
return results


class RequestItemProvider(ResponseItemProvider):
async def __call__(
self,
to_provide: Set[Callable],
request: Request,
prev_instances: Dict,
) -> List[Any]:
response = DummyResponse(request=request)
await super()(
to_provide=to_provide,
request=request,
response=response,
prev_instances=prev_instances,
)

def provided_classes(self, cls, request):
if not isclass(cls):
return False
rules = self.registry.search(to_return=cls)
if not rules:
return False
return not self._requires_scrapy_response(rules[0].use, request)


class ScrapyPoetStatCollector(StatCollector):
def __init__(self, stats):
self._stats = stats
Expand Down
11 changes: 6 additions & 5 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,10 @@ class WrapCls(Injectable):
class TestInjector:
def test_constructor(self):
injector = get_injector_for_testing(get_providers_for_testing())
assert injector.is_class_provided_by_any_provider(ClsReqResponse)
assert injector.is_class_provided_by_any_provider(Cls1)
assert not injector.is_class_provided_by_any_provider(ClsNoProvided)
request = Request("https://example.com")
assert injector.is_class_provided_by_any_provider(ClsReqResponse, request)
assert injector.is_class_provided_by_any_provider(Cls1, request)
assert not injector.is_class_provided_by_any_provider(ClsNoProvided, request)

for provider in injector.providers:
assert (
Expand Down Expand Up @@ -343,7 +344,7 @@ class TestInjectorStats:
),
(
{"item": TestItem},
set(), # there must be no stats as ItemProvider is not enabled
set(), # there must be no stats as item providers are not enabled
),
),
)
Expand All @@ -369,7 +370,7 @@ def callback_factory():
def test_po_provided_via_item(self, injector):
rules = [ApplyRule(Patterns(include=()), use=TestItemPage, to_return=TestItem)]
registry = RulesRegistry(rules=rules)
providers = {"scrapy_poet.page_input_providers.ItemProvider": 10}
providers = {"scrapy_poet.page_input_providers.ResponseItemProvider": 10}
injector = get_injector_for_testing(providers, registry=registry)

def callback(response: DummyResponse, item: TestItem):
Expand Down
44 changes: 36 additions & 8 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler
from twisted.python.failure import Failure
from web_poet import HttpClient, HttpResponse
from web_poet import HttpClient, HttpResponse, ItemPage, WebPage
from web_poet.rules import ApplyRule, RulesRegistry
from web_poet.serialization import SerializedLeafData, register_serialization

from scrapy_poet import HttpResponseProvider
from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS
from scrapy_poet.injection import Injector
from scrapy_poet.page_input_providers import (
HttpClientProvider,
ItemProvider,
PageObjectInputProvider,
PageParamsProvider,
RequestItemProvider,
ResponseItemProvider,
StatsProvider,
)
from scrapy_poet.utils.mockserver import get_ephemeral_port
Expand Down Expand Up @@ -228,16 +231,41 @@ def test_page_params_provider(settings):
assert results[0] == expected_data


def test_item_provider_cache(settings):
"""Note that the bulk of the tests for the ``ItemProvider`` alongside the
``Injector`` is tested in ``tests/test_web_poet_rules.py``.
def test_item_providers(settings):
"""We have 2 item providers, one that does not require a Scrapy response
and one that does, and only one of them should be used for any given
injectable based on whether or not its dependency tree requires a Scrapy
response."""

We'll only test its caching behavior here if its properly garbage collected.
"""
class RequestItem:
pass

class ResponseItem:
pass

crawler = get_crawler(Spider, settings)
request_rule = ApplyRule(for_patterns="*", use=ItemPage, to_return=RequestItem)
response_rule = ApplyRule(for_patterns="*", use=WebPage, to_return=ResponseItem)
registry = RulesRegistry(rules=[request_rule, response_rule])
injector = Injector(crawler, registry=registry, default_providers=DEFAULT_PROVIDERS)
request = Request("https://example.com")

request_provider = RequestItemProvider(injector)
assert request_provider.provided_classes(RequestItem, request) is True
assert request_provider.provided_classes(ResponseItem, request) is False

response_provider = ResponseItemProvider(injector)
assert response_provider.provided_classes(RequestItem, request) is False
assert response_provider.provided_classes(ResponseItem, request) is True

# TODO: Test chain, i.e. request item | response item. → page → request item | response item.
# TODO: Test overrides that should change things for one URL but not for another.


def test_item_provider_cache(settings):
crawler = get_crawler(Spider, settings)
injector = Injector(crawler)
provider = ItemProvider(injector)
provider = ResponseItemProvider(injector)

assert len(provider._cached_instances) == 0

Expand Down
2 changes: 1 addition & 1 deletion tests/test_scrapy_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from scrapy.http import Request
from web_poet.pages import WebPage

from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES
from scrapy_poet.page_input_providers import (
SCRAPY_PROVIDED_CLASSES,
HttpResponseProvider,
PageObjectInputProvider,
)
Expand Down
Loading
Loading