diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py index 19371666..fb04554f 100644 --- a/scrapy_zyte_api/providers.py +++ b/scrapy_zyte_api/providers.py @@ -1,5 +1,4 @@ from typing import Any, Callable, Dict, List, Sequence, Set -from weakref import WeakKeyDictionary from andi.typeutils import is_typing_annotated, strip_annotated from scrapy import Request @@ -51,30 +50,35 @@ class ZyteApiProvider(PageObjectInputProvider): JobPosting, } - def __init__(self, injector): - super().__init__(injector) - self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary() - def is_provided(self, type_: Callable) -> bool: return super().is_provided(strip_annotated(type_)) def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None: - if request not in self._cached_instances: - self._cached_instances[request] = {} - self._cached_instances[request].update(mapping) + if request not in self.injector.weak_cache: + self.injector.weak_cache[request] = {} + self.injector.weak_cache[request].update(mapping) async def __call__( # noqa: C901 self, to_provide: Set[Callable], request: Request, crawler: Crawler ) -> Sequence[Any]: """Makes a Zyte API request to provide BrowserResponse and/or item dependencies.""" - # TODO what if ``response`` is already from Zyte API and contains something we need results: List[Any] = [] for cls in list(to_provide): - item = self._cached_instances.get(request, {}).get(cls) + item = self.injector.weak_cache.get(request, {}).get(cls) if item: results.append(item) to_provide.remove(cls) + elif cls == AnyResponse: + http_response = self.injector.weak_cache.get(request, {}).get( + HttpResponse + ) + if http_response: + any_response = AnyResponse(response=http_response) + results.append(any_response) + self.update_cache(request, {AnyResponse: any_response}) + to_provide.remove(cls) + if not to_provide: return results @@ -170,7 +174,7 @@ async def __call__( # noqa: C901 self.update_cache(request, {BrowserResponse: browser_response}) if AnyResponse in to_provide: - any_response = None + any_response = None # type: ignore[assignment] if "browserHtml" in api_response.raw_api_response: any_response = AnyResponse( diff --git a/setup.py b/setup.py index a5c15648..f79c380a 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,8 @@ def get_version(): # Sync with [testenv:pinned-provider] @ tox.ini "provider": [ "andi>=0.6.0", - "scrapy-poet>=0.19.0", + # "scrapy-poet>=0.19.0", + "scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet", # "web-poet>=0.15.1", "web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet", "zyte-common-items>=0.8.0", diff --git a/tests/test_providers.py b/tests/test_providers.py index 9eb4e1bc..b0fd0d6a 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -551,6 +551,10 @@ def parse_(self, response: DummyResponse, page: SomePage): assert type(item["page"].product) == Product +# The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI. +# HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI +# could provide HttpResponse in anycase. +@pytest.mark.xfail(reason="Not supported yet", raises=AssertionError, strict=True) @ensureDeferred async def test_provider_any_response_product_extract_from_http_response_2(mockserver): @attrs.define @@ -688,10 +692,15 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 1 - assert params[0].keys() == {"url", "HttpResponseBody"} + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].response.response) == HttpResponse assert type(item["page"].http_response) == HttpResponse @@ -715,10 +724,15 @@ def parse_(self, response: DummyResponse, page: SomePage): params = crawler.engine.downloader.handlers._handlers["http"].params assert len(params) == 2 - assert params[0].keys() == {"url", "HttpResponseBody"} - assert params[1].keys() == {"url", "BrowserHtml"} + assert params[0].keys() == { + "url", + "httpResponseBody", + "customHttpRequestHeaders", + "httpResponseHeaders", + } + assert params[1].keys() == {"url", "browserHtml"} assert type(item["page"].response) == AnyResponse - assert type(item["page"].response.response) == BrowserResponse + assert type(item["page"].response.response) == HttpResponse assert type(item["page"].browser_response) == BrowserResponse assert type(item["page"].http_response) == HttpResponse diff --git a/tox.ini b/tox.ini index 3a4d38fe..d0da523d 100644 --- a/tox.ini +++ b/tox.ini @@ -89,7 +89,8 @@ deps = # scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0 {[testenv:pinned-scrapy-2x6]deps} andi==0.6.0 - scrapy-poet==0.19.0 + #scrapy-poet==0.19.0 + scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet #web-poet==0.15.1 web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet zyte-common-items==0.8.0