Skip to content

Commit b69580e

Browse files
committed
use new weak_ref in scrapy_poet's Injector to handle more cases
1 parent 5f1d104 commit b69580e

File tree

4 files changed

+37
-17
lines changed

4 files changed

+37
-17
lines changed

scrapy_zyte_api/providers.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from typing import Any, Callable, Dict, List, Sequence, Set
2-
from weakref import WeakKeyDictionary
32

43
from andi.typeutils import is_typing_annotated, strip_annotated
54
from scrapy import Request
@@ -51,30 +50,35 @@ class ZyteApiProvider(PageObjectInputProvider):
5150
JobPosting,
5251
}
5352

54-
def __init__(self, injector):
55-
super().__init__(injector)
56-
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()
57-
5853
def is_provided(self, type_: Callable) -> bool:
5954
return super().is_provided(strip_annotated(type_))
6055

6156
def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
62-
if request not in self._cached_instances:
63-
self._cached_instances[request] = {}
64-
self._cached_instances[request].update(mapping)
57+
if request not in self.injector.weak_cache:
58+
self.injector.weak_cache[request] = {}
59+
self.injector.weak_cache[request].update(mapping)
6560

6661
async def __call__( # noqa: C901
6762
self, to_provide: Set[Callable], request: Request, crawler: Crawler
6863
) -> Sequence[Any]:
6964
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
70-
# TODO what if ``response`` is already from Zyte API and contains something we need
7165
results: List[Any] = []
7266

7367
for cls in list(to_provide):
74-
item = self._cached_instances.get(request, {}).get(cls)
68+
item = self.injector.weak_cache.get(request, {}).get(cls)
7569
if item:
7670
results.append(item)
7771
to_provide.remove(cls)
72+
elif cls == AnyResponse:
73+
http_response = self.injector.weak_cache.get(request, {}).get(
74+
HttpResponse
75+
)
76+
if http_response:
77+
any_response = AnyResponse(response=http_response)
78+
results.append(any_response)
79+
self.update_cache(request, {AnyResponse: any_response})
80+
to_provide.remove(cls)
81+
7882
if not to_provide:
7983
return results
8084

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ def get_version():
3131
# Sync with [testenv:pinned-provider] @ tox.ini
3232
"provider": [
3333
"andi>=0.6.0",
34-
"scrapy-poet>=0.19.0",
34+
# "scrapy-poet>=0.19.0",
35+
"scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
3536
# "web-poet>=0.15.1",
3637
"web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet",
3738
"zyte-common-items>=0.8.0",

tests/test_providers.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,10 @@ def parse_(self, response: DummyResponse, page: SomePage):
551551
assert type(item["page"].product) == Product
552552

553553

554+
# The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI.
555+
# HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI
556+
# could provide HttpResponse in anycase.
557+
@pytest.mark.xfail(reason="Not supported yet", raises=AssertionError, strict=True)
554558
@ensureDeferred
555559
async def test_provider_any_response_product_extract_from_http_response_2(mockserver):
556560
@attrs.define
@@ -688,10 +692,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
688692
params = crawler.engine.downloader.handlers._handlers["http"].params
689693

690694
assert len(params) == 1
691-
assert params[0].keys() == {"url", "HttpResponseBody"}
695+
assert params[0].keys() == {
696+
"url",
697+
"httpResponseBody",
698+
"customHttpRequestHeaders",
699+
"httpResponseHeaders",
700+
}
692701

693702
assert type(item["page"].response) == AnyResponse
694-
assert type(item["page"].response.response) == BrowserResponse
703+
assert type(item["page"].response.response) == HttpResponse
695704
assert type(item["page"].http_response) == HttpResponse
696705

697706

@@ -715,10 +724,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
715724
params = crawler.engine.downloader.handlers._handlers["http"].params
716725

717726
assert len(params) == 2
718-
assert params[0].keys() == {"url", "HttpResponseBody"}
719-
assert params[1].keys() == {"url", "BrowserHtml"}
727+
assert params[0].keys() == {
728+
"url",
729+
"httpResponseBody",
730+
"customHttpRequestHeaders",
731+
"httpResponseHeaders",
732+
}
733+
assert params[1].keys() == {"url", "browserHtml"}
720734

721735
assert type(item["page"].response) == AnyResponse
722-
assert type(item["page"].response.response) == BrowserResponse
736+
assert type(item["page"].response.response) == HttpResponse
723737
assert type(item["page"].browser_response) == BrowserResponse
724738
assert type(item["page"].http_response) == HttpResponse

tox.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ deps =
8989
# scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0
9090
{[testenv:pinned-scrapy-2x6]deps}
9191
andi==0.6.0
92-
scrapy-poet==0.19.0
92+
#scrapy-poet==0.19.0
93+
scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet
9394
#web-poet==0.15.1
9495
web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet
9596
zyte-common-items==0.8.0

0 commit comments

Comments
 (0)