Skip to content

Commit

Permalink
use new weak_ref in scrapy_poet's Injector to handle more cases
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Jan 18, 2024
1 parent 5f1d104 commit 3cb2290
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 18 deletions.
26 changes: 15 additions & 11 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from typing import Any, Callable, Dict, List, Sequence, Set
from weakref import WeakKeyDictionary

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
Expand Down Expand Up @@ -51,30 +50,35 @@ class ZyteApiProvider(PageObjectInputProvider):
JobPosting,
}

def __init__(self, injector):
super().__init__(injector)
self._cached_instances: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def update_cache(self, request: Request, mapping: Dict[Any, Any]) -> None:
if request not in self._cached_instances:
self._cached_instances[request] = {}
self._cached_instances[request].update(mapping)
if request not in self.injector.weak_cache:
self.injector.weak_cache[request] = {}
self.injector.weak_cache[request].update(mapping)

Check warning on line 59 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L57-L59

Added lines #L57 - L59 were not covered by tests

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
"""Makes a Zyte API request to provide BrowserResponse and/or item dependencies."""
# TODO what if ``response`` is already from Zyte API and contains something we need
results: List[Any] = []

for cls in list(to_provide):
item = self._cached_instances.get(request, {}).get(cls)
item = self.injector.weak_cache.get(request, {}).get(cls)

Check warning on line 68 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L68

Added line #L68 was not covered by tests
if item:
results.append(item)
to_provide.remove(cls)
elif cls == AnyResponse:
http_response = self.injector.weak_cache.get(request, {}).get(

Check warning on line 73 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L72-L73

Added lines #L72 - L73 were not covered by tests
HttpResponse
)
if http_response:
any_response = AnyResponse(response=http_response)
results.append(any_response)
self.update_cache(request, {AnyResponse: any_response})
to_provide.remove(cls)

Check warning on line 80 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L76-L80

Added lines #L76 - L80 were not covered by tests

if not to_provide:
return results

Expand Down Expand Up @@ -170,7 +174,7 @@ async def __call__( # noqa: C901
self.update_cache(request, {BrowserResponse: browser_response})

Check warning on line 174 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L173-L174

Added lines #L173 - L174 were not covered by tests

if AnyResponse in to_provide:
any_response = None
any_response = None # type: ignore[assignment]

Check warning on line 177 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L176-L177

Added lines #L176 - L177 were not covered by tests

if "browserHtml" in api_response.raw_api_response:
any_response = AnyResponse(

Check warning on line 180 in scrapy_zyte_api/providers.py

View check run for this annotation

Codecov / codecov/patch

scrapy_zyte_api/providers.py#L179-L180

Added lines #L179 - L180 were not covered by tests
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def get_version():
# Sync with [testenv:pinned-provider] @ tox.ini
"provider": [
"andi>=0.6.0",
"scrapy-poet>=0.19.0",
# "scrapy-poet>=0.19.0",
"scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet",
# "web-poet>=0.15.1",
"web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet",
"zyte-common-items>=0.8.0",
Expand Down
24 changes: 19 additions & 5 deletions tests/test_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,10 @@ def parse_(self, response: DummyResponse, page: SomePage):
assert type(item["page"].product) == Product


# The issue here is that HttpResponseProvider runs earlier than ScrapyZyteAPI.
# HttpResponseProvider doesn't know that it should not run since ScrapyZyteAPI
# could provide HttpResponse in anycase.
@pytest.mark.xfail(reason="Not supported yet", raises=AssertionError, strict=True)
@ensureDeferred
async def test_provider_any_response_product_extract_from_http_response_2(mockserver):
@attrs.define
Expand Down Expand Up @@ -688,10 +692,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
params = crawler.engine.downloader.handlers._handlers["http"].params

assert len(params) == 1
assert params[0].keys() == {"url", "HttpResponseBody"}
assert params[0].keys() == {
"url",
"httpResponseBody",
"customHttpRequestHeaders",
"httpResponseHeaders",
}

assert type(item["page"].response) == AnyResponse
assert type(item["page"].response.response) == BrowserResponse
assert type(item["page"].response.response) == HttpResponse
assert type(item["page"].http_response) == HttpResponse


Expand All @@ -715,10 +724,15 @@ def parse_(self, response: DummyResponse, page: SomePage):
params = crawler.engine.downloader.handlers._handlers["http"].params

assert len(params) == 2
assert params[0].keys() == {"url", "HttpResponseBody"}
assert params[1].keys() == {"url", "BrowserHtml"}
assert params[0].keys() == {
"url",
"httpResponseBody",
"customHttpRequestHeaders",
"httpResponseHeaders",
}
assert params[1].keys() == {"url", "browserHtml"}

assert type(item["page"].response) == AnyResponse
assert type(item["page"].response.response) == BrowserResponse
assert type(item["page"].response.response) == HttpResponse
assert type(item["page"].browser_response) == BrowserResponse
assert type(item["page"].http_response) == HttpResponse
3 changes: 2 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ deps =
# scrapy-poet >= 0.4.0 depends on scrapy >= 2.6.0
{[testenv:pinned-scrapy-2x6]deps}
andi==0.6.0
scrapy-poet==0.19.0
#scrapy-poet==0.19.0
scrapy-poet @ git+https://git@github.com/scrapinghub/scrapy-poet@weak-cache#egg=scrapy-poet
#web-poet==0.15.1
web-poet @ git+https://git@github.com/scrapinghub/web-poet@response#egg=web-poet
zyte-common-items==0.8.0
Expand Down

0 comments on commit 3cb2290

Please sign in to comment.