|
4 | 4 |
|
5 | 5 | pytest.importorskip("scrapy_poet")
|
6 | 6 |
|
| 7 | +import asyncio |
| 8 | + |
7 | 9 | import attrs
|
8 | 10 | from pytest_twisted import ensureDeferred
|
9 | 11 | from scrapy import Request, Spider
|
10 | 12 | from scrapy_poet import DummyResponse
|
| 13 | +from scrapy_poet.injection import Injector |
11 | 14 | from scrapy_poet.utils.testing import HtmlResource, crawl_single_item
|
12 | 15 | from scrapy_poet.utils.testing import create_scrapy_settings as _create_scrapy_settings
|
13 |
| -from twisted.internet import reactor |
| 16 | +from twisted.internet import defer, reactor |
14 | 17 | from twisted.web.client import Agent, readBody
|
15 |
| -from web_poet import BrowserHtml, BrowserResponse, ItemPage, field, handle_urls |
| 18 | +from web_poet import ( |
| 19 | + AnyResponse, |
| 20 | + BrowserHtml, |
| 21 | + BrowserResponse, |
| 22 | + HttpResponse, |
| 23 | + ItemPage, |
| 24 | + field, |
| 25 | + handle_urls, |
| 26 | +) |
16 | 27 | from zyte_common_items import BasePage, Product
|
17 | 28 |
|
18 | 29 | from scrapy_zyte_api._annotations import ExtractFrom
|
19 | 30 | from scrapy_zyte_api.providers import ZyteApiProvider
|
20 | 31 |
|
21 |
| -from . import SETTINGS |
| 32 | +from . import SETTINGS, get_crawler |
22 | 33 | from .mockserver import get_ephemeral_port
|
23 | 34 |
|
24 | 35 |
|
@@ -263,3 +274,110 @@ def parse_(self, response: DummyResponse, page: AnnotatedProductPage): # type:
|
263 | 274 | item, _, _ = await crawl_single_item(AnnotatedZyteAPISpider, HtmlResource, settings)
|
264 | 275 | assert item is None
|
265 | 276 | assert "Multiple different extractFrom specified for product" in caplog.text
|
| 277 | + |
| 278 | + |
| 279 | +@defer.inlineCallbacks |
| 280 | +def run_provider(server, to_provide, settings_dict=None, request_meta=None): |
| 281 | + class AnyResponseSpider(Spider): |
| 282 | + name = "any_response" |
| 283 | + |
| 284 | + request = Request(server.urljoin("/some-page"), meta=request_meta) |
| 285 | + settings = create_scrapy_settings() |
| 286 | + settings["ZYTE_API_URL"] = server.urljoin("/") |
| 287 | + if settings_dict: |
| 288 | + settings.update(settings_dict) |
| 289 | + crawler = get_crawler(settings, AnyResponseSpider) |
| 290 | + yield from crawler.engine.open_spider(crawler.spider) |
| 291 | + injector = Injector(crawler) |
| 292 | + provider = ZyteApiProvider(injector) |
| 293 | + |
| 294 | + coro = provider(to_provide, request, crawler) |
| 295 | + results = yield defer.Deferred.fromFuture(asyncio.ensure_future(coro)) |
| 296 | + |
| 297 | + return results |
| 298 | + |
| 299 | + |
| 300 | +@defer.inlineCallbacks |
| 301 | +def test_provider_any_response(mockserver): |
| 302 | + # Use only one instance of the mockserver for faster tests. |
| 303 | + def provide(*args, **kwargs): |
| 304 | + return run_provider(mockserver, *args, **kwargs) |
| 305 | + |
| 306 | + results = yield provide(set()) |
| 307 | + assert results == [] |
| 308 | + |
| 309 | + # Having only AnyResponse without any of the other responses to re-use |
| 310 | + # does not result in anything. |
| 311 | + results = yield provide( |
| 312 | + { |
| 313 | + AnyResponse, |
| 314 | + } |
| 315 | + ) |
| 316 | + assert results == [] |
| 317 | + |
| 318 | + # Same case as above, since no response is available. |
| 319 | + results = yield provide({AnyResponse, Product}) |
| 320 | + assert len(results) == 1 |
| 321 | + assert type(results[0]) == Product |
| 322 | + |
| 323 | + # AnyResponse should re-use BrowserResponse if available. |
| 324 | + results = yield provide({AnyResponse, BrowserResponse}) |
| 325 | + assert len(results) == 2 |
| 326 | + assert type(results[0]) == BrowserResponse |
| 327 | + assert type(results[1]) == AnyResponse |
| 328 | + assert id(results[0]) == id(results[1].response) |
| 329 | + |
| 330 | + # AnyResponse should re-use BrowserHtml if available. |
| 331 | + results = yield provide({AnyResponse, BrowserHtml}) |
| 332 | + assert len(results) == 2 |
| 333 | + assert type(results[0]) == BrowserHtml |
| 334 | + assert type(results[1]) == AnyResponse |
| 335 | + assert results[0] == results[1].response.html # diff instance due to casting |
| 336 | + |
| 337 | + results = yield provide({AnyResponse, BrowserResponse, BrowserHtml}) |
| 338 | + assert len(results) == 3 |
| 339 | + assert type(results[0]) == BrowserHtml |
| 340 | + assert type(results[1]) == BrowserResponse |
| 341 | + assert type(results[2]) == AnyResponse |
| 342 | + assert results[0] == results[1].html # diff instance due to casting |
| 343 | + assert results[0] == results[2].response.html |
| 344 | + |
| 345 | + # NOTES: This is hard to test in this setup and would result in being empty. |
| 346 | + # This will be tested in a spider-setup instead so that HttpResponseProvider |
| 347 | + # can participate. |
| 348 | + # results = yield provide({AnyResponse, HttpResponse}) |
| 349 | + # assert results == [] |
| 350 | + |
| 351 | + # For the following cases, extraction source isn't available in `to_provided` |
| 352 | + # but are in the `*.extractFrom` parameter. |
| 353 | + |
| 354 | + settings_dict = { |
| 355 | + "ZYTE_API_PROVIDER_PARAMS": {"productOptions": {"extractFrom": "browserHtml"}} |
| 356 | + } |
| 357 | + |
| 358 | + results = yield provide({AnyResponse, Product}, settings_dict) |
| 359 | + assert len(results) == 2 |
| 360 | + assert type(results[0]) == AnyResponse |
| 361 | + assert type(results[0].response) == BrowserResponse |
| 362 | + assert type(results[1]) == Product |
| 363 | + |
| 364 | + results = yield provide({AnyResponse, BrowserHtml, Product}, settings_dict) |
| 365 | + assert len(results) == 3 |
| 366 | + assert type(results[0]) == BrowserHtml |
| 367 | + assert type(results[1]) == AnyResponse |
| 368 | + assert type(results[1].response) == BrowserResponse |
| 369 | + assert type(results[2]) == Product |
| 370 | + assert results[0] == results[1].response.html # diff instance due to casting |
| 371 | + |
| 372 | + settings_dict = { |
| 373 | + "ZYTE_API_PROVIDER_PARAMS": { |
| 374 | + "productOptions": {"extractFrom": "httpResponseBody"} |
| 375 | + } |
| 376 | + } |
| 377 | + request_meta = {"zyte_api": {"httpResponseBody": True, "httpResponseHeaders": True}} |
| 378 | + |
| 379 | + results = yield provide({AnyResponse, Product}, settings_dict, request_meta) |
| 380 | + assert len(results) == 2 |
| 381 | + assert type(results[0]) == AnyResponse |
| 382 | + assert type(results[0].response) == HttpResponse |
| 383 | + assert type(results[1]) == Product |
0 commit comments