|
27 | 27 | from zyte_common_items import BasePage, Product
|
28 | 28 |
|
29 | 29 | from scrapy_zyte_api._annotations import ExtractFrom
|
| 30 | +from scrapy_zyte_api.handler import ScrapyZyteAPIDownloadHandler |
30 | 31 | from scrapy_zyte_api.providers import ZyteApiProvider
|
31 | 32 |
|
32 | 33 | from . import SETTINGS, get_crawler
|
@@ -382,3 +383,342 @@ def provide(*args, **kwargs):
|
382 | 383 | assert type(results[0]) == AnyResponse
|
383 | 384 | assert type(results[0].response) == HttpResponse
|
384 | 385 | assert type(results[1]) == Product
|
| 386 | + |
| 387 | + |
| 388 | +class RecordingHandler(ScrapyZyteAPIDownloadHandler): |
| 389 | + """Subclasses the original handler in order to record the Zyte API parameters |
| 390 | + used for each downloading request, as well as counting the number of Zyte API |
| 391 | + requests. |
| 392 | + """ |
| 393 | + |
| 394 | + def __init__(self, *args, **kwargs): |
| 395 | + super().__init__(*args, **kwargs) |
| 396 | + self.params = [] |
| 397 | + |
| 398 | + def _log_request(self, params): |
| 399 | + self.params.append(params) |
| 400 | + |
| 401 | + |
| 402 | +def provider_settings(server): |
| 403 | + settings = create_scrapy_settings() |
| 404 | + settings["ZYTE_API_URL"] = server.urljoin("/") |
| 405 | + settings["ZYTE_API_TRANSPARENT_MODE"] = True |
| 406 | + settings["SCRAPY_POET_PROVIDERS"] = {ZyteApiProvider: 1100} |
| 407 | + settings["DOWNLOAD_HANDLERS"]["http"] = RecordingHandler |
| 408 | + return settings |
| 409 | + |
| 410 | + |
| 411 | +@ensureDeferred |
| 412 | +async def test_provider_any_response_only(mockserver): |
| 413 | + @attrs.define |
| 414 | + class SomePage(BasePage): |
| 415 | + response: AnyResponse |
| 416 | + |
| 417 | + class ZyteAPISpider(Spider): |
| 418 | + def start_requests(self): |
| 419 | + yield Request(self.url, callback=self.parse_) |
| 420 | + |
| 421 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 422 | + yield {"page": page} |
| 423 | + |
| 424 | + settings = provider_settings(mockserver) |
| 425 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 426 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 427 | + |
| 428 | + assert len(params) == 1 |
| 429 | + assert params[0].keys() == {"url"} |
| 430 | + assert item is None |
| 431 | + |
| 432 | + |
| 433 | +@ensureDeferred |
| 434 | +async def test_provider_any_response_product(mockserver): |
| 435 | + @attrs.define |
| 436 | + class SomePage(BasePage): |
| 437 | + response: AnyResponse |
| 438 | + product: Product |
| 439 | + |
| 440 | + class ZyteAPISpider(Spider): |
| 441 | + def start_requests(self): |
| 442 | + yield Request(self.url, callback=self.parse_) |
| 443 | + |
| 444 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 445 | + yield {"page": page} |
| 446 | + |
| 447 | + settings = provider_settings(mockserver) |
| 448 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 449 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 450 | + |
| 451 | + assert len(params) == 1 |
| 452 | + assert params[0].keys() == {"url", "product"} |
| 453 | + assert item is None |
| 454 | + |
| 455 | + |
| 456 | +@ensureDeferred |
| 457 | +async def test_provider_any_response_product_extract_from_browser_html(mockserver): |
| 458 | + @attrs.define |
| 459 | + class SomePage(BasePage): |
| 460 | + response: AnyResponse |
| 461 | + product: Product |
| 462 | + |
| 463 | + class ZyteAPISpider(Spider): |
| 464 | + def start_requests(self): |
| 465 | + yield Request(self.url, callback=self.parse_) |
| 466 | + |
| 467 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 468 | + yield {"page": page} |
| 469 | + |
| 470 | + settings = provider_settings(mockserver) |
| 471 | + settings["ZYTE_API_PROVIDER_PARAMS"] = { |
| 472 | + "productOptions": {"extractFrom": "browserHtml"} |
| 473 | + } |
| 474 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 475 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 476 | + |
| 477 | + assert len(params) == 1 |
| 478 | + assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} |
| 479 | + |
| 480 | + assert type(item["page"].response) == AnyResponse |
| 481 | + assert type(item["page"].response.response) == BrowserResponse |
| 482 | + assert type(item["page"].product) == Product |
| 483 | + |
| 484 | + |
| 485 | +@ensureDeferred |
| 486 | +async def test_provider_any_response_product_extract_from_browser_html_2(mockserver): |
| 487 | + @attrs.define |
| 488 | + class SomePage(BasePage): |
| 489 | + response: AnyResponse |
| 490 | + browser_response: BrowserResponse |
| 491 | + product: Product |
| 492 | + |
| 493 | + class ZyteAPISpider(Spider): |
| 494 | + def start_requests(self): |
| 495 | + yield Request(self.url, callback=self.parse_) |
| 496 | + |
| 497 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 498 | + yield {"page": page} |
| 499 | + |
| 500 | + settings = provider_settings(mockserver) |
| 501 | + settings["ZYTE_API_PROVIDER_PARAMS"] = { |
| 502 | + "productOptions": {"extractFrom": "browserHtml"} |
| 503 | + } |
| 504 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 505 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 506 | + |
| 507 | + assert len(params) == 1 |
| 508 | + assert params[0].keys() == {"url", "product", "browserHtml", "productOptions"} |
| 509 | + |
| 510 | + assert type(item["page"].response) == AnyResponse |
| 511 | + assert type(item["page"].response.response) == BrowserResponse |
| 512 | + assert type(item["page"].browser_response) == BrowserResponse |
| 513 | + assert type(item["page"].product) == Product |
| 514 | + |
| 515 | + assert id(item["page"].browser_response) == id(item["page"].response.response) |
| 516 | + |
| 517 | + |
| 518 | +@ensureDeferred |
| 519 | +async def test_provider_any_response_product_extract_from_http_response(mockserver): |
| 520 | + @attrs.define |
| 521 | + class SomePage(BasePage): |
| 522 | + response: AnyResponse |
| 523 | + product: Product |
| 524 | + |
| 525 | + class ZyteAPISpider(Spider): |
| 526 | + def start_requests(self): |
| 527 | + yield Request(self.url, callback=self.parse_) |
| 528 | + |
| 529 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 530 | + yield {"page": page} |
| 531 | + |
| 532 | + settings = provider_settings(mockserver) |
| 533 | + settings["ZYTE_API_PROVIDER_PARAMS"] = { |
| 534 | + "productOptions": {"extractFrom": "httpResponseBody"} |
| 535 | + } |
| 536 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 537 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 538 | + |
| 539 | + assert len(params) == 1 |
| 540 | + assert params[0].keys() == { |
| 541 | + "url", |
| 542 | + "product", |
| 543 | + "httpResponseBody", |
| 544 | + "productOptions", |
| 545 | + "httpResponseHeaders", |
| 546 | + "customHttpRequestHeaders", |
| 547 | + } |
| 548 | + |
| 549 | + assert type(item["page"].response) == AnyResponse |
| 550 | + assert type(item["page"].response.response) == HttpResponse |
| 551 | + assert type(item["page"].product) == Product |
| 552 | + |
| 553 | + |
| 554 | +@ensureDeferred |
| 555 | +async def test_provider_any_response_product_extract_from_http_response_2(mockserver): |
| 556 | + @attrs.define |
| 557 | + class SomePage(BasePage): |
| 558 | + response: AnyResponse |
| 559 | + http_response: HttpResponse |
| 560 | + product: Product |
| 561 | + |
| 562 | + class ZyteAPISpider(Spider): |
| 563 | + def start_requests(self): |
| 564 | + yield Request(self.url, callback=self.parse_) |
| 565 | + |
| 566 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 567 | + yield {"page": page} |
| 568 | + |
| 569 | + settings = provider_settings(mockserver) |
| 570 | + settings["ZYTE_API_PROVIDER_PARAMS"] = { |
| 571 | + "productOptions": {"extractFrom": "httpResponseBody"} |
| 572 | + } |
| 573 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 574 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 575 | + |
| 576 | + assert len(params) == 1 |
| 577 | + assert params[0].keys() == { |
| 578 | + "url", |
| 579 | + "product", |
| 580 | + "httpResponseBody", |
| 581 | + "productOptions", |
| 582 | + "httpResponseHeaders", |
| 583 | + "customHttpRequestHeaders", |
| 584 | + } |
| 585 | + |
| 586 | + assert type(item["page"].response) == AnyResponse |
| 587 | + assert type(item["page"].response.response) == HttpResponse |
| 588 | + assert type(item["page"].product) == Product |
| 589 | + assert type(item["page"].http_response) == HttpResponse |
| 590 | + |
| 591 | + |
| 592 | +@ensureDeferred |
| 593 | +async def test_provider_any_response_browser_html(mockserver): |
| 594 | + @attrs.define |
| 595 | + class SomePage(BasePage): |
| 596 | + response: AnyResponse |
| 597 | + html: BrowserHtml |
| 598 | + |
| 599 | + class ZyteAPISpider(Spider): |
| 600 | + def start_requests(self): |
| 601 | + yield Request(self.url, callback=self.parse_) |
| 602 | + |
| 603 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 604 | + yield {"page": page} |
| 605 | + |
| 606 | + settings = provider_settings(mockserver) |
| 607 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 608 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 609 | + |
| 610 | + assert len(params) == 1 |
| 611 | + assert params[0].keys() == {"url", "browserHtml"} |
| 612 | + |
| 613 | + assert type(item["page"].response) == AnyResponse |
| 614 | + assert type(item["page"].response.response) == BrowserResponse |
| 615 | + assert type(item["page"].html) == BrowserHtml |
| 616 | + |
| 617 | + |
| 618 | +@ensureDeferred |
| 619 | +async def test_provider_any_response_browser_response(mockserver): |
| 620 | + @attrs.define |
| 621 | + class SomePage(BasePage): |
| 622 | + response: AnyResponse |
| 623 | + browser_response: BrowserResponse |
| 624 | + |
| 625 | + class ZyteAPISpider(Spider): |
| 626 | + def start_requests(self): |
| 627 | + yield Request(self.url, callback=self.parse_) |
| 628 | + |
| 629 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 630 | + yield {"page": page} |
| 631 | + |
| 632 | + settings = provider_settings(mockserver) |
| 633 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 634 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 635 | + |
| 636 | + assert len(params) == 1 |
| 637 | + assert params[0].keys() == {"url", "browserHtml"} |
| 638 | + |
| 639 | + assert type(item["page"].response) == AnyResponse |
| 640 | + assert type(item["page"].response.response) == BrowserResponse |
| 641 | + assert type(item["page"].browser_response) == BrowserResponse |
| 642 | + |
| 643 | + |
| 644 | +@ensureDeferred |
| 645 | +async def test_provider_any_response_browser_html_response(mockserver): |
| 646 | + @attrs.define |
| 647 | + class SomePage(BasePage): |
| 648 | + response: AnyResponse |
| 649 | + browser_response: BrowserResponse |
| 650 | + html: BrowserHtml |
| 651 | + |
| 652 | + class ZyteAPISpider(Spider): |
| 653 | + def start_requests(self): |
| 654 | + yield Request(self.url, callback=self.parse_) |
| 655 | + |
| 656 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 657 | + yield {"page": page} |
| 658 | + |
| 659 | + settings = provider_settings(mockserver) |
| 660 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 661 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 662 | + |
| 663 | + assert len(params) == 1 |
| 664 | + assert params[0].keys() == {"url", "browserHtml"} |
| 665 | + |
| 666 | + assert type(item["page"].response) == AnyResponse |
| 667 | + assert type(item["page"].response.response) == BrowserResponse |
| 668 | + assert type(item["page"].browser_response) == BrowserResponse |
| 669 | + assert type(item["page"].html) == BrowserHtml |
| 670 | + |
| 671 | + |
| 672 | +@ensureDeferred |
| 673 | +async def test_provider_any_response_http_response(mockserver): |
| 674 | + @attrs.define |
| 675 | + class SomePage(BasePage): |
| 676 | + response: AnyResponse |
| 677 | + http_response: HttpResponse |
| 678 | + |
| 679 | + class ZyteAPISpider(Spider): |
| 680 | + def start_requests(self): |
| 681 | + yield Request(self.url, callback=self.parse_) |
| 682 | + |
| 683 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 684 | + yield {"page": page} |
| 685 | + |
| 686 | + settings = provider_settings(mockserver) |
| 687 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 688 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 689 | + |
| 690 | + assert len(params) == 1 |
| 691 | + assert params[0].keys() == {"url", "HttpResponseBody"} |
| 692 | + |
| 693 | + assert type(item["page"].response) == AnyResponse |
| 694 | + assert type(item["page"].response.response) == BrowserResponse |
| 695 | + assert type(item["page"].http_response) == HttpResponse |
| 696 | + |
| 697 | + |
| 698 | +@ensureDeferred |
| 699 | +async def test_provider_any_response_browser_http_response(mockserver): |
| 700 | + @attrs.define |
| 701 | + class SomePage(BasePage): |
| 702 | + response: AnyResponse |
| 703 | + browser_response: BrowserResponse |
| 704 | + http_response: HttpResponse |
| 705 | + |
| 706 | + class ZyteAPISpider(Spider): |
| 707 | + def start_requests(self): |
| 708 | + yield Request(self.url, callback=self.parse_) |
| 709 | + |
| 710 | + def parse_(self, response: DummyResponse, page: SomePage): |
| 711 | + yield {"page": page} |
| 712 | + |
| 713 | + settings = provider_settings(mockserver) |
| 714 | + item, url, crawler = await crawl_single_item(ZyteAPISpider, HtmlResource, settings) |
| 715 | + params = crawler.engine.downloader.handlers._handlers["http"].params |
| 716 | + |
| 717 | + assert len(params) == 2 |
| 718 | + assert params[0].keys() == {"url", "HttpResponseBody"} |
| 719 | + assert params[1].keys() == {"url", "BrowserHtml"} |
| 720 | + |
| 721 | + assert type(item["page"].response) == AnyResponse |
| 722 | + assert type(item["page"].response.response) == BrowserResponse |
| 723 | + assert type(item["page"].browser_response) == BrowserResponse |
| 724 | + assert type(item["page"].http_response) == HttpResponse |
0 commit comments