From 5a5d89e96fcda9456d22b9ef8826b81816ef7417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 16 Nov 2023 09:49:05 +0100 Subject: [PATCH] Add HttpRequestProvider --- scrapy_poet/page_input_providers.py | 25 ++++++++++++++++++ setup.py | 2 +- tests/test_providers.py | 41 ++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 672909f0..ec5d4eb8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -22,6 +22,8 @@ from scrapy.utils.defer import maybe_deferred_to_future from web_poet import ( HttpClient, + HttpRequest, + HttpRequestHeaders, HttpResponse, HttpResponseHeaders, PageParams, @@ -144,6 +146,29 @@ def __init__(self, injector): # injection breaks the method overriding rules and mypy then complains. +class HttpRequestProvider(PageObjectInputProvider): + """This class provides :class:`web_poet.HttpRequest + ` instances. + """ + + provided_classes = {HttpRequest} + name = "request_data" + + def __call__(self, to_provide: Set[Callable], request: Request): + """Builds a :class:`web_poet.HttpRequest + ` instance using a + :class:`scrapy.http.Response` instance. + """ + return [ + HttpRequest( + url=RequestUrl(request.url), + method=request.method, + headers=HttpRequestHeaders.from_bytes_dict(request.headers), + body=request.body, + ) + ] + + class HttpResponseProvider(PageObjectInputProvider): """This class provides :class:`web_poet.HttpResponse ` instances. diff --git a/setup.py b/setup.py index 640d3a3e..13a04f10 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "time_machine >= 2.2.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet >= 0.15", + "web-poet @ git+https://github.com/Gallaecio/web-poet.git@request-headers-from-bytes", # https://github.com/scrapinghub/web-poet/pull/191 ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_providers.py b/tests/test_providers.py index 892d9583..0ce9acdf 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -8,13 +8,21 @@ from scrapy.settings import Settings from scrapy.utils.test import get_crawler from twisted.python.failure import Failure -from web_poet import HttpClient, HttpResponse +from web_poet import ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + HttpResponse, + RequestUrl, +) from web_poet.serialization import SerializedLeafData, register_serialization from scrapy_poet import HttpResponseProvider from scrapy_poet.injection import Injector from scrapy_poet.page_input_providers import ( HttpClientProvider, + HttpRequestProvider, ItemProvider, PageObjectInputProvider, PageParamsProvider, @@ -204,6 +212,37 @@ async def test_http_client_provider(settings): assert results[0]._request_downloader == mock_factory.return_value +@ensureDeferred +async def test_http_request_provider(settings): + crawler = get_crawler(Spider, settings) + injector = Injector(crawler) + provider = HttpRequestProvider(injector) + + empty_scrapy_request = scrapy.http.Request("https://example.com") + (empty_request,) = provider(set(), empty_scrapy_request) + assert isinstance(empty_request, HttpRequest) + assert isinstance(empty_request.url, RequestUrl) + assert str(empty_request.url) == "https://example.com" + assert empty_request.method == "GET" + assert isinstance(empty_request.headers, HttpRequestHeaders) + assert empty_request.headers == HttpRequestHeaders() + assert isinstance(empty_request.body, HttpRequestBody) + assert empty_request.body == HttpRequestBody() + + full_scrapy_request = scrapy.http.Request( + "https://example.com", method="POST", body=b"a", headers={"a": "b"} + ) + (full_request,) = provider(set(), full_scrapy_request) + assert isinstance(full_request, HttpRequest) + assert isinstance(full_request.url, RequestUrl) + assert str(full_request.url) == "https://example.com" + assert full_request.method == "POST" + assert isinstance(full_request.headers, HttpRequestHeaders) + assert full_request.headers == HttpRequestHeaders([("a", "b")]) + assert isinstance(full_request.body, HttpRequestBody) + assert full_request.body == HttpRequestBody(b"a") + + def test_page_params_provider(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler)