From 5a5d89e96fcda9456d22b9ef8826b81816ef7417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 16 Nov 2023 09:49:05 +0100 Subject: [PATCH 1/3] Add HttpRequestProvider --- scrapy_poet/page_input_providers.py | 25 ++++++++++++++++++ setup.py | 2 +- tests/test_providers.py | 41 ++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 672909f0..ec5d4eb8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -22,6 +22,8 @@ from scrapy.utils.defer import maybe_deferred_to_future from web_poet import ( HttpClient, + HttpRequest, + HttpRequestHeaders, HttpResponse, HttpResponseHeaders, PageParams, @@ -144,6 +146,29 @@ def __init__(self, injector): # injection breaks the method overriding rules and mypy then complains. +class HttpRequestProvider(PageObjectInputProvider): + """This class provides :class:`web_poet.HttpRequest + ` instances. + """ + + provided_classes = {HttpRequest} + name = "request_data" + + def __call__(self, to_provide: Set[Callable], request: Request): + """Builds a :class:`web_poet.HttpRequest + ` instance using a + :class:`scrapy.http.Response` instance. + """ + return [ + HttpRequest( + url=RequestUrl(request.url), + method=request.method, + headers=HttpRequestHeaders.from_bytes_dict(request.headers), + body=request.body, + ) + ] + + class HttpResponseProvider(PageObjectInputProvider): """This class provides :class:`web_poet.HttpResponse ` instances. diff --git a/setup.py b/setup.py index 640d3a3e..13a04f10 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "time_machine >= 2.2.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet >= 0.15", + "web-poet @ git+https://github.com/Gallaecio/web-poet.git@request-headers-from-bytes", # https://github.com/scrapinghub/web-poet/pull/191 ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_providers.py b/tests/test_providers.py index 892d9583..0ce9acdf 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -8,13 +8,21 @@ from scrapy.settings import Settings from scrapy.utils.test import get_crawler from twisted.python.failure import Failure -from web_poet import HttpClient, HttpResponse +from web_poet import ( + HttpClient, + HttpRequest, + HttpRequestBody, + HttpRequestHeaders, + HttpResponse, + RequestUrl, +) from web_poet.serialization import SerializedLeafData, register_serialization from scrapy_poet import HttpResponseProvider from scrapy_poet.injection import Injector from scrapy_poet.page_input_providers import ( HttpClientProvider, + HttpRequestProvider, ItemProvider, PageObjectInputProvider, PageParamsProvider, @@ -204,6 +212,37 @@ async def test_http_client_provider(settings): assert results[0]._request_downloader == mock_factory.return_value +@ensureDeferred +async def test_http_request_provider(settings): + crawler = get_crawler(Spider, settings) + injector = Injector(crawler) + provider = HttpRequestProvider(injector) + + empty_scrapy_request = scrapy.http.Request("https://example.com") + (empty_request,) = provider(set(), empty_scrapy_request) + assert isinstance(empty_request, HttpRequest) + assert isinstance(empty_request.url, RequestUrl) + assert str(empty_request.url) == "https://example.com" + assert empty_request.method == "GET" + assert isinstance(empty_request.headers, HttpRequestHeaders) + assert empty_request.headers == HttpRequestHeaders() + assert isinstance(empty_request.body, HttpRequestBody) + assert empty_request.body == HttpRequestBody() + + full_scrapy_request = scrapy.http.Request( + "https://example.com", method="POST", body=b"a", headers={"a": "b"} + ) + (full_request,) = provider(set(), full_scrapy_request) + assert isinstance(full_request, HttpRequest) + assert isinstance(full_request.url, RequestUrl) + assert str(full_request.url) == "https://example.com" + assert full_request.method == "POST" + assert isinstance(full_request.headers, HttpRequestHeaders) + assert full_request.headers == HttpRequestHeaders([("a", "b")]) + assert isinstance(full_request.body, HttpRequestBody) + assert full_request.body == HttpRequestBody(b"a") + + def test_page_params_provider(settings): crawler = get_crawler(Spider, settings) injector = Injector(crawler) From db2d0b4f5aa5182a677d2bd33a7600508dec7b3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 20 Nov 2023 12:56:57 +0100 Subject: [PATCH 2/3] Update scrapy_poet/page_input_providers.py Co-authored-by: Kevin Lloyd Bernal --- scrapy_poet/page_input_providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index ec5d4eb8..28960f12 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -157,7 +157,7 @@ class HttpRequestProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable], request: Request): """Builds a :class:`web_poet.HttpRequest ` instance using a - :class:`scrapy.http.Response` instance. + :class:`scrapy.http.Request` instance. """ return [ HttpRequest( From 87f94f111901905744bf8e2270987d1866cdb2c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 21 Nov 2023 14:49:36 +0100 Subject: [PATCH 3/3] Require web-poet 0.15.1 --- setup.py | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 13a04f10..52fc431b 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "time_machine >= 2.2.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet @ git+https://github.com/Gallaecio/web-poet.git@request-headers-from-bytes", # https://github.com/scrapinghub/web-poet/pull/191 + "web-poet >= 0.15.1", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tox.ini b/tox.ini index 1ca5cefe..9e539733 100644 --- a/tox.ini +++ b/tox.ini @@ -23,7 +23,7 @@ deps = sqlitedict==1.5.0 time_machine==2.2.0 url-matcher==0.2.0 - web-poet==0.15.0 + web-poet==0.15.1 # https://github.com/john-kurkowski/tldextract/issues/305 tldextract<3.6