Skip to content

Commit a39ef5c

Browse files
committed
Implement a request fingerprinter that accounts for dependencies
1 parent aba2b74 commit a39ef5c

File tree

7 files changed

+334
-0
lines changed

7 files changed

+334
-0
lines changed

README.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ Add the following inside Scrapy's ``settings.py`` file:
6363
SPIDER_MIDDLEWARES = {
6464
"scrapy_poet.RetryMiddleware": 275,
6565
}
66+
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"
6667
6768
Developing
6869
==========

docs/intro/install.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ of your Scrapy project:
3232
SPIDER_MIDDLEWARES = {
3333
"scrapy_poet.RetryMiddleware": 275,
3434
}
35+
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"
3536
3637
Things that are good to know
3738
============================

docs/settings.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,18 @@ Sets the class, or its import path, that will be used as an adapter in the
107107
generated test fixtures.
108108

109109
More info at :ref:`fixture-adapter`.
110+
111+
112+
SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS
113+
--------------------------------------------
114+
115+
The default value is the default value of the ``REQUEST_FINGERPRINTER_CLASS``
116+
setting for the version of Scrapy currently installed (e.g.
117+
``"scrapy.utils.request.RequestFingerprinter"``).
118+
119+
You can assign a request fingerprinter class to this setting to configure a
120+
custom request fingerprinter class to use for requests.
121+
122+
This class is used to generate a base fingerprint for a request. If that
123+
request uses dependency injection, that fingerprint is then modified to account
124+
for requested dependencies. Otherwise, the fingerprint is used as is.

scrapy_poet/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from ._request_fingerprinter import ScrapyPoetRequestFingerprinter
12
from .api import DummyResponse, callback_for
23
from .downloadermiddlewares import InjectionMiddleware
34
from .page_input_providers import HttpResponseProvider, PageObjectInputProvider

scrapy_poet/_request_fingerprinter.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from typing import TYPE_CHECKING
2+
3+
try:
4+
from scrapy.utils.request import RequestFingerprinter # NOQA
5+
except ImportError:
6+
if not TYPE_CHECKING:
7+
ScrapyPoetRequestFingerprinter = None
8+
else:
9+
import hashlib
10+
import json
11+
from functools import cached_property
12+
from typing import Callable, Dict, List, Optional
13+
from weakref import WeakKeyDictionary
14+
15+
from scrapy import Request
16+
from scrapy.crawler import Crawler
17+
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
18+
from scrapy.utils.misc import create_instance, load_object
19+
20+
from scrapy_poet import InjectionMiddleware
21+
from scrapy_poet.injection import get_callback
22+
23+
class ScrapyPoetRequestFingerprinter:
24+
@classmethod
25+
def from_crawler(cls, crawler):
26+
return cls(crawler)
27+
28+
def __init__(self, crawler: Crawler) -> None:
29+
settings = crawler.settings
30+
self._fallback_request_fingerprinter = create_instance(
31+
load_object(
32+
settings.get(
33+
"SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS",
34+
REQUEST_FINGERPRINTER_CLASS,
35+
)
36+
),
37+
settings=crawler.settings,
38+
crawler=crawler,
39+
)
40+
self._callback_cache: Dict[Callable, bytes] = {}
41+
self._request_cache: "WeakKeyDictionary[Request, bytes]" = (
42+
WeakKeyDictionary()
43+
)
44+
self._crawler: Crawler = crawler
45+
46+
@cached_property
47+
def _injector(self):
48+
middlewares = self._crawler.engine.downloader.middleware.middlewares
49+
for middleware in middlewares:
50+
if isinstance(middleware, InjectionMiddleware):
51+
return middleware.injector
52+
raise RuntimeError(
53+
"scrapy_poet.InjectionMiddleware not found at run time, has it "
54+
"been configured in the DOWNLOADER_MIDDLEWARES setting?"
55+
)
56+
57+
def _get_deps(self, request: Request) -> Optional[List[str]]:
58+
"""Return a JSON-serializable structure that uniquely identifies the
59+
dependencies requested by the request, or None if dependency injection
60+
is not required."""
61+
plan = self._injector.build_plan(request)
62+
root_deps = plan[-1][1]
63+
if not root_deps:
64+
return None
65+
return [repr(cls) for cls in root_deps.values()]
66+
67+
def fingerprint_deps(self, request: Request) -> Optional[bytes]:
68+
"""Return a fingerprint based on dependencies requested through
69+
scrapy-poet injection, or None if no injection was requested."""
70+
callback = get_callback(request, self._crawler.spider)
71+
if callback in self._callback_cache:
72+
return self._callback_cache[callback]
73+
74+
deps = self._get_deps(request)
75+
if deps is None:
76+
return None
77+
78+
deps_key = json.dumps(deps, sort_keys=True).encode()
79+
self._callback_cache[callback] = hashlib.sha1(deps_key).digest()
80+
return self._callback_cache[callback]
81+
82+
def fingerprint(self, request: Request) -> bytes:
83+
if request in self._request_cache:
84+
return self._request_cache[request]
85+
fingerprint = self._fallback_request_fingerprinter.fingerprint(request)
86+
deps_fingerprint = self.fingerprint_deps(request)
87+
if deps_fingerprint is None:
88+
return fingerprint
89+
fingerprints = fingerprint + deps_fingerprint
90+
self._request_cache[request] = hashlib.sha1(fingerprints).digest()
91+
return self._request_cache[request]

tests/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,44 @@
11
import os
22

3+
from scrapy import Spider
4+
from scrapy.crawler import Crawler
5+
from scrapy.utils.test import get_crawler as _get_crawler
6+
37
# Note that tox.ini should only set the REACTOR env variable when running
48
# pytest with "--reactor=asyncio".
59
if os.environ.get("REACTOR", "") == "asyncio":
610
from scrapy.utils.reactor import install_reactor
711

812
install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
13+
14+
15+
def get_download_handler(crawler, schema):
16+
return crawler.engine.downloader.handlers._get_handler(schema)
17+
18+
19+
def setup_crawler_engine(crawler: Crawler):
20+
"""Run the crawl steps until engine setup, so that crawler.engine is not
21+
None.
22+
23+
https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122
24+
"""
25+
26+
crawler.crawling = True
27+
crawler.spider = crawler._create_spider()
28+
crawler.engine = crawler._create_engine()
29+
30+
handler = get_download_handler(crawler, "https")
31+
if hasattr(handler, "engine_started"):
32+
handler.engine_started()
33+
34+
35+
class DummySpider(Spider):
36+
name = "dummy"
37+
38+
39+
def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True):
40+
settings = settings or {}
41+
crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls)
42+
if setup_engine:
43+
setup_crawler_engine(crawler)
44+
return crawler

tests/test_request_fingerprinting.py

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import sys
2+
3+
import pytest
4+
from packaging.version import Version
5+
from scrapy import __version__ as SCRAPY_VERSION
6+
7+
if Version(SCRAPY_VERSION) < Version("2.7"):
8+
pytest.skip("Skipping tests for Scrapy < 2.7", allow_module_level=True)
9+
10+
from importlib.metadata import version as package_version
11+
12+
from scrapy import Request, Spider
13+
from web_poet import ItemPage, WebPage
14+
15+
from scrapy_poet import ScrapyPoetRequestFingerprinter
16+
17+
from . import get_crawler as _get_crawler
18+
19+
ANDI_VERSION = Version(package_version("andi"))
20+
21+
SETTINGS = {
22+
"DOWNLOADER_MIDDLEWARES": {
23+
"scrapy_poet.InjectionMiddleware": 543,
24+
},
25+
"REQUEST_FINGERPRINTER_CLASS": ScrapyPoetRequestFingerprinter,
26+
}
27+
28+
29+
def get_crawler(spider_cls=None, settings=None):
30+
settings = SETTINGS if settings is None else settings
31+
kwargs = {}
32+
if spider_cls is not None:
33+
kwargs["spider_cls"] = spider_cls
34+
return _get_crawler(settings=settings, **kwargs)
35+
36+
37+
def test_no_deps_vs_dep():
38+
class TestSpider(Spider):
39+
name = "test_spider"
40+
41+
async def parse_page(self, response, page: WebPage):
42+
pass
43+
44+
crawler = get_crawler(spider_cls=TestSpider)
45+
fingerprinter = crawler.request_fingerprinter
46+
request1 = Request("https://toscrape.com")
47+
fingerprint1 = fingerprinter.fingerprint(request1)
48+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page)
49+
fingerprint2 = fingerprinter.fingerprint(request2)
50+
assert fingerprint1 != fingerprint2
51+
52+
53+
def test_same_deps():
54+
class TestSpider(Spider):
55+
name = "test_spider"
56+
57+
async def parse_a(self, response, a: WebPage):
58+
pass
59+
60+
async def parse_b(self, response, b: WebPage):
61+
pass
62+
63+
crawler = get_crawler(spider_cls=TestSpider)
64+
fingerprinter = crawler.request_fingerprinter
65+
request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a)
66+
fingerprint1 = fingerprinter.fingerprint(request1)
67+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b)
68+
fingerprint2 = fingerprinter.fingerprint(request2)
69+
assert fingerprint1 == fingerprint2
70+
71+
72+
def test_same_deps_different_callbacks():
73+
class TestSpider(Spider):
74+
name = "test_spider"
75+
76+
async def parse_page(self, response, page: WebPage):
77+
pass
78+
79+
crawler = get_crawler(spider_cls=TestSpider)
80+
fingerprinter = crawler.request_fingerprinter
81+
request1 = Request("https://toscrape.com", callback=crawler.spider.parse_page)
82+
fingerprint1 = fingerprinter.fingerprint(request1)
83+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page)
84+
fingerprint2 = fingerprinter.fingerprint(request2)
85+
assert fingerprint1 == fingerprint2
86+
87+
88+
def test_different_deps():
89+
class TestSpider(Spider):
90+
name = "test_spider"
91+
92+
async def parse_item(self, response, item: ItemPage):
93+
pass
94+
95+
async def parse_web(self, response, web: WebPage):
96+
pass
97+
98+
crawler = get_crawler(spider_cls=TestSpider)
99+
fingerprinter = crawler.request_fingerprinter
100+
request1 = Request("https://toscrape.com", callback=crawler.spider.parse_item)
101+
fingerprint1 = fingerprinter.fingerprint(request1)
102+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_web)
103+
fingerprint2 = fingerprinter.fingerprint(request2)
104+
assert fingerprint1 != fingerprint2
105+
106+
107+
@pytest.mark.skipif(
108+
sys.version_info < (3, 9), reason="No Annotated support in Python < 3.9"
109+
)
110+
@pytest.mark.skipif(
111+
ANDI_VERSION <= Version("0.4.1"),
112+
reason="https://github.com/scrapinghub/andi/pull/25",
113+
)
114+
def test_different_annotations():
115+
from typing import Annotated
116+
117+
class TestSpider(Spider):
118+
name = "test_spider"
119+
120+
async def parse_a(self, response, a: Annotated[WebPage, "a"]):
121+
pass
122+
123+
async def parse_b(self, response, b: Annotated[WebPage, "b"]):
124+
pass
125+
126+
crawler = get_crawler(spider_cls=TestSpider)
127+
fingerprinter = crawler.request_fingerprinter
128+
request1 = Request("https://toscrape.com", callback=crawler.spider.parse_a)
129+
fingerprint1 = fingerprinter.fingerprint(request1)
130+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_b)
131+
fingerprint2 = fingerprinter.fingerprint(request2)
132+
assert fingerprint1 != fingerprint2
133+
134+
135+
def test_fallback_default():
136+
class TestSpider(Spider):
137+
name = "test_spider"
138+
139+
async def parse_page(self, response, page: WebPage):
140+
pass
141+
142+
crawler = get_crawler(spider_cls=TestSpider)
143+
fingerprinter = crawler.request_fingerprinter
144+
fallback_fingerprinter = (
145+
crawler.request_fingerprinter._fallback_request_fingerprinter
146+
)
147+
148+
request1 = Request("https://toscrape.com")
149+
fingerprint1 = fingerprinter.fingerprint(request1)
150+
fallback_fingerprint = fallback_fingerprinter.fingerprint(request1)
151+
assert fingerprint1 == fallback_fingerprint
152+
153+
request2 = Request("https://toscrape.com", callback=crawler.spider.parse_page)
154+
fingerprint2 = fingerprinter.fingerprint(request2)
155+
assert fallback_fingerprint == fallback_fingerprinter.fingerprint(request2)
156+
assert fingerprint2 != fallback_fingerprint
157+
158+
159+
def test_fallback_custom():
160+
class TestSpider(Spider):
161+
name = "test_spider"
162+
163+
async def parse_page(self, response, page: WebPage):
164+
pass
165+
166+
class CustomFingerprinter:
167+
def fingerprint(self, request):
168+
return b"foo"
169+
170+
settings = {
171+
**SETTINGS,
172+
"SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS": CustomFingerprinter,
173+
}
174+
crawler = get_crawler(spider_cls=TestSpider, settings=settings)
175+
fingerprinter = crawler.request_fingerprinter
176+
177+
request = Request("https://example.com")
178+
assert fingerprinter.fingerprint(request) == b"foo"
179+
request = Request("https://example.com", callback=crawler.spider.parse_page)
180+
assert fingerprinter.fingerprint(request) != b"foo"
181+
182+
183+
def test_missing_middleware():
184+
settings = {**SETTINGS, "DOWNLOADER_MIDDLEWARES": {}}
185+
crawler = get_crawler(settings=settings)
186+
fingerprinter = crawler.request_fingerprinter
187+
request = Request("https://example.com")
188+
with pytest.raises(RuntimeError):
189+
fingerprinter.fingerprint(request)

0 commit comments

Comments
 (0)