Skip to content

Implement a request fingerprinter that accounts for dependencies #172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ Add the following inside Scrapy's ``settings.py`` file:
SPIDER_MIDDLEWARES = {
"scrapy_poet.RetryMiddleware": 275,
}
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"

Developing
==========
Expand Down
1 change: 1 addition & 0 deletions docs/intro/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ of your Scrapy project:
SPIDER_MIDDLEWARES = {
"scrapy_poet.RetryMiddleware": 275,
}
REQUEST_FINGERPRINTER_CLASS = "scrapy_poet.ScrapyPoetRequestFingerprinter"

Things that are good to know
============================
Expand Down
15 changes: 15 additions & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,18 @@ Sets the class, or its import path, that will be used as an adapter in the
generated test fixtures.

More info at :ref:`fixture-adapter`.


SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS
--------------------------------------------

The default value is the default value of the ``REQUEST_FINGERPRINTER_CLASS``
setting for the version of Scrapy currently installed (e.g.
``"scrapy.utils.request.RequestFingerprinter"``).

You can assign a request fingerprinter class to this setting to configure a
custom request fingerprinter class to use for requests.

This class is used to generate a base fingerprint for a request. If that
request uses dependency injection, that fingerprint is then modified to account
for requested dependencies. Otherwise, the fingerprint is used as is.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ line-length = 88
[tool.isort]
profile = "black"
multi_line_output = 3
# scrapy_poet/__init__.py: Automatic sorting causes circular dependencies.
skip = ["scrapy_poet/__init__.py"]

[[tool.mypy.overrides]]
module = [
Expand Down
1 change: 1 addition & 0 deletions scrapy_poet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
from .downloadermiddlewares import InjectionMiddleware
from .page_input_providers import HttpResponseProvider, PageObjectInputProvider
from .spidermiddlewares import RetryMiddleware
from ._request_fingerprinter import ScrapyPoetRequestFingerprinter
102 changes: 102 additions & 0 deletions scrapy_poet/_request_fingerprinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
try:
from scrapy.utils.request import RequestFingerprinter # NOQA
except ImportError:
from typing import TYPE_CHECKING

if not TYPE_CHECKING:
ScrapyPoetRequestFingerprinter = None
else:
import hashlib
import json
from functools import cached_property
from typing import Callable, Dict, List, Optional, get_args, get_origin
from weakref import WeakKeyDictionary

from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
from scrapy.utils.misc import create_instance, load_object

from scrapy_poet import InjectionMiddleware
from scrapy_poet.injection import get_callback

def _serialize_dep(cls):
try:
from typing import Annotated
except ImportError:
pass
else:
if get_origin(cls) is Annotated:
annotated, *annotations = get_args(cls)
return f"{_serialize_dep(annotated)}{repr(annotations)}"
return f"{cls.__module__}.{cls.__qualname__}"

class ScrapyPoetRequestFingerprinter:
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)

def __init__(self, crawler: Crawler) -> None:
settings = crawler.settings
self._base_request_fingerprinter = create_instance(
load_object(
settings.get(
"SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS",
REQUEST_FINGERPRINTER_CLASS,
)
),
settings=crawler.settings,
crawler=crawler,
)
self._callback_cache: Dict[Callable, bytes] = {}
self._request_cache: "WeakKeyDictionary[Request, bytes]" = (
WeakKeyDictionary()
)
self._crawler: Crawler = crawler

@cached_property
def _injector(self):
middlewares = self._crawler.engine.downloader.middleware.middlewares
for middleware in middlewares:
if isinstance(middleware, InjectionMiddleware):
return middleware.injector
raise RuntimeError(
"scrapy_poet.InjectionMiddleware not found at run time, has it "
"been configured in the DOWNLOADER_MIDDLEWARES setting?"
)

def _get_deps(self, request: Request) -> Optional[List[str]]:
"""Return a JSON-serializable structure that uniquely identifies the
dependencies requested by the request, or None if dependency injection
is not required."""
plan = self._injector.build_plan(request)
root_deps = plan[-1][1]
if not root_deps:
return None
return sorted([_serialize_dep(cls) for cls in root_deps.values()])

def fingerprint_deps(self, request: Request) -> Optional[bytes]:
"""Return a fingerprint based on dependencies requested through
scrapy-poet injection, or None if no injection was requested."""
callback = get_callback(request, self._crawler.spider)
if callback in self._callback_cache:
return self._callback_cache[callback]

deps = self._get_deps(request)
if deps is None:
return None

deps_key = json.dumps(deps, sort_keys=True).encode()
self._callback_cache[callback] = hashlib.sha1(deps_key).digest()
return self._callback_cache[callback]

def fingerprint(self, request: Request) -> bytes:
if request in self._request_cache:
return self._request_cache[request]
fingerprint = self._base_request_fingerprinter.fingerprint(request)
deps_fingerprint = self.fingerprint_deps(request)
if deps_fingerprint is None:
return fingerprint
fingerprints = fingerprint + deps_fingerprint
self._request_cache[request] = hashlib.sha1(fingerprints).digest()
return self._request_cache[request]
34 changes: 33 additions & 1 deletion scrapy_poet/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from typing import Dict
from unittest import mock

from scrapy import signals
from scrapy import Spider, signals
from scrapy.crawler import Crawler
from scrapy.exceptions import CloseSpider
from scrapy.settings import Settings
from scrapy.utils.python import to_bytes
from scrapy.utils.test import get_crawler as _get_crawler
from twisted.internet import reactor
from twisted.internet.defer import inlineCallbacks
from twisted.internet.task import deferLater
Expand Down Expand Up @@ -151,6 +152,10 @@ def crawl_single_item(
return item, url, crawler


def get_download_handler(crawler, schema):
return crawler.engine.downloader.handlers._get_handler(schema)


def make_crawler(spider_cls, settings):
if not getattr(spider_cls, "name", None):

Expand All @@ -163,6 +168,33 @@ class Spider(spider_cls):
return Crawler(spider_cls, settings)


def setup_crawler_engine(crawler: Crawler):
"""Run the crawl steps until engine setup, so that crawler.engine is not
None.
https://github.com/scrapy/scrapy/blob/8fbebfa943c3352f5ba49f46531a6ccdd0b52b60/scrapy/crawler.py#L116-L122
"""

crawler.crawling = True
crawler.spider = crawler._create_spider()
crawler.engine = crawler._create_engine()

handler = get_download_handler(crawler, "https")
if hasattr(handler, "engine_started"):
handler.engine_started()


class DummySpider(Spider):
name = "dummy"


def get_crawler(settings=None, spider_cls=DummySpider, setup_engine=True):
settings = settings or {}
crawler = _get_crawler(settings_dict=settings, spidercls=spider_cls)
if setup_engine:
setup_crawler_engine(crawler)
return crawler


class CollectorPipeline:
def open_spider(self, spider):
spider.collected_items = []
Expand Down
Loading