Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SessionConfig.process_request #246

Merged
merged 1 commit into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/usage/session.rst
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,8 @@ Overriding session configs
For spiders that target a single website, using settings and request metadata
keys for :ref:`session initialization <session-init>` and :ref:`session
checking <session-check>` should do the job. However, for broad-crawl spiders,
:doc:`multi-website spiders <zyte-spider-templates:index>`, or for code
:doc:`multi-website spiders <zyte-spider-templates:index>`, to modify
session-using requests based on session initialization responses, or for code
reusability purposes, you might want to define different session configs for
different websites.

Expand Down Expand Up @@ -353,6 +354,10 @@ to tell whether a request is a :ref:`session initialization request

.. autofunction:: scrapy_zyte_api.is_session_init_request

To get the session ID of a given request, use:

.. autofunction:: scrapy_zyte_api.get_request_session_id

Classes decorated with :func:`~scrapy_zyte_api.session_config` are registered
into :data:`~scrapy_zyte_api.session_config_registry`:

Expand Down
1 change: 1 addition & 0 deletions scrapy_zyte_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
LocationSessionConfig,
ScrapyZyteAPISessionDownloaderMiddleware,
SessionConfig,
get_request_session_id,
is_session_init_request,
session_config,
)
Expand Down
138 changes: 113 additions & 25 deletions scrapy_zyte_api/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,25 @@
ZYTE_API_META_KEYS = ("zyte_api", "zyte_api_automap", "zyte_api_provider")


def get_request_session_id(request: Request) -> Optional[str]:
"""Return the session ID of *request*, or ``None`` if it does not have a
session ID assigned."""
for meta_key in ZYTE_API_META_KEYS:
if meta_key not in request.meta:
continue
session_id = request.meta[meta_key].get("session", {}).get("id", None)
if session_id:
return session_id
logger.warning(
f"Request {request} had no session ID assigned, unexpectedly. "
f"If you are sure this issue is not caused by your own code, "
f"please report this at "
f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new "
f"providing a minimal, reproducible example."
)
return None


def is_session_init_request(request):
"""Return ``True`` if the request is a :ref:`session initialization request
<session-init>` or ``False`` otherwise."""
Expand Down Expand Up @@ -209,6 +228,61 @@ def enabled(self, request: Request) -> bool:
"""
return request.meta.get("zyte_api_session_enabled", self._enabled)

def process_request(self, request: Request) -> Optional[Request]:
"""Process *request* after it has been assigned a session.

Return ``None`` to send the request as is, or return a new request
object to replace the original request.

The default implementation does not modify the request.

You can combine this method and :meth:`check` to modify requests based
on session initialization responses. For example:

#. In :meth:`__init__`, create a dictionary to store session data:

.. code-block:: python

def __init__(self, crawler):
super().__init__(crawler)
self.session_data = {}

#. In :meth:`check`, store data from the session initialization
response in ``session_data``:

.. code-block:: python

def check(self, response: Response, request: Request) -> bool:
if scrapy_zyte_api.is_session_init_request(request):
session_id = scrapy_zyte_api.get_request_session_id(request)
self.session_data[session_id] = {
"csrf_token": response.css(".csrf-token::text").get(),
}
return super().check(response, request)

#. In :meth:`process_request`, read the session data and act
accordingly, either modifying the request in place where possible,
e.g.:

.. code-block:: python

def process_request(self, request: Request) -> Optional[Request]:
session_id = scrapy_zyte_api.get_request_session_id(request)
csrf_token = self.session_data[session_id]["csrf_token"]
request.headers["CSRF-Token"] = csrf_token

Or returning an entirely new request, e.g.:

.. code-block:: python

def process_request(self, request: Request) -> Optional[Request]:
session_id = get_request_session_id(request)
csrf_token = self.session_data[session_id]["csrf_token"]
new_url = w3lib.url.add_or_replace_parameter(request.url, "csrf_token", csrf_token)
return request.replace(url=new_url)
"""
return None

def pool(self, request: Request) -> str:
"""Return the ID of the session pool to use for *request*.

Expand Down Expand Up @@ -766,22 +840,6 @@ def is_init_request(self, request: Request) -> bool:
"""
return request.meta.get(SESSION_INIT_META_KEY, False)

def _get_request_session_id(self, request: Request) -> Optional[str]:
for meta_key in ZYTE_API_META_KEYS:
if meta_key not in request.meta:
continue
session_id = request.meta[meta_key].get("session", {}).get("id", None)
if session_id:
return session_id
logger.warning(
f"Request {request} had no session ID assigned, unexpectedly. "
f"If you are sure this issue is not caused by your own code, "
f"please report this at "
f"https://github.com/scrapy-plugins/scrapy-zyte-api/issues/new "
f"providing a minimal, reproducible example."
)
return None

def _start_session_refresh(self, session_id: str, request: Request, pool: str):
try:
self._pools[pool].remove(session_id)
Expand All @@ -799,11 +857,20 @@ def _start_session_refresh(self, session_id: str, request: Request, pool: str):
pass

def _start_request_session_refresh(self, request: Request, pool: str):
session_id = self._get_request_session_id(request)
session_id = get_request_session_id(request)
if session_id is None:
return
self._start_session_refresh(session_id, request, pool)

@staticmethod
def allow_new_session_assignments(request):
# Since a response has been received or an exception raised, allow new
# session assignments for this request, e.g. if a new request based on
# this one (e.g. requests.replace()) is returned by the
# process_response or process_exception methods of a later downloader
# middleware.
request.meta.pop("_zyte_api_session_assigned", None)

async def check(self, response: Response, request: Request) -> bool:
"""Check the response for signs of session expiration, update the
internal session pool accordingly, and return ``False`` if the session
Expand Down Expand Up @@ -838,16 +905,23 @@ async def check(self, response: Response, request: Request) -> bool:
self._start_request_session_refresh(request, pool)
return False

async def assign(self, request: Request):
"""Assign a working session to *request*."""
async def assign(self, request: Request) -> Optional[Request]:
"""Assign a working session to *request*.

If the session config creates a new request instead of modifying the
request in place, return that new request, to replace the received
request.
"""
assert self._crawler.stats
with self._fatal_error_handler:
if self.is_init_request(request):
return
if self.is_init_request(request) or request.meta.get(
"_zyte_api_session_assigned", False
):
return None
session_config = self._get_session_config(request)
if not session_config.enabled(request):
self._crawler.stats.inc_value("scrapy-zyte-api/sessions/use/disabled")
return
return None
session_id = await self._next(request)
# Note: If there is a session set already (e.g. a request being
# retried), it is overridden.
Expand All @@ -870,6 +944,13 @@ async def assign(self, request: Request):
request.meta[meta_key] = {}
request.meta[meta_key]["session"] = {"id": session_id}
request.meta.setdefault("dont_merge_cookies", True)
# Mark this request as having a session assigned already, so that
# if a later downloader middleware process_request call returns a
# new request object (with a shallow copy of its meta), a new call
# to the process_request method of the session management
# middleware does not assign a new session again.
request.meta.setdefault("_zyte_api_session_assigned", True)
return session_config.process_request(request)

def is_enabled(self, request: Request) -> bool:
session_config = self._get_session_config(request)
Expand All @@ -882,7 +963,7 @@ def handle_error(self, request: Request):
self._crawler.stats.inc_value(
f"scrapy-zyte-api/sessions/pools/{pool}/use/failed"
)
session_id = self._get_request_session_id(request)
session_id = get_request_session_id(request)
if session_id is not None:
self._errors[session_id] += 1
if self._errors[session_id] < self._max_errors:
Expand All @@ -909,14 +990,19 @@ def __init__(self, crawler: Crawler):
self._crawler = crawler
self._sessions = _SessionManager(crawler)

async def process_request(self, request: Request, spider: Spider) -> None:
await self._sessions.assign(request)
async def process_request(
self, request: Request, spider: Spider
) -> Optional[Request]:
return await self._sessions.assign(request)

async def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response, None]:
if isinstance(response, DummyResponse):
return response

self._sessions.allow_new_session_assignments(request)

passed = await self._sessions.check(response, request)
if not passed:
new_request_or_none = get_retry_request(
Expand All @@ -939,6 +1025,8 @@ async def process_exception(
):
return None

self._sessions.allow_new_session_assignments(request)

if exception.parsed.type == "/problem/session-expired":
self._sessions.handle_expiration(request)
reason = "session_expired"
Expand Down
Loading