Skip to content

Commit c1ba9c0

Browse files
VMRuizGallaeciowRAR
authored
Add LocationSessionConfig (#215)
Co-authored-by: Adrián Chaves <adrian@chaves.io> Co-authored-by: Andrey Rakhmatullin <wrar@wrar.name>
1 parent cf55072 commit c1ba9c0

File tree

4 files changed

+306
-0
lines changed

4 files changed

+306
-0
lines changed

docs/usage/session.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,15 @@ To define a different session config for a given URL pattern, install
338338

339339
.. autofunction:: scrapy_zyte_api.session_config
340340

341+
If you only need to override the :meth:`SessionConfig.check
342+
<scrapy_zyte_api.SessionConfig.check>` or :meth:`SessionConfig.params
343+
<scrapy_zyte_api.SessionConfig.params>` methods for scenarios involving a
344+
location, you may subclass :class:`~scrapy_zyte_api.LocationSessionConfig`
345+
instead:
346+
347+
.. autoclass:: scrapy_zyte_api.LocationSessionConfig
348+
:members: location_check, location_params
349+
341350
If in a session config implementation or in any other Scrapy component you need
342351
to tell whether a request is a :ref:`session initialization request
343352
<session-init>` or not, use :func:`~scrapy_zyte_api.is_session_init_request`:

scrapy_zyte_api/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
)
1818
from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY
1919
from ._session import (
20+
LocationSessionConfig,
2021
ScrapyZyteAPISessionDownloaderMiddleware,
2122
SessionConfig,
2223
is_session_init_request,

scrapy_zyte_api/_session.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,8 @@ def params(self, request: Request) -> Dict[str, Any]:
348348
The returned parameters do not need to include :http:`request:url`. If
349349
missing, it is picked from the request :ref:`triggering a session
350350
initialization request <pool-size>`.
351+
352+
.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
351353
"""
352354
if location := self.location(request):
353355
return {
@@ -372,6 +374,8 @@ def check(self, response: Response, request: Request) -> bool:
372374
If you need to tell whether *request* is a :ref:`session initialization
373375
request <session-init>` or not, use
374376
:func:`~scrapy_zyte_api.is_session_init_request`.
377+
378+
.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
375379
"""
376380
if self._checker:
377381
return self._checker.check(response, request)
@@ -966,3 +970,40 @@ async def process_exception(
966970
spider=spider,
967971
reason=reason,
968972
)
973+
974+
975+
class LocationSessionConfig(SessionConfig):
976+
""":class:`~scrapy_zyte_api.SessionConfig` subclass to minimize boilerplate
977+
when implementing location-specific session configs, i.e. session configs
978+
where the default values should be used unless a location is set.
979+
980+
Provides counterparts to some :class:`~scrapy_zyte_api.SessionConfig`
981+
methods that are only called when a location is set, and get that location
982+
as a parameter.
983+
"""
984+
985+
def params(self, request: Request) -> Dict[str, Any]:
986+
if not (location := self.location(request)):
987+
return super().params(request)
988+
return self.location_params(request, location)
989+
990+
def check(self, response: Response, request: Request) -> bool:
991+
if not (location := self.location(request)):
992+
return super().check(response, request)
993+
return self.location_check(response, request, location)
994+
995+
def location_params(
996+
self, request: Request, location: Dict[str, Any]
997+
) -> Dict[str, Any]:
998+
"""Like :class:`SessionConfig.params
999+
<scrapy_zyte_api.SessionConfig.params>`, but it is only called when a
1000+
location is set, and gets that *location* as a parameter."""
1001+
return super().params(request)
1002+
1003+
def location_check(
1004+
self, response: Response, request: Request, location: Dict[str, Any]
1005+
) -> bool:
1006+
"""Like :class:`SessionConfig.check
1007+
<scrapy_zyte_api.SessionConfig.check>`, but it is only called when a
1008+
location is set, and gets that *location* as a parameter."""
1009+
return super().check(response, request)

tests/test_sessions.py

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from scrapy_zyte_api import (
1818
SESSION_AGGRESSIVE_RETRY_POLICY,
1919
SESSION_DEFAULT_RETRY_POLICY,
20+
LocationSessionConfig,
2021
SessionConfig,
2122
is_session_init_request,
2223
session_config,
@@ -2080,6 +2081,260 @@ class CustomSessionConfig(SessionConfig):
20802081
pass
20812082

20822083

2084+
@ensureDeferred
2085+
async def test_location_session_config(mockserver):
2086+
pytest.importorskip("web_poet")
2087+
2088+
@session_config(
2089+
[
2090+
"postal-code-10001.example",
2091+
"postal-code-10001-fail.example",
2092+
"postal-code-10001-alternative.example",
2093+
]
2094+
)
2095+
class CustomSessionConfig(LocationSessionConfig):
2096+
2097+
def location_params(
2098+
self, request: Request, location: Dict[str, Any]
2099+
) -> Dict[str, Any]:
2100+
assert location == {"postalCode": "10002"}
2101+
return {
2102+
"actions": [
2103+
{
2104+
"action": "setLocation",
2105+
"address": {"postalCode": "10001"},
2106+
}
2107+
]
2108+
}
2109+
2110+
def location_check(
2111+
self, response: Response, request: Request, location: Dict[str, Any]
2112+
) -> bool:
2113+
assert location == {"postalCode": "10002"}
2114+
domain = urlparse_cached(request).netloc
2115+
return "fail" not in domain
2116+
2117+
def pool(self, request: Request) -> str:
2118+
domain = urlparse_cached(request).netloc
2119+
if domain == "postal-code-10001-alternative.example":
2120+
return "postal-code-10001.example"
2121+
return domain
2122+
2123+
settings = {
2124+
"RETRY_TIMES": 0,
2125+
"ZYTE_API_URL": mockserver.urljoin("/"),
2126+
"ZYTE_API_SESSION_ENABLED": True,
2127+
# We set a location to force the location-specific methods of the
2128+
# session config class to be called, but we set the wrong location so
2129+
# that the test would not pass were it not for our custom
2130+
# implementation which ignores the input location and instead sets the
2131+
# right one.
2132+
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"},
2133+
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
2134+
}
2135+
2136+
class TestSpider(Spider):
2137+
name = "test"
2138+
start_urls = [
2139+
"https://postal-code-10001.example",
2140+
"https://postal-code-10001-alternative.example",
2141+
"https://postal-code-10001-fail.example",
2142+
]
2143+
2144+
def start_requests(self):
2145+
for url in self.start_urls:
2146+
yield Request(
2147+
url,
2148+
meta={
2149+
"zyte_api_automap": {
2150+
"actions": [
2151+
{
2152+
"action": "setLocation",
2153+
"address": {"postalCode": "10001"},
2154+
}
2155+
]
2156+
},
2157+
},
2158+
)
2159+
2160+
def parse(self, response):
2161+
pass
2162+
2163+
crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
2164+
await crawler.crawl()
2165+
2166+
session_stats = {
2167+
k: v
2168+
for k, v in crawler.stats.get_stats().items()
2169+
if k.startswith("scrapy-zyte-api/sessions")
2170+
}
2171+
assert session_stats == {
2172+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
2173+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
2174+
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1,
2175+
}
2176+
2177+
# Clean up the session config registry, and check it, otherwise we could
2178+
# affect other tests.
2179+
2180+
session_config_registry.__init__() # type: ignore[misc]
2181+
2182+
crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
2183+
await crawler.crawl()
2184+
2185+
session_stats = {
2186+
k: v
2187+
for k, v in crawler.stats.get_stats().items()
2188+
if k.startswith("scrapy-zyte-api/sessions")
2189+
}
2190+
assert session_stats == {
2191+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
2192+
"scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1,
2193+
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1,
2194+
}
2195+
2196+
2197+
@ensureDeferred
2198+
async def test_location_session_config_no_methods(mockserver):
2199+
"""If no location_* methods are defined, LocationSessionConfig works the
2200+
same as SessionConfig."""
2201+
pytest.importorskip("web_poet")
2202+
2203+
@session_config(
2204+
[
2205+
"postal-code-10001.example",
2206+
"postal-code-10001-alternative.example",
2207+
]
2208+
)
2209+
class CustomSessionConfig(LocationSessionConfig):
2210+
2211+
def pool(self, request: Request) -> str:
2212+
domain = urlparse_cached(request).netloc
2213+
if domain == "postal-code-10001-alternative.example":
2214+
return "postal-code-10001.example"
2215+
return domain
2216+
2217+
settings = {
2218+
"RETRY_TIMES": 0,
2219+
"ZYTE_API_URL": mockserver.urljoin("/"),
2220+
"ZYTE_API_SESSION_ENABLED": True,
2221+
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"},
2222+
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
2223+
}
2224+
2225+
class TestSpider(Spider):
2226+
name = "test"
2227+
start_urls = [
2228+
"https://postal-code-10001.example",
2229+
"https://postal-code-10001-alternative.example",
2230+
]
2231+
2232+
def start_requests(self):
2233+
for url in self.start_urls:
2234+
yield Request(
2235+
url,
2236+
meta={
2237+
"zyte_api_automap": {
2238+
"actions": [
2239+
{
2240+
"action": "setLocation",
2241+
"address": {"postalCode": "10001"},
2242+
}
2243+
]
2244+
},
2245+
},
2246+
)
2247+
2248+
def parse(self, response):
2249+
pass
2250+
2251+
crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
2252+
await crawler.crawl()
2253+
2254+
session_stats = {
2255+
k: v
2256+
for k, v in crawler.stats.get_stats().items()
2257+
if k.startswith("scrapy-zyte-api/sessions")
2258+
}
2259+
assert session_stats == {
2260+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
2261+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
2262+
}
2263+
2264+
# Clean up the session config registry, and check it, otherwise we could
2265+
# affect other tests.
2266+
2267+
session_config_registry.__init__() # type: ignore[misc]
2268+
2269+
2270+
@ensureDeferred
2271+
async def test_location_session_config_no_location(mockserver):
2272+
"""If no location is configured, the methods are never called."""
2273+
pytest.importorskip("web_poet")
2274+
2275+
@session_config(["postal-code-10001.example", "a.example"])
2276+
class CustomSessionConfig(LocationSessionConfig):
2277+
2278+
def location_params(
2279+
self, request: Request, location: Dict[str, Any]
2280+
) -> Dict[str, Any]:
2281+
assert False
2282+
2283+
def location_check(
2284+
self, response: Response, request: Request, location: Dict[str, Any]
2285+
) -> bool:
2286+
assert False
2287+
2288+
settings = {
2289+
"RETRY_TIMES": 0,
2290+
"ZYTE_API_URL": mockserver.urljoin("/"),
2291+
"ZYTE_API_SESSION_ENABLED": True,
2292+
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
2293+
}
2294+
2295+
class TestSpider(Spider):
2296+
name = "test"
2297+
start_urls = ["https://postal-code-10001.example", "https://a.example"]
2298+
2299+
def start_requests(self):
2300+
for url in self.start_urls:
2301+
yield Request(
2302+
url,
2303+
meta={
2304+
"zyte_api_automap": {
2305+
"actions": [
2306+
{
2307+
"action": "setLocation",
2308+
"address": {"postalCode": "10001"},
2309+
}
2310+
]
2311+
},
2312+
},
2313+
)
2314+
2315+
def parse(self, response):
2316+
pass
2317+
2318+
crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
2319+
await crawler.crawl()
2320+
2321+
session_stats = {
2322+
k: v
2323+
for k, v in crawler.stats.get_stats().items()
2324+
if k.startswith("scrapy-zyte-api/sessions")
2325+
}
2326+
assert session_stats == {
2327+
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
2328+
"scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1,
2329+
"scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1,
2330+
}
2331+
2332+
# Clean up the session config registry, and check it, otherwise we could
2333+
# affect other tests.
2334+
2335+
session_config_registry.__init__() # type: ignore[misc]
2336+
2337+
20832338
@ensureDeferred
20842339
async def test_session_refresh(mockserver):
20852340
"""If a response does not pass a session validity check, the session is

0 commit comments

Comments
 (0)