|
17 | 17 | from scrapy_zyte_api import (
|
18 | 18 | SESSION_AGGRESSIVE_RETRY_POLICY,
|
19 | 19 | SESSION_DEFAULT_RETRY_POLICY,
|
| 20 | + LocationSessionConfig, |
20 | 21 | SessionConfig,
|
21 | 22 | is_session_init_request,
|
22 | 23 | session_config,
|
@@ -2080,6 +2081,260 @@ class CustomSessionConfig(SessionConfig):
|
2080 | 2081 | pass
|
2081 | 2082 |
|
2082 | 2083 |
|
| 2084 | +@ensureDeferred |
| 2085 | +async def test_location_session_config(mockserver): |
| 2086 | + pytest.importorskip("web_poet") |
| 2087 | + |
| 2088 | + @session_config( |
| 2089 | + [ |
| 2090 | + "postal-code-10001.example", |
| 2091 | + "postal-code-10001-fail.example", |
| 2092 | + "postal-code-10001-alternative.example", |
| 2093 | + ] |
| 2094 | + ) |
| 2095 | + class CustomSessionConfig(LocationSessionConfig): |
| 2096 | + |
| 2097 | + def location_params( |
| 2098 | + self, request: Request, location: Dict[str, Any] |
| 2099 | + ) -> Dict[str, Any]: |
| 2100 | + assert location == {"postalCode": "10002"} |
| 2101 | + return { |
| 2102 | + "actions": [ |
| 2103 | + { |
| 2104 | + "action": "setLocation", |
| 2105 | + "address": {"postalCode": "10001"}, |
| 2106 | + } |
| 2107 | + ] |
| 2108 | + } |
| 2109 | + |
| 2110 | + def location_check( |
| 2111 | + self, response: Response, request: Request, location: Dict[str, Any] |
| 2112 | + ) -> bool: |
| 2113 | + assert location == {"postalCode": "10002"} |
| 2114 | + domain = urlparse_cached(request).netloc |
| 2115 | + return "fail" not in domain |
| 2116 | + |
| 2117 | + def pool(self, request: Request) -> str: |
| 2118 | + domain = urlparse_cached(request).netloc |
| 2119 | + if domain == "postal-code-10001-alternative.example": |
| 2120 | + return "postal-code-10001.example" |
| 2121 | + return domain |
| 2122 | + |
| 2123 | + settings = { |
| 2124 | + "RETRY_TIMES": 0, |
| 2125 | + "ZYTE_API_URL": mockserver.urljoin("/"), |
| 2126 | + "ZYTE_API_SESSION_ENABLED": True, |
| 2127 | + # We set a location to force the location-specific methods of the |
| 2128 | + # session config class to be called, but we set the wrong location so |
| 2129 | + # that the test would not pass were it not for our custom |
| 2130 | + # implementation which ignores the input location and instead sets the |
| 2131 | + # right one. |
| 2132 | + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}, |
| 2133 | + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, |
| 2134 | + } |
| 2135 | + |
| 2136 | + class TestSpider(Spider): |
| 2137 | + name = "test" |
| 2138 | + start_urls = [ |
| 2139 | + "https://postal-code-10001.example", |
| 2140 | + "https://postal-code-10001-alternative.example", |
| 2141 | + "https://postal-code-10001-fail.example", |
| 2142 | + ] |
| 2143 | + |
| 2144 | + def start_requests(self): |
| 2145 | + for url in self.start_urls: |
| 2146 | + yield Request( |
| 2147 | + url, |
| 2148 | + meta={ |
| 2149 | + "zyte_api_automap": { |
| 2150 | + "actions": [ |
| 2151 | + { |
| 2152 | + "action": "setLocation", |
| 2153 | + "address": {"postalCode": "10001"}, |
| 2154 | + } |
| 2155 | + ] |
| 2156 | + }, |
| 2157 | + }, |
| 2158 | + ) |
| 2159 | + |
| 2160 | + def parse(self, response): |
| 2161 | + pass |
| 2162 | + |
| 2163 | + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) |
| 2164 | + await crawler.crawl() |
| 2165 | + |
| 2166 | + session_stats = { |
| 2167 | + k: v |
| 2168 | + for k, v in crawler.stats.get_stats().items() |
| 2169 | + if k.startswith("scrapy-zyte-api/sessions") |
| 2170 | + } |
| 2171 | + assert session_stats == { |
| 2172 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, |
| 2173 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, |
| 2174 | + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1, |
| 2175 | + } |
| 2176 | + |
| 2177 | + # Clean up the session config registry, and check it, otherwise we could |
| 2178 | + # affect other tests. |
| 2179 | + |
| 2180 | + session_config_registry.__init__() # type: ignore[misc] |
| 2181 | + |
| 2182 | + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) |
| 2183 | + await crawler.crawl() |
| 2184 | + |
| 2185 | + session_stats = { |
| 2186 | + k: v |
| 2187 | + for k, v in crawler.stats.get_stats().items() |
| 2188 | + if k.startswith("scrapy-zyte-api/sessions") |
| 2189 | + } |
| 2190 | + assert session_stats == { |
| 2191 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, |
| 2192 | + "scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1, |
| 2193 | + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1, |
| 2194 | + } |
| 2195 | + |
| 2196 | + |
| 2197 | +@ensureDeferred |
| 2198 | +async def test_location_session_config_no_methods(mockserver): |
| 2199 | + """If no location_* methods are defined, LocationSessionConfig works the |
| 2200 | + same as SessionConfig.""" |
| 2201 | + pytest.importorskip("web_poet") |
| 2202 | + |
| 2203 | + @session_config( |
| 2204 | + [ |
| 2205 | + "postal-code-10001.example", |
| 2206 | + "postal-code-10001-alternative.example", |
| 2207 | + ] |
| 2208 | + ) |
| 2209 | + class CustomSessionConfig(LocationSessionConfig): |
| 2210 | + |
| 2211 | + def pool(self, request: Request) -> str: |
| 2212 | + domain = urlparse_cached(request).netloc |
| 2213 | + if domain == "postal-code-10001-alternative.example": |
| 2214 | + return "postal-code-10001.example" |
| 2215 | + return domain |
| 2216 | + |
| 2217 | + settings = { |
| 2218 | + "RETRY_TIMES": 0, |
| 2219 | + "ZYTE_API_URL": mockserver.urljoin("/"), |
| 2220 | + "ZYTE_API_SESSION_ENABLED": True, |
| 2221 | + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, |
| 2222 | + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, |
| 2223 | + } |
| 2224 | + |
| 2225 | + class TestSpider(Spider): |
| 2226 | + name = "test" |
| 2227 | + start_urls = [ |
| 2228 | + "https://postal-code-10001.example", |
| 2229 | + "https://postal-code-10001-alternative.example", |
| 2230 | + ] |
| 2231 | + |
| 2232 | + def start_requests(self): |
| 2233 | + for url in self.start_urls: |
| 2234 | + yield Request( |
| 2235 | + url, |
| 2236 | + meta={ |
| 2237 | + "zyte_api_automap": { |
| 2238 | + "actions": [ |
| 2239 | + { |
| 2240 | + "action": "setLocation", |
| 2241 | + "address": {"postalCode": "10001"}, |
| 2242 | + } |
| 2243 | + ] |
| 2244 | + }, |
| 2245 | + }, |
| 2246 | + ) |
| 2247 | + |
| 2248 | + def parse(self, response): |
| 2249 | + pass |
| 2250 | + |
| 2251 | + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) |
| 2252 | + await crawler.crawl() |
| 2253 | + |
| 2254 | + session_stats = { |
| 2255 | + k: v |
| 2256 | + for k, v in crawler.stats.get_stats().items() |
| 2257 | + if k.startswith("scrapy-zyte-api/sessions") |
| 2258 | + } |
| 2259 | + assert session_stats == { |
| 2260 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, |
| 2261 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, |
| 2262 | + } |
| 2263 | + |
| 2264 | + # Clean up the session config registry, and check it, otherwise we could |
| 2265 | + # affect other tests. |
| 2266 | + |
| 2267 | + session_config_registry.__init__() # type: ignore[misc] |
| 2268 | + |
| 2269 | + |
| 2270 | +@ensureDeferred |
| 2271 | +async def test_location_session_config_no_location(mockserver): |
| 2272 | + """If no location is configured, the methods are never called.""" |
| 2273 | + pytest.importorskip("web_poet") |
| 2274 | + |
| 2275 | + @session_config(["postal-code-10001.example", "a.example"]) |
| 2276 | + class CustomSessionConfig(LocationSessionConfig): |
| 2277 | + |
| 2278 | + def location_params( |
| 2279 | + self, request: Request, location: Dict[str, Any] |
| 2280 | + ) -> Dict[str, Any]: |
| 2281 | + assert False |
| 2282 | + |
| 2283 | + def location_check( |
| 2284 | + self, response: Response, request: Request, location: Dict[str, Any] |
| 2285 | + ) -> bool: |
| 2286 | + assert False |
| 2287 | + |
| 2288 | + settings = { |
| 2289 | + "RETRY_TIMES": 0, |
| 2290 | + "ZYTE_API_URL": mockserver.urljoin("/"), |
| 2291 | + "ZYTE_API_SESSION_ENABLED": True, |
| 2292 | + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, |
| 2293 | + } |
| 2294 | + |
| 2295 | + class TestSpider(Spider): |
| 2296 | + name = "test" |
| 2297 | + start_urls = ["https://postal-code-10001.example", "https://a.example"] |
| 2298 | + |
| 2299 | + def start_requests(self): |
| 2300 | + for url in self.start_urls: |
| 2301 | + yield Request( |
| 2302 | + url, |
| 2303 | + meta={ |
| 2304 | + "zyte_api_automap": { |
| 2305 | + "actions": [ |
| 2306 | + { |
| 2307 | + "action": "setLocation", |
| 2308 | + "address": {"postalCode": "10001"}, |
| 2309 | + } |
| 2310 | + ] |
| 2311 | + }, |
| 2312 | + }, |
| 2313 | + ) |
| 2314 | + |
| 2315 | + def parse(self, response): |
| 2316 | + pass |
| 2317 | + |
| 2318 | + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) |
| 2319 | + await crawler.crawl() |
| 2320 | + |
| 2321 | + session_stats = { |
| 2322 | + k: v |
| 2323 | + for k, v in crawler.stats.get_stats().items() |
| 2324 | + if k.startswith("scrapy-zyte-api/sessions") |
| 2325 | + } |
| 2326 | + assert session_stats == { |
| 2327 | + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, |
| 2328 | + "scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1, |
| 2329 | + "scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1, |
| 2330 | + } |
| 2331 | + |
| 2332 | + # Clean up the session config registry, and check it, otherwise we could |
| 2333 | + # affect other tests. |
| 2334 | + |
| 2335 | + session_config_registry.__init__() # type: ignore[misc] |
| 2336 | + |
| 2337 | + |
2083 | 2338 | @ensureDeferred
|
2084 | 2339 | async def test_session_refresh(mockserver):
|
2085 | 2340 | """If a response does not pass a session validity check, the session is
|
|
0 commit comments