Skip to content

Commit 702cc63

Browse files
authored
Implement a parameter map (#151)
1 parent dc09ac3 commit 702cc63

File tree

5 files changed

+383
-72
lines changed

5 files changed

+383
-72
lines changed

docs/reference/fingerprint-params.rst

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,26 +11,45 @@ fingerprints for Zyte API requests based on the following Zyte API parameters:
1111

1212
For URLs that include a URL fragment, like ``https://example.com#foo``, URL
1313
canonicalization keeps the URL fragment if :http:`request:browserHtml` or
14-
:http:`request:screenshot` are enabled.
14+
:http:`request:screenshot` are enabled, or if extractFrom_ is set to
15+
``browserHtml``.
16+
17+
.. _extractFrom: https://docs.zyte.com/zyte-api/usage/extract.html#extraction-source
1518

1619
- Request attribute parameters (:http:`request:httpRequestBody`,
17-
:http:`request:httpRequestMethod`)
20+
:http:`request:httpRequestText`, :http:`request:httpRequestMethod`), except
21+
headers
22+
23+
Equivalent :http:`request:httpRequestBody` and
24+
:http:`request:httpRequestText` values generate the same signature.
1825

1926
- Output parameters (:http:`request:browserHtml`,
2027
:http:`request:httpResponseBody`, :http:`request:httpResponseHeaders`,
21-
:http:`request:screenshot`)
28+
:http:`request:responseCookies`, :http:`request:screenshot`, and
29+
:ref:`automatic extraction outputs <zyte-api-extract-fields>` like
30+
:http:`request:product`)
2231

2332
- Rendering option parameters (:http:`request:actions`,
24-
:http:`request:javascript`, :http:`request:screenshotOptions`)
33+
:http:`request:device`, :http:`request:javascript`,
34+
:http:`request:screenshotOptions`, :http:`request:viewport`, and automatic
35+
extraction options like :http:`request:productOptions`)
2536

2637
- :http:`request:geolocation`
2738

39+
- :http:`request:echoData`
40+
2841
The following Zyte API parameters are *not* taken into account for request
2942
fingerprinting:
3043

3144
- Request header parameters (:http:`request:customHttpRequestHeaders`,
3245
:http:`request:requestHeaders`)
3346

34-
- Metadata parameters (:http:`request:echoData`, :http:`request:jobId`)
47+
- Request cookie parameters (:http:`request:cookieManagement`,
48+
:http:`request:requestCookies`)
49+
50+
- Session handling parameters (:http:`request:sessionContext`,
51+
:http:`request:sessionContextParameters`)
52+
53+
- :http:`request:jobId`
3554

36-
- Experimental parameters (:http:`request:experimental`)
55+
- Experimental parameters (:http:`experimental.* <request:experimental>`)

scrapy_zyte_api/_params.py

Lines changed: 269 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,274 @@
1515

1616
logger = getLogger(__name__)
1717

18+
_NoDefault = object()
19+
20+
# Map of all known root Zyte API request params and how they need to be
21+
# handled. Sorted by appearance in
22+
# https://docs.zyte.com/zyte-api/usage/reference.html.
23+
_REQUEST_PARAMS: Dict[str, Dict[str, Any]] = {
24+
"url": {
25+
"default": _NoDefault,
26+
"is_extract_type": False,
27+
"requires_browser_rendering": False,
28+
"changes_fingerprint": True,
29+
},
30+
"requestHeaders": {
31+
"default": {},
32+
"is_extract_type": False,
33+
"requires_browser_rendering": False,
34+
"changes_fingerprint": False,
35+
},
36+
"httpRequestMethod": {
37+
"default": "GET",
38+
"is_extract_type": False,
39+
"requires_browser_rendering": False,
40+
"changes_fingerprint": True,
41+
},
42+
"httpRequestBody": {
43+
"default": "",
44+
"is_extract_type": False,
45+
"requires_browser_rendering": False,
46+
"changes_fingerprint": True,
47+
},
48+
"httpRequestText": {
49+
"default": "",
50+
"is_extract_type": False,
51+
"requires_browser_rendering": False,
52+
"changes_fingerprint": True,
53+
},
54+
"customHttpRequestHeaders": {
55+
"default": [],
56+
"is_extract_type": False,
57+
"requires_browser_rendering": False,
58+
"changes_fingerprint": False,
59+
},
60+
"httpResponseBody": {
61+
"default": False,
62+
"is_extract_type": False,
63+
"requires_browser_rendering": False,
64+
"changes_fingerprint": True,
65+
},
66+
"httpResponseHeaders": {
67+
"default": False,
68+
"is_extract_type": False,
69+
"requires_browser_rendering": False,
70+
"changes_fingerprint": True,
71+
},
72+
"browserHtml": {
73+
"default": False,
74+
"is_extract_type": False,
75+
"requires_browser_rendering": True,
76+
"changes_fingerprint": True,
77+
},
78+
"screenshot": {
79+
"default": False,
80+
"is_extract_type": False,
81+
"requires_browser_rendering": True,
82+
"changes_fingerprint": True,
83+
},
84+
"screenshotOptions": {
85+
"default": {},
86+
"is_extract_type": False,
87+
"requires_browser_rendering": False, # Not on its own.
88+
"changes_fingerprint": True,
89+
},
90+
"article": {
91+
"default": False,
92+
"is_extract_type": True,
93+
"requires_browser_rendering": False,
94+
"changes_fingerprint": True,
95+
},
96+
"articleOptions": {
97+
"default": {},
98+
"is_extract_type": False, # Not on its own.
99+
"requires_browser_rendering": False,
100+
"changes_fingerprint": True,
101+
},
102+
"articleList": {
103+
"default": False,
104+
"is_extract_type": True,
105+
"requires_browser_rendering": False,
106+
"changes_fingerprint": True,
107+
},
108+
"articleListOptions": {
109+
"default": {},
110+
"is_extract_type": False, # Not on its own.
111+
"requires_browser_rendering": False,
112+
"changes_fingerprint": True,
113+
},
114+
"articleNavigation": {
115+
"default": False,
116+
"is_extract_type": True,
117+
"requires_browser_rendering": False,
118+
"changes_fingerprint": True,
119+
},
120+
"articleNavigationOptions": {
121+
"default": {},
122+
"is_extract_type": False, # Not on its own.
123+
"requires_browser_rendering": False,
124+
"changes_fingerprint": True,
125+
},
126+
"jobPosting": {
127+
"default": False,
128+
"is_extract_type": True,
129+
"requires_browser_rendering": False,
130+
"changes_fingerprint": True,
131+
},
132+
"jobPostingOptions": {
133+
"default": {},
134+
"is_extract_type": False, # Not on its own.
135+
"requires_browser_rendering": False,
136+
"changes_fingerprint": True,
137+
},
138+
"product": {
139+
"default": False,
140+
"is_extract_type": True,
141+
"requires_browser_rendering": False,
142+
"changes_fingerprint": True,
143+
},
144+
"productOptions": {
145+
"default": {},
146+
"is_extract_type": False, # Not on its own.
147+
"requires_browser_rendering": False,
148+
"changes_fingerprint": True,
149+
},
150+
"productList": {
151+
"default": False,
152+
"is_extract_type": True,
153+
"requires_browser_rendering": False,
154+
"changes_fingerprint": True,
155+
},
156+
"productListOptions": {
157+
"default": {},
158+
"is_extract_type": False, # Not on its own.
159+
"requires_browser_rendering": False,
160+
"changes_fingerprint": True,
161+
},
162+
"productNavigation": {
163+
"default": False,
164+
"is_extract_type": True,
165+
"requires_browser_rendering": False,
166+
"changes_fingerprint": True,
167+
},
168+
"productNavigationOptions": {
169+
"default": {},
170+
"is_extract_type": False, # Not on its own.
171+
"requires_browser_rendering": False,
172+
"changes_fingerprint": True,
173+
},
174+
"geolocation": {
175+
"default": None,
176+
"is_extract_type": False,
177+
"requires_browser_rendering": False,
178+
"changes_fingerprint": True,
179+
},
180+
"javascript": {
181+
"default": None,
182+
"is_extract_type": False,
183+
"requires_browser_rendering": False, # Not on its own.
184+
"changes_fingerprint": True,
185+
},
186+
"actions": {
187+
"default": [],
188+
"is_extract_type": False,
189+
"requires_browser_rendering": False, # Not on its own.
190+
"changes_fingerprint": True,
191+
},
192+
"jobId": {
193+
"default": None,
194+
"is_extract_type": False,
195+
"requires_browser_rendering": False,
196+
"changes_fingerprint": False,
197+
},
198+
"echoData": {
199+
"default": None,
200+
"is_extract_type": False,
201+
"requires_browser_rendering": False,
202+
"changes_fingerprint": True,
203+
},
204+
"viewport": {
205+
"default": {},
206+
"is_extract_type": False,
207+
"requires_browser_rendering": False,
208+
"changes_fingerprint": True,
209+
},
210+
"sessionContext": {
211+
"default": [],
212+
"is_extract_type": False,
213+
"requires_browser_rendering": False,
214+
"changes_fingerprint": False, # Treated like headers.
215+
},
216+
"sessionContextParameters": {
217+
"default": {},
218+
"is_extract_type": False,
219+
"requires_browser_rendering": False,
220+
"changes_fingerprint": False, # Treated like sessionContext.
221+
},
222+
"device": {
223+
"default": "auto",
224+
"is_extract_type": False,
225+
"requires_browser_rendering": False,
226+
"changes_fingerprint": True, # Treated like viewport.
227+
},
228+
"cookieManagement": {
229+
"default": "auto",
230+
"is_extract_type": False,
231+
"requires_browser_rendering": False,
232+
"changes_fingerprint": False, # Treated like headers.
233+
},
234+
"requestCookies": {
235+
"default": [],
236+
"is_extract_type": False,
237+
"requires_browser_rendering": False,
238+
"changes_fingerprint": False, # Treated like headers.
239+
},
240+
"responseCookies": {
241+
"default": False,
242+
"is_extract_type": False,
243+
"requires_browser_rendering": False,
244+
"changes_fingerprint": True,
245+
},
246+
"experimental": {
247+
"default": {},
248+
"is_extract_type": False,
249+
"requires_browser_rendering": False,
250+
"changes_fingerprint": False,
251+
},
252+
}
253+
254+
_BROWSER_KEYS = {
255+
key for key, value in _REQUEST_PARAMS.items() if value["requires_browser_rendering"]
256+
}
18257
_EXTRACT_KEYS = {
19-
"article",
20-
"articleList",
21-
"articleNavigation",
22-
"product",
23-
"productList",
24-
"productNavigation",
258+
key for key, value in _REQUEST_PARAMS.items() if value["is_extract_type"]
259+
}
260+
_BROWSER_OR_EXTRACT_KEYS = _BROWSER_KEYS | _EXTRACT_KEYS
261+
_DEFAULT_API_PARAMS = {
262+
key: value["default"]
263+
for key, value in _REQUEST_PARAMS.items()
264+
if value["default"] != _NoDefault
25265
}
26-
_BROWSER_KEYS = _EXTRACT_KEYS | {"browserHtml", "screenshot"}
27-
_DEFAULT_API_PARAMS = {key: False for key in _BROWSER_KEYS}
28266

29267
_DEFAULT_ACCEPT_ENCODING = ", ".join(
30268
encoding.decode() for encoding in ACCEPTED_ENCODINGS
31269
)
32270

33271

272+
def _uses_browser(api_params: Dict[str, Any]) -> bool:
273+
for key in _BROWSER_KEYS:
274+
if api_params.get(key, _REQUEST_PARAMS[key]["default"]):
275+
return True
276+
for key in _EXTRACT_KEYS:
277+
options = api_params.get(f"{key}Options", {})
278+
extract_from = options.get("extractFrom", None)
279+
if extract_from == "browserHtml":
280+
return True
281+
# Note: This could be a “maybe”, e.g. if no extractFrom is specified, a
282+
# extract key could be triggering browser rendering.
283+
return False
284+
285+
34286
def _iter_headers(
35287
*,
36288
api_params: Dict[str, Any],
@@ -149,7 +401,7 @@ def _set_request_headers_from_request(
149401
api_params.pop("customHttpRequestHeaders")
150402

151403
if (
152-
(not response_body or any(api_params.get(k) for k in _BROWSER_KEYS))
404+
(not response_body or any(api_params.get(k) for k in _BROWSER_OR_EXTRACT_KEYS))
153405
and request_headers is not False
154406
or request_headers is True
155407
):
@@ -167,7 +419,7 @@ def _set_http_response_body_from_request(
167419
api_params: Dict[str, Any],
168420
request: Request,
169421
):
170-
if not any(api_params.get(k) for k in _BROWSER_KEYS):
422+
if not any(api_params.get(k) for k in _BROWSER_OR_EXTRACT_KEYS):
171423
api_params.setdefault("httpResponseBody", True)
172424
elif api_params.get("httpResponseBody") is False:
173425
logger.warning(
@@ -319,14 +571,20 @@ def _set_http_request_body_from_request(
319571
api_params["httpRequestBody"] = base64_body
320572

321573

574+
_Undefined = object()
575+
576+
322577
def _unset_unneeded_api_params(
323578
*,
324579
api_params: Dict[str, Any],
325580
default_params: Dict[str, Any],
326581
request: Request,
327582
):
328583
for param, default_value in _DEFAULT_API_PARAMS.items():
329-
if api_params.get(param) != default_value:
584+
value = api_params.get(param, _Undefined)
585+
if value is _Undefined:
586+
continue
587+
if value != default_value:
330588
continue
331589
if param not in default_params or default_params.get(param) == default_value:
332590
logger.warning(

0 commit comments

Comments
 (0)