15
15
16
16
logger = getLogger (__name__ )
17
17
18
+ _NoDefault = object ()
19
+
20
+ # Map of all known root Zyte API request params and how they need to be
21
+ # handled. Sorted by appearance in
22
+ # https://docs.zyte.com/zyte-api/usage/reference.html.
23
+ _REQUEST_PARAMS : Dict [str , Dict [str , Any ]] = {
24
+ "url" : {
25
+ "default" : _NoDefault ,
26
+ "is_extract_type" : False ,
27
+ "requires_browser_rendering" : False ,
28
+ "changes_fingerprint" : True ,
29
+ },
30
+ "requestHeaders" : {
31
+ "default" : {},
32
+ "is_extract_type" : False ,
33
+ "requires_browser_rendering" : False ,
34
+ "changes_fingerprint" : False ,
35
+ },
36
+ "httpRequestMethod" : {
37
+ "default" : "GET" ,
38
+ "is_extract_type" : False ,
39
+ "requires_browser_rendering" : False ,
40
+ "changes_fingerprint" : True ,
41
+ },
42
+ "httpRequestBody" : {
43
+ "default" : "" ,
44
+ "is_extract_type" : False ,
45
+ "requires_browser_rendering" : False ,
46
+ "changes_fingerprint" : True ,
47
+ },
48
+ "httpRequestText" : {
49
+ "default" : "" ,
50
+ "is_extract_type" : False ,
51
+ "requires_browser_rendering" : False ,
52
+ "changes_fingerprint" : True ,
53
+ },
54
+ "customHttpRequestHeaders" : {
55
+ "default" : [],
56
+ "is_extract_type" : False ,
57
+ "requires_browser_rendering" : False ,
58
+ "changes_fingerprint" : False ,
59
+ },
60
+ "httpResponseBody" : {
61
+ "default" : False ,
62
+ "is_extract_type" : False ,
63
+ "requires_browser_rendering" : False ,
64
+ "changes_fingerprint" : True ,
65
+ },
66
+ "httpResponseHeaders" : {
67
+ "default" : False ,
68
+ "is_extract_type" : False ,
69
+ "requires_browser_rendering" : False ,
70
+ "changes_fingerprint" : True ,
71
+ },
72
+ "browserHtml" : {
73
+ "default" : False ,
74
+ "is_extract_type" : False ,
75
+ "requires_browser_rendering" : True ,
76
+ "changes_fingerprint" : True ,
77
+ },
78
+ "screenshot" : {
79
+ "default" : False ,
80
+ "is_extract_type" : False ,
81
+ "requires_browser_rendering" : True ,
82
+ "changes_fingerprint" : True ,
83
+ },
84
+ "screenshotOptions" : {
85
+ "default" : {},
86
+ "is_extract_type" : False ,
87
+ "requires_browser_rendering" : False , # Not on its own.
88
+ "changes_fingerprint" : True ,
89
+ },
90
+ "article" : {
91
+ "default" : False ,
92
+ "is_extract_type" : True ,
93
+ "requires_browser_rendering" : False ,
94
+ "changes_fingerprint" : True ,
95
+ },
96
+ "articleOptions" : {
97
+ "default" : {},
98
+ "is_extract_type" : False , # Not on its own.
99
+ "requires_browser_rendering" : False ,
100
+ "changes_fingerprint" : True ,
101
+ },
102
+ "articleList" : {
103
+ "default" : False ,
104
+ "is_extract_type" : True ,
105
+ "requires_browser_rendering" : False ,
106
+ "changes_fingerprint" : True ,
107
+ },
108
+ "articleListOptions" : {
109
+ "default" : {},
110
+ "is_extract_type" : False , # Not on its own.
111
+ "requires_browser_rendering" : False ,
112
+ "changes_fingerprint" : True ,
113
+ },
114
+ "articleNavigation" : {
115
+ "default" : False ,
116
+ "is_extract_type" : True ,
117
+ "requires_browser_rendering" : False ,
118
+ "changes_fingerprint" : True ,
119
+ },
120
+ "articleNavigationOptions" : {
121
+ "default" : {},
122
+ "is_extract_type" : False , # Not on its own.
123
+ "requires_browser_rendering" : False ,
124
+ "changes_fingerprint" : True ,
125
+ },
126
+ "jobPosting" : {
127
+ "default" : False ,
128
+ "is_extract_type" : True ,
129
+ "requires_browser_rendering" : False ,
130
+ "changes_fingerprint" : True ,
131
+ },
132
+ "jobPostingOptions" : {
133
+ "default" : {},
134
+ "is_extract_type" : False , # Not on its own.
135
+ "requires_browser_rendering" : False ,
136
+ "changes_fingerprint" : True ,
137
+ },
138
+ "product" : {
139
+ "default" : False ,
140
+ "is_extract_type" : True ,
141
+ "requires_browser_rendering" : False ,
142
+ "changes_fingerprint" : True ,
143
+ },
144
+ "productOptions" : {
145
+ "default" : {},
146
+ "is_extract_type" : False , # Not on its own.
147
+ "requires_browser_rendering" : False ,
148
+ "changes_fingerprint" : True ,
149
+ },
150
+ "productList" : {
151
+ "default" : False ,
152
+ "is_extract_type" : True ,
153
+ "requires_browser_rendering" : False ,
154
+ "changes_fingerprint" : True ,
155
+ },
156
+ "productListOptions" : {
157
+ "default" : {},
158
+ "is_extract_type" : False , # Not on its own.
159
+ "requires_browser_rendering" : False ,
160
+ "changes_fingerprint" : True ,
161
+ },
162
+ "productNavigation" : {
163
+ "default" : False ,
164
+ "is_extract_type" : True ,
165
+ "requires_browser_rendering" : False ,
166
+ "changes_fingerprint" : True ,
167
+ },
168
+ "productNavigationOptions" : {
169
+ "default" : {},
170
+ "is_extract_type" : False , # Not on its own.
171
+ "requires_browser_rendering" : False ,
172
+ "changes_fingerprint" : True ,
173
+ },
174
+ "geolocation" : {
175
+ "default" : None ,
176
+ "is_extract_type" : False ,
177
+ "requires_browser_rendering" : False ,
178
+ "changes_fingerprint" : True ,
179
+ },
180
+ "javascript" : {
181
+ "default" : None ,
182
+ "is_extract_type" : False ,
183
+ "requires_browser_rendering" : False , # Not on its own.
184
+ "changes_fingerprint" : True ,
185
+ },
186
+ "actions" : {
187
+ "default" : [],
188
+ "is_extract_type" : False ,
189
+ "requires_browser_rendering" : False , # Not on its own.
190
+ "changes_fingerprint" : True ,
191
+ },
192
+ "jobId" : {
193
+ "default" : None ,
194
+ "is_extract_type" : False ,
195
+ "requires_browser_rendering" : False ,
196
+ "changes_fingerprint" : False ,
197
+ },
198
+ "echoData" : {
199
+ "default" : None ,
200
+ "is_extract_type" : False ,
201
+ "requires_browser_rendering" : False ,
202
+ "changes_fingerprint" : True ,
203
+ },
204
+ "viewport" : {
205
+ "default" : {},
206
+ "is_extract_type" : False ,
207
+ "requires_browser_rendering" : False ,
208
+ "changes_fingerprint" : True ,
209
+ },
210
+ "sessionContext" : {
211
+ "default" : [],
212
+ "is_extract_type" : False ,
213
+ "requires_browser_rendering" : False ,
214
+ "changes_fingerprint" : False , # Treated like headers.
215
+ },
216
+ "sessionContextParameters" : {
217
+ "default" : {},
218
+ "is_extract_type" : False ,
219
+ "requires_browser_rendering" : False ,
220
+ "changes_fingerprint" : False , # Treated like sessionContext.
221
+ },
222
+ "device" : {
223
+ "default" : "auto" ,
224
+ "is_extract_type" : False ,
225
+ "requires_browser_rendering" : False ,
226
+ "changes_fingerprint" : True , # Treated like viewport.
227
+ },
228
+ "cookieManagement" : {
229
+ "default" : "auto" ,
230
+ "is_extract_type" : False ,
231
+ "requires_browser_rendering" : False ,
232
+ "changes_fingerprint" : False , # Treated like headers.
233
+ },
234
+ "requestCookies" : {
235
+ "default" : [],
236
+ "is_extract_type" : False ,
237
+ "requires_browser_rendering" : False ,
238
+ "changes_fingerprint" : False , # Treated like headers.
239
+ },
240
+ "responseCookies" : {
241
+ "default" : False ,
242
+ "is_extract_type" : False ,
243
+ "requires_browser_rendering" : False ,
244
+ "changes_fingerprint" : True ,
245
+ },
246
+ "experimental" : {
247
+ "default" : {},
248
+ "is_extract_type" : False ,
249
+ "requires_browser_rendering" : False ,
250
+ "changes_fingerprint" : False ,
251
+ },
252
+ }
253
+
254
+ _BROWSER_KEYS = {
255
+ key for key , value in _REQUEST_PARAMS .items () if value ["requires_browser_rendering" ]
256
+ }
18
257
_EXTRACT_KEYS = {
19
- "article" ,
20
- "articleList" ,
21
- "articleNavigation" ,
22
- "product" ,
23
- "productList" ,
24
- "productNavigation" ,
258
+ key for key , value in _REQUEST_PARAMS .items () if value ["is_extract_type" ]
259
+ }
260
+ _BROWSER_OR_EXTRACT_KEYS = _BROWSER_KEYS | _EXTRACT_KEYS
261
+ _DEFAULT_API_PARAMS = {
262
+ key : value ["default" ]
263
+ for key , value in _REQUEST_PARAMS .items ()
264
+ if value ["default" ] != _NoDefault
25
265
}
26
- _BROWSER_KEYS = _EXTRACT_KEYS | {"browserHtml" , "screenshot" }
27
- _DEFAULT_API_PARAMS = {key : False for key in _BROWSER_KEYS }
28
266
29
267
_DEFAULT_ACCEPT_ENCODING = ", " .join (
30
268
encoding .decode () for encoding in ACCEPTED_ENCODINGS
31
269
)
32
270
33
271
272
+ def _uses_browser (api_params : Dict [str , Any ]) -> bool :
273
+ for key in _BROWSER_KEYS :
274
+ if api_params .get (key , _REQUEST_PARAMS [key ]["default" ]):
275
+ return True
276
+ for key in _EXTRACT_KEYS :
277
+ options = api_params .get (f"{ key } Options" , {})
278
+ extract_from = options .get ("extractFrom" , None )
279
+ if extract_from == "browserHtml" :
280
+ return True
281
+ # Note: This could be a “maybe”, e.g. if no extractFrom is specified, a
282
+ # extract key could be triggering browser rendering.
283
+ return False
284
+
285
+
34
286
def _iter_headers (
35
287
* ,
36
288
api_params : Dict [str , Any ],
@@ -149,7 +401,7 @@ def _set_request_headers_from_request(
149
401
api_params .pop ("customHttpRequestHeaders" )
150
402
151
403
if (
152
- (not response_body or any (api_params .get (k ) for k in _BROWSER_KEYS ))
404
+ (not response_body or any (api_params .get (k ) for k in _BROWSER_OR_EXTRACT_KEYS ))
153
405
and request_headers is not False
154
406
or request_headers is True
155
407
):
@@ -167,7 +419,7 @@ def _set_http_response_body_from_request(
167
419
api_params : Dict [str , Any ],
168
420
request : Request ,
169
421
):
170
- if not any (api_params .get (k ) for k in _BROWSER_KEYS ):
422
+ if not any (api_params .get (k ) for k in _BROWSER_OR_EXTRACT_KEYS ):
171
423
api_params .setdefault ("httpResponseBody" , True )
172
424
elif api_params .get ("httpResponseBody" ) is False :
173
425
logger .warning (
@@ -319,14 +571,20 @@ def _set_http_request_body_from_request(
319
571
api_params ["httpRequestBody" ] = base64_body
320
572
321
573
574
+ _Undefined = object ()
575
+
576
+
322
577
def _unset_unneeded_api_params (
323
578
* ,
324
579
api_params : Dict [str , Any ],
325
580
default_params : Dict [str , Any ],
326
581
request : Request ,
327
582
):
328
583
for param , default_value in _DEFAULT_API_PARAMS .items ():
329
- if api_params .get (param ) != default_value :
584
+ value = api_params .get (param , _Undefined )
585
+ if value is _Undefined :
586
+ continue
587
+ if value != default_value :
330
588
continue
331
589
if param not in default_params or default_params .get (param ) == default_value :
332
590
logger .warning (
0 commit comments