forked from davidjohnbarton/crawler-google-places
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathINPUT_SCHEMA.json
461 lines (461 loc) · 18.7 KB
/
INPUT_SCHEMA.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
{
"title": "Google Maps Scraper",
"description": "Provide either a list of search keywords or search/place URLs. It is better to set location in the Geolocation settings below rather than add into the search terms. You will get more results and they will be accurate.",
"type": "object",
"schemaVersion": 1,
"properties": {
"startUrls": {
"title": "Start URLs",
"type": "array",
"description": "List of URLs to be crawled. They can be search URLs or place URLs. The only valid format for URLs contains either /maps/search or /maps/place",
"editor": "requestListSources"
},
"searchStringsArray": {
"title": "Search Terms",
"type": "array",
"description": "Array of strings to be searched. It is also possible to fill in Google Place IDs in the format place_id:ChIJp4JiUCNP0xQR1JaSjpW_Hms. Setting geolocation input fields is more accurate than including location in the search string.",
"editor": "stringList",
"prefill": [
"restaurant New York"
]
},
"maxCrawledPlaces": {
"title": "Max crawled places",
"type": "integer",
"description": "Maximum number of places you scrape per whole run. If you want to scrape all available, set this to `9999999`.",
"prefill": 10,
"minimum": 0
},
"maxCrawledPlacesPerSearch": {
"title": "Max crawled places per search",
"type": "integer",
"description": "Maximum number of places you scrape per search term. If you want to scrape all available, set this to `9999999`.",
"minimum": 0
},
"language": {
"title": "Language",
"description": "Force showing results in this language.",
"enum": [
"en",
"af",
"az",
"id",
"ms",
"bs",
"ca",
"cs",
"da",
"de",
"et",
"es",
"es-419",
"eu",
"fil",
"fr",
"gl",
"hr",
"zu",
"is",
"it",
"sw",
"lv",
"lt",
"hu",
"nl",
"no",
"uz",
"pl",
"pt-BR",
"pt-PT",
"ro",
"sq",
"sk",
"sl",
"fi",
"sv",
"vi",
"tr",
"el",
"bg",
"ky",
"kk",
"mk",
"mn",
"ru",
"sr",
"uk",
"ka",
"hy",
"iw",
"ur",
"ar",
"fa",
"am",
"ne",
"hi",
"mr",
"bn",
"pa",
"gu",
"ta",
"te",
"kn",
"ml",
"si",
"th",
"lo",
"my",
"km",
"ko",
"ja",
"zh-CN",
"zh-TW"
],
"enumTitles": [
"English",
"Afrikaans",
"azərbaycan",
"BahasaIndonesia",
"BahasaMelayu",
"bosanski",
"català",
"Čeština",
"Dansk",
"Deutsch (Deutschland)",
"eesti",
"Español (España)",
"Español (Latinoamérica)",
"euskara",
"Filipino",
"Français (France)",
"galego",
"Hrvatski",
"isiZulu",
"íslenska",
"Italiano",
"Kiswahili",
"latviešu",
"lietuvių",
"magyar",
"Nederlands",
"norsk",
"oʻzbekcha",
"polski",
"Português (Brasil)",
"Português (Portugal)",
"română",
"shqip",
"Slovenčina",
"slovenščina",
"Suomi",
"Svenska",
"TiếngViệt",
"Türkçe",
"Ελληνικά",
"български",
"кыргызча",
"қазақтілі",
"македонски",
"монгол",
"Русский",
"српски (ћирилица)",
"Українська",
"ქართული",
"հայերեն",
"עברית",
"اردو",
"العربية",
"فارسی",
"አማርኛ",
"नेपाली",
"हिन्दी",
"मराठी",
"বাংলা",
"ਪੰਜਾਬੀ",
"ગુજરાતી",
"தமிழ்",
"తెలుగు",
"ಕನ್ನಡ",
"മലയാളം",
"සිංහල",
"ไทย",
"ລາວ",
"ဗမာ",
"ខ្មែរ",
"한국어",
"日本語",
"简体中文",
"繁體中文"
],
"type": "string",
"editor": "select",
"default": "en",
"example": "en",
"prefill": "en"
},
"allPlacesNoSearchAction": {
"title": "Scrape all places (no search term)",
"description": "Will scrape all places seen on the map. This depends on a lot of the zoom so for more places, increase the zoom. Doesn't work together with URLs",
"enum": [
"",
"all_places_no_search_mouse",
"all_places_no_search_ocr"
],
"enumTitles": [
"Not applied (normal search or direct places)",
"Scrape by Moving mouse (slow)",
"Scrape by OCR tool (fast but you need 256 MB of extra memory to call another actor)"
],
"type": "string",
"editor": "select",
"default": "",
"example": "",
"prefill": ""
},
"exportPlaceUrls": {
"title": "Export place URLs only (skips place details)",
"type": "boolean",
"description": "Scraper will return only place URLs without crawling their details.",
"default": false,
"sectionCaption": "Output configuration",
"sectionDescription": "You can set what extra information you would like extracted. For maximum efficiency, the default setup doesn't include reviews or images. If you need these, just increase the maximum count for them."
},
"includeHistogram": {
"title": "Include popular times",
"type": "boolean",
"description": "If checked, the crawler scrapes popular times for all places. You can speed up crawling if you disable this.",
"default": false
},
"includeOpeningHours": {
"title": "Include opening hours",
"type": "boolean",
"description": "If checked, the crawler scrapes opening hours for all places. You can speed up crawling if you disable this.",
"default": false
},
"includePeopleAlsoSearch": {
"title": "Include people also search",
"type": "boolean",
"description": "If checked, the crawler scrapes \"people also search\" for all places. You can speed up crawling if you disable this.",
"default": false
},
"additionalInfo": {
"title": "Additional Place Info",
"type": "boolean",
"description": "Extract additional information about each place, e.g. Service Options, Highlights, Offerings, etc. You can speed up crawling if you disable this.",
"default": false
},
"oneReviewPerRow": {
"title": "One review per row",
"type": "boolean",
"description": "If checked, the scraper will produce one row of results per each review. The information about the place is copied to each row.",
"default": false
},
"maxImages": {
"title": "Number of images (Slow for more than 1)",
"type": "integer",
"description": "Max number of images per place to scrape. If you fill in 0 or nothing, no images will be scraped. For all images, just put 99999",
"prefill": 0,
"unit": "images per place"
},
"maxReviews": {
"title": "Number of reviews (slow)",
"type": "integer",
"description": "Max number of reviews per place to scrape. If you fill in 0 or nothing, no reviews will be scraped. For all reviews, just put 99999",
"prefill": 0,
"unit": "reviews per place"
},
"reviewsStartDate": {
"title": "Reviews start date (scrape only newer than)",
"type": "string",
"description": "Format should be YYYY-MM-DD, e.g. 2022-02-20",
"editor": "textfield"
},
"reviewsSort": {
"title": "Sort reviews by",
"description": "Define how reviews should be sorted.",
"type": "string",
"editor": "select",
"default": "newest",
"enum": [
"newest",
"mostRelevant",
"highestRanking",
"lowestRanking"
],
"enumTitles": [
"Newest",
"Most relevant",
"Highest ranking",
"Lowest ranking"
]
},
"reviewsTranslation": {
"title": "Reviews translation",
"type": "string",
"description": "Google automatically adds translated text to the original. You can adjust this behavior here.",
"editor": "select",
"default": "originalAndTranslated",
"enum": [
"originalAndTranslated",
"onlyOriginal",
"onlyTranslated"
],
"enumTitles": [
"Original & translated (Google's default)",
"Only original",
"Only translated"
]
},
"scrapeReviewerName": {
"title": "Reviewer name",
"type": "boolean",
"description": "Extract Reviewer name",
"default": true,
"sectionCaption": "Personal data (reviews only)",
"sectionDescription": "All of these fields contain personal data. Personal data is protected by the GDPR in the European Union and by other regulations around the world. You should not scrape personal data unless you have a legitimate reason to do so. If you're unsure whether your reason is legitimate, consult your lawyers."
},
"scrapeReviewerId": {
"title": "Reviewer ID",
"type": "boolean",
"description": "Extract Reviewer ID",
"default": true
},
"scrapeReviewerUrl": {
"title": "Reviewer URL",
"type": "boolean",
"description": "Extract Reviewer URL",
"default": true
},
"scrapeReviewId": {
"title": "Review ID",
"type": "boolean",
"description": "Extract Review ID",
"default": true
},
"scrapeReviewUrl": {
"title": "Review URL",
"type": "boolean",
"description": "Extract Review URL",
"default": true
},
"scrapeResponseFromOwnerText": {
"title": "Response from owner",
"type": "boolean",
"description": "Extract Response from owner",
"default": true
},
"country": {
"title": "Country",
"type": "string",
"description": "Set the country where the search should be performed, e.g. 'USA'.",
"editor": "textfield",
"sectionCaption": "Geolocation",
"sectionDescription": "Here you can set where places should be searched. Prefer using any combination of country, state, county, city or postal code to get maximum results. This scraper automatically chooses appropriate map zoom and splits the map into many smaller searches to catch as many places as possible. Currently, the scraper doesn't work well for full country searching of sparsely populated countries like USA or Russia. For these, prefer searching city by city or focus on populated states. Latitude and longitude or polygon don't have this functionality and will yield less results (but are faster)."
},
"state": {
"title": "State",
"type": "string",
"description": "Set a state where the search should be performed, e.g. 'Massachusetts', 'England', 'Berlin', etc.",
"editor": "textfield"
},
"county": {
"title": "County",
"type": "string",
"description": "Set the county where the search should be performed, e.g. 'washington'.",
"editor": "textfield"
},
"city": {
"title": "City",
"type": "string",
"description": "Set the city where the search should be performed, e.g. 'Pittsfield'.",
"editor": "textfield"
},
"postalCode": {
"title": "Postal code",
"type": "string",
"description": "Set the postal code where the search should be performed, e.g. 10001. Select a country as well to ensure the correct postal code is used.",
"editor": "textfield"
},
"lat": {
"title": "Viewport point latitude",
"type": "string",
"description": "Use in combination with longitude and zoom to set the viewport to search on. Do not combine with country/state/city/postal parameters (prefer those)",
"editor": "textfield"
},
"lng": {
"title": "Viewport point longitude",
"type": "string",
"description": "Use in combination with latitude and zoom to set the viewport to search on. Do not combine with country/state/city/postal parameters (prefer those)",
"editor": "textfield"
},
"zoom": {
"title": "Viewport zoom level",
"type": "integer",
"description": "You don't need to set this parameter. A good value is chosen automatically based on geolocation, see <a href='https://apify.com/drobnikj/crawler-google-places#automatic-zooming' target='_blank' rel='noopener'>readme</a> for more info.",
"minimum": 1,
"maximum": 21
},
"maxAutomaticZoomOut": {
"title": "Max automatic zoom out",
"type": "integer",
"description": "Parameter to stop searching once Google zooms out too far. It counts how far it zoomed out compared to the first page. Keep in mind that `zoom: 1` is the whole world and `zoom: 21` is a tiny street. So usually you want `maxAutomaticZoomOut` to be between `0` and `5`. Also, keep in mind that Google zooms a bit differently in each run.",
"minimum": 0,
"maximum": 10
},
"customGeolocation": {
"title": "Custom geolocation",
"type": "object",
"editor": "json",
"description": "Custom geolocation to define exact search area if other geolocation parameters don't work well. See see <a href='https://apify.com/drobnikj/crawler-google-places#custom-geolocation' target='_blank' rel='noopener'>readme</a> for details."
},
"proxyConfig": {
"title": "Proxy configuration",
"type": "object",
"description": "You need to use Apify proxy or custom proxies. Automatic proxy works very well for Google Maps.",
"default": {
"useApifyProxy": true
},
"prefill": {
"useApifyProxy": true
},
"editor": "proxy",
"sectionCaption": "Browser, page and proxy settings",
"sectionDescription": "How the browsers will behave during the scraping process."
},
"maxConcurrency": {
"title": "Max concurrency",
"type": "integer",
"description": "Maximum number of pages that will be processed in parallel. This is usually limited by the memory assigned to the run.",
"minimum": 0,
"maximum": 1000,
"default": 100
},
"maxPageRetries": {
"title": "Max page retries",
"type": "integer",
"description": "Maximum number of retries when page scraping fails. Set to 0 if you are getting repeated errors where retrying does not help (and report to support@apify.com or create new issue)",
"minimum": 0,
"maximum": 1000,
"default": 6
},
"pageLoadTimeoutSec": {
"title": "Page load timeout seconds",
"type": "integer",
"description": "Maximum time in seconds to wait for the page to load.",
"minimum": 0,
"maximum": 1000,
"default": 60
},
"maxPagesPerBrowser": {
"title": "Max opened pages per browser",
"type": "integer",
"description": "How many pages can be opened at once for each browser. Having more pages in one browser is faster but can lead to increased blocking.",
"minimum": 0,
"maximum": 100,
"default": 1
}
},
"required": [
"proxyConfig"
]
}