forked from zytedata/zyte-smartproxy-headless-proxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.toml
124 lines (107 loc) · 4.78 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# This is an example of configuration file for crawlera-headless-proxy
# All options here are optional, basically to run the proxy all you
# need is to provide API key. It is doable with environment variable
# or command line parameter. But of course, it is settable with config.
# Should we run this tool in debug mode or not. Basically, this makes
# the tool more verbose on stderr.
debug = false
# This tool talks with Crawlera and Crawlera has its own TLS certificate
# authority. This option defines if if we need to verify TLS certificate
# given by Crawlera or not.
#
# This certificate is placed here:
# https://doc.scrapinghub.com/_downloads/crawlera-ca.crt
#
# SHA1 checksum of this certificate is 5798e59f6f7ecad3c0e1284f42b07dcaa63fbd37
dont_verify_crawlera_cert = false
# Do not use automatic session management.
# Basically, Crawlera works better with browsers only if you use sessions. If
# you want to implement your own session management, please keep this option
# as 'true'. Usually you want to have automatic session management.
#
# Here is how to use Crawlera and sessions:
# https://doc.scrapinghub.com/crawlera.html#x-crawlera-session
#
# ATTENTION!
# If you want to have a better experience in session management, we would
# recommend you to have 1 instance of headless proxy per browser but then
# please pay attention to 'concurrent_connections' option.
no_auto_sessions = false
# Set the limit of concurrent connections based on your billing plan.
# If your plan does not have any limitations, it makes sense to set this
# option to 0 (disable internal throttling).
#
# In case of this parameter > 0 headless proxy is going to throttle your
# requests before Crawlera. This really help to avoid you the problems
# of 429 errors if you exceed this number and brings better overall experience.
concurrent_connections = 0
# Which port crawlera-headless-proxy should listen on. Usually people like to
# set it to 3128.
bind_port = 3128
# Which IP should crawlera-headless-proxy to use. As usual, 0.0.0.0 helps
# to listen on every interface.
bind_ip = "127.0.0.1"
# Which IP should crawlera-headless-proxy proxy API listen on. Please
# remember that this is not HTTP Proxy interface you should set in your
# browser, this is internal thing for getting stats etc.
#
# Default (empty value) will set this IP to the same value as bind_ip.
proxy_api_ip = ""
# Which port crawlera-headless-proxy proxy API should listen on. Please
# remember that his is not HTTP proxy interface port.
proxy_api_port = 3130
# Which port is Crawlera listen on. In 99.999% of cases it is 8010 and you
# do not need to change that.
crawlera_port = 8010
# Which host is Crawlera placed on.
crawlera_host = "proxy.crawlera.com"
# What is API key for accessing Crawlera.
api_key = ""
# Path to your own TLS CA certificate if you do not like to use
# own crawlera-headless-proxy certificate.
# tls_ca_certificate = "/path/to/your/own/ca/certificate"
# Path to your own TLS CA certificate if you do not like to use
# own crawlera-headless-proxy private key.
# tls_private_key = "/path/to/your/own/tls/private/key"
# The list of adblock-compatible filters.
# Usually you do not want to spend resources (and concurrent connetions) on
# advertisment, different trackers and other spyware. If you want to filter
# these requests before they even go to Crawlera, please specify the lists
# here. They can be HTTP(S) links or file paths.
#
# Here is documentation on how to build your own filter list:
# https://adblockplus.org/en/filters
adblock_lists = [
"https://fanboy.co.nz/r/fanboy-ultimate.txt",
"https://fanboy.co.nz/fanboy-antifonts.txt",
"https://fanboy.co.nz/fanboy-antifacebook.txt",
"https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt"
]
# A list of regular expressions to match hostpath part of URL for direct
# access bypassing Crawlera.
#
# Example: https://www.example.com/images/cat.png?foo=111&bar=lala#segment
#
# HostPath is a full combination of host and path parameters of URL. In
# our example, it would be 'www.example.com/images/cat.png'. Regular
# expression searches for full match, so in example above 'cat' won't
# work, but '.*?cat.*?' - will.
#
# Regular expressions are case sensitive.
# direct_access_hostpath_regexps = [
# '.*?\.(?:txt|css|eot|gif|ico|jpe?g|js|less|mkv|mp4|mpe?g|png|ttf|webm|webp|woff2?)$'
# ]
# A list of Crawlera XHeaders to propagate to real Crawlera from this
# headless proxy.
#
# As you are going to use headless browser, we would recommend you
# to use at least X-Crawlera-Cookies:disable and X-Crawlera-Profile:desktop
# headers.
# You can speficy sucn headers in full and short form. For example,
# `X-Crawlera-Cookies` and `cookies` define the same header.
#
# A full list can be found here:
# https://doc.scrapinghub.com/crawlera.html#request-headers
[xheaders]
# cookies = "disable"
# profile = "desktop"