-
Notifications
You must be signed in to change notification settings - Fork 36
/
config.toml
137 lines (119 loc) · 5.47 KB
/
config.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# This is an example of configuration file for crawlera-headless-proxy
# All options here are optional, basically to run the proxy all you
# need is to provide API key. It is doable with environment variable
# or command line parameter. But of course, it is settable with config.
# Should we run this tool in debug mode or not. Basically, this makes
# the tool more verbose on stderr.
debug = false
# This tool talks with Crawlera and Crawlera has its own TLS certificate
# authority. This option defines if if we need to verify TLS certificate
# given by Crawlera or not.
#
# This certificate is placed here:
# https://doc.scrapinghub.com/_downloads/crawlera-ca.crt
#
# SHA1 checksum of this certificate is 5798e59f6f7ecad3c0e1284f42b07dcaa63fbd37
dont_verify_crawlera_cert = false
# Do not use automatic session management.
# Basically, Crawlera works better with browsers only if you use sessions. If
# you want to implement your own session management, please keep this option
# as 'true'. Usually you want to have automatic session management.
#
# Here is how to use Crawlera and sessions:
# https://doc.scrapinghub.com/crawlera.html#x-crawlera-session
#
# ATTENTION!
# If you want to have a better experience in session management, we would
# recommend you to have 1 instance of headless proxy per browser but then
# please pay attention to 'concurrent_connections' option.
no_auto_sessions = false
# Set the limit of concurrent connections based on your billing plan.
# If your plan does not have any limitations, it makes sense to set this
# option to 0 (disable internal throttling).
#
# In case of this parameter > 0 headless proxy is going to throttle your
# requests before Crawlera. This really help to avoid you the problems
# of 429 errors if you exceed this number and brings better overall experience.
concurrent_connections = 0
# Which port crawlera-headless-proxy should listen on. Usually people like to
# set it to 3128.
bind_port = 3128
# Which IP should crawlera-headless-proxy to use. As usual, 0.0.0.0 helps
# to listen on every interface.
bind_ip = "127.0.0.1"
# Which IP should crawlera-headless-proxy proxy API listen on. Please
# remember that this is not HTTP Proxy interface you should set in your
# browser, this is internal thing for getting stats etc.
#
# Default (empty value) will set this IP to the same value as bind_ip.
proxy_api_ip = ""
# Which port crawlera-headless-proxy proxy API should listen on. Please
# remember that his is not HTTP proxy interface port.
proxy_api_port = 3130
# Which port is Crawlera listen on. In 99.999% of cases it is 8010 and you
# do not need to change that.
crawlera_port = 8010
# Which host is Crawlera placed on.
crawlera_host = "proxy.crawlera.com"
# What is API key for accessing Crawlera.
api_key = ""
# Path to your own TLS CA certificate if you do not like to use
# own crawlera-headless-proxy certificate.
# tls_ca_certificate = "/path/to/your/own/ca/certificate"
# Path to your own TLS CA certificate if you do not like to use
# own crawlera-headless-proxy private key.
# tls_private_key = "/path/to/your/own/tls/private/key"
# The list of adblock-compatible filters.
# Usually you do not want to spend resources (and concurrent connetions) on
# advertisment, different trackers and other spyware. If you want to filter
# these requests before they even go to Crawlera, please specify the lists
# here. They can be HTTP(S) links or file paths.
#
# Here is documentation on how to build your own filter list:
# https://adblockplus.org/en/filters
adblock_lists = [
"https://fanboy.co.nz/r/fanboy-ultimate.txt",
"https://fanboy.co.nz/fanboy-antifonts.txt",
"https://fanboy.co.nz/fanboy-antifacebook.txt",
"https://s3.amazonaws.com/lists.disconnect.me/simple_malware.txt"
]
# A list of regular expressions to match hostpath part of URL for direct
# access bypassing Crawlera.
#
# Example: https://www.example.com/images/cat.png?foo=111&bar=lala#segment
#
# HostPath is a full combination of host and path parameters of URL. In
# our example, it would be 'www.example.com/images/cat.png'. Regular
# expression searches for full match, so in example above 'cat' won't
# work, but '.*?cat.*?' - will.
#
# Regular expressions are case sensitive.
# direct_access_except_hostpath_regexps takes priority over this.
# direct_access_hostpath_regexps = [
# '.*?\.(?:txt|json|css|less|js|mjs|cjs|gif|ico|jpe?g|svg|png|webp|mkv|mp4|mpe?g|webm|eot|ttf|woff2?)$'
# ]
# List of regular expressions for URLs that should go through Crawlera
# irrespective of the direct_access_hostpath_regexps.
# If a match is found within these regexps, direct_access check is skipped
# for those and requests to those URLs will always go through crawlera.
#
# Example: direct_access_except_hostpath_regexps = ['.*example.*']
#
# This will make all requests containing httpbin go through crawlera.
# Even the requests like example.com/dummy.txt will be proxied through crawlera
# even though the direct_access_hostpath_regexps have '.txt' in them
# direct_access_except_hostpath_regexps = ['.*example.*']
# A list of Crawlera XHeaders to propagate to real Crawlera from this
# headless proxy.
#
# As you are going to use headless browser, we would recommend you
# to use at least X-Crawlera-Cookies:disable and X-Crawlera-Profile:desktop
# headers.
# You can speficy sucn headers in full and short form. For example,
# `X-Crawlera-Cookies` and `cookies` define the same header.
#
# A full list can be found here:
# https://doc.scrapinghub.com/crawlera.html#request-headers
[xheaders]
# cookies = "disable"
# profile = "desktop"