-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape-for-websites.js
350 lines (308 loc) · 12.8 KB
/
scrape-for-websites.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
/*
* This file scrapes, from a number of sources, the potential websites of
* corporations that received R&D awards from the US Federal Govornment
* and/or that applied for a patent, somehow connected to these awards,
* at the USPTO
*
* Author: Carlo Bottai
* Copyright (c) 2020 - TU/e and EPFL
* License: See the LICENSE file.
* Date: 2020-10-15
*/
'use strict';
/***************/
/* MODULES */
/***************/
const fs = require('fs');
const args = require('minimist')(
process.argv,
{ boolean: ['sbir', 'proxy', 'timestamp'] });
const puppeteer = require('puppeteer-extra');
const stealthPlugin = require('puppeteer-extra-plugin-stealth')();
puppeteer.use(stealthPlugin);
const blockResourcesPlugin = require('puppeteer-extra-plugin-block-resources')();
puppeteer.use(blockResourcesPlugin);
const UserAgents = require('user-agents');
const chalk = require('chalk');
const performance = require('perf_hooks').performance;
const os = require('os');
const got = require('got');
/**********************/
/* CUSTOM VARIABLES */
/* AND FUNCTIONS */
/**********************/
const resultsLabel = 'scraped_websites';
let scraper_config = fs.readFileSync('scraper.conf', 'utf8');
scraper_config = JSON.parse(scraper_config);
const useHeadless = scraper_config['USE_HEADLESS'];
let chromePath;
if (scraper_config['CHROME_PATH']) {
chromePath = scraper_config['CHROME_PATH'];
} else if (os.platform()=='win32') {
chromePath = 'C:\\Program\ Files\ \(x86\)\\Google\\Chrome\\Application\\chrome.exe';
} else if (fs.existsSync('/usr/bin/google-chrome-stable')) {
chromePath = '/usr/bin/google-chrome-stable';
} else {
chromePath = '/usr/bin/chromium-browser';
}
const useMobile = scraper_config['USE_MOBILE'];
let setUserAgent;
let setDefaultViewport;
if (useMobile) {
setUserAgent = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/75.0.3765.0 Safari/537.36';
setDefaultViewport = {
width: 800,
height: 1280,
deviceScaleFactor: 1,
isMobile: useMobile,
hasTouch: useMobile,
isLandscape: false,
}
} else {
const setData = UserAgents.random({ deviceCategory: 'desktop' }).data;
setUserAgent = setData.userAgent;
setDefaultViewport = {
width: setData.viewportWidth,
height: setData.viewportHeight,
deviceScaleFactor: 1,
isMobile: useMobile,
hasTouch: useMobile,
isLandscape: false,
}
}
let browser_config = [
`--use-mobile-user-agent=${useMobile}`,
`--user-agent=${setUserAgent}`,
'--ignore-certificate-errors',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-position=0,0',
'--start-fullscreen',
'--hide-scrollbars',
]
let proxy_config = null;
if (fs.existsSync('proxy.conf')) {
proxy_config = fs.readFileSync('proxy.conf', 'utf8');
proxy_config = JSON.parse(proxy_config);
}
let proxy_status;
if (args['proxy']) {
proxy_config['PROXY_DOMAIN'] = proxy_config['PROXY_ADDRESS'].split('.')
proxy_config['PROXY_DOMAIN'] = proxy_config['PROXY_DOMAIN'].slice(
Math.max(proxy_config['PROXY_DOMAIN'].length - 2, 1)).join('.')
browser_config.push(
'--proxy-server=' +
`${proxy_config['PROXY_ADDRESS']}:` +
`${proxy_config['PROXY_PORT']}`);
browser_config.push(
'--proxy-bypass-list=' +
`"*.${proxy_config['PROXY_DOMAIN']}"`);
proxy_status = new Function('res', proxy_config['PROXY_STATUS']);
}
const log_timestamp = args['timestamp']==true ?
require('log-timestamp')(() => {return new Date().toLocaleString();}) :
null;
const utils = require('./utils.js');
const getDataAlreadyScraped = utils.getDataAlreadyScraped;
const isAlreadyScraped = utils.isAlreadyScraped;
const getOrganizations = utils.getOrganizations;
const cleanURL = utils.cleanURL;
const randomSleep = utils.randomSleep;
const printMsAsMinAndSec = utils.printMsAsMinAndSec;
const sleepFor = utils.sleepFor;
const scrapeWebsites = utils.scrapeWebsites;
/*****************/
/* MAIN SCRIPT */
/*****************/
(async() => {
const path_in = args['i'];
const path_out = args['o'];
let allScraped;
do {
allScraped = [];
const f_in = fs.readFileSync(path_in, 'utf-8').split('\n').filter(el => el);
let dataAlreadyScraped = null;
if (fs.existsSync(path_out)) {
dataAlreadyScraped = getDataAlreadyScraped(path_out);
}
try {
const browser = await puppeteer.launch({
headless: useHeadless,
executablePath: chromePath,
defaultViewport: setDefaultViewport,
args: browser_config,
});
if (useMobile) {
blockResourcesPlugin.blockedTypes.add('stylesheet');
blockResourcesPlugin.blockedTypes.add('image');
blockResourcesPlugin.blockedTypes.add('media');
blockResourcesPlugin.blockedTypes.add('font');
blockResourcesPlugin.blockedTypes.add('xhr');
blockResourcesPlugin.blockedTypes.add('other');
}
let avgTimePerIteration = [];
let avgTotTimePerIteration = [];
let roundsSinceLastProxyRotation = 0;
for await (const [idx, line] of f_in.entries()) {
const loopStart = performance.now();
const data = await JSON.parse(line);
const alreadyScraped = await isAlreadyScraped(data, dataAlreadyScraped);
allScraped.push(alreadyScraped);
if (!alreadyScraped) {
const [organization, organizationWOLegalEntity] = await getOrganizations(data);
if (organization==='ERROR') {
var detectedResults = ['ERROR'];
} else if (!organization) {
var detectedResults = [null];
} else {
try {
if (args['proxy'] && roundsSinceLastProxyRotation%10==0) {
const proxy_rotate = await got(proxy_config['PROXY_ROTATE']);
await sleepFor(10000);
const [proxy_ok, proxy_msg] = proxy_status(proxy_rotate.body);
if (proxy_ok) {
console.log(chalk.green('PROXY STATUS: OK'));
console.log(chalk.green(proxy_msg));
} else {
console.error(chalk.red(proxy_status.body));
}
}
roundsSinceLastProxyRotation += 1;
let scrapers = [];
if (data.award_recipient && args['sbir']===true) scrapers.push({
who:organizationWOLegalEntity,
where:'SBIR',
how:'SBIR',
mobile:useMobile
});
scrapers = [
...scrapers,
{who:organization, where:'Bloomberg', how:'Bing', mobile:useMobile},
{who:organization, where:'Anywhere', how:'Google', mobile:useMobile}];
// If you have a multi-core CPU (with more cores than the operations
// you want to make) the scraping is parallelized
// In this way, even though you must wait in any case at least 2 min
// for each iteration, it reduces the chances of very long iterations
// in those cases in which you must explore a lot of SBIR/Bloomberg
// pages since this second step is executed in parallel with the others
if (os.cpus().length >= scrapers.length) {
var detectedResults = await Promise.all(scrapers.map(scraper => {
return scrapeWebsites(browser, scraper);
}));
} else {
var detectedResults = [];
for (const scraper of scrapers) {
const results = await scrapeWebsites(browser, scraper);
detectedResults.push(results);
}
}
// console.log(detectedResults);
detectedResults = [].concat(...detectedResults);
detectedResults = detectedResults.map(url => cleanURL(url));
detectedResults = [...new Set(detectedResults)];
if (detectedResults.includes('ERROR')) {
detectedResults = ['ERROR'];
} else if (detectedResults.length>1) {
// Remove null (if it's not the only element)
detectedResults = detectedResults.filter(el => el);
}
} catch(e) {
console.error(chalk.red(`Error: ${e}`));
const msSleep = randomSleep(90, 150);
await sleepFor(msSleep);
var detectedResults = ['ERROR'];
}
}
data[resultsLabel] = detectedResults;
fs.appendFileSync(path_out, `${JSON.stringify(data)}\n`, 'utf8');
if (organization && organization!='ERROR') {
const loopEnd = performance.now();
let iterationTime = loopEnd - loopStart;
avgTimePerIteration.push(iterationTime);
let currentAvgTimePerIteration = avgTimePerIteration.reduce((a, b) => {
return (a + b);
}) / avgTimePerIteration.length;
// Say that the goal is to scrape at a 2 min rate (+- 30 sec)
// If an iteration takes lass than a random wait centered around this target rate
// ask the script to wait for longer
// If the previous iterations took more than 2.5 min, try to reach the target rate
// reducing the waiting time of the difference between 2 min and the current
// average waiting time per iteration
const scraping_rate = scraper_config['SCRAPING_RATE'];
let randomWait = randomSleep(scraping_rate - 30, scraping_rate + 30, iterationTime);
avgTotTimePerIteration.push(iterationTime + randomWait);
let currentAvgTotTimePerIteration = avgTotTimePerIteration.reduce((a, b) => {
return (a + b);
}) / avgTotTimePerIteration.length;
// If the average is above the upper limit, reduce the waiting time of
// the current average minus the scraping rate (plus a 5%)
if (currentAvgTotTimePerIteration > (scraping_rate + 30) * 1000) {
const scraping_rate_5pc = (scraping_rate * 1000) + (scraping_rate * 50);
randomWait -= Math.max(0, currentAvgTotTimePerIteration - scraping_rate_5pc);
randomWait = Math.max(iterationTime < 30 * 1000 ? 30 * 1000 : 0, randomWait);
}
iterationTime = printMsAsMinAndSec(iterationTime);
currentAvgTimePerIteration = printMsAsMinAndSec(currentAvgTimePerIteration);
currentAvgTotTimePerIteration = printMsAsMinAndSec(currentAvgTotTimePerIteration);
const randomWaitToPrint = printMsAsMinAndSec(randomWait);
console.log(
`Loop iteration took ${iterationTime} ` +
`(avg: ${currentAvgTimePerIteration})`);
console.log(
`Wait for other ${randomWaitToPrint} so to be undetectable ` +
`(avg tot time: ${currentAvgTotTimePerIteration})`);
if (idx < f_in.length - 1) await sleepFor(randomWait);
}
}
}
await browser.close();
let results = fs.readFileSync(path_out, 'utf8');
let folder_out = path_out.split('/').slice(0,-1).join('/');
if (folder_out=='') {
folder_out = '.'
}
let file_out = path_out.split('/').slice(-1)[0];
const now = new Date();
const year = now.getFullYear().toString().substr(2,2);
const month = now.getMonth() + 1;
const day = now.getDate();
const hour = now.getHours();
const minute = now.getMinutes();
const second = now.getSeconds();
const file_bak = file_out.split('.').slice(0,1) +
`_${year}${month}${day}${hour}${minute}${second}` +
'.' + file_out.split('.').slice(1);
fs.writeFileSync(`${folder_out}/${file_bak}`, results);
results = results.split('\n');
results = results.filter(el => el);
results = results.map(el => JSON.parse(el));
results = results.filter(el => {
if (el[resultsLabel]) {
return el[resultsLabel][0]!='ERROR';
} else {
return false;
}
});
results = results.map(JSON.stringify).join('\n') + '\n';
fs.writeFileSync(path_out, results);
allScraped = !allScraped.every(el => el);
if (allScraped) {
console.log(chalk.red.italic(
'\nThe scraping is going to restart in 10 sec.\n' +
'Stop the Systemd daemon (or press CTRL+C) to stop it\n'));
await sleepFor(10000);
} else {
console.log(chalk.green.italic(
'\nThe scraping has been successfully completed'));
}
} catch(e) {
console.error(chalk.red(`Error: ${e}`));
}
} while (allScraped);
process.exit();
})();