-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
134 lines (120 loc) · 3.76 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
const puppeteer = require("puppeteer");
const axios = require("axios");
const AWS = require("aws-sdk");
//AWS.config.loadFromPath("./config.json");
AWS.config.update({ region: "eu-central-1" });
const docClient = new AWS.DynamoDB.DocumentClient();
// Autoscroll function to get lazy loaded content
async function autoScroll(page) {
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
let totalHeight = 0;
const distance = 300;
const timer = setInterval(() => {
const scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, 50);
});
});
}
// Scraper function utilizing Puppeteer
const scraper = async (pageToScrape, tableName) => {
let seenLinks = [];
try {
docClient
.scan({
TableName: tableName
})
.eachPage((err, data, done) => {
seenLinks = data.Items;
//done();
});
} catch (error) {
null;
}
try {
const browser = await puppeteer.launch({
headless: true,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
ignoreHTTPSErrors: true,
dumpio: false
});
const page = await browser.newPage();
await page.goto(pageToScrape);
await autoScroll(page);
const mainHrefs = await page.$$eval("a", anchors =>
anchors
.map(a => a.href)
.filter(link =>
// Get all links containing delfi.ee, postimees.ee etc with regex but
// exclude them if they contain adform, twitter, facebook, etc
link.match(
/^(?=.*(delfi\.ee|postimees\.ee))(?!.*(adform\.net|twitter\.com|facebook\.com|linkedin\.com|mailto|chrome-extension)).+$/g
)
)
);
// Important that browser.close() is above Promise.all block to avoid zombie
// browsers crashing server
await browser.close();
const uniqueLinks = [...new Set(mainHrefs)];
console.log("Unique links on mainpage: ", uniqueLinks.length);
// Get new links by comparing links loaded from DynamoDB to unique hrefs on page
const newLinks = uniqueLinks.filter(
item => seenLinks.map(link => link.Url).indexOf(item) == -1
);
console.log("Previously seen links: ", seenLinks.length);
console.log("New links: ", newLinks.length);
// Might need changing in future if delfi or postimees implements rate limiter
// Can be replaced with puppeteer going through links one by one (slow)
Promise.all(
newLinks.map(async url => {
return {
Url: url,
Paywalled: await axios
.get(url)
.then(response =>
// Test if link is paywalled with regex
/pyfe-overlay|paywall-component="paywall"|class="paywall-container"/g.test(
response.data
)
)
.catch(error => null),
TTL: Math.round(Date.now() / 1000) + 604800
};
})
)
.then(async results => {
seenLinks.push(...results);
results.forEach(link => {
// Add result into DynamoDB table
docClient.put(
{
TableName: tableName,
Item: link
},
(err, data) => {
if (err) {
console.log("Error");
} else {
console.log("Success");
}
}
);
});
console.log(
"Paywalled links: ",
seenLinks.filter(link => link.Paywalled === true).length
);
await browser.close();
})
.catch(async err => await browser.close());
} catch (error) {
console.log(error);
}
};
module.exports = scraper;