-
Notifications
You must be signed in to change notification settings - Fork 0
/
reddit_scraper.js
72 lines (53 loc) · 2.19 KB
/
reddit_scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
const puppeteer = require('puppeteer');
const SUBREDDIT_URL = (reddit) => `https://old.reddit.com/r/${reddit}/`;
const self = {
browser: null,
page: null,
initialize: async (reddit) => {
self.browser = await puppeteer.launch({headless: true});
self.page = await self.browser.newPage();
//GO to the subreddit with pupeteer
await self.page.goto(SUBREDDIT_URL(reddit), { waitUntil: 'networkidle0' })
},
getResults: async (tag, sub) => {
let elements = await self.page.$$('#siteTable > div[class*="thing"]');
let results = [];
for (let element of elements){
let title = await element.$eval(('p[class="title"]'), node => node.innerText.trim());
let postURL = await element.$eval(('a[class*="title"]'), node => node.getAttribute('href'));
let rank = await element.$eval(('span[class="rank"]'), node => node.innerText.trim());
let postTime = await element.$eval(('p[class="tagline "] > time'), node => node.getAttribute('title'));
let authorURL = await element.$eval(('p[class="tagline "] > a[class*="author"]'), node => node.getAttribute('href'));
let author = await element.$eval(('p[class="tagline "] > a[class*="author"]'), node => node.innerText.trim());
let score = await element.$eval(('div[class="score likes"]'), node => node.innerText.trim());
if (postURL.substring(0,2) == "/r")
postURL = "https://old.reddit.com" + postURL
post = [title, postURL, tag, author, sub]
//console.log(post)
results.push(post)
/*
results.push({
postTitle: title,
//rank,
//postTime,
postDescription: authorURL,
tag: "masks"
//author,
//score,
})
*/
/*
results.push({
title,
rank,
postTime,
authorURL,
author,
score,
})
*/
}
return results;
}
}
module.exports = self;