-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
145 lines (125 loc) · 4.2 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// get core
const express = require('express');
// filesystem
const fs = require('fs');
// scraping
const requestP = require('request-promise');
const cheerio = require('cheerio');
// middleware
const cors = require('cors')
const morgan = require('morgan')
// utils
const slugify = require('./util/slugify.js');
// declare app
const app = express();
const port = ( process.env.NODE_ENV === 'production' ) ? process.env.PORT : 8000;
// middleware
app.use(morgan('combined'))
app.use(cors());
// scrape destinations
// moby dick = http://www.gutenberg.org/files/2701/2701-h/2701-h.htm
// alice in winderland = https://www.gutenberg.org/files/11/11-h/11-h.htm
// through the looking glass = https://www.gutenberg.org/files/12/12-h/12-h.htm
// the url passed as an argument
const bookURL = process.argv[2];
if (bookURL && bookURL !== '/' && bookURL !== '') {
// set scrape url && requestP options (instruct cheerio)
let options = {
uri: bookURL,
transform: function (body) {
return cheerio.load(body);
}
};
let bookName = '';
let bookNameSlug = '';
let titlesArray = [];
let paragraphsArray = [];
requestP(options)
.then(function ($) {
// scrape for titles and paragraphs
let bookTitle = $('h1');
let chapterTitles = $('h2');
let paragraphs = $('p');
// get book title
if (bookTitle) {
const nameText = $(bookTitle).text().trim(); // get text content
if ( nameText !== '' ) {
// update bookName text
bookName = nameText;
bookNameSlug = slugify(bookName);
}
}
// get titles
let chapterIterator = 1;
$(chapterTitles).each( function(i, title) {
let obj = {};
let titleText = $(this).text().trim(); // get text content
if ( titleText !== '' ) {
if (titleText.indexOf('CHAPTER') > -1) {
// populate obj
obj.identifier = chapterIterator;
// split unnecessary text out (e.g. "CHAPTER 6. The Street.")
let splitText = titleText.split('. ');
if (splitText && splitText.length > 1) {
titleText = splitText[splitText.length - 1];
}
obj.content = titleText;
// add to main array
titlesArray.push(obj);
chapterIterator++;
}
}
});
// get paras
let titleIterator = 1;
$(paragraphs).each( function(i, para) {
let obj = {};
const paraText = $(this).text().trim(); // get text content
if ( paraText !== '' ) {
// populate obj
obj.identifier = titleIterator;
obj.content = paraText;
// add to main array
paragraphsArray.push(obj);
titleIterator++;
}
});
})
.then(function() {
console.log('Done scraping');
// parse arrays to json
let titlesParsed = JSON.stringify(titlesArray, null, 4);
let paragraphsParsed = JSON.stringify(paragraphsArray, null, 4);
// get comma-less pseudo json for mLab import...
let titlesStripped = titlesArray.map( item => {
return JSON.stringify(item);
}).join("\n");
let paragraphsStripped = paragraphsArray.map( item => {
return JSON.stringify(item);
}).join("\n");
// write json output to json files
fs.writeFile(`content/${bookNameSlug}-titles.json`, titlesParsed, function(err) {
console.log('Titles json file written!');
});
fs.writeFile(`content/${bookNameSlug}-paragraphs.json`, paragraphsParsed, function(err) {
console.log('Paragraphs json file written!');
});
// write string output to txt files
fs.writeFile(`content/${bookNameSlug}-titles.txt`, titlesStripped, function(err) {
console.log('Titles string file written!');
});
fs.writeFile(`content/${bookNameSlug}-paragraphs.txt`, paragraphsStripped, function(err) {
console.log('Paragraphs string file written!');
});
})
.catch(function (err) {
console.log(err);
});
console.log('working...');
}
// error handling?
process.on('uncaughtException', function (err) {
console.error(err);
console.error(err.stack);
});
module.exports = app;