-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.js
142 lines (111 loc) · 4.04 KB
/
scrap.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/**
* Words Scrapper using PhantomJS
* This lets Google do the hard OCR work, and just retrieves the result !
*
*/
const phantom = require('phantom');
const fs = require('fs');
const sharp = require('sharp');
const axios = require('axios');
const path = require('path');
const rimraf = require('rimraf');
/* Required to allow deleting temp files after use */
sharp.cache(false);
const GOOGLE_API_URL = 'https://www.googleapis.com/books/v1/volumes?q=';
class WordScrapper {
constructor() {
}
async init() {
console.log('Initializing...');
// Initialize PhantomJS instance
this.instance = await phantom.create();
// Retrieve Google NID cookie
this.initialRequest = await axios({
method: 'GET',
url: 'https://google.com',
});
console.log('Complete.');
}
async search(keywords) {
for (const keyword of keywords) {
console.log(`Searching for ${keyword}...`);
await this.searchKeyword(keyword);
}
}
async searchKeyword(keyword) {
const books = await axios(GOOGLE_API_URL + keyword);
for (const [index, value] of books.data.items.entries()) {
console.log(`Scrapping book number ${index} : ${value.volumeInfo.previewLink}`);
const wordFound = await this.retrieveKeyword(value.volumeInfo.previewLink, keyword, index);
if (wordFound) {
return;
}
}
}
async retrieveKeyword(url, keyword, index) {
let success = false;
const page = await this.instance.createPage();
const status = await page.open(url);
const img = await page.evaluate(function () { return document.querySelector('.pageImageDisplay > div > img') });
const imgSrc = img.src;
const imgWidth = parseInt(img.width);
const temp_path = path.resolve(__dirname, 'temp', `temp_${index}.jpg`);
const highlightBox = await this.getHighlightBox(page);
if (highlightBox) {
await this.downloadFrom(imgSrc, temp_path);
try {
// Resize image to fit Google's, then extract the highlighted part
await sharp(temp_path)
.resize(imgWidth)
.extract(highlightBox)
.toFile(`./result/${keyword}.jpg`);
success = true;
} catch (err) {
console.log('Error: cannot resize or crop the picture.');
}
try {
rimraf(`./temp/temp_${index}.jpg`, () => {
console.log('Temporary file deleted.');
});
} catch (err) {
console.log('Error: cannot resize or crop the picture.');
}
}
return success;
}
async downloadFrom(url, path) {
const response = await axios({
method: 'GET',
url,
responseType: 'stream',
headers: {
Cookie: this.initialRequest.headers['set-cookie'],
},
});
response.data.pipe(fs.createWriteStream(path));
return new Promise((resolve, reject) => {
response.data.on('end', () => {
resolve();
});
response.data.on('error', () => {
reject();
});
});
}
async getHighlightBox(page) {
let highlightStyle;
try {
const highlight = await page.evaluate(function () { return document.querySelectorAll('.pageImageDisplay')[0].querySelectorAll('div')[7].querySelector('div') });
highlightStyle = {
height: parseInt(highlight.style.height),
width: parseInt(highlight.style.width),
left: parseInt(highlight.style.left),
top: parseInt(highlight.style.top),
};
} catch (err) {
console.log('Error: chances are there is no highlighted text on this cover.');
}
return highlightStyle;
}
}
module.exports = WordScrapper;