-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.mjs
196 lines (156 loc) · 7.1 KB
/
index.mjs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import puppeteer from 'puppeteer';
import commander from 'commander';
import filenamify from 'filenamify';
import fs from 'fs-extra';
import metadata from './package.json';
// Screenshot
import GIFEncoder from 'gifencoder';
import inkjet from 'inkjet';
commander
.version(metadata.version)
.command('search [query] [zip]')
.description('Searches Bazos.cz using the given search query within the given Czech Republic ZIP code area.')
.option('-f, --from [amount]', 'price from amount (CZK)')
.option('-t, --to [amount]', 'price to amount (CZK)')
.option('-w, --window', 'to run in non-headless mode (windowed)')
.option('-r, --record', 'record trace / screenshots')
.action(scrape)
;
commander.parse(process.argv);
async function scrape(query, zip, { from: priceMin, to: priceMax, window, record }) {
console.log(`Searching for '${query}' in Czech Republic ZIP code area ${zip} priced between ${priceMin} and ${priceMax} CZK.`);
const browser = await puppeteer.launch({ headless: !window, args: record ? ['--window-size=800,600' /* Match default viewport */] : [] });
const page = (await browser.pages())[0];
await page.bringToFront();
if (record) {
await page.tracing.start({ screenshots: true });
}
// Speed up browsing and clean up screenshots by blocking 3rd party networking
await page.setRequestInterception(true);
page.on('request', request => {
const url = new URL(request.url());
if (url.hostname !== 'bazos.cz' && url.hostname !== 'www.bazos.cz') {
request.abort()
} else {
request.continue();
}
});
await page.goto('https://bazos.cz');
// Dismiss the cookie banner to make the screenshot animation less jumpy
if (record) {
const cookieButton = await page.waitForSelector('.fucking-eu-cookies button');
await cookieButton.click();
}
try {
const hledatInput = await page.$('#hledat');
await hledatInput.focus();
await page.keyboard.type(query);
// Discard the autocomplete prompt.
if (window) {
await page.waitForSelector('#vysledek');
await page.evaluate(() => document.getElementById('vysledek').remove());
}
const hlokalitaInput = await page.$('#hlokalita');
await hlokalitaInput.focus();
await page.keyboard.type(zip);
// Discard the autocomplete prompt.
if (window) {
await page.waitForSelector('#vysledekpsc');
await page.evaluate(() => document.getElementById('vysledekpsc').remove());
}
if (priceMin) {
const cenaodInput = await page.$('input[name=cenaod]');
await cenaodInput.focus();
await page.keyboard.type(priceMin);
}
if (priceMax) {
const cenadoInput = await page.$('input[name=cenado]');
await cenadoInput.focus();
await page.keyboard.type(priceMax);
}
const submitInput = await page.$('input[name=Submit]');
await submitInput.click();
await page.waitForNavigation();
const start = new Date();
let total;
const results = [];
let hasNextPage = false;
do {
// Remove the advertisement banner to prevent jump and make screenshots nice
// Note that this is done this way because request interception doesn't seem to work
// https://github.com/GoogleChrome/puppeteer/issues/4702
//await page.addStyleTag({ content: '#adcontainer1 { display: none !important; }' });
const summaryText = await page.$eval('table.listainzerat > tbody > tr > td', listaTd => listaTd.textContent);
const summaryTextParts = summaryText.trim().split(/[\s-]/g);
const firstPostNo = Number(summaryTextParts[1]);
const lastPostNo = Number(summaryTextParts[2]);
const pageTotal = lastPostNo - firstPostNo + 1;
total = Number(summaryTextParts[5]);
console.log(`Showing posts #${firstPostNo}-${lastPostNo} (${pageTotal} on the page) out of ${total} total results.`);
const pageResults = await page.evaluate(pageTotal => {
let results = [];
const vypisSpans = document.querySelectorAll('span.vypis');
for (let index = 0; index < vypisSpans.length; index++) {
const vypisSpan = vypisSpans[index];
const nadpisA = vypisSpan.querySelector('span.nadpis > a');
const velikostSpan = vypisSpan.querySelector('span.velikost10');
const popisDiv = vypisSpan.querySelector('div.popis');
const cenaSpan = vypisSpan.querySelector('span.cena > b');
const title = nadpisA.textContent;
const url = nadpisA.href;
const [year, month, day] = velikostSpan.lastChild.textContent.substr(4).slice(0, -1).split('.').map(n => Number(n.trim())).reverse();
const description = popisDiv.textContent;
const price = Number(cenaSpan.textContent.slice(2, -3).replace(/ /, ''));
results.push({ title, url, year, month, day, description, price });
}
if (results.length !== pageTotal) {
throw new Error(`Expected to collect ${pageTotal} posts on the page but got ${results.length}.`);
}
const strankovaniA = document.querySelector('p.strankovani > a:last-child');
return { results, nextPageHref: strankovaniA && strankovaniA.href };
}, pageTotal);
results.push(...pageResults.results);
if (pageResults.nextPageHref) {
await page.goto(pageResults.nextPageHref);
hasNextPage = true;
console.log(`Collected ${pageResults.results.length} results on the page, ${results.length} total so far.\nAdvancing to the further page.`);
} else {
hasNextPage = false;
console.log(`Collected ${pageResults.results.length} results on the page, ${results.length} total.\nQuitting as this is the final page.`);
}
} while (hasNextPage);
if (results.length !== total) {
throw new Error(`Expected to collect ${total} posts but got ${results.length}.`);
}
const end = new Date();
await fs.writeJSON(filenamify(`${query}-in-${zip}-from-${priceMin || 'any'}-czk-to-${priceMax || 'any'}-czk.json`), { start, end, results }, { spaces: 2 });
} finally {
if (record) {
console.log('Collecting and parsing the trace data.');
const trace = JSON.parse(String(await page.tracing.stop()));
const snapshotTraceEvents = trace.traceEvents.filter(traceEvent => traceEvent.args.snapshot);
let gifEncoder;
for (const traceEvent of snapshotTraceEvents) {
console.log('Encoding a snapshot to the screenshot animation.');
const { width, height, data } = await new Promise((resolve, reject) => inkjet.decode(Buffer.from(traceEvent.args.snapshot, 'base64'), (error, data) => {
if (error) {
reject(error);
}
resolve(data);
}));
if (!gifEncoder) {
gifEncoder = new GIFEncoder(width, height);
gifEncoder.createReadStream().pipe(fs.createWriteStream('screenshot.gif'));
gifEncoder.start();
gifEncoder.setRepeat(0); // Repeat
gifEncoder.setDelay(50);
gifEncoder.setQuality(10); // Best?
}
gifEncoder.addFrame(data);
}
console.log('Finishing up the screenshot animation.');
gifEncoder.finish();
}
await browser.close();
}
}