-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwarcChecker.js
42 lines (28 loc) · 1.12 KB
/
warcChecker.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* eslint-disable max-len */
/* eslint-disable no-console */
const { WARCParser, WARCRecord, WARCSerializer } = require("warcio");
const fs = require('fs');
async function readWARC(filename) {
stream = fs.createWriteStream("out.warc.gz");
console.log("Processing "+filename);
const nodeStream = fs.createReadStream(filename);
const parser = new WARCParser(nodeStream);
for await (const record of parser) {
// ways to access warc data
console.log("warcType: " + record.warcType);
console.log("warcTargetURI: " + record.warcTargetURI);
console.log(record.warcHeaders.headers.get('WARC-Record-ID'));
// iterator over WARC content one chunk at a time (as Uint8Array)
//for await (const chunk of record) {
//}
if( record.warcType == 'resource') {
const serializedRecord = await WARCSerializer.serialize(record, {gzip: true});
stream.write(serializedRecord);
}
// OR, access content as text
//const text = await record.contentText();
}
// Close
stream.end();
}
readWARC(process.argv[2]);