-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
106 lines (87 loc) · 2.62 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
const fs = require('fs')
const { pull, collect } = require('pull-stream')
const pullify = require('async-iterator-to-pull-stream')
const iterator = require('pull-stream-to-async-iterator')
const paramap = require('pull-paramap')
const file = require('pull-file')
const write = require('pull-write-file')
const { WARCParser, WARCRecord, WARCSerializer, CDXIndexer } = require('warcio')
exports.getWarcRecord = function (warcPath, offset, length) {
let sent
const options = {
start: offset,
end: offset + length,
}
return function read(end, cb) {
return end
? cb(end)
: sent
? cb(true)
: pull(
file(warcPath, options),
collect(async (err, chunks) => {
if (err) {
return cb(err)
}
const parser = new WARCParser(chunks)
const record = await parser.parse()
sent = true
cb(null, record)
})
)
}
}
exports.readWarcFile = function (warcPath) {
const reader = fs.createReadStream(warcPath)
const parser = new WARCParser(reader)
return pullify(parser)
}
exports.writeWarcFile = function (warcPath, cb) {
const warcVersion = 'WARC/1.1'
const info = {
software: 'warcio.js/pull-warc in nodejs',
format: 'WARC File Format 1.1',
isPartOf: warcPath,
}
const filename = warcPath.split('/').pop()
let started
return pull(
paramap(async (serializedWARCRecord, _cb) => {
const records = []
if (!started) {
started = true
const warcinfo = await WARCRecord.createWARCInfo(
{ filename, warcVersion },
info
)
const serializedWARCInfo = await WARCSerializer.serialize(warcinfo, {})
records.push(Uint8Array.from(serializedWARCInfo))
}
records.push(Uint8Array.from(serializedWARCRecord))
return _cb(null, Buffer.concat(records))
}),
write(warcPath, cb)
)
}
exports.serializeWarcRecord = () => {
return paramap(async (opts, cb) => {
let contents
if (!opts.warcVersion) {
opts.warcVersion = 'WARC/1.1'
}
if (opts.contents) {
contents = iterator(opts.contents)
}
const record = await WARCRecord.create(opts, contents)
const serializedWARCRecord = await WARCSerializer.serialize(record, {
digest: { algo: 'sha-1', prefix: 'sha1:', base32: true },
})
return cb(null, serializedWARCRecord)
})
}
exports.CDXIndexerStream = function (warcPath, opts) {
const indexer = new CDXIndexer(opts)
const reader = fs.createReadStream(warcPath)
const files = [{ reader, filename: warcPath }]
return pullify(indexer.iterIndex(files))
}