-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinArchive.js
177 lines (171 loc) · 4.85 KB
/
inArchive.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
const WDAPDCrawler = require('./lib/archiveCrawler')
const program = require('commander')
const fs = require('fs-extra')
const path = require('path')
const normalURL = require('normalize-url')
const Promise = require('bluebird')
const filName = require('filenamify-url')
function ensureURL (url) {
url = normalURL(url, {
normalizeProtocol: true,
normalizeHttps: false,
stripFragment: false,
stripWWW: false,
removeTrailingSlash: false,
removeDirectoryIndex: false
})
return url
}
program
.version('0.0.1')
.option('-t, --tab', 'Crawl Using Another Tab')
.option('-s, --seedlist <slp>', 'Seed List Path')
.option('-d, --dump <where>', 'Data Dump Location')
.parse(process.argv)
async function doIt () {
// if (program.seedlist === undefined) {
// throw new Error('Seed List Is Undefined')
// }
// const isFileOrDir = await fs.stat(program.seedlist)
// let seedList
// if (isFileOrDir.isFile()) {
// if (!program.seedlist.endsWith('.json')) {
// throw new Error(`Seed List Must Be A JSON File: ${program.seedlist}`)
// }
// seedList = await fs.readJSON(program.seedlist)
// } else {
// throw new Error(`Unknown Seed List ${program.seedlist}`)
// }
// if (seedList.length <= 0) {
// throw new Error(`No Seed List Provided`)
// }
let seedList = await fs.readJSON('/data/pyProjects/tenkTM/web.archive.org.json')
let crawler
if (program.tab !== undefined) {
console.log('tabbedd')
crawler = new WDAPDCrawler({newTab: true})
} else {
crawler = new WDAPDCrawler()
}
//
seedList = seedList.filter(it => it.c >= 2953)
let curSeed = seedList.shift()
console.log(curSeed.c, seedList.length)
curSeed.uri = ensureURL(curSeed.uri)
await crawler.init()
let threeXXX = 0
let working = false
crawler.on('network-idle', async () => {
if (!working) {
working = true
console.log('idle network')
await Promise.delay(10000)
let wasError = false
let where
try {
where = await crawler.whereAreWe()
where = where.result.value
} catch (error) {
wasError = true
}
let gen = true
if (!wasError) {
if (where !== curSeed.uri) {
working = false
await crawler.stop()
if (threeXXX < 1) {
console.log('3xx', curSeed.uri)
// console.log('3xx', where, curSeed)
curSeed.uri = where
crawler.goto(where)
threeXXX += 1
} else {
threeXXX = 0
curSeed.why = 'redirection'
try {
await fs.appendFile('baddUrls.log', `${JSON.stringify(curSeed)}\n`, 'utf8')
} catch (error) {
console.error(error)
}
curSeed = seedList.shift()
console.log(curSeed.c, seedList.length)
if (curSeed) {
curSeed.uri = ensureURL(curSeed.uri)
console.log(curSeed.uri)
crawler.goto(curSeed.uri)
} else {
crawler.close()
}
}
gen = false
}
}
if (gen) {
threeXXX = 0
await crawler.getExtraInfo()
try {
await crawler.stop()
} catch (error) {
console.error(error)
}
try {
await fs.writeJSON(`sldir3Dump/${curSeed.c}-${filName(curSeed.uri)}.json`, crawler)
} catch (error) {
console.error(error)
}
curSeed = seedList.shift()
console.log(curSeed.c, seedList.length)
if (curSeed) {
working = false
curSeed.uri = ensureURL(curSeed.uri)
console.log(curSeed.uri)
crawler.goto(curSeed.uri)
} else {
crawler.close()
}
}
}
})
crawler.on('navigation-timedout', async () => {
working = false
console.log('nav timeout')
try {
await crawler.stop()
} catch (error) {
console.error(error)
}
curSeed.why = 'navigation'
try {
await fs.appendFile('baddUrls.log', `${JSON.stringify(curSeed)}\n`, 'utf8')
} catch (error) {
console.error(error)
}
curSeed = seedList.shift()
console.log(curSeed.c, seedList.length)
if (curSeed) {
curSeed.uri = ensureURL(curSeed.uri)
console.log(curSeed.uri)
crawler.goto(curSeed.uri)
} else {
crawler.close()
}
})
crawler.on('navigated', () => {
console.log('navigated')
})
await Promise.delay(5000)
console.log(curSeed.uri)
crawler.goto(curSeed.uri)
// const client = await Launcher.launch()
// console.log(client)
// console.log(await Launcher.newTab())
// console.log(await Launcher.newTab())
// console.log(await Launcher.newTab())
// await fs.writeJSON('protocol.json', client.descriptor)
// client.descriptor.domains.forEach(it => {
// console.log(it)
// })
}
doIt().catch(error => {
console.error(error)
})