This repository has been archived by the owner on Aug 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
86 lines (78 loc) · 2.21 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*----------
group的双引号里输入小组链接
cookie的单引号里输入cookie
compress:是否以压缩文件而非文件夹的形式保存,是为true,否为false,默认true
------------
Examples:
const group = "https://www.douban.com/group/114514";
const cookie = 'll="114514"; bid=fx-abcdef; ......';
const compress = true;
----------*/
// 用户配置区
const group = "";
const cookie = '';
const compress = true;
//用户配置区结束
// 以下部分非专业人士勿动
import scrape from "website-scraper";
import zip from "./zip.js";
// 获取当前时间并转换格式
const time = new Date();
const timenow = time
.toISOString()
.replace(":", "-")
.replace(":", "-")
.replace("T", "-")
.replace("Z", "");
// 爬取每个页面前随机等待 0-10 秒,避免速度过快出验证码
class MyPlugin {
apply(registerAction) {
registerAction("beforeRequest", async ({ resource, requestOptions }) => {
const time = Math.round(Math.random() * 10000);
await new Promise((resolve) => setTimeout(resolve, time));
return { requestOptions };
});
}
}
const options = {
urls: [`${group}/discussion`], // 链接,模板字符串
// 类似链接白名单
urlFilter: (url) => {
if (url.startsWith(group)) {
return url;
}
if (url.startsWith("https://img1.doubanio.com")) {
return url;
}
if (url.startsWith("https://img2.doubanio.com")) {
return url;
}
if (url.startsWith("https://img3.doubanio.com")) {
return url;
}
if (url.match(`${group}/new_topic`)) {
return null;
}
if (url.startsWith("https://www.douban.com/group/topic/")) {
return url;
}
},
recursive: true, // 递归爬取
maxRecursiveDepth: 1, // 爬一层
filenameGenerator: "bySiteStructure",
directory: `./archive/${timenow}`,
request: {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0", // UA标识,装作浏览器
Referer: "https://www.douban.com/group/",
Cookie: cookie,
},
},
plugins: [new MyPlugin()], // 调用等待 0-10 秒插件
};
const result = await scrape(options);
console.log(result);
if (compress) {
zip(timenow);
}