forked from dwqs/area-puppeteer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
139 lines (112 loc) · 4.3 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
const puppeteer = require('puppeteer');
const awaitTo = require('async-await-error-handling');
const ora = require('ora');
const chalk = require('chalk');
const path = require('path');
const fs = require('fs');
const { timeout, writeFileSync } = require('./utils');
const spinner1 = ora({
color: 'yellow'
});
const spinner2 = ora({
color: 'yellow'
});
const provinces = require('./provinces')['86'];
const pcodes = [];
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{route}.html';
let cities = [];
if (fs.existsSync(path.resolve(__dirname, 'cities.js'))) {
cities = require('./cities.js');
}
let areas = [];
let url = '';
let type = 0; // 0:抓取市级数据 1:抓取升级数据
// 当前正在抓取的目标
let curCity = '';
let curPCode = '';
Object.keys(provinces).forEach(code => {
if (code !== '710000' && code !== '910000') {
// 过滤掉港澳台
pcodes.push(code.slice(0, 2));
}
});
async function getCitiesByPCode (page, pcode) {
url = target.replace('#{route}', pcode);
const parentCode = `${pcode}0000`;
await page.goto(url);
spinner1.text = chalk.blue(`正在抓取${provinces[parentCode]}的市级数据:${url}`);
cities = await page.evaluate((parentCode, cities) => {
const list = [...document.querySelectorAll('.citytable .citytr')];
list.forEach(el => {
const t = el.innerText.split('\t');
cities.push({
code: t[0],
text: t[1],
parentCode: parentCode
});
});
return cities;
}, parentCode, cities);
}
async function getAreasByCCode (page, city) {
url = target.replace('#{route}', `${city.code.slice(0, 2)}/${city.code.slice(0, 4)}`);
await page.goto(url);
spinner2.text = chalk.blue(`正在抓取 ${provinces[city.parentCode]}/${city.text} 的县区数据:${url}`);
areas = await page.evaluate((city, areas) => {
let list = [...document.querySelectorAll('.countytable .countytr')];
if (!list.length) {
// 修正海南省-儋州市的区域数据
list = [...document.querySelectorAll('.towntable .towntr')];
}
list.forEach(el => {
const t = el.innerText.split('\t');
areas.push({
code: t[0],
text: t[1],
parentCode: `${city.code}`
})
});
return areas;
}, city, areas);
}
process.on('unhandledRejection', (err) => {
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url}\n`), err.message);
process.exit(1);
});
(async () => {
spinner1.start(chalk.blue('开始抓取市区数据....'));
const browser = await puppeteer.launch();
const page = await browser.newPage();
if (!cities.length) {
for(let i = 0, l = pcodes.length; i < l; i++) {
const pcode = pcodes[i];
await timeout(1500);
const [err] = await awaitTo(getCitiesByPCode(page, pcode));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
await getCitiesByPCode(page, pcode);
}
}
writeFileSync('cities.js', cities);
spinner1.succeed(chalk.green('市区数据抓取完毕,开始抓取县区数据....'));
} else {
spinner1.succeed(chalk.green('市区数据已经抓取过,开始抓取县区数据....'));
}
type = 1;
console.log('\n');
spinner2.start(chalk.blue('正在抓取县区数据....'));
for(let i = 0, l = cities.length; i < l; i++) {
const city = cities[i];
await timeout(3000);
const [err] = await awaitTo(getAreasByCCode(page, city));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
await getAreasByCCode(page, city);
}
}
writeFileSync('areas.js', areas);
spinner2.succeed(chalk.green('县区数据抓取完毕'));
await browser.close();
})();