Skip to content

Commit bcb21cf

Browse files
authored
feat(route/cna): Preserve web crawling method. (#13531)
* feat(route/cna): Preserve web crawling method. * Add doc. * . * Update website/docs/routes/traditional-media.mdx ---------
1 parent 2a24c91 commit bcb21cf

File tree

5 files changed

+74
-0
lines changed

5 files changed

+74
-0
lines changed

lib/v2/cna/maintainer.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
module.exports = {
22
'/:id?': ['nczitzk'],
3+
'/web/:id?': ['dzx-dzx'],
34
};

lib/v2/cna/radar.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@ module.exports = {
88
source: ['/list/:id', '/topic/newstopic/:id'],
99
target: (params) => `/cna/${params.id.replace('.aspx', '')}`,
1010
},
11+
{
12+
title: '分类(网页爬虫方法)',
13+
docs: 'https://docs.rsshub.app/routes/traditional-media#zhong-yang-tong-xun-she',
14+
source: ['/list/:id'],
15+
target: (params) => `/cna/web/${params.id.replace('.aspx', '')}`,
16+
},
1117
],
1218
},
1319
};

lib/v2/cna/router.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
module.exports = (router) => {
22
router.get('/:id?', require('./'));
3+
router.get('/web/:id?', require('./web/'));
34
};

lib/v2/cna/web/index.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
const got = require('@/utils/got');
2+
const cheerio = require('cheerio');
3+
const { parseDate } = require('@/utils/parse-date');
4+
const timezone = require('@/utils/timezone');
5+
6+
module.exports = async (ctx) => {
7+
const id = ctx.params.id || 'aall';
8+
9+
let rootUrl;
10+
11+
if (/^\d+$/.test(id)) {
12+
rootUrl = `https://www.cna.com.tw/topic/newstopic/${id}.aspx`;
13+
} else {
14+
rootUrl = `https://www.cna.com.tw/list/${id}.aspx`;
15+
}
16+
const response = await got({
17+
method: 'get',
18+
url: rootUrl,
19+
});
20+
21+
const $ = cheerio.load(response.data);
22+
const list = $('*:is(.pcBox .caItem, .mainList li a div) h2')
23+
.slice(0, ctx.query.limit ? parseInt(ctx.query.limit) : 10)
24+
.toArray()
25+
.map((item) => {
26+
item = $(item);
27+
return {
28+
title: item.text(),
29+
link: item.parents('a').attr('href'),
30+
pubDate: timezone(parseDate(item.next().text()), +8),
31+
};
32+
});
33+
34+
const items = await Promise.all(
35+
list.map((item) =>
36+
ctx.cache.tryGet(item.link, async () => {
37+
const detailResponse = await got({
38+
method: 'get',
39+
url: item.link,
40+
});
41+
const content = cheerio.load(detailResponse.data);
42+
const topImage = content('.fullPic').html();
43+
44+
item.description = (topImage === null ? '' : topImage) + content('.paragraph').eq(0).html();
45+
item.category = [
46+
...content("meta[property='article:tag']")
47+
.get()
48+
.map((e) => e.attribs.content),
49+
content('.active > a').text(),
50+
];
51+
52+
return item;
53+
})
54+
)
55+
);
56+
57+
ctx.state.data = {
58+
title: $('title').text(),
59+
link: rootUrl,
60+
item: items,
61+
};
62+
};

website/docs/routes/traditional-media.mdx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,6 +2831,10 @@ category 对应的关键词有
28312831

28322832
</Route>
28332833

2834+
### 分类(网页爬虫方法) {#zhong-yang-tong-xun-she-fen-lei-wang-ye-pa-chong-fang-fa}
2835+
2836+
<Route author="dzx-dzx" example="/cna/web/aall" path="/cna/web/:id?" paramsDesc={['分类 id,见上表。此參數默认为 aall']} />
2837+
28342838
## 组织人事报 {#zu-zhi-ren-shi-bao}
28352839

28362840
### 电子报 {#zu-zhi-ren-shi-bao-dian-zi-bao}

0 commit comments

Comments
 (0)