feat(route/cna): Preserve web crawling method. (#13531)

dzx-dzx · web-flow · commit bcb21cf5cd7d · 2023-12-02T23:02:30.000+08:00
* feat(route/cna): Preserve web crawling method.

* Add doc.

* .

* Update website/docs/routes/traditional-media.mdx

---------
diff --git a/lib/v2/cna/maintainer.js b/lib/v2/cna/maintainer.js
@@ -1,3 +1,4 @@
 module.exports = {
     '/:id?': ['nczitzk'],
+    '/web/:id?': ['dzx-dzx'],
 };
diff --git a/lib/v2/cna/radar.js b/lib/v2/cna/radar.js
@@ -8,6 +8,12 @@ module.exports = {
                 source: ['/list/:id', '/topic/newstopic/:id'],
                 target: (params) => `/cna/${params.id.replace('.aspx', '')}`,
             },
+            {
+                title: '分类(网页爬虫方法)',
+                docs: 'https://docs.rsshub.app/routes/traditional-media#zhong-yang-tong-xun-she',
+                source: ['/list/:id'],
+                target: (params) => `/cna/web/${params.id.replace('.aspx', '')}`,
+            },
         ],
     },
 };
diff --git a/lib/v2/cna/router.js b/lib/v2/cna/router.js
@@ -1,3 +1,4 @@
 module.exports = (router) => {
     router.get('/:id?', require('./'));
+    router.get('/web/:id?', require('./web/'));
 };
diff --git a/lib/v2/cna/web/index.js b/lib/v2/cna/web/index.js
@@ -0,0 +1,62 @@
+const got = require('@/utils/got');
+const cheerio = require('cheerio');
+const { parseDate } = require('@/utils/parse-date');
+const timezone = require('@/utils/timezone');
+
+module.exports = async (ctx) => {
+    const id = ctx.params.id || 'aall';
+
+    let rootUrl;
+
+    if (/^\d+$/.test(id)) {
+        rootUrl = `https://www.cna.com.tw/topic/newstopic/${id}.aspx`;
+    } else {
+        rootUrl = `https://www.cna.com.tw/list/${id}.aspx`;
+    }
+    const response = await got({
+        method: 'get',
+        url: rootUrl,
+    });
+
+    const $ = cheerio.load(response.data);
+    const list = $('*:is(.pcBox .caItem, .mainList li a div) h2')
+        .slice(0, ctx.query.limit ? parseInt(ctx.query.limit) : 10)
+        .toArray()
+        .map((item) => {
+            item = $(item);
+            return {
+                title: item.text(),
+                link: item.parents('a').attr('href'),
+                pubDate: timezone(parseDate(item.next().text()), +8),
+            };
+        });
+
+    const items = await Promise.all(
+        list.map((item) =>
+            ctx.cache.tryGet(item.link, async () => {
+                const detailResponse = await got({
+                    method: 'get',
+                    url: item.link,
+                });
+                const content = cheerio.load(detailResponse.data);
+                const topImage = content('.fullPic').html();
+
+                item.description = (topImage === null ? '' : topImage) + content('.paragraph').eq(0).html();
+                item.category = [
+                    ...content("meta[property='article:tag']")
+                        .get()
+                        .map((e) => e.attribs.content),
+                    content('.active > a').text(),
+                ];
+
+                return item;
+            })
+        )
+    );
+
+    ctx.state.data = {
+        title: $('title').text(),
+        link: rootUrl,
+        item: items,
+    };
+};
diff --git a/website/docs/routes/traditional-media.mdx b/website/docs/routes/traditional-media.mdx
@@ -2831,6 +2831,10 @@ category 对应的关键词有
 
 </Route>
 
+### 分类(网页爬虫方法) {#zhong-yang-tong-xun-she-fen-lei-wang-ye-pa-chong-fang-fa}
+
+<Route author="dzx-dzx" example="/cna/web/aall" path="/cna/web/:id?" paramsDesc={['分类 id，见上表。此參數默认为 aall']} />
+
 ## 组织人事报 {#zu-zhi-ren-shi-bao}
 
 ### 电子报 {#zu-zhi-ren-shi-bao-dian-zi-bao}