1
1
const parser = require ( '@/utils/rss-parser' ) ;
2
2
const cheerio = require ( 'cheerio' ) ;
3
3
const got = require ( '@/utils/got' ) ;
4
+ const dayjs = require ( 'dayjs' ) ;
5
+ dayjs . extend ( require ( 'dayjs/plugin/utc' ) ) ;
6
+ dayjs . extend ( require ( 'dayjs/plugin/timezone' ) ) ;
7
+
8
+ const redirectCacheKey = 'phoronix:redirect' ;
9
+ const webArticlesCacheKey = 'phoronix:web-articles' ;
10
+ const articleCacheKey = 'phoronix:articles' ;
4
11
5
12
const baseUrl = 'https://www.phoronix.com' ;
13
+ const rssUrl = `${ baseUrl } /rss.php` ;
6
14
7
- module . exports = async ( ctx ) => {
8
- const { page, queryOrItem } = ctx . params ;
9
- const rssUrl = new URL ( '/rss.php' , baseUrl ) ;
10
- rssUrl . searchParams . set ( 'page' , page ) ;
15
+ const feedFetch = async ( ) => {
16
+ const feed = await parser . parseURL ( rssUrl ) ;
17
+ return {
18
+ title : feed . title ,
19
+ link : feed . link ,
20
+ description : feed . description ,
21
+ item : feed . items ,
22
+ language : feed . language ,
23
+ icon : 'https://www.phoronix.com/android-chrome-192x192.png' ,
24
+ image : 'https://www.phoronix.com/android-chrome-192x192.png' ,
25
+ logo : 'https://www.phoronix.com/phxcms7-css/phoronix.png' ,
26
+ // Copied from the web page metadata
27
+ category : [
28
+ 'Linux Hardware Reviews' ,
29
+ 'Linux hardware benchmarks' ,
30
+ 'Linux Hardware' ,
31
+ 'Linux benchmarking' ,
32
+ 'Desktop Linux' ,
33
+ 'GNU/Linux benchmarks' ,
34
+ 'Open Source AMD' ,
35
+ 'Linux How To' ,
36
+ 'X.Org drivers' ,
37
+ 'Ubuntu hardware' ,
38
+ 'Phoronix Test Suite' ,
39
+ ] ,
40
+ } ;
41
+ } ;
42
+
43
+ const webFetchCb = ( response ) => {
44
+ const $ = cheerio . load ( response . body ) ;
45
+ return {
46
+ title : $ ( 'title' ) . text ( ) ,
47
+ link : response . url ,
48
+ description : $ ( 'meta[name="Description"]' ) . attr ( 'content' ) ,
49
+ item : [
50
+ ...new Set (
51
+ $ ( '#main a' )
52
+ . toArray ( )
53
+ . map ( ( e ) => e . attribs . href )
54
+ ) ,
55
+ ]
56
+ . filter ( ( link ) => link && ( link . startsWith ( '/review/' ) || link . startsWith ( '/news/' ) ) )
57
+ . map ( ( link ) => ( { link : `${ baseUrl } ${ link } ` } ) ) ,
58
+ language : 'en-us' ,
59
+ icon : 'https://www.phoronix.com/android-chrome-192x192.png' ,
60
+ image : 'https://www.phoronix.com/android-chrome-192x192.png' ,
61
+ logo : 'https://www.phoronix.com/phxcms7-css/phoronix.png' ,
62
+ category : $ ( 'meta[name="keywords"]' ) . attr ( 'content' ) . split ( ', ' ) ,
63
+ } ;
64
+ } ;
65
+
66
+ const webFetch = ( ctx , url ) =>
67
+ ctx . cache . tryGet ( `${ webArticlesCacheKey } :${ url } ` , async ( ) => {
68
+ try {
69
+ return webFetchCb ( await got ( url ) ) ;
70
+ } catch ( error ) {
71
+ if ( error . name === 'HTTPError' && error . response . statusCode === 404 ) {
72
+ return '404' ;
73
+ }
74
+ throw error ;
75
+ }
76
+ } ) ;
77
+
78
+ const legacyFetch = async ( ctx , page , queryOrItem ) => {
79
+ const legacyUrl = new URL ( '/scan.php' , baseUrl ) ;
80
+ legacyUrl . searchParams . set ( 'page' , page ) ;
11
81
if ( queryOrItem ) {
12
82
if ( page === 'category' ) {
13
- rssUrl . searchParams . set ( 'item' , queryOrItem ) ;
83
+ legacyUrl . searchParams . set ( 'item' , queryOrItem ) ;
14
84
} else {
15
- rssUrl . searchParams . set ( 'q' , queryOrItem ) ;
85
+ legacyUrl . searchParams . set ( 'q' , queryOrItem ) ;
16
86
}
17
87
}
18
88
19
- const feed = await parser . parseURL ( rssUrl . toString ( ) ) ;
89
+ let response ;
90
+ const webUrl = await ctx . cache . tryGet ( `${ redirectCacheKey } :${ legacyUrl . toString ( ) } ` , async ( ) => {
91
+ response = await got ( legacyUrl . toString ( ) ) ;
92
+ return response . url ;
93
+ } ) ;
94
+ if ( response ) {
95
+ const feed = webFetchCb ( response ) ;
96
+ ctx . cache . set ( `${ webArticlesCacheKey } :${ webUrl } ` , feed ) ;
97
+ return feed ;
98
+ }
99
+ return await webFetch ( ctx , webUrl ) ;
100
+ } ;
20
101
21
- const items = await Promise . all (
22
- feed . items . map ( ( item ) =>
23
- ctx . cache . tryGet ( item . link , async ( ) => {
102
+ const tryFetch = async ( ctx , category , topic ) => {
103
+ const webUrl = topic ? `${ baseUrl } /${ category } /${ topic } ` : `${ baseUrl } /${ category } ` ;
104
+ let feed = await webFetch ( ctx , webUrl ) ;
105
+ if ( feed === '404' ) {
106
+ feed = await legacyFetch ( ctx , category , topic ) ;
107
+ }
108
+ return feed ;
109
+ } ;
110
+
111
+ module . exports = async ( ctx ) => {
112
+ const { category, topic } = ctx . params ;
113
+ let feed ;
114
+ switch ( category ) {
115
+ case 'category' :
116
+ case 'news_topic' :
117
+ feed = await legacyFetch ( ctx , category , topic ) ;
118
+ break ;
119
+ case 'rss' :
120
+ feed = await feedFetch ( ) ;
121
+ break ;
122
+ default :
123
+ feed = category ? await tryFetch ( ctx , category , topic ) : await feedFetch ( ) ;
124
+ break ;
125
+ }
126
+
127
+ feed . item = await Promise . all (
128
+ feed . item . map ( ( item ) =>
129
+ ctx . cache . tryGet ( `${ articleCacheKey } :${ item . link } ` , async ( ) => {
24
130
const response = await got ( item . link ) ;
25
131
const html = response . body ;
26
132
const $ = cheerio . load ( html ) ;
27
133
const content = $ ( '.content' ) ;
28
134
29
135
// Author
30
136
const authorSelector = $ ( '.author > a' ) ;
31
- // thel last 2 are the category and comments
137
+ // the last 2 are the category and comments
32
138
const author = authorSelector
33
139
. slice ( 0 , authorSelector . length - 2 )
34
140
. toArray ( )
35
141
. map ( ( e ) => $ ( e ) . text ( ) ) ;
142
+ const category = [ ] ;
143
+ if ( item . link . includes ( '/news/' ) ) {
144
+ category . push ( 'News' ) ;
145
+ } else if ( item . link . includes ( '/review/' ) ) {
146
+ category . push ( 'Review' ) ;
147
+ }
148
+ const categorySelector = authorSelector . eq ( - 2 ) ;
149
+ if ( categorySelector . length ) {
150
+ category . push ( categorySelector . text ( ) ) ;
151
+ }
152
+ let pubDate ;
153
+ if ( ! item . pubDate ) {
154
+ // the text next to the category is the date
155
+ let pubDateReadable = categorySelector . length && categorySelector [ 0 ] . nextSibling ?. nodeValue ;
156
+ if ( pubDateReadable ) {
157
+ pubDateReadable = pubDateReadable . replace ( / o n | a t | \. / g, '' ) . trim ( ) ;
158
+ if ( / \d { 4 } $ / . test ( pubDateReadable ) ) {
159
+ // Only date, no time
160
+ // Michael Larabel lives in Indiana, USA, so we assume TZ=America/Indiana/Indianapolis
161
+ // https://www.phoronix.com/review/phoronix_office_2014
162
+ // Here we use the trick to take daylight saving into account.
163
+ pubDate = dayjs
164
+ // If we don't append "UTC" at the end,
165
+ // dayjs.utc() may still parse the date in the platform (local) timezone.
166
+ // E.g., if the platform timezone is UTC+8, then:
167
+ // > dayjs.utc('2 Dec 2023').toString()
168
+ // 'Fri, 01 Dec 2023 16:00:00 GMT'
169
+ // > dayjs.utc('2 Dec 2023 UTC').toString()
170
+ // 'Sat, 02 Dec 2023 00:00:00 GMT'
171
+ // Append "UTC" at the end to explicitly prohibit the weird behavior.
172
+ . utc ( `${ pubDateReadable } 08:00 UTC` )
173
+ . tz ( 'America/Indiana/Indianapolis' , true ) ;
174
+ } else {
175
+ // date, time, and timezone (including daylight saving)
176
+ pubDate = dayjs ( pubDateReadable ) ;
177
+ }
178
+ if ( ! pubDate . isValid ( ) ) {
179
+ pubDate = pubDateReadable ;
180
+ }
181
+ }
182
+ }
36
183
37
184
// Maybe it's paginated
38
185
const links = $ ( '.pagination > a' )
@@ -55,53 +202,35 @@ module.exports = async (ctx) => {
55
202
content . append ( pages ) ;
56
203
}
57
204
58
- // Summary
59
- const summary = $ ( '.content > p:nth-child(1)' ) ;
60
-
61
- // High res images
62
- content . find ( 'img' ) . each ( ( _ , img ) => {
63
- if ( img . attribs . src . endsWith ( '_med' ) ) {
64
- img . attribs . src = img . attribs . src . replace ( '_med' , '_show' ) ;
205
+ const images = content . find ( 'img' ) ;
206
+ // Remove topic image
207
+ const topicImage = images . first ( ) ;
208
+ if ( topicImage . attr ( 'src' ) ?. startsWith ( '/assets/categories/' ) ) {
209
+ const topicImageContainer = topicImage . parent ( ) ;
210
+ if ( ! topicImageContainer . text ( ) . trim ( ) ) {
211
+ topicImageContainer . remove ( ) ;
212
+ } else {
213
+ topicImage . remove ( ) ;
65
214
}
215
+ }
216
+ // High-res images
217
+ images . each ( ( _ , img ) => {
218
+ img . attribs . src = img . attribs . src . replace ( / _ m e d $ / , '' ) ;
66
219
} ) ;
67
220
68
221
return {
69
- title : item . title ,
70
- id : item . guid ,
71
- pubDate : item . pubDate ,
222
+ title : item . title || $ ( 'article h1' ) . text ( ) ,
223
+ pubDate : item . pubDate || pubDate ,
72
224
author : author . join ( ', ' ) ,
73
225
link : item . link ,
74
- summary : summary . html ( ) ,
226
+ summary : $ ( 'meta[name="twitter:description"]' ) . attr ( 'content' ) ,
75
227
description : content . html ( ) ,
76
- icon : 'https://www.phoronix.com/android-chrome-192x192.png' ,
77
- logo : 'https://www.phoronix.com/phxcms7-css/phoronix.png' ,
228
+ image : $ ( 'meta[name="twitter:image"]' ) . attr ( 'content' ) ,
229
+ category : item . category || category ,
78
230
} ;
79
231
} )
80
232
)
81
233
) ;
82
234
83
- ctx . state . data = {
84
- title : feed . title ,
85
- link : feed . link ,
86
- description : feed . description ,
87
- item : items ,
88
- language : feed . language ,
89
- icon : 'https://www.phoronix.com/android-chrome-192x192.png' ,
90
- image : 'https://www.phoronix.com/android-chrome-192x192.png' ,
91
- logo : 'https://www.phoronix.com/phxcms7-css/phoronix.png' ,
92
- // Copied from thier web page metadata
93
- category : [
94
- 'Linux Hardware Reviews' ,
95
- 'Linux hardware benchmarks' ,
96
- 'Linux Hardware' ,
97
- 'Linux benchmarking' ,
98
- 'Desktop Linux' ,
99
- 'GNU/Linux benchmarks' ,
100
- 'Open Source AMD' ,
101
- 'Linux How To' ,
102
- 'X.Org drivers' ,
103
- 'Ubuntu hardware' ,
104
- 'Phoronix Test Suite' ,
105
- ] ,
106
- } ;
235
+ ctx . state . data = feed ;
107
236
} ;
0 commit comments