Skip to content

Commit 31425dd

Browse files
committed
backporting some changes from readability
1 parent 7155da8 commit 31425dd

File tree

4 files changed

+20
-21
lines changed

4 files changed

+20
-21
lines changed

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "cheer-reader",
3-
"version": "0.1.1",
3+
"version": "0.1.2",
44
"description": "A cheerio port of readability.js",
55
"main": "index.js",
66
"types": "dist/index.d.ts",

src/index.ts

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -957,29 +957,18 @@ export class Readability {
957957
parseSuccessful = false
958958
$page.html(pageCacheHtml)
959959

960+
this.attempts.push({
961+
articleContent: $articleContent,
962+
textLength,
963+
})
964+
960965
if (this.flagIsActive(FLAG_STRIP_UNLIKELYS)) {
961966
this.removeFlag(FLAG_STRIP_UNLIKELYS)
962-
this.attempts.push({
963-
articleContent: $articleContent,
964-
textLength,
965-
})
966967
} else if (this.flagIsActive(FLAG_WEIGHT_CLASSES)) {
967968
this.removeFlag(FLAG_WEIGHT_CLASSES)
968-
this.attempts.push({
969-
articleContent: $articleContent,
970-
textLength,
971-
})
972969
} else if (this.flagIsActive(FLAG_CLEAN_CONDITIONALLY)) {
973970
this.removeFlag(FLAG_CLEAN_CONDITIONALLY)
974-
this.attempts.push({
975-
articleContent: $articleContent,
976-
textLength,
977-
})
978971
} else {
979-
this.attempts.push({
980-
articleContent: $articleContent,
981-
textLength,
982-
})
983972
// No luck after removing flags, just return the longest text we found during the different loops
984973
this.attempts.sort(function (a, b) {
985974
return b.textLength - a.textLength

src/regexes.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ export const okMaybeItsACandidate =
88
export const positive =
99
/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i
1010
export const negative =
11-
/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i
11+
/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i
1212
export const extraneous =
1313
/print|archive|comment|discuss|e[-]?mail|share|reply|all|login|sign|single|utility/i
1414
export const replaceFonts = /<(\/?)font[^>]*>/gi

src/unescapeHtmlEntities.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,18 @@ export function unescapeHtmlEntities(str: string | null | undefined) {
1818
const index = tag as keyof typeof HTML_ESCAPE_MAP
1919
return HTML_ESCAPE_MAP[index]
2020
})
21-
.replace(/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/gi, (_, hex, numStr) => {
22-
const num = parseInt(hex || numStr, hex ? 16 : 10)
23-
return String.fromCharCode(num)
21+
.replace(/&#(?:x([0-9a-f]+)|([0-9]+));/gi, function (_, hex, numStr) {
22+
let num = parseInt(hex || numStr, hex ? 16 : 10)
23+
24+
// these character references are replaced by a conforming HTML parser
25+
if (
26+
num == 0 ||
27+
num > 0x10ffff ||
28+
(num >= 0xd800 && num <= 0xdfff)
29+
) {
30+
num = 0xfffd
31+
}
32+
33+
return String.fromCodePoint(num)
2434
})
2535
}

0 commit comments

Comments
 (0)