Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions html2text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ._typing import OutCallback
from .elements import AnchorElement, ListElement
from .utils import (
control_character_replacements,
dumb_css_parser,
element_style,
escape_md,
Expand Down Expand Up @@ -917,13 +918,14 @@ def charref(self, name: str) -> str:
else:
c = int(name)

if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate
c = 0xFFFD # REPLACEMENT CHARACTER
c = control_character_replacements.get(c, c)

if not self.unicode_snob and c in unifiable_n:
return unifiable_n[c]
else:
try:
return chr(c)
except ValueError: # invalid unicode
return ""
return chr(c)

def entityref(self, c: str) -> str:
if not self.unicode_snob and c in config.UNIFIABLE:
Expand Down
31 changes: 31 additions & 0 deletions html2text/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,37 @@
if k != "nbsp"
}

# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code
control_character_replacements = {
0x80: 0x20AC, # EURO SIGN (€)
0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚)
0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ)
0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („)
0x85: 0x2026, # HORIZONTAL ELLIPSIS (…)
0x86: 0x2020, # DAGGER (†)
0x87: 0x2021, # DOUBLE DAGGER (‡)
0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
0x89: 0x2030, # PER MILLE SIGN (‰)
0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š)
0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ)
0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž)
0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘)
0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’)
0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“)
0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”)
0x95: 0x2022, # BULLET (•)
0x96: 0x2013, # EN DASH (–)
0x97: 0x2014, # EM DASH (—)
0x98: 0x02DC, # SMALL TILDE (˜)
0x99: 0x2122, # TRADE MARK SIGN (™)
0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š)
0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ)
0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž)
0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
}


def hn(tag: str) -> int:
if tag[0] == "h" and len(tag) == 2:
Expand Down
4 changes: 4 additions & 0 deletions test/invalid_unicode.html
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
B&#3291685;r

&#x80;&#x82;&#x83;&#x84;&#x85;&#x86;&#x87;&#x88;&#x89;&#x8a;&#x8b;&#x8c;&#x8e;&#x91;&#x92;&#x93;&#x94;&#x95;&#x96;&#x97;&#x98;&#x99;&#x9a;&#x9b;&#x9c;&#x9e;&#x9f;

&#0;&#xd800;&#xdfff;&#x110000;
2 changes: 1 addition & 1 deletion test/invalid_unicode.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
Br
B�r €‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ����