diff --git a/html2text/__init__.py b/html2text/__init__.py
index a937dc7..5471f9f 100644
--- a/html2text/__init__.py
+++ b/html2text/__init__.py
@@ -12,6 +12,7 @@
from ._typing import OutCallback
from .elements import AnchorElement, ListElement
from .utils import (
+ control_character_replacements,
dumb_css_parser,
element_style,
escape_md,
@@ -917,13 +918,14 @@ def charref(self, name: str) -> str:
else:
c = int(name)
+ if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate
+ c = 0xFFFD # REPLACEMENT CHARACTER
+ c = control_character_replacements.get(c, c)
+
if not self.unicode_snob and c in unifiable_n:
return unifiable_n[c]
else:
- try:
- return chr(c)
- except ValueError: # invalid unicode
- return ""
+ return chr(c)
def entityref(self, c: str) -> str:
if not self.unicode_snob and c in config.UNIFIABLE:
diff --git a/html2text/utils.py b/html2text/utils.py
index 366748b..8f77668 100644
--- a/html2text/utils.py
+++ b/html2text/utils.py
@@ -9,6 +9,37 @@
if k != "nbsp"
}
+# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code
+control_character_replacements = {
+ 0x80: 0x20AC, # EURO SIGN (€)
+ 0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚)
+ 0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ)
+ 0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („)
+ 0x85: 0x2026, # HORIZONTAL ELLIPSIS (…)
+ 0x86: 0x2020, # DAGGER (†)
+ 0x87: 0x2021, # DOUBLE DAGGER (‡)
+ 0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
+ 0x89: 0x2030, # PER MILLE SIGN (‰)
+ 0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š)
+ 0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
+ 0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ)
+ 0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž)
+ 0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘)
+ 0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’)
+ 0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“)
+ 0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”)
+ 0x95: 0x2022, # BULLET (•)
+ 0x96: 0x2013, # EN DASH (–)
+ 0x97: 0x2014, # EM DASH (—)
+ 0x98: 0x02DC, # SMALL TILDE (˜)
+ 0x99: 0x2122, # TRADE MARK SIGN (™)
+ 0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š)
+ 0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
+ 0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ)
+ 0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž)
+ 0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
+}
+
def hn(tag: str) -> int:
if tag[0] == "h" and len(tag) == 2:
diff --git a/test/invalid_unicode.html b/test/invalid_unicode.html
index 3dd8b18..f25e754 100644
--- a/test/invalid_unicode.html
+++ b/test/invalid_unicode.html
@@ -1 +1,5 @@
Br
+
+
+
+
diff --git a/test/invalid_unicode.md b/test/invalid_unicode.md
index b028e67..ef04228 100644
--- a/test/invalid_unicode.md
+++ b/test/invalid_unicode.md
@@ -1 +1 @@
-Br
+B�r €‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ����