diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst index 75fc9a9c..934e811b 100644 --- a/CONTRIBUTORS.rst +++ b/CONTRIBUTORS.rst @@ -29,3 +29,4 @@ bug report! * `Aaron Swartz `_ * `Jakub Wilk `_ * `Nestor Rodriguez `_ +* `Rong Zhang `_ diff --git a/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst b/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst new file mode 100644 index 00000000..727dbe51 --- /dev/null +++ b/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst @@ -0,0 +1,7 @@ +Fixed +----- + +* If the metadata of a feed explicitly indicates that the encoding is UTF-8, +try decode it with ``errors="replace"`` when decoding fails. This prevents +feeds from being decoded with wrong encodings when they are mostly UTF-8 but +contain a few invalid bytes. diff --git a/feedparser/__init__.py b/feedparser/__init__.py index 9fa58e96..cca1dc0a 100644 --- a/feedparser/__init__.py +++ b/feedparser/__init__.py @@ -28,6 +28,7 @@ from .api import parse from .datetimes import registerDateHandler from .exceptions import ( + CharacterEncodingErrorsReplace, CharacterEncodingOverride, CharacterEncodingUnknown, FeedparserError, @@ -64,6 +65,7 @@ "registerDateHandler", "FeedParserDict", "FeedparserError", + "CharacterEncodingErrorsReplace", "CharacterEncodingOverride", "CharacterEncodingUnknown", "NonXMLContentType", diff --git a/feedparser/encodings.py b/feedparser/encodings.py index 01f228d1..35c3b908 100644 --- a/feedparser/encodings.py +++ b/feedparser/encodings.py @@ -47,6 +47,7 @@ def lazy_chardet_encoding(data): from .exceptions import ( + CharacterEncodingErrorsReplace, CharacterEncodingOverride, CharacterEncodingUnknown, FeedparserError, @@ -218,6 +219,21 @@ def convert_to_utf8( http_content_type = http_headers.get("content-type") or "" http_content_type, http_encoding = parse_content_type(http_content_type) + # Some UTF-8 documents may contain invalid characters, resulting in + # falling back to lazy_chardet_encoding or iso-8859-2. + # In such a case, lazy_chardet_encoding may not be able to detect the + # encoding correctly, and iso-8859-2 is apparently a wrong guess. + + # Therefore, we use the flag to allow decoding UTF-8 documents with + # errors='replace'. + + # Considering the fact that UTF-8 is the most popular encoding, + # the flag can be safely set if any metadata of the document explicitly + # indicates that the encoding is UTF-8. + + # 1st pass: adhere to HTTP encoding (Content-Type) + utf_8_confident = http_encoding == "utf-8" + acceptable_content_type = 0 application_content_types = ( "application/xml", @@ -232,6 +248,11 @@ def convert_to_utf8( and http_content_type.endswith("+xml") ): acceptable_content_type = 1 + # 2nd pass: adhere to the declared XML encoding + # (but not in the inconsistent case) + utf_8_confident = utf_8_confident or ( + xml_encoding == "utf-8" and not http_encoding + ) rfc3023_encoding = http_encoding or xml_encoding or "utf-8" elif http_content_type in text_content_types or ( http_content_type.startswith("text/") and http_content_type.endswith("+xml") @@ -298,7 +319,18 @@ def convert_to_utf8( try: text = data.decode(proposed_encoding) except (UnicodeDecodeError, LookupError): - continue + if proposed_encoding != "utf-8" or not utf_8_confident: + continue + # try utf-8 with errors='replace' if we are confident + try: + text = data.decode("utf-8", errors="replace") + error = CharacterEncodingErrorsReplace( + "document explicitly declared its encoding as utf-8, " + "but has encoding errors, " + "which has been replaced with � (U+FFFD)" + ) + except (UnicodeDecodeError, LookupError): + continue known_encoding = True if not json: diff --git a/feedparser/exceptions.py b/feedparser/exceptions.py index 49ca2858..73eba7cf 100644 --- a/feedparser/exceptions.py +++ b/feedparser/exceptions.py @@ -28,6 +28,7 @@ __all__ = [ "FeedparserError", + "CharacterEncodingErrorsReplace", "CharacterEncodingOverride", "CharacterEncodingUnknown", "NonXMLContentType", @@ -39,6 +40,10 @@ class FeedparserError(Exception): pass +class CharacterEncodingErrorsReplace(FeedparserError): + pass + + class CharacterEncodingOverride(FeedparserError): pass diff --git a/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml b/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml new file mode 100644 index 00000000..bdd6b50b --- /dev/null +++ b/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml @@ -0,0 +1,14 @@ + + + + + + + + + + diff --git a/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml b/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml new file mode 100644 index 00000000..5c724fe1 --- /dev/null +++ b/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml @@ -0,0 +1,14 @@ + + + + + + + + + +