diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
index 75fc9a9c..934e811b 100644
--- a/CONTRIBUTORS.rst
+++ b/CONTRIBUTORS.rst
@@ -29,3 +29,4 @@ bug report!
* `Aaron Swartz `_
* `Jakub Wilk `_
* `Nestor Rodriguez `_
+* `Rong Zhang `_
diff --git a/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst b/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst
new file mode 100644
index 00000000..727dbe51
--- /dev/null
+++ b/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst
@@ -0,0 +1,7 @@
+Fixed
+-----
+
+* If the metadata of a feed explicitly indicates that the encoding is UTF-8,
+try decode it with ``errors="replace"`` when decoding fails. This prevents
+feeds from being decoded with wrong encodings when they are mostly UTF-8 but
+contain a few invalid bytes.
diff --git a/feedparser/__init__.py b/feedparser/__init__.py
index 9fa58e96..cca1dc0a 100644
--- a/feedparser/__init__.py
+++ b/feedparser/__init__.py
@@ -28,6 +28,7 @@
from .api import parse
from .datetimes import registerDateHandler
from .exceptions import (
+ CharacterEncodingErrorsReplace,
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
@@ -64,6 +65,7 @@
"registerDateHandler",
"FeedParserDict",
"FeedparserError",
+ "CharacterEncodingErrorsReplace",
"CharacterEncodingOverride",
"CharacterEncodingUnknown",
"NonXMLContentType",
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
index 01f228d1..35c3b908 100644
--- a/feedparser/encodings.py
+++ b/feedparser/encodings.py
@@ -47,6 +47,7 @@ def lazy_chardet_encoding(data):
from .exceptions import (
+ CharacterEncodingErrorsReplace,
CharacterEncodingOverride,
CharacterEncodingUnknown,
FeedparserError,
@@ -218,6 +219,21 @@ def convert_to_utf8(
http_content_type = http_headers.get("content-type") or ""
http_content_type, http_encoding = parse_content_type(http_content_type)
+ # Some UTF-8 documents may contain invalid characters, resulting in
+ # falling back to lazy_chardet_encoding or iso-8859-2.
+ # In such a case, lazy_chardet_encoding may not be able to detect the
+ # encoding correctly, and iso-8859-2 is apparently a wrong guess.
+
+ # Therefore, we use the flag to allow decoding UTF-8 documents with
+ # errors='replace'.
+
+ # Considering the fact that UTF-8 is the most popular encoding,
+ # the flag can be safely set if any metadata of the document explicitly
+ # indicates that the encoding is UTF-8.
+
+ # 1st pass: adhere to HTTP encoding (Content-Type)
+ utf_8_confident = http_encoding == "utf-8"
+
acceptable_content_type = 0
application_content_types = (
"application/xml",
@@ -232,6 +248,11 @@ def convert_to_utf8(
and http_content_type.endswith("+xml")
):
acceptable_content_type = 1
+ # 2nd pass: adhere to the declared XML encoding
+ # (but not in the inconsistent case)
+ utf_8_confident = utf_8_confident or (
+ xml_encoding == "utf-8" and not http_encoding
+ )
rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
elif http_content_type in text_content_types or (
http_content_type.startswith("text/") and http_content_type.endswith("+xml")
@@ -298,7 +319,18 @@ def convert_to_utf8(
try:
text = data.decode(proposed_encoding)
except (UnicodeDecodeError, LookupError):
- continue
+ if proposed_encoding != "utf-8" or not utf_8_confident:
+ continue
+ # try utf-8 with errors='replace' if we are confident
+ try:
+ text = data.decode("utf-8", errors="replace")
+ error = CharacterEncodingErrorsReplace(
+ "document explicitly declared its encoding as utf-8, "
+ "but has encoding errors, "
+ "which has been replaced with � (U+FFFD)"
+ )
+ except (UnicodeDecodeError, LookupError):
+ continue
known_encoding = True
if not json:
diff --git a/feedparser/exceptions.py b/feedparser/exceptions.py
index 49ca2858..73eba7cf 100644
--- a/feedparser/exceptions.py
+++ b/feedparser/exceptions.py
@@ -28,6 +28,7 @@
__all__ = [
"FeedparserError",
+ "CharacterEncodingErrorsReplace",
"CharacterEncodingOverride",
"CharacterEncodingUnknown",
"NonXMLContentType",
@@ -39,6 +40,10 @@ class FeedparserError(Exception):
pass
+class CharacterEncodingErrorsReplace(FeedparserError):
+ pass
+
+
class CharacterEncodingOverride(FeedparserError):
pass
diff --git a/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml b/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml
new file mode 100644
index 00000000..bdd6b50b
--- /dev/null
+++ b/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+-
+
+
+
+
diff --git a/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml b/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml
new file mode 100644
index 00000000..5c724fe1
--- /dev/null
+++ b/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+-
+
+
+
+