diff --git a/docs/character-encoding.rst b/docs/character-encoding.rst index b8fb217f..17659774 100644 --- a/docs/character-encoding.rst +++ b/docs/character-encoding.rst @@ -99,6 +99,13 @@ sets the ``bozo`` bit to ``1`` and sets ``bozo_exception`` to ``feedparser.CharacterEncodingUnknown``. In this case, parsed values will be strings, not Unicode strings. +.. note:: + + When using multiprocessing, Python exceptions cannot always be safely + serialized between processes. In such cases, ``bozo_exception`` will + contain a string representation of the exception instead of the exception + object itself. + Handling Incorrectly-Declared Media Types ----------------------------------------- diff --git a/feedparser/api.py b/feedparser/api.py index 1c9cf15d..50800421 100644 --- a/feedparser/api.py +++ b/feedparser/api.py @@ -27,6 +27,7 @@ # POSSIBILITY OF SUCH DAMAGE. import io +import multiprocessing import urllib.error import urllib.parse import xml.sax @@ -374,3 +375,10 @@ def _parse_file_inplace( result["namespaces"] = {} else: result["namespaces"] = feed_parser.namespaces_in_use + + if "bozo_exception" in result and _is_multiprocessing(): + result["bozo_exception"] = str(result["bozo_exception"]) + + +def _is_multiprocessing(): + return multiprocessing.parent_process() is not None diff --git a/tests/test_multiprocessing.py b/tests/test_multiprocessing.py new file mode 100644 index 00000000..ae7bee19 --- /dev/null +++ b/tests/test_multiprocessing.py @@ -0,0 +1,72 @@ +from concurrent.futures import ProcessPoolExecutor + +import pytest + +import feedparser + +base_feed_str = ( + b"\n" + b'\n' + b"\n" + b"Foo\n" + b"https://foo.com/\n" + b"" + b"Title 1" + b"https://foo.com/1" + b"Thu, 05 Jun 2025 18:27:58 -0000" + b"\n" + b"\n" + b"\n" +) + + +def _parse_and_return_full(raw_feed: bytes): + return feedparser.parse(raw_feed) + + +@pytest.mark.parametrize( + "feed, expected_title, expected_bozo_exception, expected_items", + [ + ( + base_feed_str, + "Foo", + None, + [ + dict( + title="Title 1", + link="https://foo.com/1", + published="Thu, 05 Jun 2025 18:27:58 -0000", + ) + ], + ), + ( + b"\n" + base_feed_str, + "Foo", + "XML or text declaration not at start of entity", + [ + dict( + title="Title 1", + link="https://foo.com/1", + published="Thu, 05 Jun 2025 18:27:58 -0000", + ) + ], + ), + ], + ids=["correct_feed", "leading_newline_feed"], +) +def test_multiprocessing_parse( + feed, expected_title, expected_bozo_exception, expected_items +): + with ProcessPoolExecutor(1) as pool: + future = pool.submit(_parse_and_return_full, feed) + result = future.result() + + assert result["feed"]["title"] == expected_title + if expected_bozo_exception: + assert expected_bozo_exception in result.get("bozo_exception") + else: + assert result.get("bozo_exception") is None + for observed, expected in zip(result["entries"], expected_items, strict=True): + assert observed["published"] == expected["published"] + assert observed["link"] == expected["link"] + assert observed["title"] == expected["title"]