From 78b55b0681af8a32b046498f6492c92b22d37c0d Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Mon, 22 Apr 2024 11:46:15 +0200 Subject: [PATCH 1/4] JSONFeed: change item.content into a list The spec allows to have both text and HTML elements, and the feedparser content key supports different content types. Adjust the code to look at both source elements and add all that are present. Signed-off-by: Beat Bolli --- feedparser/parsers/json.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py index 36f714a5..fb66b7e8 100644 --- a/feedparser/parsers/json.py +++ b/feedparser/parsers/json.py @@ -85,16 +85,21 @@ def parse_entry(self, e): if src in e: entry[dst] = e[src] + content = [] if "content_text" in e: - entry["content"] = c = FeedParserDict() + c = FeedParserDict() c["value"] = e["content_text"] c["type"] = "text" - elif "content_html" in e: - entry["content"] = c = FeedParserDict() + content.append(c) + if "content_html" in e: + c = FeedParserDict() c["value"] = sanitize_html( e["content_html"], self.encoding, "application/json" ) c["type"] = "html" + content.append(c) + if content: + entry["content"] = content if "date_published" in e: entry["published"] = e["date_published"] From 97c648c2bac42ea11c8f2cbf6ddd78ca52ab975c Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Mon, 22 Apr 2024 12:24:10 +0200 Subject: [PATCH 2/4] JSONfeed: prefer HTML over text content HTML has generally more useful information than text, so let's move it first in the content list. Users that want text can still iterate over the list and pick the text type. Signed-off-by: Beat Bolli --- feedparser/parsers/json.py | 10 +++++----- tests/json/html_first.json | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 tests/json/html_first.json diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py index fb66b7e8..ef6633e8 100644 --- a/feedparser/parsers/json.py +++ b/feedparser/parsers/json.py @@ -86,11 +86,6 @@ def parse_entry(self, e): entry[dst] = e[src] content = [] - if "content_text" in e: - c = FeedParserDict() - c["value"] = e["content_text"] - c["type"] = "text" - content.append(c) if "content_html" in e: c = FeedParserDict() c["value"] = sanitize_html( @@ -98,6 +93,11 @@ def parse_entry(self, e): ) c["type"] = "html" content.append(c) + if "content_text" in e: + c = FeedParserDict() + c["value"] = e["content_text"] + c["type"] = "text" + content.append(c) if content: entry["content"] = content diff --git a/tests/json/html_first.json b/tests/json/html_first.json new file mode 100644 index 00000000..b7339cd3 --- /dev/null +++ b/tests/json/html_first.json @@ -0,0 +1,18 @@ +{ + "__TEST__": "Description: basic JSON tests Expect: not bozo and items[0].content[0].type == 'html' -->", + "version": "https://jsonfeed.org/version/1", + "title": "html_preferred", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "icon": "https://example.org/feed.png", + "author": { "name": "me" }, + "items": [ + { + "id": "1", + "author": { "name": "you", "url": "http://example.net/~you" }, + "content_text": "Hello, world!\n", + "content_html": "

Hello, world!

\n\n", + "url": "https://example.org/initial-post" + } + ] +} From 6a68b8dca380fd3f8ece5746f59e3313d49cf7f9 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Mon, 22 Apr 2024 12:28:10 +0200 Subject: [PATCH 3/4] JSONfeed: validate the presence of content_html or content_text The spec says that at least one element is mandatory. Verify this and raise an exception if both are missing. Signed-off-by: Beat Bolli --- feedparser/parsers/json.py | 8 +++++--- tests/json/no_content.json | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) create mode 100644 tests/json/no_content.json diff --git a/feedparser/parsers/json.py b/feedparser/parsers/json.py index ef6633e8..f90f29ab 100644 --- a/feedparser/parsers/json.py +++ b/feedparser/parsers/json.py @@ -85,7 +85,7 @@ def parse_entry(self, e): if src in e: entry[dst] = e[src] - content = [] + entry["content"] = content = [] if "content_html" in e: c = FeedParserDict() c["value"] = sanitize_html( @@ -98,8 +98,10 @@ def parse_entry(self, e): c["value"] = e["content_text"] c["type"] = "text" content.append(c) - if content: - entry["content"] = content + if not content: + raise ValueError( + f"item {entry['id']=} has neither 'content_text' nor 'content_html'" + ) if "date_published" in e: entry["published"] = e["date_published"] diff --git a/tests/json/no_content.json b/tests/json/no_content.json new file mode 100644 index 00000000..296a84ac --- /dev/null +++ b/tests/json/no_content.json @@ -0,0 +1,17 @@ +{ + "__TEST__": "Description: basic JSON tests Expect: bozo and 'neither' in str(bozo_exception) -->", + "version": "https://jsonfeed.org/version/1", + "title": "no content", + "home_page_url": "https://example.org/", + "feed_url": "https://example.org/feed.json", + "icon": "https://example.org/feed.png", + "author": { "name": "me" }, + "items": [ + { + "id": "1", + "author": { "name": "you", "url": "http://example.net/~you" }, + "summary": "Hello, world!\n", + "url": "https://example.org/initial-post" + } + ] +} From 52209c95894aaa7bee976966fef70d00e6e052a5 Mon Sep 17 00:00:00 2001 From: Beat Bolli Date: Mon, 22 Apr 2024 12:59:34 +0200 Subject: [PATCH 4/4] Add a changelog.d fragment for the three preceding commits Signed-off-by: Beat Bolli --- changelog.d/20240422_124749_bbolli_jsonfeed_content.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 changelog.d/20240422_124749_bbolli_jsonfeed_content.rst diff --git a/changelog.d/20240422_124749_bbolli_jsonfeed_content.rst b/changelog.d/20240422_124749_bbolli_jsonfeed_content.rst new file mode 100644 index 00000000..eb9e011b --- /dev/null +++ b/changelog.d/20240422_124749_bbolli_jsonfeed_content.rst @@ -0,0 +1,5 @@ +Fixed +----- + +* JSONfeed item content is now a list that prefers HTML content. +* A JSONfeed item without content sets the bozo flag.