From a9ef96c31b50654b63655f19a05f88d25b2fb14e Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Sat, 4 Nov 2023 07:26:45 +0100 Subject: [PATCH 01/15] Cleanup formatting --- toot/tui/richtext.py | 12 +++++------- toot/tui/stubs/stub_hyperlink.py | 7 +------ toot/tui/stubs/stub_text_embed.py | 12 ++---------- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index e74dff86..b1a4c68a 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -1,16 +1,14 @@ -""" -richtext -""" -from typing import List, Tuple import re import urwid import unicodedata -from .constants import PALETTE + from bs4 import BeautifulSoup from bs4.element import NavigableString, Tag -from .stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets -from urwid.util import decompose_tagmarkup +from toot.tui.constants import PALETTE +from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets from toot.utils import urlencode_url +from typing import List, Tuple +from urwid.util import decompose_tagmarkup class ContentParser: diff --git a/toot/tui/stubs/stub_hyperlink.py b/toot/tui/stubs/stub_hyperlink.py index a2831ba4..aa0488de 100644 --- a/toot/tui/stubs/stub_hyperlink.py +++ b/toot/tui/stubs/stub_hyperlink.py @@ -4,12 +4,7 @@ class Hyperlink(urwid.WidgetWrap): - def __init__( - self, - uri, - attr, - text, - ): + def __init__(self, uri, attr, text): pass def render(self, size, focus): diff --git a/toot/tui/stubs/stub_text_embed.py b/toot/tui/stubs/stub_text_embed.py index bf587d29..622b5f7f 100644 --- a/toot/tui/stubs/stub_text_embed.py +++ b/toot/tui/stubs/stub_text_embed.py @@ -4,9 +4,7 @@ class TextEmbed(urwid.Text): - def get_text( - self, - ): + def get_text(self): return None def render(self, size, focus): @@ -19,11 +17,5 @@ def set_wrap_mode(self, mode): pass -def parse_text( - text, - patterns, - repl, - *repl_args, - **repl_kwargs, -): +def parse_text(text, patterns, repl, *repl_args, **repl_kwargs): return None From d91c73520e4d662c00ad8ab191aa4a0bcd9d5a1e Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Sat, 4 Nov 2023 07:38:47 +0100 Subject: [PATCH 02/15] Better function name --- toot/output.py | 4 ++-- toot/utils/__init__.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/toot/output.py b/toot/output.py index 6fd59a2a..83898f8d 100644 --- a/toot/output.py +++ b/toot/output.py @@ -6,7 +6,7 @@ from functools import lru_cache from toot import settings from toot.entities import Instance, Notification, Poll, Status -from toot.utils import get_text, parse_html +from toot.utils import get_text, html_to_paragraphs from toot.wcstring import wc_wrap from typing import List from wcwidth import wcswidth @@ -321,7 +321,7 @@ def print_status(status: Status, width: int = 80): def print_html(text, width=80): first = True - for paragraph in parse_html(text): + for paragraph in html_to_paragraphs(text): if not first: print_out("") for line in paragraph: diff --git a/toot/utils/__init__.py b/toot/utils/__init__.py index 43af3732..f0fda9e4 100644 --- a/toot/utils/__init__.py +++ b/toot/utils/__init__.py @@ -36,7 +36,7 @@ def get_text(html): return unicodedata.normalize('NFKC', text) -def parse_html(html): +def html_to_paragraphs(html): """Attempt to convert html to plain text while keeping line breaks. Returns a list of paragraphs, each being a list of lines. """ @@ -55,7 +55,7 @@ def format_content(content): Returns a generator yielding lines of content. """ - paragraphs = parse_html(content) + paragraphs = html_to_paragraphs(content) first = True From 199a96625b3ca60f2f6f8e16495c79b49a0c23f9 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Sat, 4 Nov 2023 07:40:56 +0100 Subject: [PATCH 03/15] Extract parsing html --- toot/tui/richtext.py | 5 ++--- toot/utils/__init__.py | 12 +++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index b1a4c68a..66c1f271 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -2,11 +2,10 @@ import urwid import unicodedata -from bs4 import BeautifulSoup from bs4.element import NavigableString, Tag from toot.tui.constants import PALETTE from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets -from toot.utils import urlencode_url +from toot.utils import parse_html, urlencode_url from typing import List, Tuple from urwid.util import decompose_tagmarkup @@ -23,7 +22,7 @@ def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: """Convert html to urwid widgets""" widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) - soup = BeautifulSoup(html.replace("'", "'"), "html.parser") + soup = parse_html(html) first_tag = True for e in soup.body or soup: if isinstance(e, NavigableString): diff --git a/toot/utils/__init__.py b/toot/utils/__init__.py index f0fda9e4..c4afa7ff 100644 --- a/toot/utils/__init__.py +++ b/toot/utils/__init__.py @@ -23,17 +23,19 @@ def str_bool_nullable(b): return None if b is None else str_bool(b) -def get_text(html): - """Converts html to text, strips all tags.""" - +def parse_html(html: str) -> BeautifulSoup: # Ignore warnings made by BeautifulSoup, if passed something that looks like # a file (e.g. a dot which matches current dict), it will warn that the file # should be opened instead of passing a filename. with warnings.catch_warnings(): warnings.simplefilter("ignore") - text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text() + return BeautifulSoup(html.replace("'", "'"), "html.parser") - return unicodedata.normalize('NFKC', text) + +def get_text(html): + """Converts html to text, strips all tags.""" + text = parse_html(html).get_text() + return unicodedata.normalize("NFKC", text) def html_to_paragraphs(html): From a8b4c7971682d81164e2de8cb4c9c83fd208ba3e Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Mon, 6 Nov 2023 09:36:30 +0100 Subject: [PATCH 04/15] Eliminate constructor --- toot/tui/richtext.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index 66c1f271..e46de6b1 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -10,13 +10,11 @@ from urwid.util import decompose_tagmarkup -class ContentParser: - def __init__(self): - self.palette_names = [] - for p in PALETTE: - self.palette_names.append(p[0]) +STYLE_NAMES = [p[0] for p in PALETTE] + - """Parse a limited subset of HTML and create urwid widgets.""" +class ContentParser: + """Parse a limited subset of HTML and create urwid widgets.""" def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: """Convert html to urwid widgets""" @@ -187,7 +185,7 @@ def get_urwid_attr_name(self, tag) -> str: style_name = "class_" + "_".join(clss) # return the class name, only if we # find it as a defined palette name - if style_name in self.palette_names: + if style_name in STYLE_NAMES: return style_name # fallback to returning the tag name From 2aba3f93f990194bc82bb6ded8b710293e14b107 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Mon, 6 Nov 2023 09:56:12 +0100 Subject: [PATCH 05/15] Extract block tags --- toot/tui/richtext.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index e46de6b1..f47cbdee 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -12,6 +12,9 @@ STYLE_NAMES = [p[0] for p in PALETTE] +# NOTE: update this list if Mastodon starts supporting more block tags +BLOCK_TAGS = ["p", "pre", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"] + class ContentParser: """Parse a limited subset of HTML and create urwid widgets.""" @@ -21,6 +24,7 @@ def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) soup = parse_html(html) + first_tag = True for e in soup.body or soup: if isinstance(e, NavigableString): @@ -37,23 +41,7 @@ def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: # if our HTML starts with a tag, but not a block tag # the HTML is out of spec. Attempt a fix by wrapping the # HTML with

- if ( - first_tag - and not recovery_attempt - and name - not in ( - "p", - "pre", - "li", - "blockquote", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - ) # NOTE: update this list if Mastodon starts supporting more block tags - ): + if (first_tag and not recovery_attempt and name not in BLOCK_TAGS): return self.html_to_widgets(f"

{html}

", recovery_attempt=True) # First, look for a custom tag handler method in this class From ce6faccb991f163bff628c1eb9bd6ec65d7d9f06 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Mon, 6 Nov 2023 17:43:02 +0100 Subject: [PATCH 06/15] Extract render method --- toot/tui/richtext.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index f47cbdee..22e4b2b0 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -44,10 +44,7 @@ def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: if (first_tag and not recovery_attempt and name not in BLOCK_TAGS): return self.html_to_widgets(f"

{html}

", recovery_attempt=True) - # First, look for a custom tag handler method in this class - # If that fails, fall back to inline_tag_to_text handler - method = getattr(self, "_" + name, self.inline_tag_to_text) - markup = method(e) # either returns a Widget, or plain text + markup = self.render(name, e) first_tag = False if not isinstance(markup, urwid.Widget): @@ -77,8 +74,7 @@ def process_inline_tag_children(self, tag) -> List: markups = [] for child in tag.children: if isinstance(child, Tag): - method = getattr(self, "_" + child.name, self.inline_tag_to_text) - markup = method(child) + markup = self.render(child.name, child) markups.append(markup) else: markups.append(child) @@ -133,8 +129,7 @@ def process_block_tag_children(self, tag) -> List[urwid.Widget]: if isinstance(child, Tag): # child is a nested tag; process using custom method # or default to inline_tag_to_text - method = getattr(self, "_" + child.name, self.inline_tag_to_text) - result = method(child) + result = self.render(child.name, child) if isinstance(result, urwid.Widget): found_nested_widget = True child_widgets.append(result) @@ -207,6 +202,12 @@ def get_best_anchor_attr(self, attrib_list) -> str: return "a" + def render(self, attr, content): + # First, look for a custom tag handler method in this class + # If that fails, fall back to inline_tag_to_text handler + method = getattr(self, f"_{attr}", self.inline_tag_to_text) + return method(content) + def _a(self, tag) -> Tuple: """anchor tag handler""" @@ -298,8 +299,7 @@ def _ol(self, tag) -> urwid.Widget: pass for li in tag.find_all("li", recursive=False): - method = getattr(self, "_li", self.inline_tag_to_text) - markup = method(li) + markup = self.render("li", li) # li value= attribute will change the item number # it also overrides any ol start= attribute @@ -396,8 +396,7 @@ def _ul(self, tag) -> urwid.Widget: widgets = [] for li in tag.find_all("li", recursive=False): - method = getattr(self, "_li", self.inline_tag_to_text) - markup = method(li) + markup = self.render("li", li) if not isinstance(markup, urwid.Widget): txt = self.text_to_widget("li", ["\N{bullet} ", markup]) From a5444533381cd7dace785f2fbb4cd849d2447113 Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Mon, 6 Nov 2023 17:51:03 +0100 Subject: [PATCH 07/15] Remove magic lookup Having the choice explicit makes the code easier to read. --- toot/tui/richtext.py | 77 +++++++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index 22e4b2b0..f1829def 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -202,13 +202,43 @@ def get_best_anchor_attr(self, attrib_list) -> str: return "a" - def render(self, attr, content): - # First, look for a custom tag handler method in this class - # If that fails, fall back to inline_tag_to_text handler - method = getattr(self, f"_{attr}", self.inline_tag_to_text) - return method(content) + def render(self, attr: str, content: str): + if attr in ["a"]: + return self.render_anchor(content) - def _a(self, tag) -> Tuple: + if attr in ["blockquote"]: + return self.render_blockquote(content) + + if attr in ["br"]: + return self.render_br(content) + + if attr in ["em"]: + return self.render_em(content) + + if attr in ["ol"]: + return self.render_ol(content) + + if attr in ["pre"]: + return self.render_pre(content) + + if attr in ["span"]: + return self.render_span(content) + + if attr in ["b", "strong"]: + return self.render_strong(content) + + if attr in ["ul"]: + return self.render_ul(content) + + # Glitch-soc and Pleroma allow

...

in content + # Mastodon (PR #23913) does not; header tags are converted to

+ if attr in ["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]: + return self.basic_block_tag_handler(content) + + # Fall back to inline_tag_to_text handler + return self.inline_tag_to_text(content) + + def render_anchor(self, tag) -> Tuple: """anchor tag handler""" markups = self.process_inline_tag_children(tag) @@ -245,7 +275,7 @@ def _a(self, tag) -> Tuple: return (attr, title) - def _blockquote(self, tag) -> urwid.Widget: + def render_blockquote(self, tag) -> urwid.Widget: widget_list = self.process_block_tag_children(tag) blockquote_widget = urwid.LineBox( urwid.Padding( @@ -267,10 +297,10 @@ def _blockquote(self, tag) -> urwid.Widget: ) return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")]) - def _br(self, tag) -> Tuple: + def render_br(self, tag) -> Tuple: return ("br", "\n") - def _em(self, tag) -> Tuple: + def render_em(self, tag) -> Tuple: # to simplify the number of palette entries # translate EM to I (italic) markups = self.process_inline_tag_children(tag) @@ -284,7 +314,7 @@ def _em(self, tag) -> Tuple: return ("i", markups) - def _ol(self, tag) -> urwid.Widget: + def render_ol(self, tag) -> urwid.Widget: """ordered list tag handler""" widgets = [] @@ -325,7 +355,7 @@ def _ol(self, tag) -> urwid.Widget: return urwid.Pile(widgets) - def _pre(self, tag) -> urwid.Widget: + def render_pre(self, tag) -> urwid.Widget: #
 tag spec says that text should not wrap,
         # but horizontal screen space is at a premium
         # and we have no horizontal scroll bar, so allow
@@ -344,7 +374,7 @@ def _pre(self, tag) -> urwid.Widget:
         )
         return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
 
-    def _span(self, tag) -> Tuple:
+    def render_span(self, tag) -> Tuple:
         markups = self.process_inline_tag_children(tag)
 
         if not markups:
@@ -376,7 +406,7 @@ def _span(self, tag) -> Tuple:
             # fallback
             return ("span", markups)
 
-    def _strong(self, tag) -> Tuple:
+    def render_strong(self, tag) -> Tuple:
         # to simplify the number of palette entries
         # translate STRONG to B (bold)
         markups = self.process_inline_tag_children(tag)
@@ -390,7 +420,7 @@ def _strong(self, tag) -> Tuple:
 
         return ("b", markups)
 
-    def _ul(self, tag) -> urwid.Widget:
+    def render_ul(self, tag) -> urwid.Widget:
         """unordered list tag handler"""
 
         widgets = []
@@ -411,25 +441,6 @@ def _ul(self, tag) -> urwid.Widget:
 
         return urwid.Pile(widgets)
 
-    # These tags are handled identically to others
-    # the only difference being the tag name used for
-    # urwid attribute mapping
-
-    _b = _strong
-
-    _div = basic_block_tag_handler
-
-    _i = _em
-
-    _li = basic_block_tag_handler
-
-    # Glitch-soc and Pleroma allow 

...

in content - # Mastodon (PR #23913) does not; header tags are converted to

- - _h1 = _h2 = _h3 = _h4 = _h5 = _h6 = basic_block_tag_handler - - _p = basic_block_tag_handler - def flatten(data): if isinstance(data, tuple): From 073dd3025cf2381dfb85b561914a2cd2d539b8ff Mon Sep 17 00:00:00 2001 From: Ivan Habunek Date: Mon, 6 Nov 2023 18:14:21 +0100 Subject: [PATCH 08/15] Remove the ContentParser class, use functions instead It did not help, just added to the indent. --- toot/tui/overlays.py | 8 +- toot/tui/poll.py | 5 +- toot/tui/richtext.py | 720 ++++++++++++++++++++++--------------------- toot/tui/timeline.py | 6 +- 4 files changed, 371 insertions(+), 368 deletions(-) diff --git a/toot/tui/overlays.py b/toot/tui/overlays.py index 530921a4..58eb4572 100644 --- a/toot/tui/overlays.py +++ b/toot/tui/overlays.py @@ -7,7 +7,7 @@ from toot import api from toot.tui.utils import highlight_keys from toot.tui.widgets import Button, EditBox, SelectableText -from toot.tui.richtext import ContentParser +from toot.tui.richtext import html_to_widgets class StatusSource(urwid.Padding): @@ -255,8 +255,6 @@ def setup_listbox(self): super().__init__(walker) def generate_contents(self, account, relationship=None, last_action=None): - parser = ContentParser() - if self.last_action and not self.last_action.startswith("Confirm"): yield Button(f"Confirm {self.last_action}", on_press=take_action, user_data=self) yield Button("Cancel", on_press=cancel_action, user_data=self) @@ -282,7 +280,7 @@ def generate_contents(self, account, relationship=None, last_action=None): if account["note"]: yield urwid.Divider() - widgetlist = parser.html_to_widgets(account["note"]) + widgetlist = html_to_widgets(account["note"]) for line in widgetlist: yield (line) @@ -317,7 +315,7 @@ def generate_contents(self, account, relationship=None, last_action=None): yield urwid.Divider() yield urwid.Text([("bold", f"{name.rstrip(':')}"), ":"]) - widgetlist = parser.html_to_widgets(field["value"]) + widgetlist = html_to_widgets(field["value"]) for line in widgetlist: yield (line) diff --git a/toot/tui/poll.py b/toot/tui/poll.py index c92cc07d..e738fc73 100644 --- a/toot/tui/poll.py +++ b/toot/tui/poll.py @@ -4,7 +4,7 @@ from toot.exceptions import ApiError from toot.utils.datetime import parse_datetime from .widgets import Button, CheckBox, RadioButton -from .richtext import ContentParser +from .richtext import html_to_widgets class Poll(urwid.ListBox): @@ -86,8 +86,7 @@ def generate_poll_detail(self): def generate_contents(self, status): yield urwid.Divider() - parser = ContentParser() - widgetlist = parser.html_to_widgets(status.data["content"]) + widgetlist = html_to_widgets(status.data["content"]) for line in widgetlist: yield (line) diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index f1829def..b4e5b03f 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -16,430 +16,438 @@ BLOCK_TAGS = ["p", "pre", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"] -class ContentParser: - """Parse a limited subset of HTML and create urwid widgets.""" - - def html_to_widgets(self, html, recovery_attempt=False) -> List[urwid.Widget]: - """Convert html to urwid widgets""" - widgets: List[urwid.Widget] = [] - html = unicodedata.normalize("NFKC", html) - soup = parse_html(html) - - first_tag = True - for e in soup.body or soup: - if isinstance(e, NavigableString): - if first_tag and not recovery_attempt: - # if our first "tag" is a navigable string - # the HTML is out of spec, doesn't start with a tag, - # we see this in content from Pixelfed servers. - # attempt a fix by wrapping the HTML with

- return self.html_to_widgets(f"

{html}

", recovery_attempt=True) - else: - continue - else: - name = e.name - # if our HTML starts with a tag, but not a block tag - # the HTML is out of spec. Attempt a fix by wrapping the - # HTML with

- if (first_tag and not recovery_attempt and name not in BLOCK_TAGS): - return self.html_to_widgets(f"

{html}

", recovery_attempt=True) - - markup = self.render(name, e) - first_tag = False - - if not isinstance(markup, urwid.Widget): - # plaintext, so create a padded text widget - txt = self.text_to_widget("", markup) - markup = urwid.Padding( - txt, - align="left", - width=("relative", 100), - min_width=None, - ) - widgets.append(markup) - # separate top level widgets with a blank line - widgets.append(urwid.Divider(" ")) - return widgets[:-1] # but suppress the last blank line - - def inline_tag_to_text(self, tag) -> Tuple: - """Convert html tag to plain text with tag as attributes recursively""" - markups = self.process_inline_tag_children(tag) - if not markups: - return (tag.name, "") - return (tag.name, markups) - - def process_inline_tag_children(self, tag) -> List: - """Recursively retrieve all children - and convert to a list of markup text""" - markups = [] - for child in tag.children: - if isinstance(child, Tag): - markup = self.render(child.name, child) - markups.append(markup) +def html_to_widgets(html, recovery_attempt=False) -> List[urwid.Widget]: + """Convert html to urwid widgets""" + widgets: List[urwid.Widget] = [] + html = unicodedata.normalize("NFKC", html) + soup = parse_html(html) + + first_tag = True + for e in soup.body or soup: + if isinstance(e, NavigableString): + if first_tag and not recovery_attempt: + # if our first "tag" is a navigable string + # the HTML is out of spec, doesn't start with a tag, + # we see this in content from Pixelfed servers. + # attempt a fix by wrapping the HTML with

+ return html_to_widgets(f"

{html}

", recovery_attempt=True) else: - markups.append(child) - return markups - - def text_to_widget(self, attr, markup) -> urwid.Widget: - if not has_urwidgets: - return urwid.Text((attr, markup)) - - TRANSFORM = { - # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget - re.compile(r"(^.+)\x03(.+$)"): lambda g: ( - len(g[1]), - urwid.Filler(Hyperlink(g[2], anchor_attr, g[1])), - ), - } - markup_list = [] - - for run in markup: - if isinstance(run, tuple): - txt, attr_list = decompose_tagmarkup(run) - # find anchor titles with an ETX separator followed by href - m = re.match(r"(^.+)\x03(.+$)", txt) - if m: - anchor_attr = self.get_best_anchor_attr(attr_list) - markup_list.append( - parse_text( - txt, - TRANSFORM, - lambda pattern, groups, span: TRANSFORM[pattern](groups), - ) + continue + else: + name = e.name + # if our HTML starts with a tag, but not a block tag + # the HTML is out of spec. Attempt a fix by wrapping the + # HTML with

+ if (first_tag and not recovery_attempt and name not in BLOCK_TAGS): + return html_to_widgets(f"

{html}

", recovery_attempt=True) + + markup = render(name, e) + first_tag = False + + if not isinstance(markup, urwid.Widget): + # plaintext, so create a padded text widget + txt = text_to_widget("", markup) + markup = urwid.Padding( + txt, + align="left", + width=("relative", 100), + min_width=None, + ) + widgets.append(markup) + # separate top level widgets with a blank line + widgets.append(urwid.Divider(" ")) + return widgets[:-1] # but suppress the last blank line + + +def inline_tag_to_text(tag) -> Tuple: + """Convert html tag to plain text with tag as attributes recursively""" + markups = process_inline_tag_children(tag) + if not markups: + return (tag.name, "") + return (tag.name, markups) + + +def process_inline_tag_children(tag) -> List: + """Recursively retrieve all children + and convert to a list of markup text""" + markups = [] + for child in tag.children: + if isinstance(child, Tag): + markup = render(child.name, child) + markups.append(markup) + else: + markups.append(child) + return markups + + +def text_to_widget(attr, markup) -> urwid.Widget: + if not has_urwidgets: + return urwid.Text((attr, markup)) + + TRANSFORM = { + # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget + re.compile(r"(^.+)\x03(.+$)"): lambda g: ( + len(g[1]), + urwid.Filler(Hyperlink(g[2], anchor_attr, g[1])), + ), + } + markup_list = [] + + for run in markup: + if isinstance(run, tuple): + txt, attr_list = decompose_tagmarkup(run) + # find anchor titles with an ETX separator followed by href + m = re.match(r"(^.+)\x03(.+$)", txt) + if m: + anchor_attr = get_best_anchor_attr(attr_list) + markup_list.append( + parse_text( + txt, + TRANSFORM, + lambda pattern, groups, span: TRANSFORM[pattern](groups), ) - else: - markup_list.append(run) + ) else: markup_list.append(run) - - return TextEmbed(markup_list) - - def process_block_tag_children(self, tag) -> List[urwid.Widget]: - """Recursively retrieve all children - and convert to a list of widgets - any inline tags containing text will be - converted to Text widgets""" - - pre_widget_markups = [] - post_widget_markups = [] - child_widgets = [] - found_nested_widget = False - - for child in tag.children: - if isinstance(child, Tag): - # child is a nested tag; process using custom method - # or default to inline_tag_to_text - result = self.render(child.name, child) - if isinstance(result, urwid.Widget): - found_nested_widget = True - child_widgets.append(result) - else: - if not found_nested_widget: - pre_widget_markups.append(result) - else: - post_widget_markups.append(result) + else: + markup_list.append(run) + + return TextEmbed(markup_list) + + +def process_block_tag_children(tag) -> List[urwid.Widget]: + """Recursively retrieve all children + and convert to a list of widgets + any inline tags containing text will be + converted to Text widgets""" + + pre_widget_markups = [] + post_widget_markups = [] + child_widgets = [] + found_nested_widget = False + + for child in tag.children: + if isinstance(child, Tag): + # child is a nested tag; process using custom method + # or default to inline_tag_to_text + result = render(child.name, child) + if isinstance(result, urwid.Widget): + found_nested_widget = True + child_widgets.append(result) else: - # child is text; append to the appropriate markup list if not found_nested_widget: - pre_widget_markups.append(child) + pre_widget_markups.append(result) else: - post_widget_markups.append(child) - - widget_list = [] - if len(pre_widget_markups): - widget_list.append(self.text_to_widget(tag.name, pre_widget_markups)) + post_widget_markups.append(result) + else: + # child is text; append to the appropriate markup list + if not found_nested_widget: + pre_widget_markups.append(child) + else: + post_widget_markups.append(child) - if len(child_widgets): - widget_list += child_widgets + widget_list = [] + if len(pre_widget_markups): + widget_list.append(text_to_widget(tag.name, pre_widget_markups)) - if len(post_widget_markups): - widget_list.append(self.text_to_widget(tag.name, post_widget_markups)) + if len(child_widgets): + widget_list += child_widgets - return widget_list + if len(post_widget_markups): + widget_list.append(text_to_widget(tag.name, post_widget_markups)) - def get_urwid_attr_name(self, tag) -> str: - """Get the class name and translate to a - name suitable for use as an urwid - text attribute name""" + return widget_list - if "class" in tag.attrs: - clss = tag.attrs["class"] - if len(clss) > 0: - style_name = "class_" + "_".join(clss) - # return the class name, only if we - # find it as a defined palette name - if style_name in STYLE_NAMES: - return style_name - # fallback to returning the tag name - return tag.name +def get_urwid_attr_name(tag) -> str: + """Get the class name and translate to a + name suitable for use as an urwid + text attribute name""" - # Tag handlers start here. - # Tags not explicitly listed are "supported" by - # rendering as text. - # Inline tags return a list of marked up text for urwid.Text - # Block tags return urwid.Widget + if "class" in tag.attrs: + clss = tag.attrs["class"] + if len(clss) > 0: + style_name = "class_" + "_".join(clss) + # return the class name, only if we + # find it as a defined palette name + if style_name in STYLE_NAMES: + return style_name - def basic_block_tag_handler(self, tag) -> urwid.Widget: - """default for block tags that need no special treatment""" - return urwid.Pile(self.process_block_tag_children(tag)) + # fallback to returning the tag name + return tag.name - def get_best_anchor_attr(self, attrib_list) -> str: - if not attrib_list: - return "" - flat_al = list(flatten(attrib_list)) - for a in flat_al[0]: - # ref: https://docs.joinmastodon.org/spec/activitypub/ - # these are the class names (translated to attrib names) - # that we can support for display +def basic_block_tag_handler(tag) -> urwid.Widget: + """default for block tags that need no special treatment""" + return urwid.Pile(process_block_tag_children(tag)) - try: - if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]: - return a[0] - except KeyError: - continue - return "a" +def get_best_anchor_attr(attrib_list) -> str: + if not attrib_list: + return "" + flat_al = list(flatten(attrib_list)) - def render(self, attr: str, content: str): - if attr in ["a"]: - return self.render_anchor(content) + for a in flat_al[0]: + # ref: https://docs.joinmastodon.org/spec/activitypub/ + # these are the class names (translated to attrib names) + # that we can support for display - if attr in ["blockquote"]: - return self.render_blockquote(content) + try: + if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]: + return a[0] + except KeyError: + continue - if attr in ["br"]: - return self.render_br(content) + return "a" - if attr in ["em"]: - return self.render_em(content) - if attr in ["ol"]: - return self.render_ol(content) +def render(attr: str, content: str): + if attr in ["a"]: + return render_anchor(content) - if attr in ["pre"]: - return self.render_pre(content) + if attr in ["blockquote"]: + return render_blockquote(content) - if attr in ["span"]: - return self.render_span(content) + if attr in ["br"]: + return render_br(content) - if attr in ["b", "strong"]: - return self.render_strong(content) + if attr in ["em"]: + return render_em(content) - if attr in ["ul"]: - return self.render_ul(content) + if attr in ["ol"]: + return render_ol(content) - # Glitch-soc and Pleroma allow

...

in content - # Mastodon (PR #23913) does not; header tags are converted to

- if attr in ["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]: - return self.basic_block_tag_handler(content) + if attr in ["pre"]: + return render_pre(content) - # Fall back to inline_tag_to_text handler - return self.inline_tag_to_text(content) + if attr in ["span"]: + return render_span(content) - def render_anchor(self, tag) -> Tuple: - """anchor tag handler""" + if attr in ["b", "strong"]: + return render_strong(content) - markups = self.process_inline_tag_children(tag) - if not markups: - return (tag.name, "") + if attr in ["ul"]: + return render_ul(content) - href = tag.attrs["href"] - title, attrib_list = decompose_tagmarkup(markups) - if not attrib_list: - attrib_list = [tag] - if href and has_urwidgets: - # only if we have urwidgets loaded for OCS 8 hyperlinks: - # urlencode the path and query portions of the URL - href = urlencode_url(href) - # use ASCII ETX (end of record) as a - # delimiter between the title and the HREF - title += f"\x03{href}" + # Glitch-soc and Pleroma allow

...

in content + # Mastodon (PR #23913) does not; header tags are converted to

+ if attr in ["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]: + return basic_block_tag_handler(content) - attr = self.get_best_anchor_attr(attrib_list) + # Fall back to inline_tag_to_text handler + return inline_tag_to_text(content) - if attr == "a": - # didn't find an attribute to use - # in the child markup, so let's - # try the anchor tag's own attributes - attr = self.get_urwid_attr_name(tag) +def render_anchor(tag) -> Tuple: + """anchor tag handler""" - # hashtag anchors have a class of "mention hashtag" - # or "hashtag" - # we'll return style "class_mention_hashtag" - # or "class_hashtag" - # in that case; see corresponding palette entry - # in constants.py controlling hashtag highlighting + markups = process_inline_tag_children(tag) + if not markups: + return (tag.name, "") - return (attr, title) + href = tag.attrs["href"] + title, attrib_list = decompose_tagmarkup(markups) + if not attrib_list: + attrib_list = [tag] + if href and has_urwidgets: + # only if we have urwidgets loaded for OCS 8 hyperlinks: + # urlencode the path and query portions of the URL + href = urlencode_url(href) + # use ASCII ETX (end of record) as a + # delimiter between the title and the HREF + title += f"\x03{href}" - def render_blockquote(self, tag) -> urwid.Widget: - widget_list = self.process_block_tag_children(tag) - blockquote_widget = urwid.LineBox( - urwid.Padding( - urwid.Pile(widget_list), - align="left", - width=("relative", 100), - min_width=None, - left=1, - right=1, - ), - tlcorner="", - tline="", - lline="│", - trcorner="", - blcorner="", - rline="", - bline="", - brcorner="", - ) - return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")]) - - def render_br(self, tag) -> Tuple: - return ("br", "\n") - - def render_em(self, tag) -> Tuple: - # to simplify the number of palette entries - # translate EM to I (italic) - markups = self.process_inline_tag_children(tag) - if not markups: - return ("i", "") - - # special case processing for bold and italic - for parent in tag.parents: - if parent.name == "b" or parent.name == "strong": - return ("bi", markups) - - return ("i", markups) - - def render_ol(self, tag) -> urwid.Widget: - """ordered list tag handler""" - - widgets = [] - list_item_num = 1 - increment = -1 if tag.has_attr("reversed") else 1 - - # get ol start= attribute if present - if tag.has_attr("start") and len(tag.attrs["start"]) > 0: - try: - list_item_num = int(tag.attrs["start"]) - except ValueError: - pass - - for li in tag.find_all("li", recursive=False): - markup = self.render("li", li) - - # li value= attribute will change the item number - # it also overrides any ol start= attribute + attr = get_best_anchor_attr(attrib_list) - if li.has_attr("value") and len(li.attrs["value"]) > 0: - try: - list_item_num = int(li.attrs["value"]) - except ValueError: - pass - - if not isinstance(markup, urwid.Widget): - txt = self.text_to_widget("li", [str(list_item_num), ". ", markup]) - # 1. foo, 2. bar, etc. - widgets.append(txt) - else: - txt = self.text_to_widget("li", [str(list_item_num), ". "]) - columns = urwid.Columns( - [txt, ("weight", 9999, markup)], dividechars=1, min_width=3 - ) - widgets.append(columns) + if attr == "a": + # didn't find an attribute to use + # in the child markup, so let's + # try the anchor tag's own attributes - list_item_num += increment + attr = get_urwid_attr_name(tag) - return urwid.Pile(widgets) + # hashtag anchors have a class of "mention hashtag" + # or "hashtag" + # we'll return style "class_mention_hashtag" + # or "class_hashtag" + # in that case; see corresponding palette entry + # in constants.py controlling hashtag highlighting - def render_pre(self, tag) -> urwid.Widget: - #
 tag spec says that text should not wrap,
-        # but horizontal screen space is at a premium
-        # and we have no horizontal scroll bar, so allow
-        # wrapping.
+    return (attr, title)
 
-        widget_list = [urwid.Divider(" ")]
-        widget_list += self.process_block_tag_children(tag)
 
-        pre_widget = urwid.Padding(
+def render_blockquote(tag) -> urwid.Widget:
+    widget_list = process_block_tag_children(tag)
+    blockquote_widget = urwid.LineBox(
+        urwid.Padding(
             urwid.Pile(widget_list),
             align="left",
             width=("relative", 100),
             min_width=None,
             left=1,
             right=1,
-        )
-        return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
+        ),
+        tlcorner="",
+        tline="",
+        lline="│",
+        trcorner="",
+        blcorner="",
+        rline="",
+        bline="",
+        brcorner="",
+    )
+    return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")])
 
-    def render_span(self, tag) -> Tuple:
-        markups = self.process_inline_tag_children(tag)
 
-        if not markups:
-            return (tag.name, "")
+def render_br(tag) -> Tuple:
+    return ("br", "\n")
 
-        # span inherits its parent's class definition
-        # unless it has a specific class definition
-        # of its own
 
-        if "class" in tag.attrs:
-            # uncomment the following code to hide all HTML marked
-            # invisible (generally, the http:// prefix of URLs)
-            # could be a user preference, it's only advisable if
-            # the terminal supports OCS 8 hyperlinks (and that's not
-            # automatically detectable)
+def render_em(tag) -> Tuple:
+    # to simplify the number of palette entries
+    # translate EM to I (italic)
+    markups = process_inline_tag_children(tag)
+    if not markups:
+        return ("i", "")
 
-            # if "invisible" in tag.attrs["class"]:
-            #     return (tag.name, "")
+    # special case processing for bold and italic
+    for parent in tag.parents:
+        if parent.name == "b" or parent.name == "strong":
+            return ("bi", markups)
 
-            style_name = self.get_urwid_attr_name(tag)
+    return ("i", markups)
 
-            if style_name != "span":
-                # unique class name matches an entry in our palette
-                return (style_name, markups)
 
-        if tag.parent:
-            return (self.get_urwid_attr_name(tag.parent), markups)
+def render_ol(tag) -> urwid.Widget:
+    """ordered list tag handler"""
+
+    widgets = []
+    list_item_num = 1
+    increment = -1 if tag.has_attr("reversed") else 1
+
+    # get ol start= attribute if present
+    if tag.has_attr("start") and len(tag.attrs["start"]) > 0:
+        try:
+            list_item_num = int(tag.attrs["start"])
+        except ValueError:
+            pass
+
+    for li in tag.find_all("li", recursive=False):
+        markup = render("li", li)
+
+        # li value= attribute will change the item number
+        # it also overrides any ol start= attribute
+
+        if li.has_attr("value") and len(li.attrs["value"]) > 0:
+            try:
+                list_item_num = int(li.attrs["value"])
+            except ValueError:
+                pass
+
+        if not isinstance(markup, urwid.Widget):
+            txt = text_to_widget("li", [str(list_item_num), ". ", markup])
+            # 1. foo, 2. bar, etc.
+            widgets.append(txt)
         else:
-            # fallback
-            return ("span", markups)
+            txt = text_to_widget("li", [str(list_item_num), ". "])
+            columns = urwid.Columns(
+                [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
+            )
+            widgets.append(columns)
 
-    def render_strong(self, tag) -> Tuple:
-        # to simplify the number of palette entries
-        # translate STRONG to B (bold)
-        markups = self.process_inline_tag_children(tag)
-        if not markups:
-            return ("b", "")
+        list_item_num += increment
 
-        # special case processing for bold and italic
-        for parent in tag.parents:
-            if parent.name == "i" or parent.name == "em":
-                return ("bi", markups)
+    return urwid.Pile(widgets)
 
-        return ("b", markups)
 
-    def render_ul(self, tag) -> urwid.Widget:
-        """unordered list tag handler"""
+def render_pre(tag) -> urwid.Widget:
+    # 
 tag spec says that text should not wrap,
+    # but horizontal screen space is at a premium
+    # and we have no horizontal scroll bar, so allow
+    # wrapping.
 
-        widgets = []
+    widget_list = [urwid.Divider(" ")]
+    widget_list += process_block_tag_children(tag)
 
-        for li in tag.find_all("li", recursive=False):
-            markup = self.render("li", li)
+    pre_widget = urwid.Padding(
+        urwid.Pile(widget_list),
+        align="left",
+        width=("relative", 100),
+        min_width=None,
+        left=1,
+        right=1,
+    )
+    return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
 
-            if not isinstance(markup, urwid.Widget):
-                txt = self.text_to_widget("li", ["\N{bullet} ", markup])
-                # * foo, * bar, etc.
-                widgets.append(txt)
-            else:
-                txt = self.text_to_widget("li", ["\N{bullet} "])
-                columns = urwid.Columns(
-                    [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
-                )
-                widgets.append(columns)
 
-        return urwid.Pile(widgets)
+def render_span(tag) -> Tuple:
+    markups = process_inline_tag_children(tag)
+
+    if not markups:
+        return (tag.name, "")
+
+    # span inherits its parent's class definition
+    # unless it has a specific class definition
+    # of its own
+
+    if "class" in tag.attrs:
+        # uncomment the following code to hide all HTML marked
+        # invisible (generally, the http:// prefix of URLs)
+        # could be a user preference, it's only advisable if
+        # the terminal supports OCS 8 hyperlinks (and that's not
+        # automatically detectable)
+
+        # if "invisible" in tag.attrs["class"]:
+        #     return (tag.name, "")
+
+        style_name = get_urwid_attr_name(tag)
+
+        if style_name != "span":
+            # unique class name matches an entry in our palette
+            return (style_name, markups)
+
+    if tag.parent:
+        return (get_urwid_attr_name(tag.parent), markups)
+    else:
+        # fallback
+        return ("span", markups)
+
+
+def render_strong(tag) -> Tuple:
+    # to simplify the number of palette entries
+    # translate STRONG to B (bold)
+    markups = process_inline_tag_children(tag)
+    if not markups:
+        return ("b", "")
+
+    # special case processing for bold and italic
+    for parent in tag.parents:
+        if parent.name == "i" or parent.name == "em":
+            return ("bi", markups)
+
+    return ("b", markups)
+
+
+def render_ul(tag) -> urwid.Widget:
+    """unordered list tag handler"""
+
+    widgets = []
+
+    for li in tag.find_all("li", recursive=False):
+        markup = render("li", li)
+
+        if not isinstance(markup, urwid.Widget):
+            txt = text_to_widget("li", ["\N{bullet} ", markup])
+            # * foo, * bar, etc.
+            widgets.append(txt)
+        else:
+            txt = text_to_widget("li", ["\N{bullet} "])
+            columns = urwid.Columns(
+                [txt, ("weight", 9999, markup)], dividechars=1, min_width=3
+            )
+            widgets.append(columns)
+
+    return urwid.Pile(widgets)
 
 
 def flatten(data):
diff --git a/toot/tui/timeline.py b/toot/tui/timeline.py
index 1fef40c2..b278d085 100644
--- a/toot/tui/timeline.py
+++ b/toot/tui/timeline.py
@@ -6,6 +6,7 @@
 from typing import List, Optional
 
 from toot.tui import app
+from toot.tui.richtext import html_to_widgets
 from toot.utils.datetime import parse_datetime, time_ago
 from toot.utils.language import language_name
 
@@ -13,7 +14,6 @@
 from toot.tui.scroll import Scrollable, ScrollBar
 from toot.tui.utils import highlight_keys
 from toot.tui.widgets import SelectableText, SelectableColumns
-from toot.tui.richtext import ContentParser
 from toot.utils import urlencode_url
 from toot.tui.stubs.urwidgets import Hyperlink, TextEmbed, parse_text, has_urwidgets
 
@@ -356,9 +356,7 @@ def content_generator(self, status, reblogged_by):
             yield ("pack", urwid.Text(("content_warning", "Marked as sensitive. Press S to view.")))
         else:
             content = status.original.translation if status.original.show_translation else status.data["content"]
-
-            parser = ContentParser()
-            widgetlist = parser.html_to_widgets(content)
+            widgetlist = html_to_widgets(content)
 
             for line in widgetlist:
                 yield (line)

From f50dea1175d88ff5541418d8da71c4df0061379d Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:08:38 +0100
Subject: [PATCH 09/15] Simplify text_to_widget

This was doing double regex matching, calling parse_text was not needed
at all.
---
 toot/tui/richtext.py | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py
index b4e5b03f..ae463ae9 100644
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@@ -4,7 +4,7 @@
 
 from bs4.element import NavigableString, Tag
 from toot.tui.constants import PALETTE
-from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, parse_text, has_urwidgets
+from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, has_urwidgets
 from toot.utils import parse_html, urlencode_url
 from typing import List, Tuple
 from urwid.util import decompose_tagmarkup
@@ -80,33 +80,26 @@ def process_inline_tag_children(tag) -> List:
     return markups
 
 
+URL_PATTERN = re.compile(r"(^.+)\x03(.+$)")
+
+
 def text_to_widget(attr, markup) -> urwid.Widget:
     if not has_urwidgets:
         return urwid.Text((attr, markup))
 
-    TRANSFORM = {
-        # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
-        re.compile(r"(^.+)\x03(.+$)"): lambda g: (
-            len(g[1]),
-            urwid.Filler(Hyperlink(g[2], anchor_attr, g[1])),
-        ),
-    }
     markup_list = []
-
     for run in markup:
         if isinstance(run, tuple):
             txt, attr_list = decompose_tagmarkup(run)
             # find anchor titles with an ETX separator followed by href
-            m = re.match(r"(^.+)\x03(.+$)", txt)
-            if m:
+            match = URL_PATTERN.match(txt)
+            if match:
+                label, url = match.groups()
                 anchor_attr = get_best_anchor_attr(attr_list)
-                markup_list.append(
-                    parse_text(
-                        txt,
-                        TRANSFORM,
-                        lambda pattern, groups, span: TRANSFORM[pattern](groups),
-                    )
-                )
+                markup_list.append((
+                    len(label),
+                    urwid.Filler(Hyperlink(url, anchor_attr, label)),
+                ))
             else:
                 markup_list.append(run)
         else:

From f96b1b722cd429c4c7e69108e66df16673929bad Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:11:25 +0100
Subject: [PATCH 10/15] Move richtext to it's own module

This is the first step towards easier stubbing
---
 toot/tui/richtext/__init__.py       | 1 +
 toot/tui/{ => richtext}/richtext.py | 0
 2 files changed, 1 insertion(+)
 create mode 100644 toot/tui/richtext/__init__.py
 rename toot/tui/{ => richtext}/richtext.py (100%)

diff --git a/toot/tui/richtext/__init__.py b/toot/tui/richtext/__init__.py
new file mode 100644
index 00000000..ba857ab9
--- /dev/null
+++ b/toot/tui/richtext/__init__.py
@@ -0,0 +1 @@
+from .richtext import html_to_widgets
diff --git a/toot/tui/richtext.py b/toot/tui/richtext/richtext.py
similarity index 100%
rename from toot/tui/richtext.py
rename to toot/tui/richtext/richtext.py

From e5ac82bb010bee0a634538ce2f2a5ba0edf4dcea Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:35:44 +0100
Subject: [PATCH 11/15] Add fallback for html_to_widgets

Remove has_urwidgets since we no longer need to worry if we have
urwidgets in the richtext module.
---
 toot/tui/richtext/__init__.py | 16 +++++++++++++++-
 toot/tui/richtext/richtext.py |  8 ++------
 toot/tui/utils.py             | 12 ++++++++++++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/toot/tui/richtext/__init__.py b/toot/tui/richtext/__init__.py
index ba857ab9..6359c24e 100644
--- a/toot/tui/richtext/__init__.py
+++ b/toot/tui/richtext/__init__.py
@@ -1 +1,15 @@
-from .richtext import html_to_widgets
+import urwid
+
+from toot.tui.utils import highlight_hashtags
+from toot.utils import format_content
+from typing import List
+
+try:
+    from .richtext import html_to_widgets
+except ImportError:
+    # Fallback if urwidgets are not available
+    def html_to_widgets(html: str) -> List[urwid.Widget]:
+        return [
+            urwid.Text(highlight_hashtags(line))
+            for line in format_content(html)
+        ]
diff --git a/toot/tui/richtext/richtext.py b/toot/tui/richtext/richtext.py
index ae463ae9..9db7e73a 100644
--- a/toot/tui/richtext/richtext.py
+++ b/toot/tui/richtext/richtext.py
@@ -4,10 +4,10 @@
 
 from bs4.element import NavigableString, Tag
 from toot.tui.constants import PALETTE
-from toot.tui.stubs.urwidgets import TextEmbed, Hyperlink, has_urwidgets
 from toot.utils import parse_html, urlencode_url
 from typing import List, Tuple
 from urwid.util import decompose_tagmarkup
+from urwidgets import Hyperlink, TextEmbed
 
 
 STYLE_NAMES = [p[0] for p in PALETTE]
@@ -84,9 +84,6 @@ def process_inline_tag_children(tag) -> List:
 
 
 def text_to_widget(attr, markup) -> urwid.Widget:
-    if not has_urwidgets:
-        return urwid.Text((attr, markup))
-
     markup_list = []
     for run in markup:
         if isinstance(run, tuple):
@@ -242,8 +239,7 @@ def render_anchor(tag) -> Tuple:
     title, attrib_list = decompose_tagmarkup(markups)
     if not attrib_list:
         attrib_list = [tag]
-    if href and has_urwidgets:
-        # only if we have urwidgets loaded for OCS 8 hyperlinks:
+    if href:
         # urlencode the path and query portions of the URL
         href = urlencode_url(href)
         # use ASCII ETX (end of record) as a
diff --git a/toot/tui/utils.py b/toot/tui/utils.py
index 0ccff9d7..734ae325 100644
--- a/toot/tui/utils.py
+++ b/toot/tui/utils.py
@@ -35,6 +35,18 @@ def _gen():
     return list(_gen())
 
 
+def highlight_hashtags(line):
+    hline = []
+
+    for p in re.split(HASHTAG_PATTERN, line):
+        if p.startswith("#"):
+            hline.append(("hashtag", p))
+        else:
+            hline.append(p)
+
+    return hline
+
+
 def show_media(paths):
     """
     Attempt to open an image viewer to show given media files.

From d6ff3cc3a80f83c834e6cb57602e1e52a4d3df80 Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:46:54 +0100
Subject: [PATCH 12/15] Extract url_to_widget, add fallback

---
 toot/tui/richtext/__init__.py |  5 ++++-
 toot/tui/richtext/richtext.py |  5 +++++
 toot/tui/timeline.py          | 23 +++--------------------
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/toot/tui/richtext/__init__.py b/toot/tui/richtext/__init__.py
index 6359c24e..07e31c8e 100644
--- a/toot/tui/richtext/__init__.py
+++ b/toot/tui/richtext/__init__.py
@@ -5,7 +5,7 @@
 from typing import List
 
 try:
-    from .richtext import html_to_widgets
+    from .richtext import html_to_widgets, url_to_widget
 except ImportError:
     # Fallback if urwidgets are not available
     def html_to_widgets(html: str) -> List[urwid.Widget]:
@@ -13,3 +13,6 @@ def html_to_widgets(html: str) -> List[urwid.Widget]:
             urwid.Text(highlight_hashtags(line))
             for line in format_content(html)
         ]
+
+    def url_to_widget(url: str):
+        return urwid.Text(("link", url))
diff --git a/toot/tui/richtext/richtext.py b/toot/tui/richtext/richtext.py
index 9db7e73a..71897c4b 100644
--- a/toot/tui/richtext/richtext.py
+++ b/toot/tui/richtext/richtext.py
@@ -59,6 +59,11 @@ def html_to_widgets(html, recovery_attempt=False) -> List[urwid.Widget]:
     return widgets[:-1]  # but suppress the last blank line
 
 
+def url_to_widget(url: str):
+    widget = len(url), urwid.Filler(Hyperlink(url, "link", url))
+    return TextEmbed(widget)
+
+
 def inline_tag_to_text(tag) -> Tuple:
     """Convert html tag to plain text with tag as attributes recursively"""
     markups = process_inline_tag_children(tag)
diff --git a/toot/tui/timeline.py b/toot/tui/timeline.py
index b278d085..93421ce4 100644
--- a/toot/tui/timeline.py
+++ b/toot/tui/timeline.py
@@ -1,12 +1,11 @@
 import logging
-import re
 import urwid
 import webbrowser
 
 from typing import List, Optional
 
 from toot.tui import app
-from toot.tui.richtext import html_to_widgets
+from toot.tui.richtext import html_to_widgets, url_to_widget
 from toot.utils.datetime import parse_datetime, time_ago
 from toot.utils.language import language_name
 
@@ -14,8 +13,6 @@
 from toot.tui.scroll import Scrollable, ScrollBar
 from toot.tui.utils import highlight_keys
 from toot.tui.widgets import SelectableText, SelectableColumns
-from toot.utils import urlencode_url
-from toot.tui.stubs.urwidgets import Hyperlink, TextEmbed, parse_text, has_urwidgets
 
 logger = logging.getLogger("toot")
 
@@ -320,20 +317,6 @@ def __init__(self, timeline: Timeline, status: Optional[Status]):
             if status else ())
         return super().__init__(widget_list)
 
-    def linkify_content(self, text) -> urwid.Widget:
-        if not has_urwidgets:
-            return urwid.Text(("link", text))
-        TRANSFORM = {
-            # convert http[s] URLs to Hyperlink widgets for nesting in a TextEmbed widget
-            re.compile(r'(https?://[^\s]+)'):
-                lambda g: (len(g[1]), urwid.Filler(Hyperlink(urlencode_url(g[1]), "link", g[1]))),
-        }
-        markup_list = []
-
-        markup_list.append(parse_text(text, TRANSFORM,
-            lambda pattern, groups, span: TRANSFORM[pattern](groups)))
-        return TextEmbed(markup_list, align='left')
-
     def content_generator(self, status, reblogged_by):
         if reblogged_by:
             text = "♺ {} boosted".format(reblogged_by.display_name or reblogged_by.username)
@@ -368,7 +351,7 @@ def content_generator(self, status, reblogged_by):
                     yield ("pack", urwid.Text([("bold", "Media attachment"), " (", m["type"], ")"]))
                     if m["description"]:
                         yield ("pack", urwid.Text(m["description"]))
-                    yield ("pack", self.linkify_content(m["url"]))
+                    yield ("pack", url_to_widget(m["url"]))
 
             poll = status.original.data.get("poll")
             if poll:
@@ -428,7 +411,7 @@ def card_generator(self, card):
         if card["description"]:
             yield urwid.Text(card["description"].strip())
             yield urwid.Text("")
-        yield self.linkify_content(card["url"])
+        yield url_to_widget(card["url"])
 
     def poll_generator(self, poll):
         for idx, option in enumerate(poll["options"]):

From 57cfd41613822bfa54a2e83639096795c0ed28ca Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:49:48 +0100
Subject: [PATCH 13/15] Remove old stubs

---
 .flake8                           |  1 -
 toot/tui/stubs/stub_hyperlink.py  | 25 -------------------------
 toot/tui/stubs/stub_text_embed.py | 21 ---------------------
 toot/tui/stubs/urwidgets.py       |  8 --------
 toot/tui/urwidgets.py             |  8 --------
 5 files changed, 63 deletions(-)
 delete mode 100644 toot/tui/stubs/stub_hyperlink.py
 delete mode 100644 toot/tui/stubs/stub_text_embed.py
 delete mode 100644 toot/tui/stubs/urwidgets.py
 delete mode 100644 toot/tui/urwidgets.py

diff --git a/.flake8 b/.flake8
index cc916ad8..6efbecd1 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,4 @@
 [flake8]
 exclude=build,tests,tmp,venv,toot/tui/scroll.py
 ignore=E128,W503
-per-file-ignores=toot/tui/stubs/urwidgets.py:F401
 max-line-length=120
diff --git a/toot/tui/stubs/stub_hyperlink.py b/toot/tui/stubs/stub_hyperlink.py
deleted file mode 100644
index aa0488de..00000000
--- a/toot/tui/stubs/stub_hyperlink.py
+++ /dev/null
@@ -1,25 +0,0 @@
-__all__ = ("Hyperlink",)
-
-import urwid
-
-
-class Hyperlink(urwid.WidgetWrap):
-    def __init__(self, uri, attr, text):
-        pass
-
-    def render(self, size, focus):
-        return None
-
-
-class HyperlinkCanvas(urwid.Canvas):
-    def __init__(self, uri: str, text_canv: urwid.TextCanvas):
-        pass
-
-    def cols(self):
-        return 0
-
-    def content(self, *args, **kwargs):
-        yield [None]
-
-    def rows(self):
-        return 0
diff --git a/toot/tui/stubs/stub_text_embed.py b/toot/tui/stubs/stub_text_embed.py
deleted file mode 100644
index 622b5f7f..00000000
--- a/toot/tui/stubs/stub_text_embed.py
+++ /dev/null
@@ -1,21 +0,0 @@
-__all__ = ("parse_text", "TextEmbed")
-
-import urwid
-
-
-class TextEmbed(urwid.Text):
-    def get_text(self):
-        return None
-
-    def render(self, size, focus):
-        return None
-
-    def set_text(self, markup):
-        pass
-
-    def set_wrap_mode(self, mode):
-        pass
-
-
-def parse_text(text, patterns, repl, *repl_args, **repl_kwargs):
-    return None
diff --git a/toot/tui/stubs/urwidgets.py b/toot/tui/stubs/urwidgets.py
deleted file mode 100644
index 92737d39..00000000
--- a/toot/tui/stubs/urwidgets.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# If urwidgets is loaded use it; otherwise use our stubs
-try:
-    from urwidgets import Hyperlink, TextEmbed, parse_text
-    has_urwidgets = True
-except ImportError:
-    from .stub_hyperlink import Hyperlink
-    from .stub_text_embed import TextEmbed, parse_text
-    has_urwidgets = False
diff --git a/toot/tui/urwidgets.py b/toot/tui/urwidgets.py
deleted file mode 100644
index ee731a82..00000000
--- a/toot/tui/urwidgets.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# If urwidgets is loaded use it; otherwise use our stubs
-try:
-    from urwidgets import Hyperlink, TextEmbed, parse_text  # noqa: F401
-    has_urwidgets = True
-except ImportError:
-    from .stub_hyperlink import Hyperlink  # noqa: F401
-    from .stub_text_embed import TextEmbed, parse_text  # noqa: F401
-    has_urwidgets = False

From bc542b5e37d2af3de0b4c71bdffa4803096f3e41 Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 11:51:11 +0100
Subject: [PATCH 14/15] Add richtext package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0739408b..c0574c67 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
         'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
         'Programming Language :: Python :: 3',
     ],
-    packages=['toot', 'toot.tui', 'toot.utils'],
+    packages=['toot', 'toot.tui', 'toot.tui.richtext', 'toot.utils'],
     python_requires=">=3.7",
     install_requires=[
         "requests>=2.13,<3.0",

From 414d9e8ff2bbae999af92696c02e89eae95dc800 Mon Sep 17 00:00:00 2001
From: Ivan Habunek 
Date: Thu, 16 Nov 2023 12:29:37 +0100
Subject: [PATCH 15/15] Start testing richtext

---
 tests/tui/test_rich_text.py | 45 +++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 tests/tui/test_rich_text.py

diff --git a/tests/tui/test_rich_text.py b/tests/tui/test_rich_text.py
new file mode 100644
index 00000000..68453fa8
--- /dev/null
+++ b/tests/tui/test_rich_text.py
@@ -0,0 +1,45 @@
+from urwid import Divider, Filler, Pile
+from toot.tui.richtext import url_to_widget
+from urwidgets import Hyperlink, TextEmbed
+
+from toot.tui.richtext.richtext import html_to_widgets
+
+
+def test_url_to_widget():
+    url = "http://foo.bar"
+    embed_widget = url_to_widget(url)
+    assert isinstance(embed_widget, TextEmbed)
+
+    [(filler, length)] = embed_widget.embedded
+    assert length == len(url)
+    assert isinstance(filler, Filler)
+
+    link_widget: Hyperlink = filler.base_widget
+    assert isinstance(link_widget, Hyperlink)
+
+    assert link_widget.attrib == "link"
+    assert link_widget.text == url
+    assert link_widget.uri == url
+
+
+def test_html_to_widgets():
+    html = """
+    

foo

+

foo bar baz

+ """.strip() + + [foo, divider, bar] = html_to_widgets(html) + + assert isinstance(foo, Pile) + assert isinstance(divider, Divider) + assert isinstance(bar, Pile) + + [foo_embed] = foo.widget_list + assert foo_embed.embedded == [] + assert foo_embed.attrib == [] + assert foo_embed.text == "foo" + + [bar_embed] = bar.widget_list + assert bar_embed.embedded == [] + assert bar_embed.attrib == [(None, 4), ("b", 3), (None, 1), ("i", 3)] + assert bar_embed.text == "foo bar baz"