diff --git a/README.md b/README.md index 2bd7369..b60ab72 100644 --- a/README.md +++ b/README.md @@ -50,12 +50,16 @@ pdm add telegramify-markdown -G mermaid - If you are developing an *LLM application* or need to send potentially **super-long text**, please check:[playground/telegramify_case.py](https://github.com/sudoskys/telegramify-markdown/blob/main/playground/telegramify_case.py) -We have two main functions: `markdownify` and `telegramify`. +- If you want to write TelegramV2 format text directly in bot, please check:[playground/standardize_case.py](https://github.com/sudoskys/telegramify-markdown/blob/main/playground/standardize_case.py) -`markdownify`: Just converts raw Markdown text to Telegram's MarkdownV2 format. +We have three main functions: `markdownify`, `telegramify`, and `standardize`. + +`markdownify`: Just converts raw Markdown text to Telegram's MarkdownV2 format, used for LLM like ChatGPT. `telegramify`: Spilt long text into multiple chunks, convert format and use Interpreter to render code block to File, -Image etc. +Image etc, used for LLM bot developers who want do more with Telegram's MarkdownV2 format. + +`standardize`: Convert unstandardized Telegram's MarkdownV2 format to standardized format(convenient for bot developers write something directly in bot). > `Interpreter` can be easily customized to inspect the rendering process in `telegramify`. @@ -155,6 +159,47 @@ print(converted) please check: [playground/telegramify_case.py](https://github.com/sudoskys/telegramify-markdown/blob/main/playground/telegramify_case.py) +### `standardize` + +```python +import telegramify_markdown +from telegramify_markdown.customize import get_runtime_config + +# Customize symbols (optional) +markdown_symbol = get_runtime_config().markdown_symbol +markdown_symbol.head_level_1 = "๐Ÿ“Œ" # Customize the first level title symbol +markdown_symbol.link = "๐Ÿ”—" # Customize the link symbol + +# Telegram MarkdownV2 format text +telegram_v2 = r""" +# Title +*bold \*text* +_italic \*text_ +__underline__ +~strikethrough~ +||spoiler|| +*bold _italic bold ~italic bold strikethrough ||italic bold strikethrough spoiler||~ __underline italic bold___ bold* + +```python +pre-formatted fixed-width code block written in the Python programming language +``` + +>Block quotation started +>Block quotation continued +>The last line of the block quotation +""" + +# Standardize processing +converted = telegramify_markdown.standardize(telegram_v2) + +# Send to Telegram +bot.send_message( + chat_id, + converted, + parse_mode="MarkdownV2" # Must use MarkdownV2 parsing mode +) +``` + ## ๐Ÿ”จ Supported Input - [x] Headings (Levels 1-6) diff --git a/playground/_types.py b/feature-test/_types.py similarity index 100% rename from playground/_types.py rename to feature-test/_types.py diff --git a/playground/max_line.py b/feature-test/debug_maxline.py similarity index 74% rename from playground/max_line.py rename to feature-test/debug_maxline.py index 2018f08..ab861bb 100644 --- a/playground/max_line.py +++ b/feature-test/debug_maxline.py @@ -1,7 +1,7 @@ import telegramify_markdown -from telegramify_markdown import customize +from telegramify_markdown.customize import get_runtime_config -customize.strict_markdown = True +get_runtime_config().strict_markdown = True md = """ [Treating Otitis Externa in Dogs | Today's Veterinary Practice](https://todaysveterinarypractice.com/dermatology/treating-otitis-externa-in-dogs/) diff --git a/playground/inspect_markdownify.py b/feature-test/inspect_markdownify.py similarity index 100% rename from playground/inspect_markdownify.py rename to feature-test/inspect_markdownify.py diff --git a/playground/re_t.py b/feature-test/inspect_regex.py similarity index 100% rename from playground/re_t.py rename to feature-test/inspect_regex.py diff --git a/playground/inspect_telegramify.py b/feature-test/inspect_telegramify.py similarity index 82% rename from playground/inspect_telegramify.py rename to feature-test/inspect_telegramify.py index 76f6652..fb32135 100644 --- a/playground/inspect_telegramify.py +++ b/feature-test/inspect_telegramify.py @@ -7,11 +7,13 @@ from telebot import TeleBot import telegramify_markdown -from telegramify_markdown.customize import markdown_symbol +from telegramify_markdown.customize import get_runtime_config from telegramify_markdown.type import ContentTypes +markdown_symbol = get_runtime_config().markdown_symbol + tips = """ -telegramify_markdown.telegramify +telegramify_markdown.telegramify The stability of telegramify_markdown.telegramify is unproven, please keep good log records. @@ -25,7 +27,7 @@ markdown_symbol.head_level_1 = "๐Ÿ“Œ" # If you want, Customizing the head level 1 symbol markdown_symbol.link = "๐Ÿ”—" # If you want, Customizing the link symbol -md = pathlib.Path(__file__).parent.joinpath("t_longtext.md").read_text(encoding="utf-8") +md = pathlib.Path(__file__).parent.parent.joinpath('playground').joinpath("t_longtext.md").read_text(encoding="utf-8") async def main(): diff --git a/feature-test/latex_render_case2.py b/feature-test/latex_render_case2.py new file mode 100644 index 0000000..7e323c7 --- /dev/null +++ b/feature-test/latex_render_case2.py @@ -0,0 +1,15 @@ + +import matplotlib.pyplot as plt + +from matplotlib import rcParams +rcParams['text.usetex'] = True + +txte = r"The \emph{characteristic polynomial} $\chi(\lambda)$ of the $3 \times 3$~matrix \\ $\left( \begin{array}{ccc} a & b & c \\ d & e & f \\g & h & i \end{array} \right) $ \\is given by the formula\\ $ \chi(\lambda) = \left| \begin{array}{ccc} \lambda - a & -b & -c \\ -d & \lambda - e & -f \\ -g & -h & \lambda - i \end{array} \right|. $" + + +plt.text(0.0, 0.0, txte, fontsize=14) +ax = plt.gca() +ax.axes.get_xaxis().set_visible(False) +ax.axes.get_yaxis().set_visible(False) + +plt.show() \ No newline at end of file diff --git a/playground/markdownify_case.py b/playground/markdownify_case.py index 5e9954e..efeaaab 100644 --- a/playground/markdownify_case.py +++ b/playground/markdownify_case.py @@ -6,8 +6,12 @@ import telegramify_markdown +customize = telegramify_markdown.customize.get_runtime_config() # Get the global Customize singleton instance + # Customize the markdownify -telegramify_markdown.customize.strict_markdown = False # we need send underline text +customize.strict_markdown = False # treat `__` as underline instead of bold +print("strict_markdown:", customize.strict_markdown) + # Test html tags html_t = telegramify_markdown.markdownify( "Hello, World! HTML: <strong>Hello, World!</strong>", @@ -68,28 +72,27 @@ '\_', '\*', '\[', '\]', '\(', '\)', '\~', '\`', '\>', '\#', '\+', '\-', '\=', '\|', '\{', '\}', '\.', '\!' _ , * , [ , ] , ( , ) , ~ , ` , > , # , + , - , = , | , { , } , . , ! We will remove the \ symbol from the original text. -**bold text** *bold text* _italic text_ __underline__ -~no valid strikethrough~ -~~strikethrough~~ +~strikethrough~ ||spoiler|| *bold _italic bold ~~italic bold strikethrough ||italic bold strikethrough spoiler||~~ __underline italic bold___ bold* -__underline italic bold__ +*bold & _italic & __underline & ~~strikethrough & ||spoiler||~~__**_* +__underline _italic *bold*_**__ [link](https://www.google.com) - [ ] Uncompleted task list item - [x] Completed task list item > Quote ->Multiline Quote In Markdown it's not possible to send multiline quote in telegram without using code block or html tag but telegramify_markdown can do it. +>Multiline Quote In Markdown it's not possible to send multiline quote in telegram without using code block or html tag but telegramify_markdown can do it. --- Text Text Text -> If you quote is too long, it will be automatically set in expandable citation. +> If you quote is too long, it will be automatically set in expandable citation. > This is the second line of the quote. > `This is the third line of the quote.` > This is the fourth line of the quote. @@ -124,7 +127,7 @@ latex_escape=True ) print(converted) -# export Markdown to Telegram MarkdownV2 style. + load_dotenv() telegram_bot_token = os.getenv("TELEGRAM_BOT_TOKEN", None) chat_id = os.getenv("TELEGRAM_CHAT_ID", None) @@ -132,5 +135,62 @@ bot.send_message( chat_id, converted, - parse_mode="MarkdownV2" # IMPORTANT: Need Send in MarkdownV2 Mode. + parse_mode="MarkdownV2" # IMPORTANT: Must be sent with "MarkdownV2" parse mode ) + +MARKDOWN_SYNTAX = r""" +__bold__ +**bold text** +_italic text_ +*italic text* +~~strikethrough~~ +~not a strikethrough~ +||spoiler|| +""" +# **bold & _italic & __underline & ~strikethrough & ||spoiler||~__****_** + +TELEGRAM_SYNTAX = r""" +__underline__ +*bold text* +_italic text_ +~strikethrough~ +~~not a strikethrough~~ +||spoiler|| +""" +# *bold & _italic & __underline & ~~strikethrough & ||spoiler||~~__**_* + + +SYNTAX: dict[str, dict[str, list[str]]] = { + "MARKDOWN": { + "underline": [], # Not supported in Markdown + "bold": ["__", "**"], + "italic": ["_", "*"], + "strike": ["~~"], + }, + "TELEGRAM": { + "underline": ["__"], + "bold": ["*"], + "italic": ["_"], + "strike": ["~"], + } +} + +def generate_expected_strings(syntax: str, func: callable) -> str: + strings = [] + for key, values in SYNTAX[syntax].items(): + for token in values: + wrap = lambda x: token + x + token + input = wrap("text") + output = func(input).strip() + print(f" {wrap(key):14} => {func(wrap(key)).strip()}") + strings.append(func(f"- Given `{input}`, which becomes `{output}`, we get: {wrap(key)}")) + return "".join(strings) + + +def generate_output_string(syntax: str, func: callable) -> str: + case = telegramify_markdown.markdownify(f"\n# {syntax} syntax") + print("\n" + case.replace("\\", "").strip()) + return case + generate_expected_strings(syntax, func) + +bot.send_message(chat_id, generate_output_string("MARKDOWN", telegramify_markdown.markdownify) + telegramify_markdown.markdownify(MARKDOWN_SYNTAX), parse_mode="MarkdownV2") +bot.send_message(chat_id, generate_output_string("TELEGRAM", telegramify_markdown.standardize) + telegramify_markdown.standardize(TELEGRAM_SYNTAX), parse_mode="MarkdownV2") diff --git a/playground/use_case.py b/playground/simple_case.py similarity index 88% rename from playground/use_case.py rename to playground/simple_case.py index f374441..324af4d 100644 --- a/playground/use_case.py +++ b/playground/simple_case.py @@ -1,7 +1,7 @@ -import re - import telegramify_markdown -from telegramify_markdown.customize import markdown_symbol +from telegramify_markdown.customize import get_runtime_config + +markdown_symbol = get_runtime_config().markdown_symbol markdown_symbol.head_level_1 = "๐Ÿ“Œ" # If you want, Customizing the head level 1 symbol markdown_symbol.link = "๐Ÿ”—" # If you want, Customizing the link symbol @@ -18,7 +18,7 @@ >1231 """ -test_md = """ +test_md = r""" **bold text** ||spoiler|| """ @@ -34,7 +34,7 @@ $ f(x) = \frac{1}{x} $ """ -emoji=""" +emoji = r""" [inline URL](http://www.example.com/) [inline mention of a user](tg://user?id=123456789) ![๐Ÿ‘](tg://emoji?id=5368324170671202286) @@ -45,5 +45,3 @@ converted = telegramify_markdown.markdownify(emoji) print(converted) - - diff --git a/playground/standardize_case.py b/playground/standardize_case.py new file mode 100644 index 0000000..3dd58cf --- /dev/null +++ b/playground/standardize_case.py @@ -0,0 +1,57 @@ +import os + +from dotenv import load_dotenv +from telebot import TeleBot + +import telegramify_markdown +from telegramify_markdown.customize import get_runtime_config + +markdown_symbol = get_runtime_config().markdown_symbol +markdown_symbol.head_level_1 = "๐Ÿ“Œ" # If you want, Customizing the head level 1 symbol +markdown_symbol.link = "๐Ÿ”—" # If you want, Customizing the link symbol +telegram_v2=r""" +# Title +*bold \*text* +_italic \*text_ +__underline__ +~strikethrough~ +||spoiler|| +*bold _italic bold ~italic bold strikethrough ||italic bold strikethrough spoiler||~ __underline italic bold___ bold* +[inline URL](http://www.example.com/) +[inline mention of a user](tg://user?id=123456789) +![๐Ÿ‘](tg://emoji?id=5368324170671202286) +`inline fixed-width code` +``` +pre-formatted fixed-width code block +``` +```python +pre-formatted fixed-width code block written in the Python programming language +``` + +>Block quotation started +>Block quotation continued +>Block quotation continued +>Block quotation continued +>The last line of the block quotation + + +**>The expandable block quotation started right after the previous block quotation +>It is separated from the previous block quotation by an empty bold entity +>Expandable block quotation continued +>Hidden by default part of the expandable block quotation started +>Expandable block quotation continued +>The last line of the expandable block quotation with the expandability mark|| +""" + +converted = telegramify_markdown.standardize(telegram_v2) + +# Compare Line +load_dotenv() +telegram_bot_token = os.getenv("TELEGRAM_BOT_TOKEN", None) +chat_id = os.getenv("TELEGRAM_CHAT_ID", None) +bot = TeleBot(telegram_bot_token) +bot.send_message( + chat_id, + converted, + parse_mode="MarkdownV2" # IMPORTANT: Must be sent with "MarkdownV2" parse mode +) diff --git a/playground/t_all_test.md b/playground/t_text.md similarity index 100% rename from playground/t_all_test.md rename to playground/t_text.md diff --git a/playground/telegramify_case.py b/playground/telegramify_case.py index 2b55c9e..b6d4761 100644 --- a/playground/telegramify_case.py +++ b/playground/telegramify_case.py @@ -7,12 +7,12 @@ from telebot import TeleBot import telegramify_markdown -from telegramify_markdown.customize import markdown_symbol +from telegramify_markdown.customize import get_runtime_config from telegramify_markdown.interpreters import BaseInterpreter, MermaidInterpreter from telegramify_markdown.type import ContentTypes tips = """ -telegramify_markdown.telegramify +telegramify_markdown.telegramify The stability of telegramify_markdown.telegramify is unproven, please keep good log records. @@ -25,8 +25,8 @@ bot = TeleBot(telegram_bot_token) # Customizing global rendering options -markdown_symbol.head_level_1 = "๐Ÿ“Œ" # If you want, Customizing the head level 1 symbol -markdown_symbol.link = "๐Ÿ”—" # If you want, Customizing the link symbol +get_runtime_config().markdown_symbol.head_level_1 = "๐Ÿ“Œ" # If you want, Customizing the head level 1 symbol +get_runtime_config().markdown_symbol.link = "๐Ÿ”—" # If you want, Customizing the link symbol md = pathlib.Path(__file__).parent.joinpath("t_longtext.md").read_text(encoding="utf-8") diff --git a/pyproject.toml b/pyproject.toml index af10755..2a7887f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "telegramify-markdown" -version = "0.4.3" +version = "0.4.4" description = "Makes it easy to send Markdown in Telegram MarkdownV2 style" authors = [ { name = "sudoskys", email = "coldlando@hotmail.com" }, diff --git a/src/telegramify_markdown/__init__.py b/src/telegramify_markdown/__init__.py index 2e2c54a..8f10e1b 100644 --- a/src/telegramify_markdown/__init__.py +++ b/src/telegramify_markdown/__init__.py @@ -7,12 +7,10 @@ from mistletoe.span_token import SpanToken # noqa from . import customize -from .interpreters import Text, File, Photo, BaseInterpreter, MermaidInterpreter +from .interpreters import BaseInterpreter, MermaidInterpreter from .latex_escape.const import LATEX_SYMBOLS, NOT_MAP, LATEX_STYLES from .latex_escape.helper import LatexToUnicodeHelper -from .logger import logger -from .mime import get_filename -from .render import TelegramMarkdownRenderer, escape_markdown +from .render import TelegramMarkdownRenderer, escape_markdown, TelegramMarkdownFormatter from .type import Text, File, Photo, ContentTypes __all__ = [ @@ -78,7 +76,7 @@ def _update_text(token: Union[SpanToken, BlockToken]): pass else: if hasattr(token, "content"): - token.content = escape_markdown(token.content, unescape_html=customize.unescape_html) + token.content = escape_markdown(token.content, unescape_html=customize.get_runtime_config().unescape_html) def _update_block(token: BlockToken): @@ -133,15 +131,15 @@ async def telegramify( content = escape_latex(content) document = mistletoe.Document(content) document2 = mistletoe.Document(content) - # ๅชๆ›ดๆ–ฐ็ฌฌไธ€ไธชๆ–‡ๆกฃ๏ผŒๅ› ไธบๆˆ‘ไปฌ่ฆๅ€’ๆŸฅ็ฌฌไบŒไธชๆ–‡ๆกฃ็š„ๅ†…ๅฎน + # Only update the first document, because we need to check the content of the second document _update_block(document) - # ่งฃ็ฆป Token + # Disconnect the Token tokens = list(document.children) tokens2 = list(document2.children) if len(tokens) != len(tokens2): raise ValueError("Token length mismatch") - # ๅฏนๅ†…ๅฎน่ฟ›่กŒๅˆ†ๅ—ๆธฒๆŸ“ + # Split the content into blocks def is_over_max_word_count(doc_t: List[Tuple[Any, Any]]): doc = mistletoe.Document(lines=[]) doc.children = [___token for ___token, ___token2 in doc_t] @@ -160,9 +158,9 @@ def render_lines(lines: str): _stack = [] _packed = [] - # ๆญฅ่ฟ›ๆŽจ้€ + # Step by step push for _token, _token2 in zip(tokens, tokens2): - # ่ฎก็ฎ—ๅฆ‚ๆžœๆŽจ้€ๅฝ“ๅ‰ Token ๆ˜ฏๅฆไผš่ถ…่ฟ‡ๆœ€ๅคงๅญ—ๆ•ฐ้™ๅˆถ + # Calculate if pushing the current token will exceed the maximum word count limit if is_over_max_word_count(_stack + [(_token, _token2)]): _packed.append(_stack) _stack = [(_token, _token2)] @@ -200,6 +198,35 @@ def render_lines(lines: str): )) return _rendered +def standardize( + content: str, + *, + max_line_length: int = None, + normalize_whitespace=False, + latex_escape: bool = True, +) -> str: + """ + Convert Unstandardized Telegram MarkdownV2 Syntax to Standardized Telegram MarkdownV2 Syntax. + Used for replace the Telegram MarkdownV2 Syntax Builder. + + **Showcase** https://github.com/sudoskys/telegramify-markdown/blob/main/playground/standardize_case.py + + :param content: The markdown content to convert. + :param max_line_length: The maximum length of a line. + :param normalize_whitespace: Whether to normalize whitespace. + :param latex_escape: Whether to make LaTeX content readable in Telegram. + :return: The Telegram markdown formatted content. **Need Send in MarkdownV2 Mode.** + """ + with TelegramMarkdownFormatter( + max_line_length=max_line_length, + normalize_whitespace=normalize_whitespace + ) as renderer: + if latex_escape: + content = escape_latex(content) + document = mistletoe.Document(content) + _update_block(document) + result = renderer.render(document) + return result def markdownify( content: str, @@ -209,7 +236,7 @@ def markdownify( latex_escape: bool = True, ) -> str: """ - Convert markdown str to Telegram Markdown format. + Convert Standardized Markdown to Standardized Telegram MarkdownV2 Syntax. **Showcase** https://github.com/sudoskys/telegramify-markdown/blob/main/playground/markdownify_case.py @@ -219,7 +246,6 @@ def markdownify( :param latex_escape: Whether to make LaTeX content readable in Telegram. :return: The Telegram markdown formatted content. **Need Send in MarkdownV2 Mode.** """ - _rendered = [] with TelegramMarkdownRenderer( max_line_length=max_line_length, normalize_whitespace=normalize_whitespace diff --git a/src/telegramify_markdown/customize.py b/src/telegramify_markdown/customize.py index 9f9110b..f61ba4f 100644 --- a/src/telegramify_markdown/customize.py +++ b/src/telegramify_markdown/customize.py @@ -1,28 +1,75 @@ -class Symbol(object): - head_level_1 = "\N{PUSHPIN}" - # "๐Ÿ“Œ" - head_level_2 = "\N{PENCIL}" - # "โœ๏ธ" - head_level_3 = "\N{BOOKS}" - # "๐Ÿ“š" - head_level_4 = "\N{BOOKMARK}" - # "๐Ÿ”–" - image = "\N{FRAME WITH PICTURE}" - # "๐Ÿ–ผ" - link = "\N{LINK SYMBOL}" - # "๐Ÿ”—" - task_completed = "\N{WHITE HEAVY CHECK MARK}" - # "โœ…" - task_uncompleted = "\N{BALLOT BOX WITH CHECK}" - # "โ˜‘๏ธ" - - -# NOTE: Settings that are not part of global rendering **are not allowed** to be stored here!! -# Prioritize function parameter passing to ensure definability - -# Markdown options -markdown_symbol = Symbol() -# Rendering options -cite_expandable = True +def singleton(cls): + """Singleton pattern decorator""" + instances = {} + + def get_instance(*args, **kwargs): + if cls not in instances: + instances[cls] = cls(*args, **kwargs) + return instances[cls] + + return get_instance + + +class Symbol: + def __init__(self): + self.head_level_1: str = "\N{PUSHPIN}" # ๐Ÿ“Œ + self.head_level_2: str = "\N{PENCIL}" # โœ๏ธ + self.head_level_3: str = "\N{BOOKS}" # ๐Ÿ“š + self.head_level_4: str = "\N{BOOKMARK}" # ๐Ÿ”– + self.image: str = "\N{FRAME WITH PICTURE}" # ๐Ÿ–ผ + self.link: str = "\N{LINK SYMBOL}" # ๐Ÿ”— + self.task_completed: str = "\N{WHITE HEAVY CHECK MARK}" # โœ… + self.task_uncompleted: str = "\N{BALLOT BOX WITH CHECK}" # โ˜‘๏ธ + + +@singleton +class RenderConfig: + def __init__(self): + self._markdown_symbol = Symbol() + self._cite_expandable = True + self._strict_markdown = True + self._unescape_html = False + + @property + def markdown_symbol(self) -> Symbol: + return self._markdown_symbol + + @property + def cite_expandable(self) -> bool: + return self._cite_expandable + + @cite_expandable.setter + def cite_expandable(self, value: bool): + self._cite_expandable = value + + @property + def unescape_html(self) -> bool: + return self._unescape_html + + @unescape_html.setter + def unescape_html(self, value: bool): + self._unescape_html = value + + +# Global accessor function for accessing the CustomConfig singleton +def get_runtime_config() -> RenderConfig: + return RenderConfig() + + +# NOTE: Deprecated global variables included below for backward compatibility. +# These should be removed in the next major version bump following the deprecation warning period. +import inspect # noqa: E402 +import warnings # noqa: E402 +warnings.simplefilter("default", DeprecationWarning) + +warnings.warn_explicit("markdown_symbol is deprecated, and will be removed in a future release. Use get_config().markdown_symbol instead", DeprecationWarning, filename=__file__, lineno=inspect.currentframe().f_lineno + 1) +markdown_symbol = get_runtime_config().markdown_symbol + +warnings.warn_explicit("cite_expandable is deprecated, and will be removed in a future release. Use get_config().cite_expandable instead", DeprecationWarning, filename=__file__, lineno=inspect.currentframe().f_lineno + 1) +cite_expandable = get_runtime_config().cite_expandable + +warnings.warn_explicit("strict_markdown is deprecated, and will be removed in a future release. Use get_config().strict_markdown instead", DeprecationWarning, filename=__file__, lineno=inspect.currentframe().f_lineno + 1) strict_markdown = True -unescape_html = False + +warnings.warn_explicit("unescape_html is deprecated, and will be removed in a future release. Use get_config().unescape_html instead", DeprecationWarning, filename=__file__, lineno=inspect.currentframe().f_lineno + 1) +unescape_html = get_runtime_config().unescape_html diff --git a/src/telegramify_markdown/latex_escape/helper.py b/src/telegramify_markdown/latex_escape/helper.py index c1598ce..5ef3176 100644 --- a/src/telegramify_markdown/latex_escape/helper.py +++ b/src/telegramify_markdown/latex_escape/helper.py @@ -1,12 +1,10 @@ import re from logging import getLogger - -logger = getLogger(__name__) - from telegramify_markdown.latex_escape.const import ( COMBINING, CombiningType, NOT_MAP, SUBSCRIPTS, SUPERSCRIPTS, LATEX_STYLES, FRAC_MAP, LATEX_SYMBOLS ) +logger = getLogger(__name__) class LatexToUnicodeHelper: @staticmethod @@ -151,11 +149,15 @@ def translate_escape(name): return LATEX_SYMBOLS.get(name, name) def parse(self, latex): - # ่งฃๆžๅนถ่ฝฌๆข LaTeX ๅญ—็ฌฆไธฒไธบ Unicode + # Parse and convert LaTeX string to Unicode result, i = [], 0 while i < len(latex): if latex[i] == '\\': command, i = self.parse_command(latex, i) + # Check if it is a mixed fraction format (a number followed directly by \frac) + if command == "\\frac" and result and result[-1] and result[-1][-1].isdigit(): + # Add a space between the number and the fraction + result[-1] = result[-1] + " " handled, i = self.handle_command(command, latex, i) result.append(handled) elif latex[i] == '{': diff --git a/src/telegramify_markdown/render.py b/src/telegramify_markdown/render.py index b906d3d..96ecd87 100644 --- a/src/telegramify_markdown/render.py +++ b/src/telegramify_markdown/render.py @@ -5,15 +5,32 @@ from telegramify_markdown import markdown from mistletoe import span_token, block_token -from mistletoe.markdown_renderer import MarkdownRenderer, LinkReferenceDefinition, Fragment - -from .customize import markdown_symbol, strict_markdown, cite_expandable +from mistletoe.markdown_renderer import ( + MarkdownRenderer, + LinkReferenceDefinition, + Fragment, +) +from .customize import get_runtime_config class Spoiler(span_token.SpanToken): + """ + Spoiler token. ("||some text||") + This is an inline token. Its children are inline (span) tokens. + """ + pattern = re.compile(r"(? str: # First pass to escape all markdown special characters escaped_content = re.sub(r"([_*\[\]()~`>\#\+\-=|{}\.!\\])", r"\\\1", content) # Second pass to remove double escaping - final_content = re.sub(r"\\\\([_*\[\]()~`>\#\+\-=|{}\.!\\])", r"\\\1", escaped_content) + final_content = re.sub( + r"\\\\([_*\[\]()~`>\#\+\-=|{}\.!\\])", r"\\\1", escaped_content + ) return final_content @@ -92,22 +112,21 @@ def validate_telegram_emoji(url: str) -> bool: class TelegramMarkdownRenderer(MarkdownRenderer): - def __init__(self, *extras, **kwargs): super().__init__( *chain( ( Spoiler, - TaskListItem, + TaskListItem ), - extras + extras, ) ) self.render_map["Spoiler"] = self.render_spoiler self.render_map["TaskListItem"] = self.render_task_list_item def render_quote( - self, token: block_token.Quote, max_line_length: int + self, token: block_token.Quote, max_line_length: int ) -> Iterable[str]: def add_expanded_suffix(iterable: Iterable[str]) -> Iterable[str]: iterator = iter(iterable) @@ -127,7 +146,7 @@ def add_expanded_suffix(iterable: Iterable[str]) -> Iterable[str]: lines, counter = tee(lines) total_characters = sum(len(s) for s in counter) # NOTE: Remove the space after the > , but it is not standard markdown - append_expanded_cite = cite_expandable and total_characters > 200 + append_expanded_cite = get_runtime_config().cite_expandable and total_characters > 200 if append_expanded_cite: first_line_prefix = "**>" lines = add_expanded_suffix(lines) @@ -136,23 +155,25 @@ def add_expanded_suffix(iterable: Iterable[str]) -> Iterable[str]: yield from self.prefix_lines( lines or [""], first_line_prefix=first_line_prefix, - following_line_prefix=">" + following_line_prefix=">", ) def render_heading( - self, token: block_token.Heading, max_line_length: int + self, token: block_token.Heading, max_line_length: int ) -> Iterable[str]: # note: no word wrapping, because atx headings always fit on a single line. line = "" if token.level == 1: - line += markdown_symbol.head_level_1 + line += get_runtime_config().markdown_symbol.head_level_1 elif token.level == 2: - line += markdown_symbol.head_level_2 + line += get_runtime_config().markdown_symbol.head_level_2 elif token.level == 3: - line += markdown_symbol.head_level_3 + line += get_runtime_config().markdown_symbol.head_level_3 elif token.level == 4: - line += markdown_symbol.head_level_4 - text = next(self.span_to_lines(token.children, max_line_length=max_line_length), "") + line += get_runtime_config().markdown_symbol.head_level_4 + text = next( + self.span_to_lines(token.children, max_line_length=max_line_length), "" + ) if text: line += " " + text if token.closing_sequence: @@ -160,13 +181,11 @@ def render_heading( return [markdown.bold(line)] def render_fenced_code_block( - self, token: block_token.BlockCode, max_line_length: int + self, token: block_token.BlockCode, max_line_length: int ) -> Iterable[str]: indentation = " " * token.indentation yield indentation + token.delimiter + token.info_string - yield from self.prefix_lines( - token.content[:-1].split("\n"), indentation - ) + yield from self.prefix_lines(token.content[:-1].split("\n"), indentation) yield indentation + token.delimiter def render_inline_code(self, token: span_token.InlineCode) -> Iterable[Fragment]: @@ -174,67 +193,57 @@ def render_inline_code(self, token: span_token.InlineCode) -> Iterable[Fragment] return self.embed_span( Fragment(token.delimiter + token.padding + "\n"), token.children, - Fragment(token.padding + token.delimiter) + Fragment(token.padding + token.delimiter), ) return self.embed_span( Fragment(token.delimiter + token.padding), token.children, - Fragment(token.padding + token.delimiter) + Fragment(token.padding + token.delimiter), ) def render_block_code( - self, token: block_token.BlockCode, - max_line_length: int + self, token: block_token.BlockCode, max_line_length: int ) -> Iterable[str]: return [markdown.code(token.content)] def render_setext_heading( - self, token: block_token.SetextHeading, - max_line_length: int + self, token: block_token.SetextHeading, max_line_length: int ) -> Iterable[str]: yield from self.span_to_lines(token.children, max_line_length=max_line_length) yield markdown.escape("โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€") def render_emphasis(self, token: span_token.Emphasis) -> Iterable[Fragment]: - return super().render_emphasis(token) + return self.embed_span(Fragment("_"), token.children) def render_strong(self, token: span_token.Strong) -> Iterable[Fragment]: - if strict_markdown: - # Telegram strong: *text* - # Markdown strong: **text** or __text__ - return self.embed_span(Fragment('*'), token.children) - else: - # bold - if token.delimiter == "*": - return self.embed_span(Fragment(token.delimiter * 1), token.children) - # underline - return self.embed_span(Fragment(token.delimiter * 2), token.children) + return self.embed_span(Fragment("*"), token.children) def render_strikethrough( - self, token: span_token.Strikethrough + self, token: span_token.Strikethrough ) -> Iterable[Fragment]: return self.embed_span(Fragment("~"), token.children) def render_spoiler(self, token: Spoiler) -> Iterable[Fragment]: return self.embed_span(Fragment("||"), token.children) - def render_task_list_item(self, - token: TaskListItem, - max_line_length: int - ) -> Iterable[str]: - symbol = markdown_symbol.task_completed if token.checked else markdown_symbol.task_uncompleted + def render_task_list_item( + self, token: TaskListItem, max_line_length: int + ) -> Iterable[str]: + symbol = ( + get_runtime_config().markdown_symbol.task_completed + if token.checked + else get_runtime_config().markdown_symbol.task_uncompleted + ) if self.normalize_whitespace: indentation = 0 else: indentation = len(token.indentation) - lines = self.span_to_lines( - token.children, max_line_length=max_line_length - ) + lines = self.span_to_lines(token.children, max_line_length=max_line_length) space = " " * indentation return self.prefix_lines(lines or [""], f"{space}{symbol} ") def render_list_item( - self, token: block_token.ListItem, max_line_length: int + self, token: block_token.ListItem, max_line_length: int ) -> Iterable[str]: token_origin = str(token.leader).strip() if token_origin.endswith("."): @@ -246,14 +255,15 @@ def render_list_item( return super().render_list_item(token, max_line_length) def render_link_reference_definition( - self, token: LinkReferenceDefinition + self, token: LinkReferenceDefinition ) -> Iterable[Fragment]: yield from ( Fragment( - markdown_symbol.link + markdown.link( + get_runtime_config().markdown_symbol.link + + markdown.link( content=token.title if token.title else token.label, url=token.dest, - escape=True + escape=True, ) ), ) @@ -261,27 +271,25 @@ def render_link_reference_definition( def render_image(self, token: span_token.Image) -> Iterable[Fragment]: # tg://emoji?id=5368324170671202286 is a special case if not validate_telegram_emoji(token.src): - yield Fragment(markdown_symbol.image) + yield Fragment(get_runtime_config().markdown_symbol.image) yield from self.render_link_or_image(token, token.src) def render_link(self, token: span_token.Link) -> Iterable[Fragment]: return self.render_link_or_image(token, token.target) def render_link_or_image( - self, token: span_token.SpanToken, target: str + self, token: span_token.SpanToken, target: str ) -> Iterable[Fragment]: - title = next(self.span_to_lines(token.children, max_line_length=self.max_line_length), "") + title = next( + self.span_to_lines(token.children, max_line_length=self.max_line_length), "" + ) if token.dest_type == "uri" or token.dest_type == "angle_uri": # "[" description "](" dest_part [" " title] ")" # "[" description "](" dest_part [" " title] ")" if validate_telegram_emoji(target): - yield Fragment( - f'![{title}]({target})' - ) + yield Fragment(f"![{title}]({target})") else: - yield Fragment( - markdown.link(href=target, body=title) - ) + yield Fragment(markdown.link(href=target, body=title)) elif token.dest_type == "full": # "[" description "][" label "]" yield from ( @@ -291,24 +299,55 @@ def render_link_or_image( ) elif token.dest_type == "collapsed": # "[" description "][]" - yield Fragment(markdown.escape("[]")), + yield (Fragment(markdown.escape("[]")),) else: # "[" description "]" pass def render_auto_link(self, token: span_token.AutoLink) -> Iterable[Fragment]: - yield Fragment(markdown.escape("<") + token.children[0].content + markdown.escape(">")) + yield Fragment( + markdown.escape("<") + token.children[0].content + markdown.escape(">") + ) def render_escape_sequence( - self, token: span_token.EscapeSequence + self, token: span_token.EscapeSequence ) -> Iterable[Fragment]: - # ๆธฒๆŸ“่ฝฌไน‰ๅญ—็ฌฆ # because the escape_markdown already happened in the parser, we can skip it here. yield Fragment("" + token.children[0].content) def render_table( - self, token: block_token.Table, max_line_length: int + self, token: block_token.Table, max_line_length: int ) -> Iterable[str]: # note: column widths are not preserved; they are automatically adjusted to fit the contents. fs = super().render_table(token, max_line_length) return [markdown.code(markdown.escape("\n".join(fs)))] + + +class TelegramMarkdownFormatter(TelegramMarkdownRenderer): + def __init__(self, *extras, **kwargs): + super().__init__( + *chain( + ( + Spoiler, + TelegramStrikethrough, + TaskListItem, + ), + extras, + ) + ) + self.render_map["Spoiler"] = self.render_spoiler + self.render_map["TelegramStrikethrough"] = self.render_telegram_strikethrough + self.render_map["TaskListItem"] = self.render_task_list_item + + def render_telegram_strikethrough( + self, token: TelegramStrikethrough + ) -> Iterable[Fragment]: + return self.embed_span(Fragment("~"), token.children) + + def render_emphasis(self, token: span_token.Emphasis) -> Iterable[Fragment]: + return self.embed_span(Fragment(token.delimiter), token.children) + + def render_strong(self, token: span_token.Strong) -> Iterable[Fragment]: + if token.delimiter == "_": + return self.embed_span(Fragment(token.delimiter * 2), token.children) + return self.embed_span(Fragment(token.delimiter), token.children) diff --git a/tests/exp_test.py b/tests/exp_test.py index 050f430..6ac9549 100644 --- a/tests/exp_test.py +++ b/tests/exp_test.py @@ -5,7 +5,6 @@ import telegramify_markdown from server import server_t -from telegramify_markdown import markdownify load_dotenv() @@ -13,7 +12,12 @@ class TestCase(unittest.IsolatedAsyncioTestCase): async def test_markdownify(self): md = pathlib.Path(__file__).parent.joinpath("exp1.md").read_text(encoding="utf-8") - converted = markdownify(md) + converted = telegramify_markdown.markdownify(md) + self.assertEqual(server_t(converted), True) + + def test_standardize(self): + md = pathlib.Path(__file__).parent.joinpath("exp1.md").read_text(encoding="utf-8") + converted = telegramify_markdown.standardize(md) self.assertEqual(server_t(converted), True) async def test_telegramify(self):