pyproject.toml

-Original file line number
+Diff line change
@@ -1,6 +1,6 @@
     [project]
     name = "telegramify-markdown"
-    version = "0.5.3"
+    version = "0.5.4"
     description = "Makes it easy to send Markdown in Telegram MarkdownV2 style"
     authors = [
         { name = "sudoskys", email = "coldlando@hotmail.com" },
@@ Expand Down @@

src/telegramify_markdown/interpreters.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ @@
         SentType,
         ContentTrace,
     )
-    from telegramify_markdown.word_count import count_markdown, hard_split_markdown
+    from telegramify_markdown.word_count import count_markdown
     if TYPE_CHECKING:
         try:
@@ Expand Down Expand Up @@
             return chunks
-    if __name__ == "__main__":
-        # Test case
-        content = "".join([f"[a](http://example.com/{'a'*50})" for _ in range(10)])
-        interpreter = BaseInterpreter()
-        chunks = interpreter._hard_split(content, 50)
-        print(f"Hard split chunks count: {len(chunks)}")
-        assert len(chunks) < 5
     class TextInterpreter(BaseInterpreter):
         """
         Pure text interpreter, only return text type
@@ Expand Down @@

src/telegramify_markdown/word_count.py

-Original file line number
+Diff line change
@@ -1,56 +1,59 @@
     import re
     from typing import List
+    # Known limitation: The regex uses negative lookbehind (?<!\\) to skip escaped brackets.
+    # However, this does NOT correctly handle double backslash cases like `\\[` which should
+    # be treated as a valid link start (the backslash itself is escaped, not the bracket).
+    # This edge case is intentionally left unhandled for simplicity, as it's rare in practice.
+    _MARKDOWN_LINK_PATTERN = re.compile(
+        r"""
+        (?<!\\)\[   # match [, but not \[
+            (.*?)   # url description (captured group \1)
+        (?<!\\)\]   # match ], but not \]
+        \(
+            .*?     # url content (not counted by Telegram)
+        (?<!\\)\)   # match ), but not \)
+        """,
+        re.VERBOSE,
+    )
     def count_markdown(md: str) -> int:
-        md = re.sub(r'''
-                (?<!\\)\[ # match [, but not match \[ (we just assume there won't be `\\[`)
-                    (.*?) # url description, \1
-                (?<!\\)\] # similar as above
-                \(
-                .*? # url content
-                (?<!\\)\)
-            ''',
-            r'[\1]()', # remove URL, because URL doesn't count as word count in Telegram
-            md, flags=re.X)
+        """
+        Count the effective length of markdown text for Telegram.
+        Telegram does not count URL characters in links toward the message length limit.
+        :param md: Markdown text to count
+        :return: Effective character count
+        """
+        # Replace [desc](url) with [desc]() to remove URL from count
+        md = _MARKDOWN_LINK_PATTERN.sub(r"[\1]()", md)
         return len(md)
     def hard_split_markdown(text: str, max_word_count: int) -> List[str]:
-        assert max_word_count > 0
+        """
+        Hard split markdown text based on effective character count.
+        Uses iterative approximation to find optimal split points.
+        Note: This function is kept for API compatibility but is not currently used
+        internally. The BaseInterpreter._hard_split method provides similar functionality
+        with additional optimization for the interpreter context.
+        :param text: Text to split
+        :param max_word_count: Maximum effective character count per chunk
+        :return: List of text chunks
+        """
+        if max_word_count <= 0:
+            raise ValueError("max_word_count must be positive")
         chunks = []
         while text:
             limit = len(text)
-            round = 0
+            # Iteratively reduce limit until chunk fits within max_word_count
             while limit > 0:
-                round += 1
                 c = count_markdown(text[:limit])
                 if c <= max_word_count:
                     break
-                limit -= (c - max_word_count)
-            # print(round, limit, c)
+                limit -= c - max_word_count
             chunks.append(text[:limit])
             text = text[limit:]
         return chunks
-    if __name__ == "__main__":
-        # Test plain text split
-        assert len(hard_split_markdown("a" * 200, 100)) == 2
-        # Test expansion with large max_word_count
-        # [a](http://b) -> 13 chars, 5 visible
-        content = "[a](http://b)" * 100 # 1300 chars, 500 visible
-        # max_word_count = 1000.
-        # Should fit in 1 chunk (since 500 < 1000)
-        # But strictly by length, 1300 > 1000, so it would split if not for count_markdown logic
-        chunks = hard_split_markdown(content, 1000)
-        assert len(chunks) == 1
-        assert len(chunks[0]) == 1300
-        content = "[a](http://b)" * 10000 # 1300 chars, 500 visible
-        chunks = hard_split_markdown(content, 500)
-        print(len(chunks))
-        assert len(chunks) == 100
-        print("hard_split_markdown passed")

🔧 refactor(word_count): improve code quality and bump version to 0.5.4 #93

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

sudoskys merged 2 commits into main from dev

Dec 20, 2025

-Original file line number
+Diff line change
@@ -1,6 +1,6 @@
     [project]
     name = "telegramify-markdown"
-    version = "0.5.3"
+    version = "0.5.4"
     description = "Makes it easy to send Markdown in Telegram MarkdownV2 style"
     authors = [
         { name = "sudoskys", email = "coldlando@hotmail.com" },
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -16,7 +16,7 @@ @@
         SentType,
         ContentTrace,
     )
-    from telegramify_markdown.word_count import count_markdown, hard_split_markdown
+    from telegramify_markdown.word_count import count_markdown
     if TYPE_CHECKING:
         try:
@@ Expand Down Expand Up @@
             return chunks
-    if __name__ == "__main__":
-        # Test case
-        content = "".join([f"[a](http://example.com/{'a'*50})" for _ in range(10)])
-        interpreter = BaseInterpreter()
-        chunks = interpreter._hard_split(content, 50)
-        print(f"Hard split chunks count: {len(chunks)}")
-        assert len(chunks) < 5
     class TextInterpreter(BaseInterpreter):
         """
         Pure text interpreter, only return text type
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -1,56 +1,59 @@
     import re
     from typing import List
+    # Known limitation: The regex uses negative lookbehind (?<!\\) to skip escaped brackets.
+    # However, this does NOT correctly handle double backslash cases like `\\[` which should
+    # be treated as a valid link start (the backslash itself is escaped, not the bracket).
+    # This edge case is intentionally left unhandled for simplicity, as it's rare in practice.
+    _MARKDOWN_LINK_PATTERN = re.compile(
+        r"""
+        (?<!\\)\[   # match [, but not \[
+            (.*?)   # url description (captured group \1)
+        (?<!\\)\]   # match ], but not \]
+        \(
+            .*?     # url content (not counted by Telegram)
+        (?<!\\)\)   # match ), but not \)
+        """,
+        re.VERBOSE,
+    )
     def count_markdown(md: str) -> int:
-        md = re.sub(r'''
-                (?<!\\)\[ # match [, but not match \[ (we just assume there won't be `\\[`)
-                    (.*?) # url description, \1
-                (?<!\\)\] # similar as above
-                \(
-                .*? # url content
-                (?<!\\)\)
-            ''',
-            r'[\1]()', # remove URL, because URL doesn't count as word count in Telegram
-            md, flags=re.X)
+        """
+        Count the effective length of markdown text for Telegram.
+        Telegram does not count URL characters in links toward the message length limit.
+        :param md: Markdown text to count
+        :return: Effective character count
+        """
+        # Replace [desc](url) with [desc]() to remove URL from count
+        md = _MARKDOWN_LINK_PATTERN.sub(r"[\1]()", md)
         return len(md)
     def hard_split_markdown(text: str, max_word_count: int) -> List[str]:
-        assert max_word_count > 0
+        """
+        Hard split markdown text based on effective character count.
+        Uses iterative approximation to find optimal split points.
+        Note: This function is kept for API compatibility but is not currently used
+        internally. The BaseInterpreter._hard_split method provides similar functionality
+        with additional optimization for the interpreter context.
+        :param text: Text to split
+        :param max_word_count: Maximum effective character count per chunk
+        :return: List of text chunks
+        """
+        if max_word_count <= 0:
+            raise ValueError("max_word_count must be positive")
         chunks = []
         while text:
             limit = len(text)
-            round = 0
+            # Iteratively reduce limit until chunk fits within max_word_count
             while limit > 0:
-                round += 1
                 c = count_markdown(text[:limit])
                 if c <= max_word_count:
                     break
-                limit -= (c - max_word_count)
-            # print(round, limit, c)
+                limit -= c - max_word_count
             chunks.append(text[:limit])
             text = text[limit:]
         return chunks
-    if __name__ == "__main__":
-        # Test plain text split
-        assert len(hard_split_markdown("a" * 200, 100)) == 2
-        # Test expansion with large max_word_count
-        # [a](http://b) -> 13 chars, 5 visible
-        content = "[a](http://b)" * 100 # 1300 chars, 500 visible
-        # max_word_count = 1000.
-        # Should fit in 1 chunk (since 500 < 1000)
-        # But strictly by length, 1300 > 1000, so it would split if not for count_markdown logic
-        chunks = hard_split_markdown(content, 1000)
-        assert len(chunks) == 1
-        assert len(chunks[0]) == 1300
-        content = "[a](http://b)" * 10000 # 1300 chars, 500 visible
-        chunks = hard_split_markdown(content, 500)
-        print(len(chunks))
-        assert len(chunks) == 100
-        print("hard_split_markdown passed")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

🔧 refactor(word_count): improve code quality and bump version to 0.5.4 #93

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!