Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "telegramify-markdown"
version = "0.5.3"
version = "0.5.4"
description = "Makes it easy to send Markdown in Telegram MarkdownV2 style"
authors = [
{ name = "sudoskys", email = "coldlando@hotmail.com" },
Expand Down
11 changes: 1 addition & 10 deletions src/telegramify_markdown/interpreters.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
SentType,
ContentTrace,
)
from telegramify_markdown.word_count import count_markdown, hard_split_markdown
from telegramify_markdown.word_count import count_markdown

if TYPE_CHECKING:
try:
Expand Down Expand Up @@ -236,15 +236,6 @@ def _hard_split(self, text: str, max_word_count: int) -> List[str]:
return chunks


if __name__ == "__main__":
# Test case
content = "".join([f"[a](http://example.com/{'a'*50})" for _ in range(10)])
interpreter = BaseInterpreter()
chunks = interpreter._hard_split(content, 50)
print(f"Hard split chunks count: {len(chunks)}")
assert len(chunks) < 5


class TextInterpreter(BaseInterpreter):
"""
Pure text interpreter, only return text type
Expand Down
81 changes: 42 additions & 39 deletions src/telegramify_markdown/word_count.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,59 @@
import re
from typing import List

# Known limitation: The regex uses negative lookbehind (?<!\\) to skip escaped brackets.
# However, this does NOT correctly handle double backslash cases like `\\[` which should
# be treated as a valid link start (the backslash itself is escaped, not the bracket).
# This edge case is intentionally left unhandled for simplicity, as it's rare in practice.
_MARKDOWN_LINK_PATTERN = re.compile(
r"""
(?<!\\)\[ # match [, but not \[
(.*?) # url description (captured group \1)
(?<!\\)\] # match ], but not \]
\(
.*? # url content (not counted by Telegram)
(?<!\\)\) # match ), but not \)
""",
re.VERBOSE,
)


def count_markdown(md: str) -> int:
md = re.sub(r'''
(?<!\\)\[ # match [, but not match \[ (we just assume there won't be `\\[`)
(.*?) # url description, \1
(?<!\\)\] # similar as above
\(
.*? # url content
(?<!\\)\)
''',
r'[\1]()', # remove URL, because URL doesn't count as word count in Telegram
md, flags=re.X)
"""
Count the effective length of markdown text for Telegram.
Telegram does not count URL characters in links toward the message length limit.

:param md: Markdown text to count
:return: Effective character count
"""
# Replace [desc](url) with [desc]() to remove URL from count
md = _MARKDOWN_LINK_PATTERN.sub(r"[\1]()", md)
return len(md)

def hard_split_markdown(text: str, max_word_count: int) -> List[str]:
assert max_word_count > 0
"""
Hard split markdown text based on effective character count.
Uses iterative approximation to find optimal split points.

Note: This function is kept for API compatibility but is not currently used
internally. The BaseInterpreter._hard_split method provides similar functionality
with additional optimization for the interpreter context.

:param text: Text to split
:param max_word_count: Maximum effective character count per chunk
:return: List of text chunks
"""
if max_word_count <= 0:
raise ValueError("max_word_count must be positive")
chunks = []
while text:
limit = len(text)
round = 0
# Iteratively reduce limit until chunk fits within max_word_count
while limit > 0:
round += 1
c = count_markdown(text[:limit])
if c <= max_word_count:
break
limit -= (c - max_word_count)
# print(round, limit, c)

limit -= c - max_word_count
chunks.append(text[:limit])
text = text[limit:]

return chunks


if __name__ == "__main__":
# Test plain text split
assert len(hard_split_markdown("a" * 200, 100)) == 2

# Test expansion with large max_word_count
# [a](http://b) -> 13 chars, 5 visible
content = "[a](http://b)" * 100 # 1300 chars, 500 visible
# max_word_count = 1000.
# Should fit in 1 chunk (since 500 < 1000)
# But strictly by length, 1300 > 1000, so it would split if not for count_markdown logic
chunks = hard_split_markdown(content, 1000)
assert len(chunks) == 1
assert len(chunks[0]) == 1300

content = "[a](http://b)" * 10000 # 1300 chars, 500 visible
chunks = hard_split_markdown(content, 500)
print(len(chunks))
assert len(chunks) == 100

print("hard_split_markdown passed")
Loading