Skip to content

Commit

Permalink
Merge pull request #4 from DoodleBears/I3-fix-split-sign-inf-loop
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears authored Jul 6, 2024
2 parents ecfad44 + 107f494 commit b466113
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 18 deletions.
20 changes: 4 additions & 16 deletions split_lang/split/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,22 +223,10 @@ def _smart_merge(
substr_list: List[SubString],
lang_section_type: LangSectionType,
):
is_concat_complete = False
while is_concat_complete is False:
substr_list = self._smart_concat_logic(
substr_list,
lang_section_type=lang_section_type,
)
is_concat_complete = True

for index, block in enumerate(substr_list):
if block.lang == "x":
is_concat_complete = False
break
if index < len(substr_list) - 1:
if substr_list[index].lang == substr_list[index + 1].lang:
is_concat_complete = False
break
substr_list = self._smart_concat_logic(
substr_list,
lang_section_type=lang_section_type,
)
return substr_list

# MARK: _init_substr_lang
Expand Down
2 changes: 1 addition & 1 deletion split_lang/split/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re

PUNCTUATION = r""",.;:!?,。!?;:、·([{<(【《〈「『“‘)]}>)】》〉」』”’"""
PUNCTUATION = r""",.;:!?,。!?;:、·([{<(【《〈「『“‘)]}>)】》〉」』”’"-_——#$%&……¥'*+<=>?@[\]^_`{|}~"""

chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
hangul_pattern = re.compile(r"[\uac00-\ud7af]")
Expand Down
4 changes: 3 additions & 1 deletion tests/data/correct_split_merge_punc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ The shirt is 9.15 dollars.
The shirt is 233 dollars.
我是 |VGroupChatBot,|一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和|Brainstorming|等情况下。你好我的名字是|西野くまです|my name is bob|很高兴认识你|どうぞよろしくお願いいたします「こんにちは」|是什么意思。
我的名字是|西野くまです。|I am from Tokyo, |日本の首都。|今天的天气非常好
我给你送的|手紙|你读了吗?
我给你送的|手紙|你读了吗?
lang-|split
I have 10, |€

0 comments on commit b466113

Please sign in to comment.