From 107f4945c34e5369dc534daba30c578c47a08765 Mon Sep 17 00:00:00 2001 From: DoodleBears Date: Sun, 7 Jul 2024 05:22:43 +0900 Subject: [PATCH] fix(splitter): remote inf loop, add more sign to punctuation --- split_lang/split/splitter.py | 20 ++++---------------- split_lang/split/utils.py | 2 +- tests/data/correct_split_merge_punc.txt | 4 +++- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/split_lang/split/splitter.py b/split_lang/split/splitter.py index 2d77dd5..89e4023 100644 --- a/split_lang/split/splitter.py +++ b/split_lang/split/splitter.py @@ -223,22 +223,10 @@ def _smart_merge( substr_list: List[SubString], lang_section_type: LangSectionType, ): - is_concat_complete = False - while is_concat_complete is False: - substr_list = self._smart_concat_logic( - substr_list, - lang_section_type=lang_section_type, - ) - is_concat_complete = True - - for index, block in enumerate(substr_list): - if block.lang == "x": - is_concat_complete = False - break - if index < len(substr_list) - 1: - if substr_list[index].lang == substr_list[index + 1].lang: - is_concat_complete = False - break + substr_list = self._smart_concat_logic( + substr_list, + lang_section_type=lang_section_type, + ) return substr_list # MARK: _init_substr_lang diff --git a/split_lang/split/utils.py b/split_lang/split/utils.py index 3128381..c322dcb 100644 --- a/split_lang/split/utils.py +++ b/split_lang/split/utils.py @@ -1,6 +1,6 @@ import re -PUNCTUATION = r""",.;:!?,。!?;:、·([{<(【《〈「『“‘)]}>)】》〉」』”’""" +PUNCTUATION = r""",.;:!?,。!?;:、·([{<(【《〈「『“‘)]}>)】》〉」』”’"-_——#$%&……¥'*+<=>?@[\]^_`{|}~""" chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]") hangul_pattern = re.compile(r"[\uac00-\ud7af]") diff --git a/tests/data/correct_split_merge_punc.txt b/tests/data/correct_split_merge_punc.txt index c48ee01..926d4a6 100644 --- a/tests/data/correct_split_merge_punc.txt +++ b/tests/data/correct_split_merge_punc.txt @@ -40,4 +40,6 @@ The shirt is 9.15 dollars. The shirt is 233 dollars. 我是 |VGroupChatBot,|一个旨在支持多人通信的助手,通过可视化消息来帮助团队成员更好地交流。我可以帮助团队成员更好地整理和共享信息,特别是在讨论、会议和|Brainstorming|等情况下。你好我的名字是|西野くまです|my name is bob|很高兴认识你|どうぞよろしくお願いいたします「こんにちは」|是什么意思。 我的名字是|西野くまです。|I am from Tokyo, |日本の首都。|今天的天气非常好 -我给你送的|手紙|你读了吗? \ No newline at end of file +我给你送的|手紙|你读了吗? +lang-|split +I have 10, |€ \ No newline at end of file