fix(splitter): merge across newline (\n) on section stage

DoodleBears · DoodleBears · commit 4f7718c92944 · 2024-10-18T02:38:38.000+09:00
diff --git a/split_lang/model.py b/split_lang/model.py
@@ -8,6 +8,7 @@ class LangSectionType(Enum):
     ZH_JA = "zh_ja"
     KO = "ko"
     PUNCTUATION = "punctuation"
+    NEWLINE = "newline"
     DIGIT = "digit"
     OTHERS = "others"
     ALL = "all"
diff --git a/split_lang/split/splitter.py b/split_lang/split/splitter.py
@@ -25,10 +25,9 @@ def __init__(
         default_lang: str = DEFAULT_LANG,
         punctuation: str = PUNCTUATION,
         not_merge_punctuation: str = "",
-        special_merge_for_zh_ja: bool = True,
         merge_across_punctuation: bool = True,
-        merge_across_newline: bool = True,
         merge_across_digit: bool = True,
+        merge_across_newline: bool = True,
         debug: bool = True,
         log_level: int = logging.INFO,
     ) -> None:
@@ -51,10 +50,9 @@ def __init__(
         self.debug = debug
         self.punctuation = punctuation
         self.not_merge_punctuation = not_merge_punctuation
-        self.merge_across_newline = merge_across_newline
-        self.special_merge_for_zh_ja = special_merge_for_zh_ja
         self.merge_across_punctuation = merge_across_punctuation
         self.merge_across_digit = merge_across_digit
+        self.merge_across_newline = merge_across_newline
         self.log_level = log_level
         logging.basicConfig(
             level=self.log_level,
@@ -79,17 +77,18 @@ def split_by_lang(
         sections = self._split(pre_split_section=pre_split_section)
 
         if self.merge_across_punctuation:  # 合并跨标点符号的 SubString
-            after_merge_punctuation_sections = (
-                self._merge_substrings_across_punctuation_based_on_sections(
-                    sections=sections
-                )
+            sections = self._merge_substrings_across_punctuation_based_on_sections(
+                sections=sections
             )
 
         if self.merge_across_digit:  # 合并跨数字的 SubString
-            after_merge_digit_sections = (
-                self._merge_substrings_across_digit_based_on_sections(
-                    sections=after_merge_punctuation_sections
-                )
+            sections = self._merge_substrings_across_digit_based_on_sections(
+                sections=sections
+            )
+
+        if self.merge_across_newline:
+            sections = self._merge_substrings_across_newline_based_on_sections(
+                sections=sections
             )
 
         substrings: List[SubString] = []
@@ -148,8 +147,12 @@ def add_substring(lang_section_type: LangSectionType):
                     current_lang = LangSectionType.PUNCTUATION
             elif char.isspace():
                 # concat space to current text
-                add_substring(current_lang)
-                current_lang = LangSectionType.PUNCTUATION
+                if char == "\n":
+                    add_substring(current_lang)
+                    current_lang = LangSectionType.NEWLINE
+                else:
+                    add_substring(current_lang)
+                    current_lang = LangSectionType.PUNCTUATION
             else:
                 if current_lang != LangSectionType.OTHERS:
                     add_substring(current_lang)
@@ -193,6 +196,16 @@ def _split(
                         length=section_len,
                     )
                 )
+            elif section.lang_section_type is LangSectionType.NEWLINE:
+                # NOTE: 换行作为单独的 SubString
+                section.substrings.append(
+                    SubString(
+                        text=section.text,
+                        lang="newline",
+                        index=section_index,
+                        length=section_len,
+                    )
+                )
             else:
                 substrings_with_lang: List[SubString] = []
                 if section.lang_section_type is LangSectionType.ZH_JA:
@@ -212,6 +225,7 @@ def _split(
                             length=section_len,
                         )
                     ]
+
                 else:
                     temp_substrings = self._parse_without_zh_ja(section.text)
                     substrings_with_lang = self._init_substr_lang(
@@ -232,7 +246,10 @@ def _split(
 
         # MARK: smart merge substring together
         for section in pre_split_section:
-            if section.lang_section_type is LangSectionType.PUNCTUATION:
+            if (
+                section.lang_section_type is LangSectionType.PUNCTUATION
+                or section.lang_section_type is LangSectionType.NEWLINE
+            ):
                 # print(section.text)
                 continue
             smart_concat_result = self._smart_merge(
@@ -660,6 +677,98 @@ def _special_merge_for_zh_ja(
         new_substrings = self._merge_substrings(substrings=new_substrings)
         return new_substrings
 
+    def _merge_substrings_across_newline_based_on_sections(
+        self,
+        sections: List[SubStringSection],
+    ) -> List[SubStringSection]:
+        new_sections: List[SubStringSection] = [sections[0]]
+        # NOTE: 将 sections 中的 newline 合并到临近的非 punctuation 的 section
+        for index, _ in enumerate(sections):
+            if index == 0:
+                continue
+            if index >= len(sections):
+                break
+
+            prev_section = new_sections[-1]
+            current_section = sections[index]
+            if (
+                current_section.lang_section_type != LangSectionType.PUNCTUATION
+                and prev_section.lang_section_type == LangSectionType.NEWLINE
+            ):
+                # NOTE: 如果前一个 section 是 newline，则合并
+                prev_section.lang_section_type = current_section.lang_section_type
+                prev_section.text += current_section.text
+                prev_section.substrings.extend(current_section.substrings)
+                for index, substr in enumerate(prev_section.substrings):
+                    if index == 0:
+                        continue
+                    else:
+                        substr.index = (
+                            new_sections[-1].substrings[index - 1].index
+                            + new_sections[-1].substrings[index - 1].length
+                        )
+
+            elif (
+                current_section.lang_section_type == LangSectionType.NEWLINE
+                and prev_section.lang_section_type != LangSectionType.PUNCTUATION
+            ):
+                # NOTE: 如果前一个 section 不是 punctuation，则合并
+                prev_section.text += current_section.text
+                prev_section.substrings.extend(current_section.substrings)
+                prev_section.substrings[-1].index = (
+                    prev_section.substrings[-2].index
+                    + prev_section.substrings[-2].length
+                )
+            else:
+                new_sections.append(current_section)
+        # NOTE: 将相同类型的 section 合并
+        new_sections_merged: List[SubStringSection] = [new_sections[0]]
+        for index, _ in enumerate(new_sections):
+            if index == 0:
+                continue
+            if (
+                new_sections_merged[-1].lang_section_type
+                == new_sections[index].lang_section_type
+            ):
+                new_sections_merged[-1].text += new_sections[index].text
+                new_sections_merged[-1].substrings.extend(
+                    new_sections[index].substrings
+                )
+            else:
+                new_sections_merged.append(new_sections[index])
+        # NOTE: 重新计算 index
+        for section_index, section in enumerate(new_sections_merged):
+            if section_index == 0:
+                for substr_index, substr in enumerate(section.substrings):
+                    if substr_index == 0:
+                        continue
+                    else:
+                        substr.index = (
+                            section.substrings[substr_index - 1].index
+                            + section.substrings[substr_index - 1].length
+                        )
+            else:
+                for substr_index, substr in enumerate(section.substrings):
+                    if substr_index == 0:
+                        substr.index = (
+                            new_sections_merged[section_index - 1].substrings[-1].index
+                            + new_sections_merged[section_index - 1]
+                            .substrings[-1]
+                            .length
+                        )
+                    else:
+                        substr.index = (
+                            section.substrings[substr_index - 1].index
+                            + section.substrings[substr_index - 1].length
+                        )
+        if self.debug:
+            logger.debug(
+                "---------------------------------after_merge_newline_sections:"
+            )
+            for section in new_sections_merged:
+                logger.debug(section)
+        return new_sections_merged
+
     # MARK: _merge_substrings_across_digit_based_on_sections
     def _merge_substrings_across_digit_based_on_sections(
         self,
@@ -799,7 +908,11 @@ def _merge_substrings_across_punctuation_based_on_sections(
             # 如果前一个 section 和当前的 section 类型不同，且其中一个是 punctuation，则合并
             if current_section.lang_section_type != prev_section.lang_section_type:
                 # 如果前一个 section 是 punctuation，且第一个元素不是 not_merge_punctuation，则合并
-                if prev_section.lang_section_type == LangSectionType.PUNCTUATION:
+                if (
+                    prev_section.lang_section_type == LangSectionType.PUNCTUATION
+                    and prev_section.substrings[0].text
+                    not in self.not_merge_punctuation
+                ):
                     # 将前一个 punctuation section 和当前的 section 合并
                     prev_section.text += current_section.text
                     prev_section.lang_section_type = current_section.lang_section_type
diff --git a/tests/test_split.py b/tests/test_split.py
@@ -1,3 +1,5 @@
+import logging
+
 from split_lang import LangSplitter
 
 texts = [
@@ -113,10 +115,10 @@
 そして、この先、私達３人の関係は壊れていくことにこの時は気づかなかった…""",
 ]
 
-lang_splitter = LangSplitter()
+lang_splitter = LangSplitter(log_level=logging.DEBUG)
 
 
-def test_split():
+def test_split_step_by_step():
     for text in texts:
         pre_split_sections = lang_splitter.pre_split(
             text=text,
@@ -143,12 +145,32 @@ def test_split():
                 sections=after_merge_punctuation_sections,
             )
         )
+
         # for section in after_merge_digit_sections:
         #     print(section)
 
+        after_merge_newline_sections = (
+            lang_splitter._merge_substrings_across_newline_based_on_sections(
+                sections=after_merge_digit_sections,
+            )
+        )
+        # for section in after_merge_newline_sections:
+        #     print(section)
+
+
+def test_split():
+    print("===========test_split===========")
+    lang_splitter.merge_across_punctuation = True
+    lang_splitter.not_merge_punctuation = ["。"]
+    for text in texts:
+        substrings = lang_splitter.split_by_lang(text=text)
+        for substr in substrings:
+            print(substr)
+
 
 def main():
-    test_split()
+    test_split_step_by_step()
+    # test_split()
     pass