Skip to content

Commit

Permalink
Merge pull request #24 from DoodleBears/23-fix-long-text-split
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears authored Oct 17, 2024
2 parents afef386 + 638d41f commit b1885b9
Show file tree
Hide file tree
Showing 12 changed files with 974 additions and 447 deletions.
152 changes: 142 additions & 10 deletions split-lang-demo.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions split_lang/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,19 @@
"en": "en",
"hr": "en",
}
ZH_JA_LANG_MAP = {
"zh": "zh",
"yue": "zh", # 粤语
"wuu": "zh", # 吴语
"zh-cn": "zh",
"zh-tw": "x",
"ja": "ja",
}
NO_ZH_JA_LANG_MAP = {
"de": "de",
"fr": "fr",
"en": "en",
"hr": "en",
}

DEFAULT_LANG = "x"
4 changes: 2 additions & 2 deletions split_lang/detect_lang/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from wordfreq import word_frequency

from ..model import LangSectionType
from ..split.utils import contains_ja
from ..split.utils import contains_ja_kana

logger = logging.getLogger(__name__)

Expand All @@ -19,7 +19,7 @@ def fast_lang_detect(text: str) -> str:
# For example '衬衫' cannot be detected by `langdetect`, and `fast_langdetect` will detect it as 'en'
def detect_lang_combined(text: str, lang_section_type: LangSectionType) -> str:
if lang_section_type is LangSectionType.ZH_JA:
if contains_ja(text):
if contains_ja_kana(text):
return "ja"
return fast_lang_detect(text)
return fast_lang_detect(text)
Expand Down
4 changes: 0 additions & 4 deletions split_lang/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ class SubString(BaseModel):
"""index of `text` of original string"""
length: int
"""length of `text`"""
is_punctuation: bool
"""if `text` is punctuation"""
is_digit: bool
"""if `text` is punctuation"""


class SubStringSection(BaseModel):
Expand Down
2 changes: 1 addition & 1 deletion split_lang/split/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .splitter import LangSplitter
from .utils import PUNCTUATION, contains_hangul, contains_zh_ja, contains_ja
from .utils import PUNCTUATION, contains_hangul, contains_ja_kana, contains_zh_ja
Loading

0 comments on commit b1885b9

Please sign in to comment.