Skip to content

Commit

Permalink
fix(split-lang-demo): update to 1.3.0 with new syntax
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears committed Jul 6, 2024
1 parent a5a24df commit ecfad44
Showing 1 changed file with 104 additions and 33 deletions.
137 changes: 104 additions & 33 deletions split-lang-demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,46 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 21,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: split-lang==1.3.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (1.3.0)\n",
"Requirement already satisfied: fast-langdetect in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (0.1.1)\n",
"Requirement already satisfied: lingua-language-detector in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (2.0.2)\n",
"Requirement already satisfied: pydantic in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (2.7.4)\n",
"Requirement already satisfied: budoux in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (0.6.2)\n",
"Requirement already satisfied: fasttext-wheel>=0.9.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (0.9.2)\n",
"Requirement already satisfied: requests>=2.31.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (2.32.3)\n",
"Requirement already satisfied: robust-downloader>=0.0.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (0.0.2)\n",
"Requirement already satisfied: numpy<2.0.0,>=1.26.4 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (1.26.4)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.18.4 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (2.18.4)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (4.12.2)\n",
"Requirement already satisfied: pybind11>=2.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fasttext-wheel>=0.9.2->fast-langdetect->split-lang==1.3.0) (2.13.1)\n",
"Requirement already satisfied: setuptools>=0.7.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fasttext-wheel>=0.9.2->fast-langdetect->split-lang==1.3.0) (70.2.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (2.2.2)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (2024.6.2)\n",
"Requirement already satisfied: tqdm in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (4.66.4)\n",
"Requirement already satisfied: colorlog in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (6.8.2)\n",
"Requirement already satisfied: colorama in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from colorlog->robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (0.4.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%%capture\n",
"%pip install split-lang"
"%pip install split-lang==1.3.0"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand All @@ -21,15 +50,16 @@
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗我也喜欢看\n"
"2|zh:\n"
]
}
],
"source": [
"from split_lang import split_by_lang\n",
"text = \"你喜欢看アニメ吗我也喜欢看\"\n",
"from split_lang import LangSplitter\n",
"lang_splitter = LangSplitter()\n",
"text = \"你喜欢看アニメ吗\"\n",
"\n",
"substr = split_by_lang(\n",
"substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
")\n",
"for index, item in enumerate(substr):\n",
Expand All @@ -38,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 23,
"metadata": {},
"outputs": [
{
Expand All @@ -47,15 +77,50 @@
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗我也喜欢看\n",
"2|zh:吗?我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub, Thanks you. I love you\n",
"1|zh:请加星这个项目,谢谢你。我爱你\n",
"2|ja:この項目をスターしてください、ありがとうございます!愛してる\n",
"----------------------\n",
"0.005999565124511719\n"
]
}
],
"source": [
"from split_lang import LangSplitter\n",
"lang_splitter = LangSplitter(merge_across_punctuation=True)\n",
"import time\n",
"texts = [\n",
" \"你喜欢看アニメ吗?我也喜欢看\",\n",
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n",
"]\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"print(time2 - time1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗我也喜欢看\n",
"2|zh:吗\n",
"3|punctuation:?\n",
"4|zh:我也喜欢看\n",
"----------------------\n",
"0|en:Please star this project on GitHub\n",
"1|punctuation:, \n",
Expand All @@ -73,44 +138,50 @@
"13|punctuation:!\n",
"14|ja:愛してる\n",
"----------------------\n",
"0.15833711624145508\n",
"0.1587212085723877\n"
"0.007001399993896484\n"
]
}
],
"source": [
"from split_lang import split_by_lang\n",
"import time\n",
"texts = [\n",
" \"你喜欢看アニメ吗我也喜欢看\",\n",
" \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n",
"]\n",
"lang_splitter.merge_across_punctuation = False\n",
"time1 = time.time()\n",
"for text in texts:\n",
" substr = split_by_lang(\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" threshold=4.9e-5,\n",
" merge_across_punctuation=True,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time2 = time.time()\n",
"\n",
"print(time2 - time1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:衬衫的价格是\n",
"1|digit:9.15\n",
"2|zh:便士\n"
]
}
],
"source": [
"lang_splitter.merge_across_digit = False\n",
"texts = [\n",
" \"衬衫的价格是9.15便士\",\n",
"]\n",
"for text in texts:\n",
" substr = split_by_lang(\n",
" substr = lang_splitter.split_by_lang(\n",
" text=text,\n",
" threshold=4.9e-5,\n",
" merge_across_punctuation=False,\n",
" merge_across_digit=False,\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")\n",
"time3 = time.time()\n",
"\n",
"print(time2 - time1)\n",
"print(time3 - time2)"
" print(f\"{index}|{item.lang}:{item.text}\")"
]
}
],
Expand Down

0 comments on commit ecfad44

Please sign in to comment.