From ecfad44677677105e65eff0ff35ba55098267d83 Mon Sep 17 00:00:00 2001 From: DoodleBears Date: Sun, 7 Jul 2024 03:09:33 +0900 Subject: [PATCH] fix(split-lang-demo): update to 1.3.0 with new syntax --- split-lang-demo.ipynb | 137 ++++++++++++++++++++++++++++++++---------- 1 file changed, 104 insertions(+), 33 deletions(-) diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb index 8b4fb8f..b8a9921 100644 --- a/split-lang-demo.ipynb +++ b/split-lang-demo.ipynb @@ -2,17 +2,46 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: split-lang==1.3.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (1.3.0)\n", + "Requirement already satisfied: fast-langdetect in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (0.1.1)\n", + "Requirement already satisfied: lingua-language-detector in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (2.0.2)\n", + "Requirement already satisfied: pydantic in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (2.7.4)\n", + "Requirement already satisfied: budoux in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from split-lang==1.3.0) (0.6.2)\n", + "Requirement already satisfied: fasttext-wheel>=0.9.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (0.9.2)\n", + "Requirement already satisfied: requests>=2.31.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (2.32.3)\n", + "Requirement already satisfied: robust-downloader>=0.0.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (0.0.2)\n", + "Requirement already satisfied: numpy<2.0.0,>=1.26.4 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fast-langdetect->split-lang==1.3.0) (1.26.4)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.18.4 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (2.18.4)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from pydantic->split-lang==1.3.0) (4.12.2)\n", + "Requirement already satisfied: pybind11>=2.2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fasttext-wheel>=0.9.2->fast-langdetect->split-lang==1.3.0) (2.13.1)\n", + "Requirement already satisfied: setuptools>=0.7.0 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from fasttext-wheel>=0.9.2->fast-langdetect->split-lang==1.3.0) (70.2.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (2.2.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from requests>=2.31.0->fast-langdetect->split-lang==1.3.0) (2024.6.2)\n", + "Requirement already satisfied: tqdm in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (4.66.4)\n", + "Requirement already satisfied: colorlog in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (6.8.2)\n", + "Requirement already satisfied: colorama in c:\\users\\admin\\.conda\\envs\\melotts\\lib\\site-packages (from colorlog->robust-downloader>=0.0.2->fast-langdetect->split-lang==1.3.0) (0.4.6)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%%capture\n", - "%pip install split-lang" + "%pip install split-lang==1.3.0" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -21,15 +50,16 @@ "text": [ "0|zh:你喜欢看\n", "1|ja:アニメ\n", - "2|zh:吗我也喜欢看\n" + "2|zh:吗\n" ] } ], "source": [ - "from split_lang import split_by_lang\n", - "text = \"你喜欢看アニメ吗我也喜欢看\"\n", + "from split_lang import LangSplitter\n", + "lang_splitter = LangSplitter()\n", + "text = \"你喜欢看アニメ吗\"\n", "\n", - "substr = split_by_lang(\n", + "substr = lang_splitter.split_by_lang(\n", " text=text,\n", ")\n", "for index, item in enumerate(substr):\n", @@ -38,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -47,15 +77,50 @@ "text": [ "0|zh:你喜欢看\n", "1|ja:アニメ\n", - "2|zh:吗我也喜欢看\n", + "2|zh:吗?我也喜欢看\n", "----------------------\n", "0|en:Please star this project on GitHub, Thanks you. I love you\n", "1|zh:请加星这个项目,谢谢你。我爱你\n", "2|ja:この項目をスターしてください、ありがとうございます!愛してる\n", "----------------------\n", + "0.005999565124511719\n" + ] + } + ], + "source": [ + "from split_lang import LangSplitter\n", + "lang_splitter = LangSplitter(merge_across_punctuation=True)\n", + "import time\n", + "texts = [\n", + " \"你喜欢看アニメ吗?我也喜欢看\",\n", + " \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", + "]\n", + "time1 = time.time()\n", + "for text in texts:\n", + " substr = lang_splitter.split_by_lang(\n", + " text=text,\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")\n", + " print(\"----------------------\")\n", + "time2 = time.time()\n", + "print(time2 - time1)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ "0|zh:你喜欢看\n", "1|ja:アニメ\n", - "2|zh:吗我也喜欢看\n", + "2|zh:吗\n", + "3|punctuation:?\n", + "4|zh:我也喜欢看\n", "----------------------\n", "0|en:Please star this project on GitHub\n", "1|punctuation:, \n", @@ -73,44 +138,50 @@ "13|punctuation:!\n", "14|ja:愛してる\n", "----------------------\n", - "0.15833711624145508\n", - "0.1587212085723877\n" + "0.007001399993896484\n" ] } ], "source": [ - "from split_lang import split_by_lang\n", - "import time\n", - "texts = [\n", - " \"你喜欢看アニメ吗我也喜欢看\",\n", - " \"Please star this project on GitHub, Thanks you. I love you请加星这个项目,谢谢你。我爱你この項目をスターしてください、ありがとうございます!愛してる\",\n", - "]\n", + "lang_splitter.merge_across_punctuation = False\n", "time1 = time.time()\n", "for text in texts:\n", - " substr = split_by_lang(\n", + " substr = lang_splitter.split_by_lang(\n", " text=text,\n", - " threshold=4.9e-5,\n", - " merge_across_punctuation=True,\n", " )\n", " for index, item in enumerate(substr):\n", " print(f\"{index}|{item.lang}:{item.text}\")\n", " print(\"----------------------\")\n", "time2 = time.time()\n", - "\n", + "print(time2 - time1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:衬衫的价格是\n", + "1|digit:9.15\n", + "2|zh:便士\n" + ] + } + ], + "source": [ + "lang_splitter.merge_across_digit = False\n", + "texts = [\n", + " \"衬衫的价格是9.15便士\",\n", + "]\n", "for text in texts:\n", - " substr = split_by_lang(\n", + " substr = lang_splitter.split_by_lang(\n", " text=text,\n", - " threshold=4.9e-5,\n", - " merge_across_punctuation=False,\n", - " merge_across_digit=False,\n", " )\n", " for index, item in enumerate(substr):\n", - " print(f\"{index}|{item.lang}:{item.text}\")\n", - " print(\"----------------------\")\n", - "time3 = time.time()\n", - "\n", - "print(time2 - time1)\n", - "print(time3 - time2)" + " print(f\"{index}|{item.lang}:{item.text}\")" ] } ],