From 555c4d59bd0284a1e33170cce8f06b2343e6aef1 Mon Sep 17 00:00:00 2001 From: DoodleBears Date: Mon, 1 Jul 2024 02:57:57 +0900 Subject: [PATCH] fix: add jupyter file demo split-lang-demo.ipynb --- split-lang-demo.ipynb | 86 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 split-lang-demo.ipynb diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb new file mode 100644 index 0000000..678c0b1 --- /dev/null +++ b/split-lang-demo.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:48: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n", + " warnings.warn(\n", + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "2024-07-01 02:55:01,151 DEBUG [urllib3.connectionpool]: Starting new HTTPS connection (1): huggingface.co:443\n", + "2024-07-01 02:55:01,760 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/mixtures.skops HTTP/11\" 302 0\n", + "2024-07-01 02:55:02,348 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/config.json HTTP/11\" 200 0\n", + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import langsplit" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:你喜欢看\n", + "1|ja:アニメ\n", + "2|zh:吗\n", + "3|punctuation:?\n", + "----------------------\n" + ] + } + ], + "source": [ + "from langsplit import split_by_lang\n", + "\n", + "texts = [\n", + " \"你喜欢看アニメ吗?\",\n", + "]\n", + "\n", + "for text in texts:\n", + " substr = split_by_lang(\n", + " text=text,\n", + " threshold=4.9e-5,\n", + " default_lang=\"en\",\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")\n", + " print(\"----------------------\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "melotts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}