diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb new file mode 100644 index 0000000..678c0b1 --- /dev/null +++ b/split-lang-demo.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:48: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n", + " warnings.warn(\n", + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "2024-07-01 02:55:01,151 DEBUG [urllib3.connectionpool]: Starting new HTTPS connection (1): huggingface.co:443\n", + "2024-07-01 02:55:01,760 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/mixtures.skops HTTP/11\" 302 0\n", + "2024-07-01 02:55:02,348 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/config.json HTTP/11\" 200 0\n", + "c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", + "https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "import langsplit" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0|zh:你喜欢看\n", + "1|ja:アニメ\n", + "2|zh:吗\n", + "3|punctuation:?\n", + "----------------------\n" + ] + } + ], + "source": [ + "from langsplit import split_by_lang\n", + "\n", + "texts = [\n", + " \"你喜欢看アニメ吗?\",\n", + "]\n", + "\n", + "for text in texts:\n", + " substr = split_by_lang(\n", + " text=text,\n", + " threshold=4.9e-5,\n", + " default_lang=\"en\",\n", + " )\n", + " for index, item in enumerate(substr):\n", + " print(f\"{index}|{item.lang}:{item.text}\")\n", + " print(\"----------------------\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "melotts", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}