-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: add jupyter file demo split-lang-demo.ipynb
- Loading branch information
1 parent
87b6f0f
commit 555c4d5
Showing
1 changed file
with
86 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:48: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n", | ||
" warnings.warn(\n", | ||
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", | ||
" warnings.warn(\n", | ||
"2024-07-01 02:55:01,151 DEBUG [urllib3.connectionpool]: Starting new HTTPS connection (1): huggingface.co:443\n", | ||
"2024-07-01 02:55:01,760 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/mixtures.skops HTTP/11\" 302 0\n", | ||
"2024-07-01 02:55:02,348 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/config.json HTTP/11\" 200 0\n", | ||
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n", | ||
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n", | ||
" warnings.warn(\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import langsplit" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0|zh:你喜欢看\n", | ||
"1|ja:アニメ\n", | ||
"2|zh:吗\n", | ||
"3|punctuation:?\n", | ||
"----------------------\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from langsplit import split_by_lang\n", | ||
"\n", | ||
"texts = [\n", | ||
" \"你喜欢看アニメ吗?\",\n", | ||
"]\n", | ||
"\n", | ||
"for text in texts:\n", | ||
" substr = split_by_lang(\n", | ||
" text=text,\n", | ||
" threshold=4.9e-5,\n", | ||
" default_lang=\"en\",\n", | ||
" )\n", | ||
" for index, item in enumerate(substr):\n", | ||
" print(f\"{index}|{item.lang}:{item.text}\")\n", | ||
" print(\"----------------------\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "melotts", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.14" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |