Skip to content

Commit

Permalink
fix: add jupyter file demo split-lang-demo.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
DoodleBears committed Jun 30, 2024
1 parent 87b6f0f commit 555c4d5
Showing 1 changed file with 86 additions and 0 deletions.
86 changes: 86 additions & 0 deletions split-lang-demo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\wtpsplit\\__init__.py:48: DeprecationWarning: You are using WtP, the old sentence segmentation model. It is highly encouraged to use SaT instead due to strongly improved performance and efficiency. See https://github.com/segment-any-text/wtpsplit for more info. To ignore this warning, set ignore_legacy_warning=True.\n",
" warnings.warn(\n",
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\huggingface_hub\\file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n",
"2024-07-01 02:55:01,151 DEBUG [urllib3.connectionpool]: Starting new HTTPS connection (1): huggingface.co:443\n",
"2024-07-01 02:55:01,760 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/mixtures.skops HTTP/11\" 302 0\n",
"2024-07-01 02:55:02,348 DEBUG [urllib3.connectionpool]: https://huggingface.co:443 \"HEAD /benjamin/wtp-bert-mini/resolve/main/config.json HTTP/11\" 200 0\n",
"c:\\Users\\admin\\.conda\\envs\\melotts\\lib\\site-packages\\sklearn\\base.py:376: InconsistentVersionWarning: Trying to unpickle estimator LogisticRegression from version 1.2.2 when using version 1.5.0. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:\n",
"https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations\n",
" warnings.warn(\n"
]
}
],
"source": [
"import langsplit"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0|zh:你喜欢看\n",
"1|ja:アニメ\n",
"2|zh:吗\n",
"3|punctuation:?\n",
"----------------------\n"
]
}
],
"source": [
"from langsplit import split_by_lang\n",
"\n",
"texts = [\n",
" \"你喜欢看アニメ吗?\",\n",
"]\n",
"\n",
"for text in texts:\n",
" substr = split_by_lang(\n",
" text=text,\n",
" threshold=4.9e-5,\n",
" default_lang=\"en\",\n",
" )\n",
" for index, item in enumerate(substr):\n",
" print(f\"{index}|{item.lang}:{item.text}\")\n",
" print(\"----------------------\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "melotts",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 555c4d5

Please sign in to comment.