From 1dbb526150b4d08400c97b72c5e79c9af0c3c6da Mon Sep 17 00:00:00 2001 From: DoodleBears Date: Mon, 1 Jul 2024 04:03:03 +0900 Subject: [PATCH] doc(README): add Google Colab link --- README.md | 17 +++++++++++------ setup.py | 1 + split-lang-demo.ipynb | 3 ++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6ea8d97..2e9decd 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ [![Downloads](https://static.pepy.tech/badge/split-lang)](https://pepy.tech/project/split-lang) [![Downloads](https://static.pepy.tech/badge/split-lang/month)](https://pepy.tech/project/split-lang) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DoodleBears/split-lang/blob/main/split-lang-demo.ipynb) + + [![Open Source Love](https://badges.frapsoft.com/os/mit/mit.svg?v=102)](https://github.com/ellerbrock/open-source-badge/) [![wakatime](https://wakatime.com/badge/user/5728d95a-5cfb-4acb-b600-e34c2fc231b6/project/e06e0a00-9ba1-453d-8c62-a0b2604aaaad.svg)](https://wakatime.com/badge/user/5728d95a-5cfb-4acb-b600-e34c2fc231b6/project/e06e0a00-9ba1-453d-8c62-a0b2604aaaad) @@ -12,7 +15,7 @@ Splitting sentences by concatenating over-split substrings based on their langua powered by [`wtpsplit`](https://github.com/segment-any-text/wtpsplit) and [`fast-langdetect`](https://github.com/LlmKira/fast-langdetect) and [`langdetect`](https://github.com/Mimino666/langdetect) -## Idea +## 1.1. Idea **Stage 1**: rule-based split using punctuation - `hello, how are you` -> `hello` | `,` | `how are you` @@ -39,15 +42,15 @@ Vielen Dank merci beaucoup for your help. ``` - [1. `split-lang`](#1-split-lang) - - [Idea](#idea) + - [1.1. Idea](#11-idea) - [2. Motivation](#2-motivation) - [3. Usage](#3-usage) - [3.1. Installation](#31-installation) - [3.2. Basic](#32-basic) - [3.2.1. `split_by_lang`](#321-split_by_lang) - [3.3. Advanced](#33-advanced) - - [`threshold`](#threshold) - - [3.3.1. usage of `lang_map` (for better result)](#331-usage-of-lang_map-for-better-result) + - [3.3.1. `threshold`](#331-threshold) + - [3.3.2. usage of `lang_map` (for better result)](#332-usage-of-lang_map-for-better-result) # 3. Usage @@ -65,6 +68,8 @@ pip install split-lang ## 3.2. Basic ### 3.2.1. `split_by_lang` +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DoodleBears/split-lang/blob/main/split-lang-demo.ipynb) + ```python from split_lang import split_by_lang @@ -92,7 +97,7 @@ for text in texts: ## 3.3. Advanced -### `threshold` +### 3.3.1. `threshold` the threshold used in `wtpsplit`, default to 1e-4, the smaller the more substring you will get in `wtpsplit` stage @@ -100,7 +105,7 @@ the threshold used in `wtpsplit`, default to 1e-4, the smaller the more substrin > Check GitHub Repo `tests/split_acc.py` to find best threshold for your use case -### 3.3.1. usage of `lang_map` (for better result) +### 3.3.2. usage of `lang_map` (for better result) > [!IMPORTANT] > Add lang code for your usecase if other languages are needed diff --git a/setup.py b/setup.py index cd6cd2a..8fcfc60 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ def read(*relpath): url="https://github.com/DoodleBears/langsplit", author="DoodleBear", author_email="yangmufeng233@gmail.com", + license="MIT", packages=find_packages(), install_requires=[ "langdetect", diff --git a/split-lang-demo.ipynb b/split-lang-demo.ipynb index 62a4b64..0523b3c 100644 --- a/split-lang-demo.ipynb +++ b/split-lang-demo.ipynb @@ -6,7 +6,8 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install split-lang --upgrade" + "%pip install split-lang --upgrade\n", + "%pip install numpy==1.26.0" ] }, {