fix

llm-jp · Sep 22, 2024 · 192c995 · 192c995
1 parent a6b13e1
commit 192c995
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,6 @@
 # text2dataset
+[![pypi](https://img.shields.io/pypi/v/text2dataset.svg)](https://pypi.python.org/pypi/text2dataset)
+
 Easily turn large English text datasets into Japanese text datasets using open LLMs.
 
 A tool for converting a datasets.Dataset by translating the data in the "txt" column using Open LLM like gemma2 with vLLM, and adding a new "txt_ja" column (translated text in Japanese).
@@ -35,22 +37,32 @@ $ python src/text2dataset/main.py \
 ```
 
 ### Example
+You can use Translator class to translate texts into Japanese.
 ```python
->>> from datasets import load_dataset
->>> load_dataset("parquet", data_files="/path/to/input", split="train")
-DatasetDict({
-    train: Dataset({
-        features: ['__key__', '__url__', 'jpg', 'json', 'txt'],
-        num_rows: 1000
-    })
-})
->>> load_dataset("parquet", data_files="/path/to/output")
-DatasetDict({
-    train: Dataset({
-        features: ['__key__', '__url__', 'jpg', 'json', 'txt', 'txt_ja'],
-        num_rows: 1000
-    })
-})
+from datasets import load_dataset
+from text2dataset.translator import Translator
+
+ds = load_dataset("Abirate/english_quotes", split="train")
+ds = ds.select(range(10))
+print(ds.column_names)
+# ['quote', 'author', 'tags']
+print("\n".join(ds["quote"][:5]))
+# “Be yourself; everyone else is already taken.”
+# “I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”
+# “Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”
+# “So many books, so little time.”
+# “A room without books is like a body without a soul.”
+translator = Translator(model_id="google/gemma-2-9b-it", tensor_parallel_size=1, pipeline_parallel_size=1)
+translated = translator.translate(ds["quote"])
+ds = ds.add_column("quote_ja", translated)
+print(ds.column_names)
+# ['quote', 'author', 'tags', 'quote_ja']
+print("\n".join(ds["quote_ja"][:5]))
+#
+# 自分のことは、自己中心的で、衝動的で、少し不安定。失敗することもあるし、制御不能な時もあるし、扱いにくい時もある。でも、私が最悪な時をあなたが処理できないなら、最高の私をあなたが望む資格はない。
+# **宇宙と人間の愚かさ、どちらが無限大か分からない。**
+# 本がたくさん、時間が足りない。
+# 書籍のない部屋は、魂のない体と同じ。
 ```
 
 ## Areas for Improvement

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "text2dataset"
-version = "0.1.2"
+version = "0.1.6"
 description = "Easily turn large English text datasets into Japanese text datasets using open LLMs"
 authors = [
     { name = "speed1313", email = "speedtry13@icloud.com" }
@@ -13,11 +13,13 @@ dependencies = [
 ]
 readme = "README.md"
 requires-python = ">= 3.12.1"
-license = "MIT"
-url="https://github.com/llm-jp/text2dataset"
+license = {file = "LICENSE"}
+
+[project.urls]
+Repository = "https://github.com/llm-jp/text2dataset"
 
 [project.scripts]
-hello = "text2dataset:hello"
+text2dataset = "text2dataset.main:main"
 
 [build-system]
 requires = ["hatchling"]

diff --git a/src/text2dataset/__init__.py b/src/text2dataset/__init__.py
@@ -1,2 +0,0 @@
-def hello():
-    return "Hello from llm-translator!"

diff --git a/src/text2dataset/translator.py b/src/text2dataset/translator.py
@@ -54,7 +54,8 @@ def translate(self, text_list: list[str]) -> list[str]:
     model_id = "google/gemma-2-9b-it"
     tensor_parallel_size = 1
     pipeline_parallel_size = 1
-    translator = MockTranslator(model_id, tensor_parallel_size, pipeline_parallel_size)
-    text_list = ["Hello, how are you?"]
+    #translator = MockTranslator(model_id, tensor_parallel_size, pipeline_parallel_size)
+    translator = Translator(model_id, tensor_parallel_size, pipeline_parallel_size)
+    text_list = ["Hello, how are you?", "“Be yourself; everyone else is already taken.”"]
     translated = translator.translate(text_list)
     print(translated)
Original file line number	Diff line number	Diff line change
		@@ -1,2 +0,0 @@
		def hello():
		return "Hello from llm-translator!"