Skip to content

Commit f32f52f

Browse files
authored
feat: add --endpoint option to example (#197)
* feat: introduce KonohaAPITokenizer * feat: add --endpoint option to example * chore: with_postag is no more available
1 parent f6c4fd0 commit f32f52f

File tree

4 files changed

+20
-6
lines changed

4 files changed

+20
-6
lines changed

example/tokenize_demo.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
1+
import argparse
2+
13
from konoha import SentenceTokenizer
24
from konoha import WordTokenizer
35

46

57
if __name__ == "__main__":
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("--endpoint", type=str, default=None)
10+
args = parser.parse_args()
11+
612
sentence_tokenizer = SentenceTokenizer()
713
tokenizers = ["MeCab", "KyTea", "Janome", "nagisa", "Character"]
8-
tokenizers_support_postag = ["MeCab", "KyTea", "Janome", "nagisa"]
914

1015
word_tokenizers = []
1116
for word_tokenizer_name in tokenizers:
1217
try:
13-
_tokenizer = WordTokenizer(word_tokenizer_name)
18+
_tokenizer = WordTokenizer(word_tokenizer_name, endpoint=args.endpoint)
1419
word_tokenizers.append(_tokenizer)
1520

16-
if word_tokenizer_name in tokenizers_support_postag:
17-
_tokenizer = WordTokenizer(word_tokenizer_name)
18-
word_tokenizers.append(_tokenizer)
19-
2021
except (ImportError, RuntimeError):
2122
print("Skip: ", word_tokenizer_name)
2223

src/konoha/word_tokenizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from konoha.data.resource import Resource
1111
from konoha.data.token import Token
1212
from konoha.word_tokenizers.tokenizer import BaseTokenizer
13+
from konoha.word_tokenizers import KonohaAPITokenizer
1314

1415

1516
class WordTokenizer:
@@ -42,6 +43,8 @@ def __init__(
4243

4344
if not isinstance(endpoint, str):
4445
self._setup_tokenizer()
46+
else:
47+
self._tokenizer = KonohaAPITokenizer(tokenizer)
4548

4649
def _setup_tokenizer(self) -> None:
4750
if self._tokenizer_name == "character":

src/konoha/word_tokenizers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .character_tokenizer import CharacterTokenizer # NOQA
22
from .janome_tokenizer import JanomeTokenizer # NOQA
33
from .kytea_tokenizer import KyTeaTokenizer # NOQA
4+
from .konoha_api_tokenizer import KonohaAPITokenizer # NOQA
45
from .mecab_tokenizer import MeCabTokenizer # NOQA
56
from .nagisa_tokenizer import NagisaTokenizer # NOQA
67
from .sentencepiece_tokenizer import SentencepieceTokenizer # NOQA
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from konoha.word_tokenizers.tokenizer import BaseTokenizer
2+
3+
4+
class KonohaAPITokenizer(BaseTokenizer):
5+
def __init__(self, tokenizer: str):
6+
super().__init__(name=f"{tokenizer} (remote)")
7+
8+
def tokenize(self, text: str):
9+
pass

0 commit comments

Comments
 (0)