Skip to content

Commit d0d6bfa

Browse files
authored
Merge pull request #248 from ikawaha/develop
Release candidate
2 parents 92102e0 + 16d2de7 commit d0d6bfa

File tree

4 files changed

+373
-28
lines changed

4 files changed

+373
-28
lines changed

README.md

Lines changed: 76 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ The dictionary/statistical models such as MeCab-IPADIC, UniDic (unidic-mecab) an
2121

2222
|dict| source | package |
2323
|:---|:---|:---|
24-
|MeCab IPADIC| mecab-ipadic-2.7.0-20070801 | [github.com/ikawaha/kagome-dict/ipa](https://github.com/ikawaha/kagome-dict/tree/master/ipa)|
24+
|MeCab IPADIC| mecab-ipadic-2.7.0-20070801 | [github.com/ikawaha/kagome-dict/ipa](https://github.com/ikawaha/kagome-dict/tree/master/ipa)|
2525
|UniDIC| unidic-mecab-2.1.2_src | [github.com/ikawaha/kagome-dict/uni](https://github.com/ikawaha/kagome-dict/tree/master/uni) |
2626

2727
**Experimental Features**
@@ -131,26 +131,32 @@ The commands are:
131131
[tokenize] - command line tokenize (*default)
132132
server - run tokenize server
133133
lattice - lattice viewer
134+
sentence - tiny sentence splitter
134135
version - show version
135136

136-
tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)]
137+
tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split] [-json]
137138
-dict string
138-
dict
139+
dict
139140
-file string
140-
input file
141+
input file
142+
-json
143+
outputs in JSON format
141144
-mode string
142-
tokenize mode (normal|search|extended) (default "normal")
145+
tokenize mode (normal|search|extended) (default "normal")
143146
-simple
144-
display abbreviated dictionary contents
147+
display abbreviated dictionary contents
148+
-split
149+
use tiny sentence splitter
145150
-sysdict string
146-
system dict type (ipa|uni) (default "ipa")
151+
system dict type (ipa|uni) (default "ipa")
147152
-udict string
148-
user dict
153+
user dict
149154
```
150155

151156
### Tokenize command
152157

153158
```shellsession
159+
% # interactive mode
154160
% kagome
155161
すもももももももものうち
156162
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
@@ -163,6 +169,64 @@ tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict
163169
EOS
164170
```
165171

172+
```shellsession
173+
% # piped standard input
174+
echo "すもももももももものうち" | kagome
175+
すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
176+
も 助詞,係助詞,*,*,*,*,も,モ,モ
177+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
178+
も 助詞,係助詞,*,*,*,*,も,モ,モ
179+
もも 名詞,一般,*,*,*,*,もも,モモ,モモ
180+
の 助詞,連体化,*,*,*,*,の,ノ,ノ
181+
うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
182+
EOS
183+
```
184+
185+
```shellsession
186+
% # JSON output
187+
% echo "" | kagome -json | jq .
188+
[
189+
{
190+
"id": 286994,
191+
"start": 0,
192+
"end": 1,
193+
"surface": "猫",
194+
"class": "KNOWN",
195+
"pos": [
196+
"名詞",
197+
"一般",
198+
"*",
199+
"*"
200+
],
201+
"base_form": "猫",
202+
"reading": "ネコ",
203+
"pronunciation": "ネコ",
204+
"features": [
205+
"名詞",
206+
"一般",
207+
"*",
208+
"*",
209+
"*",
210+
"*",
211+
"猫",
212+
"ネコ",
213+
"ネコ"
214+
]
215+
}
216+
]
217+
```
218+
219+
```shellsession
220+
echo "私ははにわよわわわんわん" | kagome -json | jq -r '.[].pronunciation'
221+
ワタシ
222+
223+
ハニワ
224+
225+
226+
227+
ワンワン
228+
```
229+
166230
### Server command
167231

168232
**API**
@@ -171,15 +235,15 @@ Start a server and try to access the "/tokenize" endpoint.
171235

172236
```shellsession
173237
% kagome server &
174-
% curl -XPUT localhost:6060/tokenize -d'{"sentence":"すもももももももものうち", "mode":"normal"}' | jq .
238+
% curl -XPUT localhost:6060/tokenize -d'{"sentence":"すもももももももものうち", "mode":"normal"}' | jq .
175239
```
176240

177-
**Web App**
241+
**Web App**
178242

179243
[![demo](https://img.shields.io/badge/demo-heroku_deployed-blue.svg)](https://kagome.herokuapp.com/)
180244

181245

182-
Start a server and access `http://localhost:6060`.
246+
Start a server and access `http://localhost:6060`.
183247
(To draw a lattice, demo application uses graphviz . You need graphviz installed.)
184248

185249
```shellsession
@@ -204,7 +268,7 @@ A debug tool of tokenize process outputs a lattice in graphviz dot format.
204268
# Building to WebAssembly
205269

206270
You can see how kagome wasm works in [demo site.](http://ikawaha.github.io/kagome/)
207-
The source code can be found in `./sample/wasm`.
271+
The source code can be found in `./sample/wasm`.
208272

209273
# Licence
210274

cmd/tokenize/cmd.go

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"io"
88
"io/ioutil"
99
"os"
10-
"strings"
1110

1211
"github.com/ikawaha/kagome-dict/dict"
1312
"github.com/ikawaha/kagome-dict/ipa"
@@ -20,7 +19,8 @@ import (
2019
const (
2120
CommandName = "tokenize"
2221
Description = `command line tokenize`
23-
usageMessage = "%s [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split]\n"
22+
usageMessage = "%s [-file input_file] [-dict dic_file] [-userdict userdic_file]" +
23+
" [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split] [-json]\n"
2424
)
2525

2626
// ErrorWriter writes to stderr
@@ -37,6 +37,7 @@ type option struct {
3737
simple bool
3838
mode string
3939
split bool
40+
json bool
4041
flagSet *flag.FlagSet
4142
}
4243

@@ -56,6 +57,7 @@ func newOption(w io.Writer, eh flag.ErrorHandling) (o *option) {
5657
o.flagSet.BoolVar(&o.simple, "simple", false, "display abbreviated dictionary contents")
5758
o.flagSet.StringVar(&o.mode, "mode", "normal", "tokenize mode (normal|search|extended)")
5859
o.flagSet.BoolVar(&o.split, "split", false, "use tiny sentence splitter")
60+
o.flagSet.BoolVar(&o.json, "json", false, "outputs in JSON format")
5961

6062
return
6163
}
@@ -155,20 +157,8 @@ func command(opt *option) error {
155157
if opt.split {
156158
s.Split(filter.ScanSentences)
157159
}
158-
for s.Scan() {
159-
sen := s.Text()
160-
tokens := t.Analyze(sen, mode)
161-
for i, size := 1, len(tokens); i < size; i++ {
162-
tok := tokens[i]
163-
c := tok.Features()
164-
if tok.Class == tokenizer.DUMMY {
165-
fmt.Printf("%s\n", tok.Surface)
166-
} else {
167-
fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
168-
}
169-
}
170-
}
171-
return s.Err()
160+
161+
return PrintScannedTokens(s, t, mode, opt)
172162
}
173163

174164
// Run receives the slice of args and executes the tokenize tool

cmd/tokenize/print_scanned_tokens.go

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
package tokenize
2+
3+
import (
4+
"bufio"
5+
"encoding/json"
6+
"fmt"
7+
"strings"
8+
9+
"github.com/ikawaha/kagome/v2/tokenizer"
10+
)
11+
12+
// tokenedJSON is a struct to output the tokens as JSON format.
13+
type tokenedJSON struct {
14+
ID int `json:"id"`
15+
Start int `json:"start"`
16+
End int `json:"end"`
17+
Surface string `json:"surface"`
18+
Class string `json:"class"`
19+
POS []string `json:"pos"`
20+
BaseForm string `json:"base_form"`
21+
Reading string `json:"reading"`
22+
Pronunciation string `json:"pronunciation"`
23+
Features []string `json:"features"`
24+
}
25+
26+
// Variable for dependency injection and/or mocking for testing
27+
var (
28+
JSONMarshal = json.Marshal
29+
FmtPrintF = fmt.Printf
30+
)
31+
32+
func fmtPrintF(format string, a ...interface{}) {
33+
_, _ = FmtPrintF(format, a...)
34+
}
35+
36+
// parseTokenToJSON parses the token to JSON in the same format as the server mode response does.
37+
func parseTokenToJSON(tok tokenizer.Token) ([]byte, error) {
38+
j := tokenedJSON{
39+
ID: tok.ID,
40+
Start: tok.Start,
41+
End: tok.End,
42+
Surface: tok.Surface,
43+
Class: fmt.Sprintf("%v", tok.Class),
44+
POS: tok.POS(),
45+
Features: tok.Features(),
46+
}
47+
48+
j.BaseForm, _ = tok.BaseForm()
49+
j.Reading, _ = tok.Reading()
50+
j.Pronunciation, _ = tok.Pronunciation()
51+
52+
return JSONMarshal(j)
53+
}
54+
55+
// printTokensAsDefault prints the tokenized text in the default format.
56+
// The default format is: [Surface]\t[Features in CSV]\n
57+
func printTokensAsDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
58+
for s.Scan() {
59+
sen := s.Text()
60+
tokens := t.Analyze(sen, mode)
61+
62+
for i, size := 1, len(tokens); i < size; i++ {
63+
tok := tokens[i]
64+
c := tok.Features()
65+
if tok.Class == tokenizer.DUMMY {
66+
fmtPrintF("%s\n", tok.Surface)
67+
} else {
68+
fmtPrintF("%s\t%v\n", tok.Surface, strings.Join(c, ","))
69+
}
70+
}
71+
}
72+
73+
return s.Err()
74+
}
75+
76+
// printTokensInJSON prints the tokenized text in JSON format.
77+
func printTokensInJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
78+
var buff []byte
79+
80+
fmtPrintF("[\n") // Begin array bracket
81+
82+
for s.Scan() {
83+
sen := s.Text()
84+
tokens := t.Analyze(sen, mode)
85+
86+
for _, tok := range tokens {
87+
if tok.ID == tokenizer.BosEosID {
88+
continue
89+
}
90+
91+
if len(buff) > 0 {
92+
fmtPrintF("%s,\n", buff) // Print array element (JSON with comma)
93+
}
94+
95+
if buff, err = parseTokenToJSON(tok); err != nil {
96+
return err
97+
}
98+
}
99+
}
100+
101+
if s.Err() == nil {
102+
fmtPrintF("%s\n", buff) // Spit out the last buffer without comma to close the array
103+
fmtPrintF("]\n") // End array bracket
104+
}
105+
106+
return s.Err()
107+
}
108+
109+
// PrintScannedTokens scans and analyzes to tokenize the input and print out.
110+
func PrintScannedTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, opt *option) error {
111+
if opt.json {
112+
return printTokensInJSON(s, t, mode)
113+
}
114+
115+
return printTokensAsDefault(s, t, mode)
116+
}

0 commit comments

Comments
 (0)