Merge pull request #248 from ikawaha/develop

ikawaha · web-flow · commit d0d6bfa76129 · 2021-07-26T20:28:41.000+09:00
Release candidate
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ The dictionary/statistical models such as MeCab-IPADIC, UniDic (unidic-mecab) an
 
 |dict| source | package |
 |:---|:---|:---|
-|MeCab IPADIC| mecab-ipadic-2.7.0-20070801 | [github.com/ikawaha/kagome-dict/ipa](https://github.com/ikawaha/kagome-dict/tree/master/ipa)| 
+|MeCab IPADIC| mecab-ipadic-2.7.0-20070801 | [github.com/ikawaha/kagome-dict/ipa](https://github.com/ikawaha/kagome-dict/tree/master/ipa)|
 |UniDIC| unidic-mecab-2.1.2_src | [github.com/ikawaha/kagome-dict/uni](https://github.com/ikawaha/kagome-dict/tree/master/uni) |
 
 **Experimental Features**
@@ -131,26 +131,32 @@ The commands are:
    [tokenize] - command line tokenize (*default)
    server - run tokenize server
    lattice - lattice viewer
+   sentence - tiny sentence splitter
    version - show version
 
-tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)]
+tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split] [-json]
   -dict string
-    	dict
+        dict
   -file string
-    	input file
+        input file
+  -json
+        outputs in JSON format
   -mode string
-    	tokenize mode (normal|search|extended) (default "normal")
+        tokenize mode (normal|search|extended) (default "normal")
   -simple
-    	display abbreviated dictionary contents
+        display abbreviated dictionary contents
+  -split
+        use tiny sentence splitter
   -sysdict string
-    	system dict type (ipa|uni) (default "ipa")
+        system dict type (ipa|uni) (default "ipa")
   -udict string
-    	user dict
+        user dict
 ```
 
 ### Tokenize command
 
 ```shellsession
+% # interactive mode
 % kagome
 すもももももももものうち
 すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
@@ -163,6 +169,64 @@ tokenize [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict
 EOS
 ```
 
+```shellsession
+% # piped standard input
+echo "すもももももももものうち" | kagome
+すもも  名詞,一般,*,*,*,*,すもも,スモモ,スモモ
+も      助詞,係助詞,*,*,*,*,も,モ,モ
+もも    名詞,一般,*,*,*,*,もも,モモ,モモ
+も      助詞,係助詞,*,*,*,*,も,モ,モ
+もも    名詞,一般,*,*,*,*,もも,モモ,モモ
+の      助詞,連体化,*,*,*,*,の,ノ,ノ
+うち    名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
+EOS
+```
+
+```shellsession
+% # JSON output
+% echo "猫" | kagome -json | jq .
+[
+  {
+    "id": 286994,
+    "start": 0,
+    "end": 1,
+    "surface": "猫",
+    "class": "KNOWN",
+    "pos": [
+      "名詞",
+      "一般",
+      "*",
+      "*"
+    ],
+    "base_form": "猫",
+    "reading": "ネコ",
+    "pronunciation": "ネコ",
+    "features": [
+      "名詞",
+      "一般",
+      "*",
+      "*",
+      "*",
+      "*",
+      "猫",
+      "ネコ",
+      "ネコ"
+    ]
+  }
+]
+```
+
+```shellsession
+echo "私ははにわよわわわんわん" | kagome -json | jq -r '.[].pronunciation'
+ワタシ
+ワ
+ハニワ
+ヨ
+ワ
+ワ
+ワンワン
+```
+
 ### Server command
 
 **API**
@@ -171,15 +235,15 @@ Start a server and try to access the "/tokenize" endpoint.
 
 ```shellsession
 % kagome server &
-% curl -XPUT localhost:6060/tokenize -d'{"sentence":"すもももももももものうち", "mode":"normal"}' | jq . 
+% curl -XPUT localhost:6060/tokenize -d'{"sentence":"すもももももももものうち", "mode":"normal"}' | jq .
 ```
 
-**Web App** 
+**Web App**
 
 [![demo](https://img.shields.io/badge/demo-heroku_deployed-blue.svg)](https://kagome.herokuapp.com/)
 
 
-Start a server and access `http://localhost:6060`. 
+Start a server and access `http://localhost:6060`.
 (To draw a lattice, demo application uses graphviz . You need graphviz installed.)
 
 ```shellsession
@@ -204,7 +268,7 @@ A debug tool of tokenize process outputs a lattice in graphviz dot format.
 # Building to WebAssembly
 
 You can see how kagome wasm works in [demo site.](http://ikawaha.github.io/kagome/)
-The source code can be found in `./sample/wasm`. 
+The source code can be found in `./sample/wasm`.
 
 # Licence
 
diff --git a/cmd/tokenize/cmd.go b/cmd/tokenize/cmd.go
@@ -7,7 +7,6 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
-	"strings"
 
 	"github.com/ikawaha/kagome-dict/dict"
 	"github.com/ikawaha/kagome-dict/ipa"
@@ -20,7 +19,8 @@ import (
 const (
 	CommandName  = "tokenize"
 	Description  = `command line tokenize`
-	usageMessage = "%s [-file input_file] [-dict dic_file] [-userdict userdic_file] [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split]\n"
+	usageMessage = "%s [-file input_file] [-dict dic_file] [-userdict userdic_file]" +
+		" [-sysdict (ipa|uni)] [-simple false] [-mode (normal|search|extended)] [-split] [-json]\n"
 )
 
 // ErrorWriter writes to stderr
@@ -37,6 +37,7 @@ type option struct {
 	simple  bool
 	mode    string
 	split   bool
+	json    bool
 	flagSet *flag.FlagSet
 }
 
@@ -56,6 +57,7 @@ func newOption(w io.Writer, eh flag.ErrorHandling) (o *option) {
 	o.flagSet.BoolVar(&o.simple, "simple", false, "display abbreviated dictionary contents")
 	o.flagSet.StringVar(&o.mode, "mode", "normal", "tokenize mode (normal|search|extended)")
 	o.flagSet.BoolVar(&o.split, "split", false, "use tiny sentence splitter")
+	o.flagSet.BoolVar(&o.json, "json", false, "outputs in JSON format")
 
 	return
 }
@@ -155,20 +157,8 @@ func command(opt *option) error {
 	if opt.split {
 		s.Split(filter.ScanSentences)
 	}
-	for s.Scan() {
-		sen := s.Text()
-		tokens := t.Analyze(sen, mode)
-		for i, size := 1, len(tokens); i < size; i++ {
-			tok := tokens[i]
-			c := tok.Features()
-			if tok.Class == tokenizer.DUMMY {
-				fmt.Printf("%s\n", tok.Surface)
-			} else {
-				fmt.Printf("%s\t%v\n", tok.Surface, strings.Join(c, ","))
-			}
-		}
-	}
-	return s.Err()
+
+	return PrintScannedTokens(s, t, mode, opt)
 }
 
 // Run receives the slice of args and executes the tokenize tool
diff --git a/cmd/tokenize/print_scanned_tokens.go b/cmd/tokenize/print_scanned_tokens.go
@@ -0,0 +1,116 @@
+package tokenize
+
+import (
+	"bufio"
+	"encoding/json"
+	"fmt"
+	"strings"
+
+	"github.com/ikawaha/kagome/v2/tokenizer"
+)
+
+// tokenedJSON is a struct to output the tokens as JSON format.
+type tokenedJSON struct {
+	ID            int      `json:"id"`
+	Start         int      `json:"start"`
+	End           int      `json:"end"`
+	Surface       string   `json:"surface"`
+	Class         string   `json:"class"`
+	POS           []string `json:"pos"`
+	BaseForm      string   `json:"base_form"`
+	Reading       string   `json:"reading"`
+	Pronunciation string   `json:"pronunciation"`
+	Features      []string `json:"features"`
+}
+
+// Variable for dependency injection and/or mocking for testing
+var (
+	JSONMarshal = json.Marshal
+	FmtPrintF   = fmt.Printf
+)
+
+func fmtPrintF(format string, a ...interface{}) {
+	_, _ = FmtPrintF(format, a...)
+}
+
+// parseTokenToJSON parses the token to JSON in the same format as the server mode response does.
+func parseTokenToJSON(tok tokenizer.Token) ([]byte, error) {
+	j := tokenedJSON{
+		ID:       tok.ID,
+		Start:    tok.Start,
+		End:      tok.End,
+		Surface:  tok.Surface,
+		Class:    fmt.Sprintf("%v", tok.Class),
+		POS:      tok.POS(),
+		Features: tok.Features(),
+	}
+
+	j.BaseForm, _ = tok.BaseForm()
+	j.Reading, _ = tok.Reading()
+	j.Pronunciation, _ = tok.Pronunciation()
+
+	return JSONMarshal(j)
+}
+
+// printTokensAsDefault prints the tokenized text in the default format.
+// The default format is: [Surface]\t[Features in CSV]\n
+func printTokensAsDefault(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) error {
+	for s.Scan() {
+		sen := s.Text()
+		tokens := t.Analyze(sen, mode)
+
+		for i, size := 1, len(tokens); i < size; i++ {
+			tok := tokens[i]
+			c := tok.Features()
+			if tok.Class == tokenizer.DUMMY {
+				fmtPrintF("%s\n", tok.Surface)
+			} else {
+				fmtPrintF("%s\t%v\n", tok.Surface, strings.Join(c, ","))
+			}
+		}
+	}
+
+	return s.Err()
+}
+
+// printTokensInJSON prints the tokenized text in JSON format.
+func printTokensInJSON(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode) (err error) {
+	var buff []byte
+
+	fmtPrintF("[\n") // Begin array bracket
+
+	for s.Scan() {
+		sen := s.Text()
+		tokens := t.Analyze(sen, mode)
+
+		for _, tok := range tokens {
+			if tok.ID == tokenizer.BosEosID {
+				continue
+			}
+
+			if len(buff) > 0 {
+				fmtPrintF("%s,\n", buff) // Print array element (JSON with comma)
+			}
+
+			if buff, err = parseTokenToJSON(tok); err != nil {
+				return err
+			}
+		}
+	}
+
+	if s.Err() == nil {
+		fmtPrintF("%s\n", buff) // Spit out the last buffer without comma to close the array
+		fmtPrintF("]\n")        // End array bracket
+	}
+
+	return s.Err()
+}
+
+// PrintScannedTokens scans and analyzes to tokenize the input and print out.
+func PrintScannedTokens(s *bufio.Scanner, t *tokenizer.Tokenizer, mode tokenizer.TokenizeMode, opt *option) error {
+	if opt.json {
+		return printTokensInJSON(s, t, mode)
+	}
+
+	return printTokensAsDefault(s, t, mode)
+}
diff --git a/cmd/tokenize/print_scanned_tokens_test.go b/cmd/tokenize/print_scanned_tokens_test.go