diff --git a/README.md b/README.md
index 1a68676..2103ab9 100644
--- a/README.md
+++ b/README.md
@@ -1,29 +1,37 @@
# jdepp-python
-Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers)
+
+

+
-## Status
+Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers)
-W.I.P.
+## Install
-## Build configuration
+```
+$ python -m pip install jdepp
+```
-* MeCab style POS format: `FEATURE_SEP ','`
-* See `jdepp/typedf.h` for more info about ifdef macros.
+### Precompiled model files
-## Precompiled model files
+pip install does not install the model(dictionary).
You can get precompiled model files(MeCab POS tagging + train with KNBC copus) from
https://github.com/lighttransport/jdepp-python/releases/tag/v0.1.0
-Model file is licensed under 3-clause BSD license.
+Precompiled KNBC model file is licensed under 3-clause BSD license.
+
+### Build configuration
+
+* MeCab style POS format: `FEATURE_SEP ','`
+* See `jdepp/typedf.h` for more info about ifdef macros.
## Example
Download precompiled model file.
-```
+```bash
$ wget https://github.com/lighttransport/jdepp-python/releases/download/v0.1.0/knbc-mecab-jumandic-2ndpoly.tar.gz
$ tar xvf knbc-mecab-jumandic-2ndpoly.tar.gz
```
@@ -49,9 +57,39 @@ input_postagged = """吾輩 名詞,普通名詞,*,*,吾輩,わがはい,代表
EOS
"""
-parser.parse_from_postagged(input_postagged)
+sent = parser.parse_from_postagged(input_postagged)
+print(sent)
```
+### Print in tree
+
+```py
+print(jdepp.to_tree(str(sent)))
+```
+
+```
+# S-ID: 1; J.DepP
+ 0: 吾輩は━━┓
+ 1: 猫である。━━┓
+ 2: 名前は━━┫
+ 3: まだ━━┫
+ 4: ない。EOS
+```
+
+### Graphviz dot export
+
+
+`jdepp.to_dot` is provided to export graph as dot(Graphviz)
+
+```py
+dot_text = jdepp.to_dot(str(sentence))
+
+# feed output text to graphviz viewer, e.g. https://dreampuf.github.io/GraphvizOnline/
+```
+
+
+

+
## POS tagged input format
@@ -61,7 +99,7 @@ MeCab style. surface + TAB + feature(comma separated 7 fields)
You can use jagger-python for POS tagging.
-```
+```py
import jagger
import jdepp
@@ -84,6 +122,8 @@ parser.load_model(jdepp_model_path)
parser.parse_from_postagged(pos_tagged_input)
```
+
+
## Build standalone C++ app + training a model
If you just want to use J.DepP from cli(e.g. batch processing),
diff --git a/example/simple_jdepp.py b/example/simple_jdepp.py
index c07130b..6f03f27 100644
--- a/example/simple_jdepp.py
+++ b/example/simple_jdepp.py
@@ -1 +1,75 @@
import jdepp
+
+parser = jdepp.Jdepp()
+
+model_path = "model/knbc/"
+parser.load_model(model_path)
+
+# NOTE: Mecab format: surface + TAB + feature(comma separated 7 fields)
+input_postagged = """吾輩 名詞,普通名詞,*,*,吾輩,わがはい,代表表記:我が輩/わがはい カテゴリ:人
+は 助詞,副助詞,*,*,は,は,*
+猫 名詞,普通名詞,*,*,猫,ねこ,*
+である 判定詞,*,判定詞,デアル列基本形,だ,である,*
+。 特殊,句点,*,*,。,。,*
+名前 名詞,普通名詞,*,*,名前,なまえ,*
+は 助詞,副助詞,*,*,は,は,*
+まだ 副詞,*,*,*,まだ,まだ,*
+ない 形容詞,*,イ形容詞アウオ段,基本形,ない,ない,*
+。 特殊,句点,*,*,。,。,*
+EOS
+"""
+
+sent = parser.parse_from_postagged(input_postagged)
+print(sent)
+
+print("--------------------")
+
+# Print each bunsetsu(chunk)
+for chunk in sent.chunks():
+ print("Chunk", chunk)
+
+print("--------------------")
+# Print bunsetsu string(concat the surface of tokens)
+
+ss = []
+for chunk in sent.chunks():
+
+ s = ""
+ for token in chunk.tokens():
+ s += token.surface()
+
+ ss.append(s)
+
+print("|".join(ss))
+
+
+print("--------------------")
+
+# Print token in each chunk
+for chunk in sent.chunks():
+ print("Chunk ID:", chunk.id)
+
+ for token in chunk.tokens():
+ print(token)
+
+ print("")
+
+print("--------------------")
+
+# Show dependents
+for chunk in sent.chunks():
+ if len(chunk.dependents()):
+ for dep_chunk in chunk.dependents():
+ print(dep_chunk.id, "->", chunk.id)
+
+ print("")
+
+print("--------------------")
+
+# print tree
+print(jdepp.to_tree(str(sent)))
+
+print("--------------------")
+
+# export as graphviz dot
+print(jdepp.to_dot(str(sent)))
diff --git a/imgs/fastest-jdepp.png b/imgs/fastest-jdepp.png
new file mode 100755
index 0000000..69a853e
Binary files /dev/null and b/imgs/fastest-jdepp.png differ
diff --git a/imgs/wagahai-dot.png b/imgs/wagahai-dot.png
new file mode 100755
index 0000000..2c4bbb7
Binary files /dev/null and b/imgs/wagahai-dot.png differ
diff --git a/jdepp/jdepp_tools.py b/jdepp/jdepp_tools.py
index a921c15..a6a182c 100644
--- a/jdepp/jdepp_tools.py
+++ b/jdepp/jdepp_tools.py
@@ -64,7 +64,8 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1}
if isinstance(lines, str):
- lines = lines.split('\n')
+ # make List[str]
+ lines = [line + '\n' for line in lines.split('\n')]
result = ""
#for line in iter (sys.stdin.readline, ""): # no buffering
@@ -94,7 +95,7 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
if not quiet or wrong:
text = treeify (binfo)
- result += hader
+ result += header
result += text
result += line
binfo[:] = []
@@ -108,25 +109,92 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
# End jdepp/to_tree.py ---------------
-# Export as dot(graphviz)
+# Export as dot(graphviz), based on to_tree.py
+# Copyright: Naoki Yoshinaga
# Copyright: Light Transport Entertainment, Inc.
# License: BSD
-def dottify(lines, graph_name: str = 'jdepp', label_name: str = 'dependency'):
+
+def dottyfy (binfo, graph_name: str = "jdepp", label_name = "# S-ID; 1", prob: bool = False):
+ # TODO: better layouting by considering binfo.width
+ # TODO: show probability
+ # TODO: styles for node and edge.
s = ''
+ s += 'digraph ' + graph_name + ' {\n'
- s += 'digraph ' + graph_name + '{\n'
+ s += '\n'
+ s += ' graph [\n'
+ s += ' charset = "UTF-8";\n'
+ s += ' label = "{}";\n'.format(label_name)
+ s += ' ];\n'
+ s += '\n'
- s += '\ngraph [\n'
- s += ' charset = "UTF-8";\n'
- s += ' label = "{}";\n'.format(label_name)
- s += '];\n'
+ s += '\n'
+ s += ' node [ shape = record ];\n'
s += '\n'
- s += 'node [\n'
+ # define nodes
+ for b in binfo:
+ s += " bunsetsu{} [label=\"{}\"];\n".format(b.id, b.morph)
- s += ']\n'
+ s += '\n'
+
+ # define edges
+ for b in binfo:
+ if b.head < 0:
+ # root
+ continue
+
+ s += " bunsetsu{} -> bunsetsu{};\n".format(b.id, b.head)
- s += '}\n'
+
+ s += '\n}\n'
return s
+
+
+def to_dot(lines, morph: bool = True, ignore: str = ""):
+
+ binfo = []
+ header = ''
+ text = ''
+ charset = ''
+ wrong = False
+ pat_s = re.compile (r'[\t\s]')
+ pat_i = re.compile (re.escape (ignore))
+ tag = set (["D", "A", "P", "I"])
+ ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1}
+
+ if isinstance(lines, str):
+ # make List[str]
+ lines = [line + '\n' for line in lines.split('\n')]
+
+ for line in lines:
+ if line[:7] == '# S-ID:' or (ignore and pat_i.match (line)):
+ header += line
+ elif line[:-1] == 'EOS': # EOS
+ for line_ in text[:-1].split ('\n'):
+ if line_[0] == '*':
+ gold, auto = line_[2:].split (' ', 3)[-2:] # [1:3]
+ p = ""
+ pos = auto.find ('@')
+ if pos != -1:
+ if prob: p = "%.2f" % float (auto[pos + 1:])
+ auto = auto[:pos]
+ fail = gold[-1] in tag and auto[:-1] != gold[:-1]
+ wrong |= fail
+ binfo.append (Binfo (len (binfo), int (auto[:-1]), p, fail, gold))
+ else:
+ if binfo[-1].morph and morph:
+ binfo[-1].morph += "|"
+ binfo[-1].morph += pat_s.split (line_, 1)[0]
+ for b in binfo:
+ b.len = sum (ww[width (x)] for x in b.morph)
+ if not wrong:
+ return dottyfy (binfo)
+
+ return None # fail
+ else:
+ text += line
+
+ return None # failed to parse line
diff --git a/jdepp/python-binding-jdepp.cc b/jdepp/python-binding-jdepp.cc
index 2843e47..26e3827 100644
--- a/jdepp/python-binding-jdepp.cc
+++ b/jdepp/python-binding-jdepp.cc
@@ -658,7 +658,10 @@ class PyJdepp {
// This approach is redundunt and not memory-efficient,
// but this make Python binding easier(we don't need to consider lifetime of Python/C++ object)
const char *str = sent->print_tostr(pdep::RAW, /* print_prob */false);
- pysent.set_str(std::string(str));
+
+ // Assume single sentence in input text(i.e. one `EOS` line)
+ std::string header = "# S-ID: " + std::to_string(1) + "; J.DepP\n";
+ pysent.set_str(header + std::string(str));
std::vector py_chunks;