Merge branch 'main' of github.com:lighttransport/jdepp-python into main

lighttransport · Jan 20, 2024 · 3978d2d · 3978d2d
2 parents 377e4fb + f260ca8
commit 3978d2d
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -1,29 +1,37 @@
 # jdepp-python
 
-Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers)
+<div align="center">
+  <img src="imgs/fastest-jdepp.png" width="300"/>
+</div>
 
-## Status
+Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers)
 
-W.I.P.
+## Install
 
-## Build configuration
+```
+$ python -m pip install jdepp
+```
 
-* MeCab style POS format: `FEATURE_SEP ','`
-* See `jdepp/typedf.h` for more info about ifdef macros.
+### Precompiled model files
 
-## Precompiled model files
+pip install does not install the model(dictionary).
 
 You can get precompiled model files(MeCab POS tagging + train with KNBC copus) from
 
 https://github.com/lighttransport/jdepp-python/releases/tag/v0.1.0
 
-Model file is licensed under 3-clause BSD license.
+Precompiled KNBC model file is licensed under 3-clause BSD license.
+
+### Build configuration
+
+* MeCab style POS format: `FEATURE_SEP ','`
+* See `jdepp/typedf.h` for more info about ifdef macros.
 
 ## Example
 
 Download precompiled model file.
 
-```
+```bash
 $ wget https://github.com/lighttransport/jdepp-python/releases/download/v0.1.0/knbc-mecab-jumandic-2ndpoly.tar.gz
 $ tar xvf knbc-mecab-jumandic-2ndpoly.tar.gz
 ```
@@ -49,9 +57,39 @@ input_postagged = """吾輩	名詞,普通名詞,*,*,吾輩,わがはい,代表
 EOS
 """
 
-parser.parse_from_postagged(input_postagged)
+sent = parser.parse_from_postagged(input_postagged)
+print(sent)
 ```
 
+### Print in tree
+
+```py
+print(jdepp.to_tree(str(sent)))
+```
+
+```
+# S-ID: 1; J.DepP
+  0:　　吾輩は━━┓　　　
+  1:　　　猫である。━━┓
+  2:　　　　　名前は━━┫
+  3:　　　　　　まだ━━┫
+  4:　　　　　　　　ない。EOS
+```
+
+### Graphviz dot export
+
+
+`jdepp.to_dot` is provided to export graph as dot(Graphviz)
+
+```py
+dot_text = jdepp.to_dot(str(sentence))
+
+# feed output text to graphviz viewer, e.g. https://dreampuf.github.io/GraphvizOnline/
+```
+
+<div align="center">
+  <img src="imgs/wagahai-dot.png" width="400"/>
+</div>
 
 ## POS tagged input format
 
@@ -61,7 +99,7 @@ MeCab style. surface + TAB + feature(comma separated 7 fields)
 
 You can use jagger-python for POS tagging.
 
-```
+```py
 import jagger
 import jdepp
 
@@ -84,6 +122,8 @@ parser.load_model(jdepp_model_path)
 parser.parse_from_postagged(pos_tagged_input)
 ```
 
+
+
 ## Build standalone C++ app + training a model
 
 If you just want to use J.DepP from cli(e.g. batch processing),

diff --git a/example/simple_jdepp.py b/example/simple_jdepp.py
@@ -1 +1,75 @@
 import jdepp
+
+parser = jdepp.Jdepp()
+
+model_path = "model/knbc/"
+parser.load_model(model_path)
+
+# NOTE: Mecab format: surface + TAB + feature(comma separated 7 fields)
+input_postagged = """吾輩	名詞,普通名詞,*,*,吾輩,わがはい,代表表記:我が輩/わがはい カテゴリ:人
+は	助詞,副助詞,*,*,は,は,*
+猫	名詞,普通名詞,*,*,猫,ねこ,*
+である	判定詞,*,判定詞,デアル列基本形,だ,である,*
+。	特殊,句点,*,*,。,。,*
+名前	名詞,普通名詞,*,*,名前,なまえ,*
+は	助詞,副助詞,*,*,は,は,*
+まだ	副詞,*,*,*,まだ,まだ,*
+ない	形容詞,*,イ形容詞アウオ段,基本形,ない,ない,*
+。	特殊,句点,*,*,。,。,*
+EOS
+"""
+
+sent = parser.parse_from_postagged(input_postagged)
+print(sent)
+
+print("--------------------")
+
+# Print each bunsetsu(chunk)
+for chunk in sent.chunks():
+    print("Chunk", chunk)
+
+print("--------------------")
+# Print bunsetsu string(concat the surface of tokens)
+
+ss = []
+for chunk in sent.chunks():
+
+    s = ""
+    for token in chunk.tokens():
+        s += token.surface()
+
+    ss.append(s)
+
+print("|".join(ss))
+
+
+print("--------------------")
+
+# Print token in each chunk
+for chunk in sent.chunks():
+    print("Chunk ID:", chunk.id) 
+
+    for token in chunk.tokens():
+        print(token)
+
+    print("")
+
+print("--------------------")
+
+# Show dependents
+for chunk in sent.chunks():
+    if len(chunk.dependents()):
+        for dep_chunk in chunk.dependents():
+            print(dep_chunk.id, "->", chunk.id)
+
+        print("")
+
+print("--------------------")
+
+# print tree 
+print(jdepp.to_tree(str(sent)))
+
+print("--------------------")
+
+# export as graphviz dot
+print(jdepp.to_dot(str(sent)))
diff --git a/imgs/fastest-jdepp.png b/imgs/fastest-jdepp.png
diff --git a/imgs/wagahai-dot.png b/imgs/wagahai-dot.png
diff --git a/jdepp/jdepp_tools.py b/jdepp/jdepp_tools.py
@@ -64,7 +64,8 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
     ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1}
 
     if isinstance(lines, str):
-        lines = lines.split('\n')
+        # make List[str]
+        lines = [line + '\n' for line in lines.split('\n')]
 
     result = ""
     #for line in iter (sys.stdin.readline, ""): # no buffering
@@ -94,7 +95,7 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
 
             if not quiet or wrong:
                 text = treeify (binfo)
-                result += hader
+                result += header
                 result += text
                 result += line
             binfo[:] = []
@@ -108,25 +109,92 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals
 
 # End jdepp/to_tree.py ---------------
 
-# Export as dot(graphviz)
+# Export as dot(graphviz), based on to_tree.py
+# Copyright: Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
 # Copyright: Light Transport Entertainment, Inc.
 # License: BSD
-def dottify(lines, graph_name: str = 'jdepp', label_name: str = 'dependency'):
+
+def dottyfy (binfo, graph_name: str = "jdepp", label_name = "# S-ID; 1", prob: bool = False):
+    # TODO: better layouting by considering binfo.width
+    # TODO: show probability
+    # TODO: styles for node and edge.
 
     s = ''
+    s += 'digraph ' + graph_name + ' {\n'
 
-    s += 'digraph ' + graph_name + '{\n'
+    s += '\n'
+    s += '  graph [\n'
+    s += '    charset = "UTF-8";\n'
+    s += '    label = "{}";\n'.format(label_name)
+    s += '  ];\n'
+    s += '\n'
 
-    s += '\ngraph [\n'
-    s += '  charset = "UTF-8";\n'
-    s += '  label = "{}";\n'.format(label_name)
-    s += '];\n'
+    s += '\n'
+    s += '  node [ shape = record ];\n'
     s += '\n'
 
-    s += 'node [\n'
+    # define nodes
+    for b in binfo:
+        s += "  bunsetsu{} [label=\"{}\"];\n".format(b.id, b.morph)
 
-    s += ']\n'
+    s += '\n'
+
+    # define edges
+    for b in binfo:
+        if b.head < 0:
+            # root
+            continue
+
+        s += "  bunsetsu{} -> bunsetsu{};\n".format(b.id, b.head)
 
-    s += '}\n'
+
+    s += '\n}\n'
 
     return s
+
+
+def to_dot(lines, morph: bool = True, ignore: str = ""):
+
+    binfo   = []
+    header  = ''
+    text    = ''
+    charset = ''
+    wrong   = False
+    pat_s = re.compile (r'[\t\s]')
+    pat_i = re.compile (re.escape (ignore))
+    tag = set (["D", "A", "P", "I"])
+    ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1}
+
+    if isinstance(lines, str):
+        # make List[str]
+        lines = [line + '\n' for line in lines.split('\n')]
+
+    for line in lines:
+        if line[:7] == '# S-ID:' or (ignore and pat_i.match (line)):
+            header += line
+        elif line[:-1] == 'EOS': # EOS
+            for line_ in text[:-1].split ('\n'):
+                if line_[0] == '*':
+                    gold, auto = line_[2:].split (' ', 3)[-2:] # [1:3]
+                    p = ""
+                    pos = auto.find ('@')
+                    if pos != -1:
+                        if prob: p = "%.2f" % float (auto[pos + 1:])
+                        auto = auto[:pos]
+                    fail = gold[-1] in tag and auto[:-1] != gold[:-1]
+                    wrong |= fail
+                    binfo.append (Binfo (len (binfo), int (auto[:-1]), p, fail, gold))
+                else:
+                    if binfo[-1].morph and morph:
+                        binfo[-1].morph += "|"
+                    binfo[-1].morph += pat_s.split (line_, 1)[0]
+            for b in binfo:
+                b.len = sum (ww[width (x)] for x in b.morph)
+            if not wrong:
+                return dottyfy (binfo)
+
+            return None # fail
+        else:
+            text += line
+
+    return None # failed to parse line
diff --git a/jdepp/python-binding-jdepp.cc b/jdepp/python-binding-jdepp.cc
@@ -658,7 +658,10 @@ class PyJdepp {
     // This approach is redundunt and not memory-efficient,
     // but this make Python binding easier(we don't need to consider lifetime of Python/C++ object)
     const char *str = sent->print_tostr(pdep::RAW, /* print_prob */false);
-    pysent.set_str(std::string(str));
+
+    // Assume single sentence in input text(i.e. one `EOS` line)
+    std::string header = "# S-ID: " + std::to_string(1) + "; J.DepP\n";
+    pysent.set_str(header + std::string(str));
 
     std::vector<PyChunk> py_chunks;