diff --git a/README.md b/README.md index 1a68676..2103ab9 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,37 @@ # jdepp-python -Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers) +
+ +
-## Status +Python binding for J.DepP(C++ implementation of Japanese Dependency Parsers) -W.I.P. +## Install -## Build configuration +``` +$ python -m pip install jdepp +``` -* MeCab style POS format: `FEATURE_SEP ','` -* See `jdepp/typedf.h` for more info about ifdef macros. +### Precompiled model files -## Precompiled model files +pip install does not install the model(dictionary). You can get precompiled model files(MeCab POS tagging + train with KNBC copus) from https://github.com/lighttransport/jdepp-python/releases/tag/v0.1.0 -Model file is licensed under 3-clause BSD license. +Precompiled KNBC model file is licensed under 3-clause BSD license. + +### Build configuration + +* MeCab style POS format: `FEATURE_SEP ','` +* See `jdepp/typedf.h` for more info about ifdef macros. ## Example Download precompiled model file. -``` +```bash $ wget https://github.com/lighttransport/jdepp-python/releases/download/v0.1.0/knbc-mecab-jumandic-2ndpoly.tar.gz $ tar xvf knbc-mecab-jumandic-2ndpoly.tar.gz ``` @@ -49,9 +57,39 @@ input_postagged = """吾輩 名詞,普通名詞,*,*,吾輩,わがはい,代表 EOS """ -parser.parse_from_postagged(input_postagged) +sent = parser.parse_from_postagged(input_postagged) +print(sent) ``` +### Print in tree + +```py +print(jdepp.to_tree(str(sent))) +``` + +``` +# S-ID: 1; J.DepP + 0:  吾輩は━━┓    + 1:   猫である。━━┓ + 2:     名前は━━┫ + 3:      まだ━━┫ + 4:        ない。EOS +``` + +### Graphviz dot export + + +`jdepp.to_dot` is provided to export graph as dot(Graphviz) + +```py +dot_text = jdepp.to_dot(str(sentence)) + +# feed output text to graphviz viewer, e.g. https://dreampuf.github.io/GraphvizOnline/ +``` + +
+ +
## POS tagged input format @@ -61,7 +99,7 @@ MeCab style. surface + TAB + feature(comma separated 7 fields) You can use jagger-python for POS tagging. -``` +```py import jagger import jdepp @@ -84,6 +122,8 @@ parser.load_model(jdepp_model_path) parser.parse_from_postagged(pos_tagged_input) ``` + + ## Build standalone C++ app + training a model If you just want to use J.DepP from cli(e.g. batch processing), diff --git a/example/simple_jdepp.py b/example/simple_jdepp.py index c07130b..6f03f27 100644 --- a/example/simple_jdepp.py +++ b/example/simple_jdepp.py @@ -1 +1,75 @@ import jdepp + +parser = jdepp.Jdepp() + +model_path = "model/knbc/" +parser.load_model(model_path) + +# NOTE: Mecab format: surface + TAB + feature(comma separated 7 fields) +input_postagged = """吾輩 名詞,普通名詞,*,*,吾輩,わがはい,代表表記:我が輩/わがはい カテゴリ:人 +は 助詞,副助詞,*,*,は,は,* +猫 名詞,普通名詞,*,*,猫,ねこ,* +である 判定詞,*,判定詞,デアル列基本形,だ,である,* +。 特殊,句点,*,*,。,。,* +名前 名詞,普通名詞,*,*,名前,なまえ,* +は 助詞,副助詞,*,*,は,は,* +まだ 副詞,*,*,*,まだ,まだ,* +ない 形容詞,*,イ形容詞アウオ段,基本形,ない,ない,* +。 特殊,句点,*,*,。,。,* +EOS +""" + +sent = parser.parse_from_postagged(input_postagged) +print(sent) + +print("--------------------") + +# Print each bunsetsu(chunk) +for chunk in sent.chunks(): + print("Chunk", chunk) + +print("--------------------") +# Print bunsetsu string(concat the surface of tokens) + +ss = [] +for chunk in sent.chunks(): + + s = "" + for token in chunk.tokens(): + s += token.surface() + + ss.append(s) + +print("|".join(ss)) + + +print("--------------------") + +# Print token in each chunk +for chunk in sent.chunks(): + print("Chunk ID:", chunk.id) + + for token in chunk.tokens(): + print(token) + + print("") + +print("--------------------") + +# Show dependents +for chunk in sent.chunks(): + if len(chunk.dependents()): + for dep_chunk in chunk.dependents(): + print(dep_chunk.id, "->", chunk.id) + + print("") + +print("--------------------") + +# print tree +print(jdepp.to_tree(str(sent))) + +print("--------------------") + +# export as graphviz dot +print(jdepp.to_dot(str(sent))) diff --git a/imgs/fastest-jdepp.png b/imgs/fastest-jdepp.png new file mode 100755 index 0000000..69a853e Binary files /dev/null and b/imgs/fastest-jdepp.png differ diff --git a/imgs/wagahai-dot.png b/imgs/wagahai-dot.png new file mode 100755 index 0000000..2c4bbb7 Binary files /dev/null and b/imgs/wagahai-dot.png differ diff --git a/jdepp/jdepp_tools.py b/jdepp/jdepp_tools.py index a921c15..a6a182c 100644 --- a/jdepp/jdepp_tools.py +++ b/jdepp/jdepp_tools.py @@ -64,7 +64,8 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1} if isinstance(lines, str): - lines = lines.split('\n') + # make List[str] + lines = [line + '\n' for line in lines.split('\n')] result = "" #for line in iter (sys.stdin.readline, ""): # no buffering @@ -94,7 +95,7 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals if not quiet or wrong: text = treeify (binfo) - result += hader + result += header result += text result += line binfo[:] = [] @@ -108,25 +109,92 @@ def to_tree(lines, verbose: bool = False, prob: bool = False, morph: bool = Fals # End jdepp/to_tree.py --------------- -# Export as dot(graphviz) +# Export as dot(graphviz), based on to_tree.py +# Copyright: Naoki Yoshinaga # Copyright: Light Transport Entertainment, Inc. # License: BSD -def dottify(lines, graph_name: str = 'jdepp', label_name: str = 'dependency'): + +def dottyfy (binfo, graph_name: str = "jdepp", label_name = "# S-ID; 1", prob: bool = False): + # TODO: better layouting by considering binfo.width + # TODO: show probability + # TODO: styles for node and edge. s = '' + s += 'digraph ' + graph_name + ' {\n' - s += 'digraph ' + graph_name + '{\n' + s += '\n' + s += ' graph [\n' + s += ' charset = "UTF-8";\n' + s += ' label = "{}";\n'.format(label_name) + s += ' ];\n' + s += '\n' - s += '\ngraph [\n' - s += ' charset = "UTF-8";\n' - s += ' label = "{}";\n'.format(label_name) - s += '];\n' + s += '\n' + s += ' node [ shape = record ];\n' s += '\n' - s += 'node [\n' + # define nodes + for b in binfo: + s += " bunsetsu{} [label=\"{}\"];\n".format(b.id, b.morph) - s += ']\n' + s += '\n' + + # define edges + for b in binfo: + if b.head < 0: + # root + continue + + s += " bunsetsu{} -> bunsetsu{};\n".format(b.id, b.head) - s += '}\n' + + s += '\n}\n' return s + + +def to_dot(lines, morph: bool = True, ignore: str = ""): + + binfo = [] + header = '' + text = '' + charset = '' + wrong = False + pat_s = re.compile (r'[\t\s]') + pat_i = re.compile (re.escape (ignore)) + tag = set (["D", "A", "P", "I"]) + ww ={'Na':1, 'W':2, 'F':2, 'H':1, 'A':2, 'N':1} + + if isinstance(lines, str): + # make List[str] + lines = [line + '\n' for line in lines.split('\n')] + + for line in lines: + if line[:7] == '# S-ID:' or (ignore and pat_i.match (line)): + header += line + elif line[:-1] == 'EOS': # EOS + for line_ in text[:-1].split ('\n'): + if line_[0] == '*': + gold, auto = line_[2:].split (' ', 3)[-2:] # [1:3] + p = "" + pos = auto.find ('@') + if pos != -1: + if prob: p = "%.2f" % float (auto[pos + 1:]) + auto = auto[:pos] + fail = gold[-1] in tag and auto[:-1] != gold[:-1] + wrong |= fail + binfo.append (Binfo (len (binfo), int (auto[:-1]), p, fail, gold)) + else: + if binfo[-1].morph and morph: + binfo[-1].morph += "|" + binfo[-1].morph += pat_s.split (line_, 1)[0] + for b in binfo: + b.len = sum (ww[width (x)] for x in b.morph) + if not wrong: + return dottyfy (binfo) + + return None # fail + else: + text += line + + return None # failed to parse line diff --git a/jdepp/python-binding-jdepp.cc b/jdepp/python-binding-jdepp.cc index 2843e47..26e3827 100644 --- a/jdepp/python-binding-jdepp.cc +++ b/jdepp/python-binding-jdepp.cc @@ -658,7 +658,10 @@ class PyJdepp { // This approach is redundunt and not memory-efficient, // but this make Python binding easier(we don't need to consider lifetime of Python/C++ object) const char *str = sent->print_tostr(pdep::RAW, /* print_prob */false); - pysent.set_str(std::string(str)); + + // Assume single sentence in input text(i.e. one `EOS` line) + std::string header = "# S-ID: " + std::to_string(1) + "; J.DepP\n"; + pysent.set_str(header + std::string(str)); std::vector py_chunks;