Merge pull request #132 from FengZiYjun/v0.3.1

fastNLP V0.3.1
fastnlp · Feb 6, 2019 · 13faa2b · 13faa2b
2 parents 3fa95b6 + b66d7b8
commit 13faa2b
Show file tree

Hide file tree

Showing 70 changed files with 3,957 additions and 4,832 deletions.
diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,5 @@
+ignore:
+- "reproduction"  # ignore folders and all its contents
+- "setup.py"
+- "docs"
+- "tutorials"
diff --git a/docs/source/tutorials/fastnlp_10tmin_tutorial.rst b/docs/source/tutorials/fastnlp_10tmin_tutorial.rst
@@ -1,7 +1,8 @@
-
-fastNLP上手教程
+fastNLP 10分钟上手教程
 ===============
 
+教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_10min_tutorial.ipynb
+
 fastNLP提供方便的数据预处理，训练和测试模型的功能
 
 DataSet & Instance

diff --git a/docs/source/tutorials/fastnlp_1_minute_tutorial.rst b/docs/source/tutorials/fastnlp_1_minute_tutorial.rst
@@ -2,6 +2,8 @@
 FastNLP 1分钟上手教程
 =====================
 
+教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_1min_tutorial.ipynb
+
 step 1
 ------
 

diff --git a/docs/source/tutorials/fastnlp_advanced_tutorial.rst b/docs/source/tutorials/fastnlp_advanced_tutorial.rst
@@ -0,0 +1,5 @@
+fastNLP 进阶教程
+===============
+
+教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb
+
diff --git a/docs/source/tutorials/fastnlp_developer_guide.rst b/docs/source/tutorials/fastnlp_developer_guide.rst
@@ -0,0 +1,5 @@
+fastNLP 开发者指南
+===============
+
+原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/tutorial_for_developer.md
+
diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst
@@ -5,6 +5,7 @@ Installation
 .. contents::
    :local:
 
+Make sure your environment satisfies https://github.com/fastnlp/fastNLP/blob/master/requirements.txt .
 
 Run the following commands to install fastNLP package:
 

diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst
@@ -6,4 +6,6 @@ Quickstart
 
    ../tutorials/fastnlp_1_minute_tutorial
    ../tutorials/fastnlp_10tmin_tutorial
+   ../tutorials/fastnlp_advanced_tutorial
+   ../tutorials/fastnlp_developer_guide
 
diff --git a/fastNLP/api/README.md b/fastNLP/api/README.md
@@ -18,26 +18,27 @@ print(cws.predict(text))
 # ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?']
 ```
 
-### 中文分词+词性标注
+### 词性标注
 ```python
-text = ['编者按：7月12日，英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
-        '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
-        '那么这款无人机到底有多厉害？']
+# 输入已分词序列
+text = [['编者', '按：', '7月', '12日', '，', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
+         '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
+        ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '？']]
 from fastNLP.api import POS
 pos = POS(device='cpu')
 print(pos.predict(text))
-# [['编者/NN', '按/P', '：/PU', '7月/NT', '12日/NR', '，/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一/OD', '款高/NN', '科技/NN', '隐形/NN', '无/VE', '人机/NN', '雷电/NN', '之/DEG', '神/NN', '。/PU'], ['这/DT', '款/NN', '飞行/VV', '从/P', '外型/NN', '上/LC', '来/MSP', '看/VV', '酷似/VV', '电影/NN', '中/LC', '的/DEG', '太空/NN', '飞行器/NN', '，/PU', '据/P', '英国/NR', '方面/NN', '介绍/VV', '，/PU', '可以/VV', '实现/VV', '洲际/NN', '远程/NN', '打击/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无/VE', '人机/NN', '到底/AD', '有/VE', '多/CD', '厉害/NN', '？/PU']]
+# [['编者/NN', '按：/NN', '7月/NT', '12日/NT', '，/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '？/PU']]
 ```
 
-### 中文分词+词性标注+句法分析
+### 句法分析
 ```python
-text = ['编者按：7月12日，英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
-        '这款飞行从外型上来看酷似电影中的太空飞行器，据英国方面介绍，可以实现洲际远程打击。',
-        '那么这款无人机到底有多厉害？']
+text = [['编者', '按：', '7月', '12日', '，', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
+        '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
+        ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '？']]
 from fastNLP.api import Parser
 parser = Parser(device='cpu')
 print(parser.predict(text))
-# [['12/nsubj', '12/prep', '2/punct', '5/nn', '2/pobj', '12/punct', '11/nn', '11/nn', '11/nn', '11/nn', '2/pobj', '0/root', '12/asp', '15/det', '16/nsubj', '21/rcmod', '16/cpm', '21/nummod', '21/nn', '21/nn', '22/top', '12/ccomp', '24/nn', '26/assmod', '24/assm', '22/dobj', '12/punct'], ['2/det', '8/xsubj', '8/mmod', '8/prep', '6/lobj', '4/plmod', '8/prtmod', '0/root', '8/ccomp', '11/lobj', '14/assmod', '11/assm', '14/nn', '9/dobj', '8/punct', '22/prep', '18/nn', '19/nsubj', '16/pccomp', '22/punct', '22/mmod', '8/dep', '25/nn', '25/nn', '22/dobj', '8/punct'], ['4/advmod', '3/det', '4/nsubj', '0/root', '4/dobj', '7/advmod', '4/conj', '9/nummod', '7/dobj', '4/punct']]
+# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']]
 ```
 
 完整样例见`examples.py`
diff --git a/fastNLP/api/api.py b/fastNLP/api/api.py
@@ -9,19 +9,17 @@
 
 from fastNLP.api.utils import load_url
 from fastNLP.api.processor import ModelProcessor
-from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
-from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
-from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
+from fastNLP.io.dataset_loader import ConllCWSReader, ConllxDataLoader
 from fastNLP.core.instance import Instance
 from fastNLP.api.pipeline import Pipeline
 from fastNLP.core.metrics import SpanFPreRecMetric
 from fastNLP.api.processor import IndexerProcessor
 
 # TODO add pretrain urls
 model_urls = {
-    "cws": "http://123.206.98.91:8888/download/cws_crf_1_11-457fc899.pkl",
-    "pos": "http://123.206.98.91:8888/download/pos_tag_model_20190108-f3c60ee5.pkl",
-    "parser": "http://123.206.98.91:8888/download/biaffine_parser-3a2f052c.pkl"
+    "cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl",
+    "pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl",
+    "parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl"
 }
 
 
@@ -31,6 +29,16 @@ def __init__(self):
         self._dict = None
 
     def predict(self, *args, **kwargs):
+        """Do prediction for the given input.
+        """
+        raise NotImplementedError
+
+    def test(self, file_path):
+        """Test performance over the given data set.
+
+        :param str file_path:
+        :return: a dictionary of metric values
+        """
         raise NotImplementedError
 
     def load(self, path, device):
@@ -69,12 +77,11 @@ def predict(self, content):
         if not hasattr(self, "pipeline"):
             raise ValueError("You have to load model first.")
 
-        sentence_list = []
+        sentence_list = content
         # 1. 检查sentence的类型
-        if isinstance(content, str):
-            sentence_list.append(content)
-        elif isinstance(content, list):
-            sentence_list = content
+        for sentence in sentence_list:
+            if not all((type(obj) == str for obj in sentence)):
+                raise ValueError("Input must be list of list of string.")
 
         # 2. 组建dataset
         dataset = DataSet()
@@ -83,36 +90,28 @@ def predict(self, content):
         # 3. 使用pipeline
         self.pipeline(dataset)
 
-        def decode_tags(ins):
-            pred_tags = ins["tag"]
-            chars = ins["words"]
-            words = []
-            start_idx = 0
-            for idx, tag in enumerate(pred_tags):
-                if tag[0] == "S":
-                    words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
-                    start_idx = idx + 1
-                elif tag[0] == "E":
-                    words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:])
-                    start_idx = idx + 1
-            return words
-
-        dataset.apply(decode_tags, new_field_name="tag_output")
-
-        output = dataset.field_arrays["tag_output"].content
+        def merge_tag(words_list, tags_list):
+            rtn = []
+            for words, tags in zip(words_list, tags_list):
+                rtn.append([w + "/" + t for w, t in zip(words, tags)])
+            return rtn
+
+        output = dataset.field_arrays["tag"].content
         if isinstance(content, str):
             return output[0]
         elif isinstance(content, list):
-            return output
+            return merge_tag(content, output)
 
     def test(self, file_path):
-        test_data = ZhConllPOSReader().load(file_path)
+        test_data = ConllxDataLoader().load(file_path)
 
-        tag_vocab = self._dict["tag_vocab"]
-        pipeline = self._dict["pipeline"]
+        save_dict = self._dict
+        tag_vocab = save_dict["tag_vocab"]
+        pipeline = save_dict["pipeline"]
         index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
         pipeline.pipeline = [index_tag] + pipeline.pipeline
 
+        test_data.rename_field("pos_tags", "tag")
         pipeline(test_data)
         test_data.set_target("truth")
         prediction = test_data.field_arrays["predict"].content
@@ -226,7 +225,7 @@ def test(self, filepath):
         rec = eval_res['BMESF1PreRecMetric']['rec']
         # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))
 
-        return f1, pre, rec
+        return {"F1": f1, "precision": pre, "recall": rec}
 
 
 class Parser(API):
@@ -251,6 +250,7 @@ def predict(self, content):
         dataset.add_field('wp', pos_out)
         dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
         dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
+        dataset.rename_field("words", "raw_words")
 
         # 3. 使用pipeline
         self.pipeline(dataset)
@@ -260,39 +260,82 @@ def predict(self, content):
         # output like: [['2/top', '0/root', '4/nn', '2/dep']]
         return dataset.field_arrays['output'].content
 
-    def test(self, filepath):
-        data = ConllxDataLoader().load(filepath)
-        ds = DataSet()
-        for ins1, ins2 in zip(add_seg_tag(data), data):
-            ds.append(Instance(words=ins1[0], tag=ins1[1],
-                               gold_words=ins2[0], gold_pos=ins2[1],
-                               gold_heads=ins2[2], gold_head_tags=ins2[3]))
+    def load_test_file(self, path):
+        def get_one(sample):
+            sample = list(map(list, zip(*sample)))
+            if len(sample) == 0:
+                return None
+            for w in sample[7]:
+                if w == '_':
+                    print('Error Sample {}'.format(sample))
+                    return None
+            # return word_seq, pos_seq, head_seq, head_tag_seq
+            return sample[1], sample[3], list(map(int, sample[6])), sample[7]
+
+        datalist = []
+        with open(path, 'r', encoding='utf-8') as f:
+            sample = []
+            for line in f:
+                if line.startswith('\n'):
+                    datalist.append(sample)
+                    sample = []
+                elif line.startswith('#'):
+                    continue
+                else:
+                    sample.append(line.split('\t'))
+            if len(sample) > 0:
+                datalist.append(sample)
+
+        data = [get_one(sample) for sample in datalist]
+        data_list = list(filter(lambda x: x is not None, data))
+        return data_list
 
+    def test(self, filepath):
+        data = self.load_test_file(filepath)
+
+        def convert(data):
+            BOS = '<BOS>'
+            dataset = DataSet()
+            for sample in data:
+                word_seq = [BOS] + sample[0]
+                pos_seq = [BOS] + sample[1]
+                heads = [0] + sample[2]
+                head_tags = [BOS] + sample[3]
+                dataset.append(Instance(raw_words=word_seq,
+                                        pos=pos_seq,
+                                        gold_heads=heads,
+                                        arc_true=heads,
+                                        tags=head_tags))
+            return dataset
+
+        ds = convert(data)
         pp = self.pipeline
         for p in pp:
             if p.field_name == 'word_list':
                 p.field_name = 'gold_words'
             elif p.field_name == 'pos_list':
                 p.field_name = 'gold_pos'
+        # ds.rename_field("words", "raw_words")
+        # ds.rename_field("tag", "pos")
         pp(ds)
         head_cor, label_cor, total = 0, 0, 0
         for ins in ds:
             head_gold = ins['gold_heads']
-            head_pred = ins['heads']
+            head_pred = ins['arc_pred']
             length = len(head_gold)
             total += length
             for i in range(length):
                 head_cor += 1 if head_pred[i] == head_gold[i] else 0
         uas = head_cor / total
-        print('uas:{:.2f}'.format(uas))
+        # print('uas:{:.2f}'.format(uas))
 
         for p in pp:
             if p.field_name == 'gold_words':
                 p.field_name = 'word_list'
             elif p.field_name == 'gold_pos':
                 p.field_name = 'pos_list'
 
-        return uas
+        return {"USA": round(uas, 5)}
 
 
 class Analyzer:

diff --git a/fastNLP/api/examples.py b/fastNLP/api/examples.py
@@ -15,15 +15,42 @@ def chinese_word_segmentation():
     print(cws.predict(text))
 
 
+def chinese_word_segmentation_test():
+    cws = CWS(device='cpu')
+    print(cws.test("../../test/data_for_tests/zh_sample.conllx"))
+
+
 def pos_tagging():
+    # 输入已分词序列
+    text = [['编者', '按：', '7月', '12日', '，', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
+             '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
+            ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '？']]
     pos = POS(device='cpu')
     print(pos.predict(text))
 
 
+def pos_tagging_test():
+    pos = POS(device='cpu')
+    print(pos.test("../../test/data_for_tests/zh_sample.conllx"))
+
+
 def syntactic_parsing():
+    text = [['编者', '按：', '7月', '12日', '，', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
+             '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
+            ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '？']]
     parser = Parser(device='cpu')
     print(parser.predict(text))
 
 
+def syntactic_parsing_test():
+    parser = Parser(device='cpu')
+    print(parser.test("../../test/data_for_tests/zh_sample.conllx"))
+
+
 if __name__ == "__main__":
+    # chinese_word_segmentation()
+    # chinese_word_segmentation_test()
+    # pos_tagging()
+    # pos_tagging_test()
     syntactic_parsing()
+    # syntactic_parsing_test()