implement the predicting function

Yh Tian · Yh Tian · commit e6d7e315e007 · 2020-07-15T00:18:57.000-07:00
diff --git a/README.md b/README.md
@@ -78,9 +78,22 @@ Here are some important parameters:
 * `--feature_flag`: use `pos`, `chunk`, or `dep` knowledge
 * `--model_name`: the name of model to save 
 
+## Predicting
+
+`run_sample.sh` contains the command line to segment and tag the sentences in an input file ([./sample_data/sentence.txt](./sample_data/sentence.txt)).
+
+Here are some important parameters:
+
+* `--do_predict`: segment and tag the sentences using a pre-trained TwASP model.
+* `--input_file`: the file contains sentences to be segmented and tagged. Each line contains one sentence; you can refer to [a sample input file](./sample_data/sentence.txt) for the input format.
+* `--output_file`: the path of the output file. Words are segmented by a space; POS labels are attached to the resulting words by an underline ("_").
+* `--eval_model`: the pre-trained WMSeg model to be used to segment the sentences in the input file.
+
+To run a pre-trained TwASP model, you need to install SCT and BNP to obtain the auto-analyzed syntactic knowledge. See [data_processing](./data_preprocessing) for more information to download the two toolkits.
+
 ## To-do List
 
-* Implement `predict` function in `twasp_main.py`
+* Regular maintenance
 
 You can leave comments in the `Issues` section, if you want us to implement any functions.
 
diff --git a/run_sample.sh b/run_sample.sh
@@ -6,3 +6,5 @@ python twasp_main.py --do_train --train_data_path=./sample_data/train.tsv --eval
 # test
 python twasp_main.py --do_test --eval_data_path=./sample_data/test.tsv --eval_model=./models/model_name/model.pt
 
+# predict
+python twasp_main.py --do_predict --input_file=./sample_data/sentence.txt --output_file=./sample_data/sentece.txt.out --eval_model=./models/model_name/model.pt
diff --git a/sample_data/sentence.txt b/sample_data/sentence.txt
@@ -0,0 +1,5 @@
+共同创造美好的新世纪——二○○一年新年贺词
+（二○○○年十二月三十一日）（附图片1张）
+女士们，先生们，同志们，朋友们：
+2001年新年钟声即将敲响。人类社会前进的航船就要驶入21世纪的新航程。中国人民进入了向现代化建设第三步战略目标迈进的新征程。
+在这个激动人心的时刻，我很高兴通过中国国际广播电台、中央人民广播电台和中央电视台，向全国各族人民，向香港特别行政区同胞、澳门特别行政区同胞和台湾同胞、海外侨胞，向世界各国的朋友们，致以新世纪第一个新年的祝贺！
diff --git a/twasp_eval.py b/twasp_eval.py
@@ -1,39 +1,49 @@
-from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
+from seqeval.metrics import f1_score, precision_score, recall_score
+
 
 def eval_sentence(y_pred, y, sentence, word2id):
     words = sentence.split(' ')
-    seg_true = []
+
+    if y is not None:
+        seg_true = []
+        word_true = ''
+        y_word = []
+        y_pos = []
+        for y_label in y:
+            y_word.append(y_label[0])
+            y_pos.append(y_label[2:])
+
+        for i in range(len(y_word)):
+            word_true += words[i]
+            if y_word[i] in ['S', 'E']:
+                pos_tag_true = y_pos[i]
+                word_pos_true = word_true + '_' + pos_tag_true
+                if word_true not in word2id:
+                    word_pos_true = '*' + word_pos_true + '*'
+                seg_true.append(word_pos_true)
+                word_true = ''
+
+        seg_true_str = ' '.join(seg_true)
+    else:
+        seg_true_str = None
+
     seg_pred = []
-    word_true = ''
     word_pred = ''
 
-    y_word = []
-    y_pos = []
     y_pred_word = []
     y_pred_pos = []
-    for y_label, y_pred_label in zip(y, y_pred):
-        y_word.append(y_label[0])
-        y_pos.append(y_label[2:])
+    for y_pred_label in y_pred:
         y_pred_word.append(y_pred_label[0])
         y_pred_pos.append(y_pred_label[2:])
 
-    for i in range(len(y_word)):
-        word_true += words[i]
+    for i in range(len(y_pred_word)):
         word_pred += words[i]
-        if y_word[i] in ['S', 'E']:
-            pos_tag_true = y_pos[i]
-            word_pos_true = word_true + '_' + pos_tag_true
-            if word_true not in word2id:
-                word_pos_true = '*' + word_pos_true + '*'
-            seg_true.append(word_pos_true)
-            word_true = ''
         if y_pred_word[i] in ['S', 'E']:
             pos_tag_pred = y_pred_pos[i]
             word_pos_pred = word_pred + '_' + pos_tag_pred
             seg_pred.append(word_pos_pred)
             word_pred = ''
 
-    seg_true_str = ' '.join(seg_true)
     seg_pred_str = ' '.join(seg_pred)
     return seg_true_str, seg_pred_str
 
diff --git a/twasp_helper.py b/twasp_helper.py
@@ -42,6 +42,19 @@ def read_tsv(file_path):
     return sentence_list, label_list
 
 
+def read_sentence(file_path):
+    sentence = []
+    with open(file_path, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        for line in lines:
+            line = line.strip()
+            if line == '':
+                continue
+            sentence.append([char for char in line])
+
+    return sentence, None
+
+
 def get_word2id(train_path):
     word2id = {'<PAD>': 0}
     word = ''
@@ -97,7 +110,7 @@ def merge_results(results):
     return merged
 
 
-def request_features_from_stanford(data_path):
+def request_features_from_stanford(data_path, do_predict=False):
     data_dir = data_path[:data_path.rfind('/')]
     flag = data_path[data_path.rfind('/') + 1: data_path.rfind('.')]
 
@@ -107,7 +120,10 @@ def request_features_from_stanford(data_path):
 
     print('Requesting Stanford results for %s' % str(data_path))
 
-    all_sentences, _ = read_tsv(data_path)
+    if do_predict:
+        all_sentences, _ = read_sentence(data_path)
+    else:
+        all_sentences, _ = read_tsv(data_path)
     sentences_str = []
     for sentence in all_sentences:
         sentences_str.append(''.join(sentence))
@@ -126,13 +142,13 @@ def request_features_from_stanford(data_path):
             f.write('\n')
 
 
-def request_features_from_berkeley(data_path):
+def request_features_from_berkeley(data_path, do_predict=False):
     data_dir = data_path[:data_path.rfind('/')]
     flag = data_path[data_path.rfind('/') + 1: data_path.rfind('.')]
 
     if not os.path.exists(path.join(data_dir, flag + '.stanford.json')):
         print('Do not find the Stanford data file\nRequesting Stanford segmentation results for %s' % str(data_path))
-        request_features_from_stanford(data_path, flag)
+        request_features_from_stanford(data_path, do_predict=do_predict)
     else:
         print('The Stanford data file for %s already exists!' % str(data_path))
     if os.path.exists(path.join(data_dir, flag + '.berkeley.json')):
@@ -164,14 +180,7 @@ def request_features_from_berkeley(data_path):
         pos_tags = parse_tree.pos()
 
         for i, (bt, (w, pos)) in enumerate(zip(berkeley_data['tokens'], pos_tags)):
-            # w = w_pos[0]
-            # pos = w_pos[1]
-            # try:
             assert bt['word'] == w
-            # except AssertionError:
-            #     print('error in sentence: %s' % ''.join(word_list))
-            #     print('word error: excepted %s, get %s' % (bt['word'], w))
-            # else:
             berkeley_data['tokens'][i]['pos'] = pos
         berkeley_all_data.append(berkeley_data)
 
@@ -455,70 +464,3 @@ def renew_ngram_by_freq(all_sentences, ngram2count, min_feq, ngram_len=10):
                     new_ngram2count[n_gram] += 1
     new_ngram_dict = {gram: c for gram, c in new_ngram2count.items() if c > min_feq}
     return new_ngram_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--dataset",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-
-    args = parser.parse_args()
-    base_min_freq = 1
-    av_threshold = 2
-
-    min_freq = base_min_freq
-
-    print('min freq: %d' % min_freq)
-
-    data_dir = path.join(DATA_DIR, args.dataset)
-
-    print(data_dir)
-
-    # getlabels(data_dir)
-
-    # get_word2id(data_dir)
-
-    # be(data_dir, 0, 10)
-
-    # oov_stat(data_dir, 'train')
-    # oov_stat(data_dir, 'dev')
-    # oov_stat(data_dir, 'test')
-    # request_features_from_stanford(data_dir, 'train')
-    # request_features_from_stanford(data_dir, 'dev')
-    # request_features_from_stanford(data_dir, 'test')
-
-    # request_features_from_stanford(data_dir, 'bc')
-    # request_features_from_stanford(data_dir, 'bn')
-    # request_features_from_stanford(data_dir, 'cs')
-    # request_features_from_stanford(data_dir, 'df')
-    # request_features_from_stanford(data_dir, 'mz')
-    # request_features_from_stanford(data_dir, 'nw')
-    # request_features_from_stanford(data_dir, 'sc')
-    # request_features_from_stanford(data_dir, 'wb')
-
-    # request_features_from_stanford('./data/POS/demo', 'demo')
-
-    # sfp = stanford_feature_processor(data_dir)
-    # sfp._pre_processing()
-    # sfp.read_features('train')
-    # sfp.read_features('test')
-    # sfp.feature_stat()
-
-    # bek = berkeley_feature_processor(data_dir)
-    # bek.request_knoledge('train')
-    # bek.request_knoledge('dev')
-    # bek.request_knoledge('test')
-    # bek.request_knoledge('demo')
-    # bek._pre_processing()
-    # bek.feature_stat()
-
-    # attentionn_gram_stat(data_dir, 0, 10)
-
-    print('')
-
-    # exit()
-
diff --git a/twasp_main.py b/twasp_main.py
@@ -480,8 +480,93 @@ def test(args):
 
 
 def predict(args):
-    # In progressing
-    return None
+
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        n_gpu = 1
+        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.distributed.init_process_group(backend='nccl')
+    print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        device, n_gpu, bool(args.local_rank != -1), args.fp16))
+
+    joint_model_checkpoint = torch.load(args.eval_model)
+    joint_model = TwASP.from_spec(joint_model_checkpoint['spec'], joint_model_checkpoint['state_dict'], args)
+
+    if joint_model.use_attention:
+        if joint_model.source == 'stanford':
+            request_features_from_stanford(args.input_file, do_predict=True)
+        elif joint_model.source == 'berkeley':
+            request_features_from_berkeley(args.input_file, do_predict=True)
+        else:
+            raise ValueError('Invalid source $s. '
+                             'Source must be one of \'stanford\' or \'berkeley\' if attentions are used.'
+                             % joint_model.source)
+
+    eval_examples = joint_model.load_data(args.input_file, do_predict=True)
+    convert_examples_to_features = joint_model.convert_examples_to_features
+    feature2input = joint_model.feature2input
+    num_labels = joint_model.num_labels
+    word2id = joint_model.word2id
+    label_map = {v: k for k, v in joint_model.labelmap.items()}
+    label_map[0] = 'O'
+
+    if args.fp16:
+        joint_model.half()
+    joint_model.to(device)
+    if args.local_rank != -1:
+        try:
+            from apex.parallel import DistributedDataParallel as DDP
+        except ImportError:
+            raise ImportError(
+                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
+
+        joint_model = DDP(joint_model)
+    elif n_gpu > 1:
+        joint_model = torch.nn.DataParallel(joint_model)
+
+    joint_model.to(device)
+
+    joint_model.eval()
+    y_pred = []
+
+    for start_index in tqdm(range(0, len(eval_examples), args.eval_batch_size)):
+        eval_batch_examples = eval_examples[start_index: min(start_index + args.eval_batch_size,
+                                                             len(eval_examples))]
+        eval_features = convert_examples_to_features(eval_batch_examples)
+
+        feature_ids, input_ids, input_mask, l_mask, label_ids, ngram_ids, ngram_positions, \
+        segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(device, eval_features)
+
+        with torch.no_grad():
+            _, tag_seq = joint_model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask,
+                                     word_ids, feature_ids, word_matching_matrix, word_matching_matrix,
+                                     ngram_ids, ngram_positions)
+
+        logits = tag_seq.to('cpu').numpy()
+        label_ids = label_ids.to('cpu').numpy()
+
+        for i, label in enumerate(label_ids):
+            temp = []
+            for j, m in enumerate(label):
+                if j == 0:
+                    continue
+                elif label_ids[i][j] == num_labels - 1:
+                    y_pred.append(temp)
+                    break
+                else:
+                    temp.append(label_map[logits[i][j]])
+
+    print('write results to %s' % str(args.output_file))
+    with open(args.output_file, 'w') as writer:
+        for i in range(len(y_pred)):
+            sentence = eval_examples[i].text_a
+            _, seg_pred_str = eval_sentence(y_pred[i], None, sentence, word2id)
+            writer.write('%s\n' % seg_pred_str)
+
 
 
 def main():
diff --git a/twasp_model.py b/twasp_model.py
@@ -189,8 +189,12 @@ def from_spec(cls, spec, model, args):
         res.load_state_dict(model)
         return res
 
-    def load_data(self, data_path):
-        lines = readfile(data_path)
+    def load_data(self, data_path, do_predict=False):
+
+        if do_predict:
+            lines = read_sentence(data_path)
+        else:
+            lines = readfile(data_path)
 
         flag = data_path[data_path.rfind('/')+1: data_path.rfind('.')]
 
@@ -654,3 +658,16 @@ def readfile(filename):
         label = []
     return data
 
+
+def read_sentence(filename):
+    data = []
+    with open(filename, 'r', encoding='utf8') as f:
+        lines = f.readlines()
+        for line in lines:
+            line = line.strip()
+            if line == '':
+                continue
+            sentence = [char for char in line]
+            label = ['<UNK>' for _ in sentence]
+            data.append((sentence, label))
+    return data
diff --git a/updates.md b/updates.md
@@ -1,3 +1,4 @@
 # Important Updates
 
+* July 14, 2020: Implement the `predict` function in `twasp_main.py`. You can use that function to segment and tag the sentences in an input file with a pre-trained TwASP model. See [run_sample.sh](./run_sample.sh) for the usage, and [./sample_data/sentences.txt](./sample_data/sentence.txt) for the input format. If you run pre-trained TwASP models using Stanford CoreNLP Toolkit v3.9.2 or Berkeley Neural Parser, you need to download these toolkits before running. See [data_preprocessing](./data_preprocessing) for more information to install the toolkits.
 * July 7, 2020: the release of [pre-trained TwASP models](./models).

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`# Important Updates`
`2`	`2`
	`3`	+* July 14, 2020: Implement the `predict` function in `twasp_main.py`. You can use that function to segment and tag the sentences in an input file with a pre-trained TwASP model. See [run_sample.sh](./run_sample.sh) for the usage, and [./sample_data/sentences.txt](./sample_data/sentence.txt) for the input format. If you run pre-trained TwASP models using Stanford CoreNLP Toolkit v3.9.2 or Berkeley Neural Parser, you need to download these toolkits before running. See [data_preprocessing](./data_preprocessing) for more information to install the toolkits.
`3`	`4`	`* July 7, 2020: the release of [pre-trained TwASP models](./models).`