update the way to save model

Yh Tian · Yh Tian · commit 786e97a128f9 · 2020-07-07T18:48:41.000-07:00
diff --git a/README.md b/README.md
@@ -9,14 +9,14 @@ We will keep updating this repository these days.
 If you use or extend our work, please cite our paper at ACL2020.
 
 ```
-@inproceedings{tian-etal-2020-improving,
-    title = "Improving {C}hinese Word Segmentation with Wordhood Memory Networks",
-    author = "Tian, Yuanhe and Song, Yan and Xia, Fei and Zhang, Tong and Wang, Yonggang",
+@inproceedings{tian-etal-2020-joint,
+    title = "Joint Chinese Word Segmentation and Part-of-speech Tagging via Two-way Attentions of Auto-analyzed Knowledge",
+    author = "Tian, Yuanhe and Song, Yan and Ao, Xiang and Xia, Fei and Quan, Xiaojun and Zhang, Tong and Wang, Yonggang",
     booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
     month = jul,
     year = "2020",
     address = "Online",
-    pages = "8274--8285",
+    pages = "8286--8296",
 }
 ```
 
@@ -42,7 +42,7 @@ Run `run_sample.sh` to train a model on the small sample data under the `sample_
 
 We use [CTB5](https://catalog.ldc.upenn.edu/LDC2005T01), [CTB6](https://catalog.ldc.upenn.edu/LDC2007T36), [CTB7](https://catalog.ldc.upenn.edu/LDC2010T07), [CTB9](https://catalog.ldc.upenn.edu/LDC2016T13), and [Universal Dependencies 2.4](https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2988) (UD) in our paper.
 
-To obtain and pre-process the data, you can go to `data_preprocessing` directory and run `getdata.sh`. This script will download and process the official data from UD. For CTB5 (LDC05T01), CTB6 (LDC07T36), CTB7 (LDC10T07), and CTB9 (LDC2016T13), you need to obtain the official data yourself, and then put the raw data directory under the `data_preprocessing` directory.
+To obtain and pre-process the data, you can go to `data_preprocessing` directory and run `getdata.sh`. This script will download and process the official data from UD. For CTB5 (LDC05T01), CTB6 (LDC07T36), CTB7 (LDC10T07), and CTB9 (LDC2016T13), you need to obtain the official data yourself, and then put the raw data folder under the `data_preprocessing` directory.
 
 The script will also download the [Stanford CoreNLP Toolkit v3.9.2](https://stanfordnlp.github.io/CoreNLP/history.html) (SCT) and [Berkeley Neural Parser](https://github.com/nikitakit/self-attentive-parser) (BNP) to obtain the auto-analyzed syntactic knowledge. You can refer to their website for more information.
 
diff --git a/get_syninfo.py b/get_syninfo.py
@@ -42,14 +42,14 @@
             if os.path.exists(out_file) and not args.overwrite:
                 print('File already exists: %s' % str(out_file))
                 continue
-            request_features_from_stanford(input_file, flag)
+            request_features_from_stanford(input_file)
 
         elif args.toolkit == 'BNP':
             out_file = os.path.join(input_dir, flag + '.berkeley.json')
             if os.path.exists(out_file) and not args.overwrite:
                 print('File already exists: %s' % str(out_file))
                 continue
-            request_features_from_berkeley(input_file, flag)
+            request_features_from_berkeley(input_file)
         else:
             raise ValueError('Invalid type of toolkit name: %s. Should be one of \'SCT\' and \'BNP\'.' % args.toolkit)
 
diff --git a/run_sample.sh b/run_sample.sh
@@ -1,7 +1,7 @@
 mkdir logs
 
 # train
-python twasp_main.py --do_train --train_data_path=./sample_data/train.tsv --eval_data_path=./sample_data/test.tsv --use_bert --bert_model=/path/to/bert/model --use_attention --max_seq_length=300 --max_ngram_size=300  --train_batch_size=2 --eval_batch_size=2 --num_train_epochs=3 --warmup_proportion=0.1 --learning_rate=1e-5 --patient=15 --source=stanford --feature_flag=pos --model_name=sample_model
+python twasp_main.py --do_train --train_data_path=./sample_data/train.tsv --eval_data_path=./sample_data/dev.tsv --use_bert --bert_model=/path/to/bert/model --use_attention --max_seq_length=300 --max_ngram_size=300  --train_batch_size=2 --eval_batch_size=2 --num_train_epochs=3 --warmup_proportion=0.1 --learning_rate=1e-5 --patient=15 --source=stanford --feature_flag=pos --model_name=sample_model
 
 # test
 python twasp_main.py --do_test --eval_data_path=./sample_data/test.tsv --eval_model=./models/model_name/model.pt
diff --git a/twasp_helper.py b/twasp_helper.py
@@ -97,8 +97,10 @@ def merge_results(results):
     return merged
 
 
-def request_features_from_stanford(data_path, flag):
+def request_features_from_stanford(data_path):
     data_dir = data_path[:data_path.rfind('/')]
+    flag = data_path[data_path.rfind('/') + 1: data_path.rfind('.')]
+
     if os.path.exists(path.join(data_dir, flag + '.stanford.json')):
         print('The Stanford data file for %s already exists!' % str(data_path))
         return None
@@ -124,8 +126,9 @@ def request_features_from_stanford(data_path, flag):
             f.write('\n')
 
 
-def request_features_from_berkeley(data_path, flag):
+def request_features_from_berkeley(data_path):
     data_dir = data_path[:data_path.rfind('/')]
+    flag = data_path[data_path.rfind('/') + 1: data_path.rfind('.')]
 
     if not os.path.exists(path.join(data_dir, flag + '.stanford.json')):
         print('Do not find the Stanford data file\nRequesting Stanford segmentation results for %s' % str(data_path))
diff --git a/twasp_main.py b/twasp_main.py
@@ -26,8 +26,8 @@ def train(args):
     if args.use_bert and args.use_zen:
         raise ValueError('We cannot use both BERT and ZEN')
 
-    if not os.path.exists('./logs/'):
-        os.mkdir('./logs')
+    if not os.path.exists('./logs'):
+        os.mkdir('logs')
 
     now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
     log_file_name = './logs/log-' + now_time
@@ -88,12 +88,12 @@ def train(args):
 
     if args.use_attention:
         if args.source == 'stanford':
-            request_features_from_stanford(args.train_data_path, flag='train')
-            request_features_from_stanford(args.eval_data_path, flag='test')
+            request_features_from_stanford(args.train_data_path)
+            request_features_from_stanford(args.eval_data_path)
             processor = stanford_feature_processor()
         elif args.source == 'berkeley':
-            request_features_from_berkeley(args.train_data_path, flag='train')
-            request_features_from_berkeley(args.eval_data_path, flag='test')
+            request_features_from_berkeley(args.train_data_path)
+            request_features_from_berkeley(args.eval_data_path)
             processor = berkeley_feature_processor()
         else:
             raise ValueError('Source must be one of \'stanford\' or \'berkeley\' if attentions are used.')
@@ -103,12 +103,14 @@ def train(args):
         gram2id = None
         feature2id = None
 
-    joint_model = TwASP(word2id, gram2id, feature2id, label_map, processor, args)
+    hpara = TwASP.init_hyper_parameters(args)
+    joint_model = TwASP(word2id, gram2id, feature2id, label_map, processor, hpara, args)
 
-    train_examples = joint_model.load_data(args.train_data_path, flag='train')
-    eval_examples = joint_model.load_data(args.eval_data_path, flag='test')
+    train_examples = joint_model.load_data(args.train_data_path)
+    eval_examples = joint_model.load_data(args.eval_data_path)
     num_labels = joint_model.num_labels
     convert_examples_to_features = joint_model.convert_examples_to_features
+    feature2input = joint_model.feature2input
 
     total_params = sum(p.numel() for p in joint_model.parameters() if p.requires_grad)
     logger.info('# of trainable parameters: %d' % total_params)
@@ -194,7 +196,7 @@ def train(args):
                     continue
                 train_features = convert_examples_to_features(batch_examples)
                 feature_ids, input_ids, input_mask, l_mask, label_ids, ngram_ids, ngram_positions, \
-                segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(args, device, train_features)
+                segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(device, train_features)
 
                 loss, _ = joint_model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask, word_ids,
                                       feature_ids, word_matching_matrix, word_matching_matrix, ngram_ids, ngram_positions)
@@ -237,7 +239,7 @@ def train(args):
                     eval_features = convert_examples_to_features(eval_batch_examples)
 
                     feature_ids, input_ids, input_mask, l_mask, label_ids, ngram_ids, ngram_positions, \
-                    segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(args, device, eval_features)
+                    segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(device, eval_features)
 
                     with torch.no_grad():
                         _, tag_seq = joint_model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask,
@@ -365,48 +367,6 @@ def train(args):
                 f.write('\n')
 
 
-def feature2input(args, device, feature):
-    all_input_ids = torch.tensor([f.input_ids for f in feature], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in feature], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in feature], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_id for f in feature], dtype=torch.long)
-    all_valid_ids = torch.tensor([f.valid_ids for f in feature], dtype=torch.long)
-    all_lmask_ids = torch.tensor([f.label_mask for f in feature], dtype=torch.long)
-
-    input_ids = all_input_ids.to(device)
-    input_mask = all_input_mask.to(device)
-    segment_ids = all_segment_ids.to(device)
-    label_ids = all_label_ids.to(device)
-    valid_ids = all_valid_ids.to(device)
-    l_mask = all_lmask_ids.to(device)
-    if args.use_attention:
-        all_word_ids = torch.tensor([f.word_ids for f in feature], dtype=torch.long)
-        all_feature_ids = torch.tensor([f.syn_feature_ids for f in feature], dtype=torch.long)
-        all_word_matching_matrix = torch.tensor([f.word_matching_matrix for f in feature],
-                                                dtype=torch.float)
-
-        word_ids = all_word_ids.to(device)
-        feature_ids = all_feature_ids.to(device)
-        word_matching_matrix = all_word_matching_matrix.to(device)
-    else:
-        word_ids = None
-        feature_ids = None
-        word_matching_matrix = None
-    if args.use_zen:
-        all_ngram_ids = torch.tensor([f.ngram_ids for f in feature], dtype=torch.long)
-        all_ngram_positions = torch.tensor([f.ngram_positions for f in feature], dtype=torch.long)
-        # all_ngram_lengths = torch.tensor([f.ngram_lengths for f in train_features], dtype=torch.long)
-        # all_ngram_seg_ids = torch.tensor([f.ngram_seg_ids for f in train_features], dtype=torch.long)
-        # all_ngram_masks = torch.tensor([f.ngram_masks for f in train_features], dtype=torch.long)
-
-        ngram_ids = all_ngram_ids.to(device)
-        ngram_positions = all_ngram_positions.to(device)
-    else:
-        ngram_ids = None
-        ngram_positions = None
-    return feature_ids, input_ids, input_mask, l_mask, label_ids, ngram_ids, ngram_positions, segment_ids, valid_ids, word_ids, word_matching_matrix
-
-
 def test(args):
 
     if args.local_rank == -1 or args.no_cuda:
@@ -422,21 +382,23 @@ def test(args):
         device, n_gpu, bool(args.local_rank != -1), args.fp16))
 
     joint_model_checkpoint = torch.load(args.eval_model)
-    joint_model = TwASP.from_spec(joint_model_checkpoint['spec'], joint_model_checkpoint['state_dict'])
+    joint_model = TwASP.from_spec(joint_model_checkpoint['spec'], joint_model_checkpoint['state_dict'], args)
 
     if joint_model.use_attention:
-        if joint_model.spec['args'].source == 'stanford':
-            request_features_from_stanford(args.eval_data_path, flag='test')
-        elif joint_model.spec['args'].source == 'berkeley':
-            request_features_from_berkeley(args.eval_data_path, flag='test')
+        if joint_model.source == 'stanford':
+            request_features_from_stanford(args.eval_data_path)
+        elif joint_model.source == 'berkeley':
+            request_features_from_berkeley(args.eval_data_path)
         else:
-            raise ValueError('Source must be one of \'stanford\' or \'berkeley\' if attentions are used.')
+            raise ValueError('Invalid source $s. '
+                             'Source must be one of \'stanford\' or \'berkeley\' if attentions are used.'
+                             % joint_model.source)
 
-    eval_examples = joint_model.load_data(args.eval_data_path, flag='test')
+    eval_examples = joint_model.load_data(args.eval_data_path)
     convert_examples_to_features = joint_model.convert_examples_to_features
+    feature2input = joint_model.feature2input
     num_labels = joint_model.num_labels
     word2id = joint_model.word2id
-    model_args = joint_model.spec['args']
     label_map = {v: k for k, v in joint_model.labelmap.items()}
     label_map[0] = 'O'
 
@@ -457,8 +419,6 @@ def test(args):
     joint_model.to(device)
 
     joint_model.eval()
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
     y_true = []
     y_pred = []
 
@@ -468,7 +428,7 @@ def test(args):
         eval_features = convert_examples_to_features(eval_batch_examples)
 
         feature_ids, input_ids, input_mask, l_mask, label_ids, ngram_ids, ngram_positions, \
-        segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(model_args, device, eval_features)
+        segment_ids, valid_ids, word_ids, word_matching_matrix = feature2input(device, eval_features)
 
         with torch.no_grad():
             _, tag_seq = joint_model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask,
@@ -520,7 +480,7 @@ def test(args):
 
 
 def predict(args):
-
+    # In progressing
     return None
 
 
diff --git a/twasp_model.py b/twasp_model.py