-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdeploy.py
86 lines (70 loc) · 3.86 KB
/
deploy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import sys,os,json,pickle
full_path = os.path.realpath(__file__)
sys.path.insert(0,full_path)
###### code above this line is used to treat bioNLP as a local python package. #####
import logging
logging.basicConfig(level=logging.INFO)
logger=logging.getLogger(__name__)
from bionlp.taggers.rnn_feature.tagger import rnn_train
from bionlp.preprocess.dataset_preprocess import encode_data_format,decode_training_data
from bionlp.preprocess.extract_data import file_extractor,annotated_file_extractor
from bionlp.data.token import Token as Token
from bionlp.data.sentence import Sentence as Sentence
from bionlp.data.document import Document as Document
from bionlp.data.dataset import Dataset as Dataset
from bionlp.modifiers.crf_modifiers import add_BIO
from bionlp.modifiers.rnn_modifiers import add_surface_feature_list,add_umls_type,construct_umls_rnn_features
from bionlp.utils.crf_arguments import deploy_arguments
from bionlp.evaluate.evaluation import get_Approx_Metrics,get_Exact_Metrics,evaluator
from bionlp.evaluate.postprocess import prepare_document_report
def trainer(config_params):
logger.info('Loading new input dataset from {0}'.format(deploy_params['input']))
if not config_params['noeval']:
logger.info('Evaluation on the input dataset is on ( -noeval 0). I will search for gold standard annotation in .json files ')
raw_text,documents=annotated_file_extractor(deploy_params['input'],config_params['umls'])
else:
raw_text,documents=file_extractor(deploy_params['input'],config_params['umls'])
encoded_documents=encode_data_format(documents,raw_text,config_params['umls'])
if config_params['umls']!=0 and 'UMLS_TYPE' not in encoded_documents.active:
logger.info('UMLS is on, but the dataset does not have umls tags. Adding and caching ..')
encoded_documents=add_umls_type(encoded_documents)
del raw_text
del documents
logger.info('Gathering stored vocabularies')
model_file= pickle.load(open(config_params['model'],'rb'))
w2i=model_file['w2i']
umls_vocab_dict=model_file['umls_vocab']
encoded_documents=add_surface_feature_list(encoded_documents)
if config_params['umls']!=0:
encoded_documents,umls_vocab_dict=construct_umls_rnn_features(encoded_documents,umls_vocab_dict)
logger.info('Dataset Tags \nActive {0}\nPassive {1}\nDelayed {2}'.format(encoded_documents.active,encoded_documents.passive,encoded_documents.delayed))
#logger.info('Sample of encoded data {0}'.format(encoded_documents.value[0].value[1].value[0]))
if config_params['bio']==1:
encoded_documents=add_BIO(encoded_documents)
##### Decoding Data into training format #####
dataset=decode_training_data(encoded_documents)
#logger.info('Sample of decoded data {0}'.format(dataset[0][2]))
texts,label,pred=rnn_train(dataset,config_params,w2i,umls_vocab_dict)
if config_params['output-dir'] is not 'None':
prepare_document_report(texts,label,pred,encoded_documents,config_params['output-dir'])
if not config_params['noeval']:
evaluator(label,pred,get_Exact_Metrics)
evaluator(label,pred,get_Approx_Metrics)
def main(config_params):
config_params['dependency']={}
config_params['trainable']=False
config_params['dataset_percentage']=100 # Using hardcoded dataset percentage
config_params['error-analysis']=deploy_params['output']
print "Using the parameters :"
print json.dumps(config_params,indent =2)
trainer(config_params)
print "Using the parameters :"
print json.dumps(config_params,indent =2)
if __name__=="__main__":
deploy_params=deploy_arguments()
nn_v_d=pickle.load(open(deploy_params['model'],'rb'))
config_params=nn_v_d['params']
config_params['model']=deploy_params['model']
config_params['noeval']=deploy_params['noeval']
config_params['output-dir']=deploy_params['outputdir']
main(config_params)