-
Notifications
You must be signed in to change notification settings - Fork 3
/
NERTraining.py
88 lines (74 loc) · 2.4 KB
/
NERTraining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
import json
def convert_to_data(path):
try:
training_data = []
lines=[]
with open(path, 'r',encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
TRAIN_DATA=convert_to_data("./traindata.json")
model = None
output_dir=Path("./model")
n_iter=100
if model is not None:
nlp = spacy.load(model)
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank('en')
print("Created blank 'en' model")
#set up the pipeline
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe('ner')
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in tqdm(TRAIN_DATA):
try:
nlp.update(
[text],
[annotations],
drop=0.5,
sgd=optimizer,
losses=losses)
except:
continue
print(losses)
for text, _ in TRAIN_DATA:
doc = nlp(text)
print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)