-
Notifications
You must be signed in to change notification settings - Fork 0
/
stanzaNER.py
39 lines (32 loc) · 1.03 KB
/
stanzaNER.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# Written by Seungil Lee, Nov 21 2021
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
# https://medium.com/@b.terryjack/nlp-pretrained-named-entity-recognition-7caa5cd28d7b
import stanza
import csv
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
def ner(document):
doc = nlp(document)
result = dict()
for sent in doc.sentences:
for ent in sent.ents:
if ent.type in result:
result[ent.type].add(ent.text)
else:
result[ent.type] = set()
result[ent.type].add(ent.text)
return result
def main():
raw = open('./data/2017.tsv','r')
new = open('./data/2017_ner.tsv','w')
reader = csv.reader(raw,delimiter = '\t')
headers = next(reader,None)
writer = csv.writer(new,delimiter = '\t')
if headers:
writer.writerow(headers)
for row in reader:
row[9] = ner(row[1])
writer.writerow(row)
raw.close()
new.close()
if __name__ == '__main__':
main()