-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathner.py
79 lines (59 loc) · 2.79 KB
/
ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Wrapper to use Stanford NER
https://nlp.stanford.edu/software/CRF-NER.shtml
Requires: java 1.8
This module assumes that the folder with the Stanford NER is located in STANFORD_PATH
(path/to/nel-baseline/stanford-ner by default)
The output format can be changed from the parameter -outputFormat
Available options are: inlineXML, xml, tsv, tabbedEntities, slashTags
See https://nlp.stanford.edu/software/crf-faq.html#j
"""
from os import getcwd, remove, listdir
from os.path import join, basename, isfile
import subprocess
STANFORD_PATH = "stanford-core"
# THREE_CLASS_CLASSIFIER = "classifiers/english.all.3class.distsim.crf.ser.gz"
# FOUR_CLASS_CLASSIFIER = "classifiers/english.conll.4class.distsim.crf.ser.gz"
# SEVEN_CLASS_CLASSIFIER = "english.muc.7class.distsim.crf.ser.gz"
#
# classifiers = {
# 3: THREE_CLASS_CLASSIFIER,
# 4: FOUR_CLASS_CLASSIFIER,
# 7: SEVEN_CLASS_CLASSIFIER
# }
#
# NER_COMMAND = "java -mx600m -cp stanford-ner.jar:lib/* edu.stanford.nlp.ie.crf.CRFClassifier " \
# "-loadClassifier {classifier} -outputFormat {output_format} -textFile {text_file}"
STANFORD_COMMAND = 'java -cp "*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP ' \
'-annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat {output_format} -filelist {file_list}'
TMP_FILE_LIST = "filelist.tmp.txt"
def write_filelist(files, stanford_ner_path):
cwd = getcwd()
with open(join(stanford_ner_path, TMP_FILE_LIST), 'w+') as f:
for file_name in files:
file_path = join(cwd, file_name) + "\n"
f.write(file_path)
def detect(files, output_format="xml", output_dir=None):
"""
:param files: <list>
:param output_format:
:return: list of files with NER result
"""
stanford_ner_path = join(getcwd(), STANFORD_PATH)
write_filelist(files, stanford_ner_path)
print("Processing NER")
command = STANFORD_COMMAND.format(output_format=output_format, file_list=TMP_FILE_LIST)
subprocess.run(command, shell=True, stderr=subprocess.STDOUT, cwd=stanford_ner_path)
remove(join(stanford_ner_path, TMP_FILE_LIST))
output_files = [basename(file_name) + "." + output_format for file_name in files]
print(output_files)
all_output_files = " ".join(output_files)
output_dir = getcwd() if output_dir is None else output_dir
subprocess.run("mv -t {} {}".format(output_dir, all_output_files), shell=True, cwd=stanford_ner_path)
return output_files
if __name__ == "__main__":
dataset_path = join(getcwd(), "dataset/tac15/source_docs/")
files_to_process = []
for file in [filename for filename in listdir(dataset_path) if isfile(join(dataset_path, filename))]:
files_to_process.append(join(dataset_path, file))
detect(files_to_process, output_dir=join(getcwd(), "named_entities/tac_eval_15"))