-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmedtxt_to_conll.py
117 lines (96 loc) · 3.56 KB
/
medtxt_to_conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from bs4 import BeautifulSoup, NavigableString
TAG_ATTRIBUTE_MAP = {
"d": ("certainty", "d"),
"a": ("None", "a"),
"f": ("None", "f"),
"c": ("None", "c"),
"timex3": ("type", "timex3"),
"t-test": ("state", "t-test"),
"t-key": ("None", "t-key"),
"t-val": ("None", "t-val"),
"m-key": ("state", "m-key"),
"m-val": ("None", "m-val"),
"r": ("state", "r"),
"cc": ("state", "cc"),
"p": ("None", "p"),
}
LABEL_NAME = {
"d": "disease",
"a": "anatomical-parts",
"f": "feature",
"c": "change",
"timex3": "time",
"t-test": "test-test",
"t-key": "test-key",
"t-val": "test-value",
"m-key": "medicine-key",
"m-val": "medicine-value",
"r": "remedy",
"cc": "clinical-context",
"p": "pending",
}
def parse_node(node, current_label="O", current_detail="O", results=[]):
tag_name = node.name
if tag_name in ["articles", "article"]:
for child in node.children:
parse_node(child, current_label, current_detail, results)
return results
if isinstance(node, NavigableString):
text = node.strip()
if text:
text = text.replace(".", " .")
text = text.replace(",", " ,")
text = text.replace("?", " ?")
tokens = text.split()
for i, t in enumerate(tokens):
if current_label == "O":
results.append((t, "O", "O"))
else:
if i == 0:
results.append((t, "B-" + current_detail, "B-" + LABEL_NAME[current_label]))
else:
results.append((t, "I-" + current_detail, "I-" + LABEL_NAME[current_label]))
return results
else:
attr_key, current_label = TAG_ATTRIBUTE_MAP[tag_name]
current_detail = node.get(attr_key, "O")
for child in node.children:
parse_node(child, current_label, current_detail, results)
return results
def convert_xml_to_conll(xml_str):
soup = BeautifulSoup(xml_str, "xml")
articles = soup.find_all("article")
all_results = []
for article in articles:
results = []
parse_node(article, "O", "O", results)
all_results.append(results)
return all_results
if __name__ == "__main__":
with open("./data/MedTxt-CR-EN-training-pub.xml", encoding="utf-8") as f:
original_data = f.read()
article_results = convert_xml_to_conll(original_data)
train_rate = 0.8
train_num = int(len(article_results) * train_rate)
train_text = ""
for article_result in article_results[:train_num]:
if len(train_text) != 0 and train_text[-2:] != "\n\n":
train_text += "\n"
train_text += "-DOCSTART- -X- O\n\n"
for word, detail, label in article_result:
train_text += f"{word} {detail} {label}\n"
if word == ".":
train_text += "\n"
test_text = ""
for article_result in article_results[train_num:]:
if len(test_text) != 0 and test_text[-2:] != "\n":
test_text += "\n"
test_text += "-DOCSTART- -X- O\n\n"
for word, detail, label in article_result:
test_text += f"{word} {detail} {label}\n"
if word == ".":
test_text += "\n"
with open("./data/MedTxt_train.txt", "w", encoding="utf-8") as f_out:
f_out.write(train_text)
with open("./data/MedTxt_test.txt", "w", encoding="utf-8") as f_out:
f_out.write(test_text)