-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_abbr_dictionary.py
197 lines (182 loc) · 8.68 KB
/
create_abbr_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# This script goes through a list of encoded abbreviations
# and their expansions and makes a dictionary out of them.
# This dictionary is later used by another script which
# adds expansions to unexpanded abbreviations in a text.
# The starting point is a match list of all already
# existing expanded abbreviations in a large material.
# They have been encoded as <choice><abbr>Dr</abbr><expan>Doctor</expan></choice>.
# I exported this list from "Find in Files" in the
# Oxygen XML Editor, but I couldn't use the list as such
# with BeautifulSoup since it contained XML that was not
# well-formed (matches contained context that didn't take XML
# into consideration, so tags/attributes/values were chopped off
# right in the middle, or opening/closing tags were missing).
# So this script first makes a new XML file containing only
# matches and well-formed XML.
# With BeautifulSoup we can then get the contents of <abbr> and <expan>
# and create the dictionary.
# This script can also easily be modified into e.g. making a
# dictionary of editors' corrections (in TEI XML encoded as
# <choice><orig>Docton</orig><corr>Doctor</corr></choice> or
# <choice><orig>e. g.</orig><reg>e.g.</reg></choice>).
# Once the dictionary has been created: if there are multiple
# expansions for an abbreviation, these should be checked manually.
# When that is done, use option UPDATE_DICTIONARY to tidy up
# the existing dict and create the final product.
import re
from bs4 import BeautifulSoup
import json
SOURCE_FILE = "documents/abbr/match_list.xml"
OUTPUT_FILE = "documents/expan/abbr_and_expan.xml"
ABBR_DICTIONARY = "dictionaries/abbr_dictionary.json"
UPDATE_DICTIONARY = False
# read an xml file and return its content
# either as a string or as as a soup object
# depending on what it's going to be used for
def read_xml(filename):
with open (filename, "r", encoding="utf-8-sig") as source_file:
file_content = source_file.read()
if filename == OUTPUT_FILE:
abbr_soup = BeautifulSoup(file_content, "xml")
print("We have soup.")
return abbr_soup
else:
return file_content
# find all <choice> containing both <abbr> and <expan>
# in the match list which also contains lots of other stuff
def find_abbr_and_expan(file_content):
search_string = re.compile(r"<choice><abbr>.*?</abbr><expan>.*?</expan></choice>|<choice><expan>.*?</expan><abbr>.*?</abbr></choice>")
match_string = re.findall(search_string, file_content)
# the result is a list of strings
# join them with a newline as separator for each match
abbr_string = "\n"
return abbr_string.join(match_string)
# create a new xml file containing only
# all the relevant <choice> elements
# this is the file we'll use when making the dictionary
def write_to_file(abbr_string, filename):
with open(filename, "w", encoding="utf-8-sig") as output_file:
output_file.write("<body>")
output_file.write(abbr_string)
output_file.write("</body>")
output_file.close()
print("Abbreviations and expansions written to file", filename)
# create the dictionary of abbreviations and their expansions
def create_abbr_dictionary(abbr_soup, abbr_dict):
# go through each <choice> and its children <abbr> and <expan>
choices = abbr_soup.find_all("choice")
# just checking that every <choice> in the xml file
# is actually used for the dictionary, hence the counter
i = 0
for choice in choices:
for child in choice.children:
# we can't just use .get_text() for getting the abbr contents,
# because we need to preserve all the tags inside abbr,
# such as <hi> (get_text only returns string content)
# also, we can't use BS methods such as .unwrap() since the
# variable abbr is no longer part of the parse tree
# but ordinary replacement will do just as well
# (just have to convert it from a BS tag object into a string first)
# and we end up with the contents of <abbr>, both tags and strings
if child.name == "abbr":
abbr = child
abbr_content = str(abbr)
abbr_content = abbr_content.replace("<abbr>", "")
abbr_content = abbr_content.replace("</abbr>", "")
# the contents of <expan> contain no tags
# so just get the string
if child.name == "expan":
expan_content = child.get_text()
expan_content = str(expan_content)
# now we have an abbreviation - expansion pair
i += 1
# if this abbreviation already exists in the dictionary:
# check if the expansion we just found also exists
# if it does, there's nothing to add
# else: keep checking the expan, and if our expan
# is a new one, add it to the dict
# the same abbr may have several expans, e.g.
# B.C. = "Before Christ" or "British Columbia"
# since abbr is the key, we have to add _1 etc. to it
# in order to be able to record new expans for it
# these multiple expans should be checked later
# in order to decide which of them to keep
if abbr_content in abbr_dict.keys():
expan = abbr_dict.get(abbr_content)
if expan == expan_content:
continue
else:
x = 1
while x < 5:
abbr_content = abbr_content + "_" + str(x)
if abbr_content in abbr_dict.keys():
expan = abbr_dict.get(abbr_content)
if expan == expan_content:
break
else:
x += 1
search_string = re.compile(r"_\d")
abbr_content = search_string.sub("", abbr_content)
else:
abbr_dict.update({abbr_content: expan_content})
break
# if this abbreviation isn't in the dict: add it
else:
abbr_dict.update({abbr_content: expan_content})
print(i)
# sort dict by key (NB with this method uppercase and lowercase
# are sorted separately)
sorted_abbr_dict = sorted(abbr_dict.items(), key = lambda item: item[0])
sorted_abbr_dict = dict(sorted_abbr_dict)
return sorted_abbr_dict
# save dictionary as a file
def write_dict_to_file(dictionary, filename):
json_dict = json.dumps(dictionary, ensure_ascii=False)
with open(filename, "w", encoding="utf-8") as output_file:
output_file.write(json_dict)
print("Dictionary written to file", filename)
# get dictionary content from file
def read_dict_from_file(filename):
with open(filename, encoding="utf-8-sig") as source_file:
json_content = json.load(source_file)
return json_content
# remove entries containing underscore + a digit
# they represent alternative expansions for an abbreviation
# and were added when the dict was originally created
# but if they have been checked there's no need to keep them
# in the dictionary any longer
def update_dictionary(dictionary_to_update):
max_digit = 4
for digit in range(1, max_digit + 1):
char_to_check = f"_{digit}"
keys_to_remove = [key for key in dictionary_to_update if char_to_check in key]
for key in keys_to_remove:
del dictionary_to_update[key]
# sort dict by key, ignore case
sorted_updated_dict = sorted(dictionary_to_update.items(), key = lambda item: item[0].casefold())
sorted_updated_dict = dict(sorted_updated_dict)
return sorted_updated_dict
def main():
# create a dictionary of abbreviations and expansions
# afterwards: go through alternative expansions manually
# and decide which of them to keep
if not UPDATE_DICTIONARY:
# read the match list
file_content = read_xml(SOURCE_FILE)
# find the abbreviations and their expansions
abbr_string = find_abbr_and_expan(file_content)
# save the abbr - expan pairs
write_to_file(abbr_string, OUTPUT_FILE)
# make the newly created file into a soup object
abbr_soup = read_xml(OUTPUT_FILE)
abbr_dict = {}
# create and sort the dictionary
sorted_abbr_dict = create_abbr_dictionary(abbr_soup, abbr_dict)
write_dict_to_file(sorted_abbr_dict, ABBR_DICTIONARY)
# tidy up and sort the dictionary anew once it's been manually checked
else:
dictionary_to_update = read_dict_from_file(ABBR_DICTIONARY)
# create and sort the new and updated dictionary
sorted_updated_dict = update_dictionary(dictionary_to_update)
write_dict_to_file(sorted_updated_dict, ABBR_DICTIONARY)
main()