-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
516 lines (435 loc) · 21.9 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
import re
import pandas as pd
import numpy as np
class NounParser:
def __init__(self):
self.consonants = {"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "r", "s", "t", "v", "w", "y", "z", "ʔ", "ɲ", "ŋ", "ɣ"}
self.exceptions = ["nd", "nt", "ŋg", "ŋk", "ɲj", "mb","mp" ]
self.vowels = {"ɛ̌", "ɔ̌", "ɛ̂", "ɔ̂", "ɛ́", "ɔ́", "ɛ̀", "ɔ̀", "ì", "í", "ǔ", "û",'ê', 'ě', 'è', 'é', 'ô', 'ò', 'ǒ', 'ó', 'ǎ',
'â', 'à', 'á', 'ɛ̌', 'ɛ̂', 'ɛ́', 'ɛ̀', 'û', 'ǔ', 'ú', 'ù', 'ǐ', 'î', 'í', 'ì', ' ̀ɔ ', 'ɔ́', 'ɔ̌', 'ɔ̂',
"i", "e", "ɛ", "u", "o", "ɔ", "a"}
self.replacements = {"i":"i", 'í': 'í', 'ì': 'ì', 'ǐ': 'í','î': 'ì', "e": "e", 'é': 'é', 'è': 'è','ě': 'é', 'ê': 'è','ɛ':'ɛ',
'ɛ́':'ɛ́', 'ɛ̀': 'ɛ̀', "ɛ̌":'ɛ́', 'ɛ̂': 'ɛ̀',"u": "u", 'ú': 'ú', 'ù': 'ù',"ǔ":'ú', "û": 'ù',"o": "o", 'ó': 'ó',
'ò': 'ò', 'ǒ': 'ó', 'ô': 'ò','ɔ': 'ɔ', 'ɔ́': 'ɔ́','ɔ̀': 'ɔ̀', 'ɔ̌':'ɔ́', 'ɔ̂': ' ̀ɔ ', "a": "a", 'á': 'á',
'à':'à', 'ǎ': 'á', 'â':'à'}
self.replacement_exceptions={'ǐ':'ì','î':'í','ě':'è','ê':'é',"ɛ̌":'ɛ̀','ɛ̂':'ɛ́',"ǔ":'ù',"û":'ú','ɔ̌':'ɔ̀','ɔ̂':'ɔ́','ǒ':'ò',
'ô':'ó','ǎ':'à','â':'á'}
self.replacement_exceptions_1={'ǐ':'ì','î':'í','ě':'è','ê':'é',"ɛ̌":'ɛ̀','ɛ̂':'ɛ́',"ǔ":'ù',"û":'ú','ɔ̌':'ɔ̀','ɔ̂':'ɔ́','ǒ':'ò','ô':'ó','ǎ':'à','â':'á'}
self.replacement_exceptions_2={'ǐ':'í','î':'ì','ě':'é','ê':'è',"ɛ̌":'ɛ́','ɛ̂':'ɛ̀',"ǔ":'ú',"û":'ù','ɔ̌':'ɔ́','ɔ̂':'ɔ̀','ǒ':'ó','ô':'ò','ǎ':'á','â':'à'}
self.extra_material={ 'ǎ': 'á', 'ê': 'è', 'ô': 'ò', 'ě': 'é','ǐ': 'í', 'î': 'ì', 'ǒ': 'ó', 'â': 'à', 'ɛ́': 'ɛ́', 'ɛ̀': 'ɛ̀',
'ǔ': 'ú', 'í': 'í','ì': 'ì'}
def parse_off_final_nasals(self, item):
if item and item[-1] =="ⁿ":
return item[:-1] + "-" + item[-1:]
else:
return item
def existing_parses(self, word):
"""
reparses existing parses:
Jeff's parses:
hyphens in word-final syllables are eliminated.
hyphens elsewhere indicate word boundaries
parse from orthography profile:
spaces are eliminated
"""
new_word="" #Jeff's parses
for index, letter in enumerate(word):
idx=len(word)-index
if letter== "-" and idx <=4:
new_word += ""
elif letter== "-" and idx >3:
new_word += " "
else:
new_word += str(letter)
final_word_1="" #eliminating orthography profile spaces
for letter in new_word:
if letter==" ":
final_word_1 += ""
else:
final_word_1 += letter
final_word_2=""
for letter in final_word_1:
if letter=="#":
final_word_2 += " "
else:
final_word_2 += letter
return final_word_2
def parse_noun_durationals(self, item):
"""
parses durational marking in nouns
"""
new_word = ""
for index, letter in enumerate(item): #segments that preceed a durational and which have contour tones
if (letter in self.replacement_exceptions_1.keys() and
index < len(item) - 1 and item[index + 1] == ":"):
new_word += str(self.replacement_exceptions_1.get(letter, letter))
elif (letter == ":" and #replacing durationals that are preceeded by a contour tone
index > 0 and
item[index - 1] in self.replacement_exceptions_1.keys()):
new_word += str(self.replacement_exceptions_2.get(item[index - 1], letter))
elif letter ==":" and item[index-1] not in self.replacement_exceptions_1.keys(): #replacing durationals that are preceded by level tones
new_word += str(self.replacements.get(item[index-1],f"{ item[index-2]+ item[index-1]}" ))
else:
new_word +=letter
return new_word
def cvcv_segmentation(self, word, indexes=[3, 5, 7, 9, 11]):
"""
Parses syllables to follow a cvcv basic structure
"""
consonant_count = 0
new_word = ""
for letter in word:
if letter in self.consonants and letter != "ⁿ":
consonant_count += 1 #continue consonant count
if consonant_count in indexes:
if word[word.index(letter)-1] != "-":
new_word += "-" + letter #parsing
else:
new_word += letter
else:
new_word += letter
elif letter == " ": #start recount at word boundaries
consonant_count=0
new_word +=letter
else:
new_word += letter #skipping over vowels
for i in self.exceptions:
with_hyphen = i[0] + "-" + i[1]
if with_hyphen in new_word: #cases where there is a hyphen between nasalized consonants
idx = new_word.index(with_hyphen)
new_word = new_word[:idx] + "-" + i[0]+i[1] + new_word[idx + 3:]
return new_word.replace("--", "-")
def hyphen_space(self, word):
"""
remove hyphens after morphemic boundaries indicated by a space and hyphens occuring word initially
"""
if "-" in word and (word[word.index("-")-1]=="#" or word[word.index("-") + 1] ==" "):#eliminates hyphens after space
word=word.replace("-", "")
elif word.find("-")==0: #elinates hyphens that occur at the begining of a word
word=word[1:]
return word
def nasalized_stops(self, word):
"""
Parses 'consonant ensembles i.e. nasalized consonants'
"""
for i in self.exceptions:
with_hyphen = i[0] + "-" + i[1]
if with_hyphen in word: #cases where there is a hyphen between the nasalized consonants
idx = word.index(with_hyphen)
if word[idx-1] != "-":
word = word[:idx] + "-" + word[idx + 3:]
remainder = word[word.index(i) + 2:] if i in word and len(word) > idx + 3 else None
word = word[:word.index(i) + 2] + self.cvcv_segmentation(remainder, indexes=[1, 3, 5, 7, 9]) if remainder is not None else word
else:
word = word[:idx] + word[idx + 3:]
remainder = word[word.index(i) + 2:] if i in word and len(word) > idx + 3 else None
word = word[:word.index(i) + 2] + self.cvcv_segmentation(remainder, indexes=[1, 3, 5, 7, 9]) if remainder is not None else word
else: #cases where nasalized consonants have no hyphen in-between
if i in word:
idx = word.index(i)
word = word[:idx] + "-" + i + word[idx + 2:]
if len(word) > idx + 2:
remainder = word[idx + 3:] if i in word and len(word) > idx + 3 else None
word = word[:idx + len(i) + 1] + self.cvcv_segmentation(remainder, indexes=[1, 3, 5, 7, 9]) if remainder is not None else word
else:
word=word
return self.hyphen_space(word.replace("--", "-"))
def identified_suffixes(self, word):
"""
This function parses suffixes that have been identified. These are the suffixes identified so far:
'g'+ vowel
'm'+vowel
"""
new_word=""
for i in range(len(word)):
letter = word[i]
if i >= len(word) - 3: # Checking if the letter is within the last three alphabets
if letter == "g" and word[i - 1] != "ŋ":#checking if "g" is not a nasalized consonant
new_word += f"-{letter}"
elif letter == "m":
new_word += f"-{letter}"
else:
new_word += letter
else:
new_word += letter
return self.hyphen_space(new_word.replace("--", "-"))
class VerbParser(NounParser):
def __init__(self):
super().__init__()
def consonant_count(self, item):
"""
counts number of consonants in words
"""
if item is None:
return 0
consonant_count=0
for letter in item:
if letter != "ⁿ" and letter in self.consonants:
consonant_count +=1
return consonant_count
def parse_verb_durationals(self, item):
"""
parses durational marking in nouns
"""
if item.endswith(":"):
new_word = ""
for index, letter in enumerate(item): #segments that preceed a durational and which have contour tones
if (letter in self.replacement_exceptions_1.keys() and
index < len(item) - 1 and item[index + 1] == ":"):
new_word += str(self.replacement_exceptions_1.get(letter, letter))
elif (letter == ":" and #replacing durationals that are preceeded by a contour tone
index > 0 and
item[index - 1] in self.replacement_exceptions_1.keys()):
if index +1 == len(item):
new_word += f"-{str(self.replacement_exceptions_2.get(item[index - 1], letter))}"
else:
new_word += str(self.replacement_exceptions_2.get(item[index - 1], letter))
elif letter ==":" and item[index-1] not in self.replacement_exceptions_1.keys(): #replacing durationals that are preceded by level tones
if index +1 == len(item):
new_word += f"-{str(self.replacements.get(item[index-1],f"{ item[index-2]+ item[index-1]}" ))}"
else:
new_word += str(self.replacements.get(item[index-1],f"{ item[index-2]+ item[index-1]}" ))
else:
new_word +=letter
else:
new_word=self.parse_noun_durationals(item)
return new_word
def post_editing_short_strings(self, word):
"""
post edits short words that have been "over-parsed" as a result of issues encountered with character fonts
"""
if word is None:
return None
consonant_count=self.consonant_count(word)
if consonant_count <=2 and word.count("-") >1:
word=word.replace("-", "", 1)
return word
def hyphen_space(self, word):
"""
special utility function to remove hyphens after morphemic boundaries indicated by a space and hyphens occurring word initially
"""
new_word=""
for index, letter in enumerate(word):
if letter== "-" and index+1 < len(word) and (word[index+1]== " " or word[index-1]== " ") :
new_word += ""
else:
new_word += letter
return new_word
def maintaining_nasality_on_segment(self, word):
"""
ensures that nasality remains on segment that is marked for it, and also parses sounds occuring after the nasal marker
"""
new_word=""
for index, letter in enumerate(word):
if letter == "-" and index + 1 < len(word) and word[index + 1]== "ⁿ": #zǐǐ-ⁿ zǐ-ǐⁿ can be solved from here but the fonts do not permit this
new_word +=""
else:
new_word += letter
word=""
for index, letter in enumerate(new_word):
idx=len(new_word)-index
if letter == "ⁿ" and 3 >= idx > 1:
if new_word[index]!= "-":
word += f"{letter}-"
else:
word +=letter
if "-" not in word:
new_word=""
for index, letter in enumerate(word):
if letter not in self.consonants and index+1 < len(word) and "ⁿ" in word[index: index+2]:
new_word += f"-{letter}"
else:
new_word +=letter
return self.hyphen_space(new_word)
else:
return self.hyphen_space(word)
def verify_exceptions(self, word):
"""
verifies that consonant ensembles are not parsed as different consonants and reparses long words with consonant ensembles
"""
for i in self.exceptions:
with_hyphen = i[0] + "-" + i[1]
if with_hyphen in word:
idx = word.index(with_hyphen)
word = word[:idx] + i + "-" + word[idx + 3:]
word = word.replace("--", "-")
return self.maintaining_nasality_on_segment(word)
def vowel_tone_hyphen(self, word):
"""
handles cases in which despite parse_durationals, it is difficult to extract vowels
"""
if len(word) >= 4:
if word[-1] not in self.vowels.union(self.consonants) and word[-2] == "-":
return self.verify_exceptions(word[:-3] + "-" + word[-3] + word[-1])
return self.verify_exceptions(word)
def syllabic_vowels(self, word):
"""
isolates syllabic vowels and parses them off
"""
final_word = word
for alphabet in self.vowels:
if alphabet in word[2:-2]:
try:
index = word.index(alphabet)
if word[index - 1] == "-" and word[index + 1] in self.consonants:
final_word = final_word.replace(word[index], f"{alphabet}-")
except ValueError:
pass
return self.vowel_tone_hyphen(final_word.rstrip('-').replace("--", "-"))
def post_coda(self, word):
"""
parses vowels sounds that occur after codas
"""
for i in range(1, len(word) - 3):
if word[i] in self.consonants:
if i + 1 < len(word) and word[i + 1] == "-":
try:
if i + 2 < len(word) and i + 3 < len(word) and word[i + 2] in self.vowels and word[
i + 3] in self.consonants:
material = word[:i + 2] + "-" + word[i + 2:]
return self.syllabic_vowels(material)
else:
return self.syllabic_vowels(word)
except IndexError:
return self.syllabic_vowels(word)
return self.syllabic_vowels(word)
def special_mid_forms(self, item):
"""
takes words of between 3-4 alphabets long that have special characters and parses them
"""
if "-" not in item[-3:] and item[-1] not in self.consonants:
return self.post_coda(item[:-1] + "-" + item[-1:])
return self.post_coda(item)
def special_long_words(self, word):
"""
takes word of length 5 and parses them into CVC: this is because of the font-recognition issues encountered
"""
current_syllable = ''
consonant_count = 0
for i in range(len(word)):
if word[i] != "ⁿ" and word[i] in self.consonants:
consonant_count += 1
if consonant_count == 2 and i < len(word):
word = word[:i + 1] + '-' + word[i + 1:]
consonant_count = 0
return self.post_coda(word)
def long_words(self, word):
"""
parses long words according to a cvc priority structure and also takes care of idiosyncratic words
"""
num=[2,4,6,8]
new_word="" #word parsed according to cvc priority
consonant_count=0
for index, letter in enumerate(word):
if letter == "#" or letter== "ⁿ":
consonant_count=0
new_word +=letter
elif letter!= "ⁿ" and letter in self.consonants: #just to be extra sure
consonant_count +=1
if consonant_count in num:
new_word += f"{letter}-"
elif consonant_count==3 and word[index-1] != " ":
new_word +=f"-{letter}"
else:
new_word += letter
else:
new_word +=letter
if " " in new_word and "-" not in new_word[new_word.index(" "):]: #parsing words such as "gúr-ɔ́ gùrɔ́" which have second parts not responding to code above so they become "gúr-ɔ́ gùr-ɔ́"
consonant_count=self.consonant_count(new_word[new_word.index(" "):])
if consonant_count >=2 and new_word[-1] not in self.consonants:
conso=[x for x in new_word[new_word.index(" "):] if x in self.consonants]
idx=new_word.index(conso[1])
new_word=new_word[:idx+1] + "-" + new_word[idx+1:]
new_word=new_word[:-1].replace("--", "-") if new_word[-1] == '-' else new_word.replace("--", "-")
return self.verify_exceptions(new_word)
def segment_cvcs(self, item):
"""
main segmentation function to be implemented after parse_durationals has been implemented
"""
if len(item) >= 5:
return self.long_words(item)
elif 3 <= len(item) <= 4:
return self.special_mid_forms(item) #calling special_mid_forms function
else:
return self.post_coda(item) #directly sending words to post_coda function for parsing
class AdjectiveNumeralParser(VerbParser):
def __init__(self):
super().__init__()
def isolating_suffixes(self,item):
"""
parses suffixes of adjectives and numerals i.e. vowels and 'y'
"""
consonants=self.consonant_count(item)
if consonants >=2:
new_word=""
idxs=[index for index,letter in enumerate(item) if letter in self.consonants and letter != "ⁿ"] #usual cases
for index, letter in enumerate(item):
if index==idxs[-1] and item[index-1] != "-":
if index+1 >len(item) :
new_word += f"-{letter}"
elif index+1 < len(item) and item[index+1]=="ⁿ":
new_word += f"-{letter}"
else:
new_word += f"{letter}-"
else:
new_word += letter
elif consonants <2 and len(item)>=3:
new_word=""
for letter in item:
if letter in self.consonants and letter != "ⁿ" and item[-1] != letter:
new_word += f"{letter}-"
else:
new_word+=letter
else:
new_word=item
if new_word.endswith("-") and new_word.index("-")+1 < len(new_word) and new_word[new_word.index("-")-1]=="y": #the case of 'y'
new_word=new_word[:new_word.index("-")-1] + "-" + new_word[new_word.index("-")-1:new_word.index("-")]
elif new_word.endswith("-"):
new_word=new_word[:-1]
else:
new_word=new_word
return new_word
def y_suffixes(self, item):
"""
further parses y suffixes due to font instabilities
"""
if item.endswith("y") and "-" not in item:
return item[:item.index("y")] + "-" + item[item.index("y"):]
else:
return item
def replace_hyphens_keep_last(self, item):
"""
corrects overgeneralizations from isolating suffixes method and makes sure only suffixes are parsed
"""
parts=item.rsplit("-",1)
if len(parts)==2:
first_part,last_part=parts
if last_part[-1] in self.consonants and last_part[-1] != "y":
return first_part.replace("-", "")+ "-" + last_part
else:
return first_part.replace("-", "")+ "-" + last_part
else:
return item
def switch_hyphen_position(self, item):
"""
corrects parses where hyphen occurs on the opposite side of expected position
"""
if "-" in item:
hyphen_index = item.index("-")
if len(item) > hyphen_index + 1 and item[hyphen_index + 1] != "y" and item[hyphen_index + 1] in self.consonants:
if len(item) > hyphen_index + 2 and item[hyphen_index+2] in self.vowels:
word = item[:hyphen_index] + item[hyphen_index + 1] + "-" + item[-1:]
return word
return item
def miscellaneous(self, input_string):
"""
fourre tout method to handle adhoc cases of wrong parse as a result of font instability
"""
x = [i for i in range(len(input_string)) if i > 0 and i < len(input_string) - 3] #handles cases such as in "núm-ɔ̀ẁ"
if "-" in input_string and input_string.index("-") in x and input_string[-1] != "y":
return input_string[:input_string.index("-")] + input_string[input_string.index("-") + 1:]
y = [index for index, letter in enumerate(input_string) if letter == "ⁿ"] #handles cases such as in "dííⁿ díí-wⁿ"
if "ⁿ" in input_string[:-2] and "-" in input_string[:y[-1]] and input_string[y[-1]-1] in self.consonants and input_string[y[-1]-1] != "y":
return input_string[:input_string.index("-")] + input_string[input_string.index("-") + 1:]
else:
return input_string