-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords.py
197 lines (168 loc) · 7.32 KB
/
words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import pandas as pd
import eng_to_ipa as ipa
import subprocess
import openpyxl
import argparse
import random
# Word object that stores the word itself, IPA translation, IPA in List formation and FREQCount
# Freqcount and word are provided by SUBTLEX
class Words:
def __init__(self, WORD, IPA, IPA_LIST, FREQcount):
self.WORD = WORD
self.IPA = IPA
self.IPA_LIST = IPA_LIST
self.FREQcount = FREQcount
# This is for updating the dataset. If you are not selecting all words to examine it will trim words below the median by half
# Once we cannot half our dataset, words will randomly be removed from the 2nd half of the list until we get our desired size
def update_list(size, words):
words.sort(key=lambda x: x.FREQcount, reverse=True)
print(words[3].WORD)
while len(words) // 2 > size:
mid = len(words) // 2
median = (words[mid].FREQcount + words[-mid - 1].FREQcount) / 2
if words[mid].FREQcount < median:
del words[mid:]
elif words[mid+1].FREQcount < median:
del words[mid+1:]
else:
del words[mid+2:]
mid = len(words) // 2
median = (words[mid].FREQcount + words[-mid-1].FREQcount) / 2
print(f"Words after first for loop {len(words)}")
if words[mid].FREQcount >= median:
mid_val = words[mid]
else:
mid_val = words[mid+1]
while len(words) != size:
element = random.randrange(words.index(mid_val) + 1, len(words))
words.pop(element)
print(len(words))
# This uses SUBTLEX-US-Copy which contains the IPA forms generated by our project, and grabs all information from the xlsx file
def add_words_to_list_from_file(words):
# data = pd.read_excel('SUBTLEX-US-Copy.xlsx')
# df = data.sample(n = 4096, random_state = 1)
# for label, row in df.iterrows():
# WORD = str(row['Word']).strip()
# IPA = str(row['IPA']).strip()
# IPA_LIST = str(row['IPA-List']).strip().split()
# words.append(Words(WORD, IPA, IPA_LIST))
#print(df['Word'])
df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
# df = df.sample(frac = 0.0013)
# j = 0
# print(df['Word'])
# Original Code Below
for i in range(len(df['Word'])):
WORD = str(df['Word'][i]).strip()
IPA = str(df['IPA'][i]).strip()
IPA_LIST = str(df['IPA-List'][i]).strip().split()
FREQcount = int(df['FREQcount'][i])
words.append(Words(WORD, IPA, IPA_LIST, FREQcount))
# This uses the original SUBTLEX File and generates IPA transcriptions for each word then adds it to our dataset
# ~12 hours to run
def add_words_to_list(words):
df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
for i in range(len(df['Word'])):
WORD = str(df['Word'][i]).strip()
IPA = str(ipa.convert(WORD)).strip()
IPA = IPA.replace("ˈ", "")
IPA = IPA.replace("ˌ", "")
if IPA[len(IPA) - 1] == "*":
IPA = str(subprocess.run(['bash', 'ipa_translator.sh', WORD]))
with open('ipa_translation.txt') as f:
lines = f.readlines()
IPA = lines[0][1:].replace(" ", "")
IPA = IPA.replace(">", " ")
IPA = IPA.strip()
IPA = IPA.strip("\n")
# if IPA[len(IPA) - 1] == "*":
# print(word)
# subprocess.run(['bash', 'ipa_translator.sh', word])
# with open('ipa_translation.txt') as f:
# lines = f.readlines()
# IPA = lines[0][1:].replace(" ", "")
# IPA = IPA.replace(">", " ")
# IPA = IPA.strip()
FREQcount = int(df['FREQcount'][i])
words.append(Words(WORD, IPA, list(IPA), FREQcount))
#Some Phonetic transcriptions are two characters. This combines the characters as one entity in our IPA_LIST
def update_ipa(words):
two_character_phonemes = ["oʊ", "ɔɪ", "aɪ", "aʊ"]
vowels = ["ɑ", "æ", "ə", "ʌ", "ɔ", "a", "aɪ", "aʊ", "ɛ", "e", "ɪ", "i", "o", "ɔ", "ʊ", "u"]
for i in range(0, len(words)):
word_1 = words[i].IPA_LIST
temp_array_word = []
temp_array_word.append(word_1[0])
for k in range(1, len(word_1)):
temp_array_word.append(word_1[k])
if (word_1[k-1] + word_1[k] == "ər") and (k != len(word_1) - 1) and (word_1[k+1] not in vowels):
# print(f"{words[i].WORD}...... {word_1}.....{word_1[k+1]}")
temp_array_word.pop()
temp_array_word.pop()
temp_array_word.append(word_1[k-1] + word_1[k])
elif (word_1[k-1] + word_1[k] in two_character_phonemes):
temp_array_word.pop()
temp_array_word.pop()
temp_array_word.append(word_1[k-1] + word_1[k])
words[i].IPA_LIST = temp_array_word
# This functions checks words without POS tags in SUBTLEX
def words_without_pos(words):
total = 0
f = open("words-without-pos.txt", "w+")
f.write(f'{"WORD":<100} \t FREQ \n')
for i in range(0, len(words)):
if str(words[i].POS) == 'nan':
f.write(f'{words[i].word:<100} \t {words[i].FREQ} \n')
total += 1
f.close()
return total
# currently ignoring nan POS
def total_pos(words, total_nan):
total = 0
for i in range(len(words)):
if str(words[i].POS) != 'nan':
total += 1
print(f"{total} / 74095 POS. Averaging {(total / (74286 - total_nan))}")
return total / (74286 - total_nan)
# currently ignoring nan POS
def total_pos_reading_from_file():
total = 0
num_words = 0
df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
for i in range(len(df['Word'])):
POS = str(df['All_PoS_SUBTLEX'][i]).split(".")
if len(POS) == 1:
# if POS[0] == "nan" and FREQ[0] == "nan":
# words.append(Words(word, "N/A", "N/A")) # If we want to keep it as N/A
# words.append(Words(word, "N/A", str(df['FREQcount'][i]))) # If we want to use FREQcount
# elif POS[0] == "nan" and FREQ[0] != "nan":
# words.append(Words(word, "N/A", FREQ[0]))
# elif POS[0] != "nan" and FREQ[0] == "nan":
# words.append(Words(word, POS[0], str(df['FREQcount'][i])))
# else:
if POS[0] != "nan":
num_words += 1
total += len(POS)
else:
total += len(POS)
num_words += 1
print("Using file reading... ")
print(f"{total} / {num_words} POS. Averaging {total / num_words}")
# Does not include nan values
def frequency_distribution(freq_words):
frequency = {}
df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
for i in range(len(df['Word'])):
POS = str(df['All_PoS_SUBTLEX'][i]) # As of now we are treating X.Y separately compared to Y.X
if POS != 'nan': # remove if statement if we want nan included in this frequency distribution
frequency[POS] = frequency.get(POS, 0) + 1
sort_frequency = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
f = open("frequency-distribution.txt", "w+")
for i in sort_frequency:
# print(f'{i[0]:<100} {i[1]}')
f.write(f'{i[0]:<100} \t {i[1]} \n')
f.write(f'\nThe average POS per word is: {freq_words}')
# pprint.pprint(frequency)
# pretty_dict_str = pprint.pformat(frequency)
# f.write(pretty_dict_str)
f.close()