words.py

import pandas as pd
import eng_to_ipa as ipa
import subprocess
import openpyxl
import argparse
import random


# Word object that stores the word itself, IPA translation, IPA in List formation and FREQCount
# Freqcount and word are provided by SUBTLEX
class Words:
    def __init__(self, WORD, IPA, IPA_LIST, FREQcount):
        self.WORD = WORD
        self.IPA = IPA
        self.IPA_LIST = IPA_LIST
        self.FREQcount = FREQcount

# This is for updating the dataset. If you are not selecting all words to examine it will trim words below the median by half
# Once we cannot half our dataset, words will randomly be removed from the 2nd half of the list until we get our desired size
def update_list(size, words):
    words.sort(key=lambda x: x.FREQcount, reverse=True)
    print(words[3].WORD)
    while len(words) // 2 > size:
        mid = len(words) // 2
        median = (words[mid].FREQcount + words[-mid - 1].FREQcount) / 2
        if words[mid].FREQcount < median:
            del words[mid:]
        elif words[mid+1].FREQcount < median:
            del words[mid+1:]
        else:
            del words[mid+2:]
    
    mid = len(words) // 2
    median = (words[mid].FREQcount + words[-mid-1].FREQcount) / 2
   
    print(f"Words after first for loop {len(words)}")
    if words[mid].FREQcount >= median:
        mid_val = words[mid]
    else:
        mid_val = words[mid+1]
    
    while len(words) != size:
        element = random.randrange(words.index(mid_val) + 1, len(words))
        words.pop(element)
        print(len(words))

# This uses SUBTLEX-US-Copy which contains the IPA forms generated by our project, and grabs all information from the xlsx file    
def add_words_to_list_from_file(words):
    # data  = pd.read_excel('SUBTLEX-US-Copy.xlsx')
    # df = data.sample(n = 4096, random_state = 1)
    # for label, row in df.iterrows():
    #    WORD = str(row['Word']).strip()
    #    IPA = str(row['IPA']).strip()
    #    IPA_LIST = str(row['IPA-List']).strip().split()
    #    words.append(Words(WORD, IPA, IPA_LIST))

    #print(df['Word'])
    df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
    # df = df.sample(frac = 0.0013)
    # j = 0
    # print(df['Word'])

    # Original Code Below
    for i in range(len(df['Word'])):
        WORD = str(df['Word'][i]).strip()
        IPA = str(df['IPA'][i]).strip()
        IPA_LIST = str(df['IPA-List'][i]).strip().split()
        FREQcount = int(df['FREQcount'][i])
        words.append(Words(WORD, IPA, IPA_LIST, FREQcount))


# This uses the original SUBTLEX File and generates IPA transcriptions for each word then adds it to our dataset
# ~12 hours to run
def add_words_to_list(words):
    df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
    for i in range(len(df['Word'])):
        WORD = str(df['Word'][i]).strip()
        IPA = str(ipa.convert(WORD)).strip()
        IPA = IPA.replace("ˈ", "")
        IPA = IPA.replace("ˌ", "")
        if IPA[len(IPA) - 1] == "*":
            IPA = str(subprocess.run(['bash', 'ipa_translator.sh', WORD]))
            with open('ipa_translation.txt') as f:
                lines = f.readlines()
                IPA = lines[0][1:].replace(" ", "")
                IPA = IPA.replace(">", " ")
                IPA = IPA.strip()
                IPA = IPA.strip("\n")
            
       #  if IPA[len(IPA) - 1] == "*":
           #  print(word)
        # subprocess.run(['bash', 'ipa_translator.sh', word])
        # with open('ipa_translation.txt') as f:
        #     lines = f.readlines()
        #     IPA = lines[0][1:].replace(" ", "")
        #     IPA = IPA.replace(">", " ")
        #     IPA = IPA.strip()
        FREQcount = int(df['FREQcount'][i])
        words.append(Words(WORD, IPA, list(IPA), FREQcount))

#Some Phonetic transcriptions are two characters. This combines the characters as one entity in our IPA_LIST
def update_ipa(words):
    two_character_phonemes = ["oʊ", "ɔɪ", "aɪ", "aʊ"]
    vowels = ["ɑ", "æ", "ə", "ʌ", "ɔ", "a", "aɪ", "aʊ", "ɛ", "e", "ɪ", "i", "o", "ɔ", "ʊ", "u"]

    for i in range(0, len(words)):
        word_1 = words[i].IPA_LIST
        temp_array_word = []
        temp_array_word.append(word_1[0])

        for k in range(1, len(word_1)):
            temp_array_word.append(word_1[k])

            if (word_1[k-1] + word_1[k] == "ər") and (k != len(word_1) - 1) and (word_1[k+1] not in vowels):
                    # print(f"{words[i].WORD}...... {word_1}.....{word_1[k+1]}")
                    temp_array_word.pop()
                    temp_array_word.pop()
                    temp_array_word.append(word_1[k-1] + word_1[k])


            elif (word_1[k-1] + word_1[k] in two_character_phonemes):
                temp_array_word.pop()
                temp_array_word.pop()
                temp_array_word.append(word_1[k-1] + word_1[k])
        
        words[i].IPA_LIST = temp_array_word
                
                
# This functions checks words without POS tags in SUBTLEX
def words_without_pos(words):
    total = 0
    f = open("words-without-pos.txt", "w+")
    f.write(f'{"WORD":<100} \t FREQ \n')
    for i in range(0, len(words)):
        if str(words[i].POS) == 'nan':
            f.write(f'{words[i].word:<100} \t {words[i].FREQ} \n')
            total += 1
    f.close()
    return total


# currently ignoring nan POS
def total_pos(words, total_nan):
    total = 0
    for i in range(len(words)):
        if str(words[i].POS) != 'nan':
            total += 1
    print(f"{total} / 74095 POS. Averaging {(total / (74286 - total_nan))}")
    return total / (74286 - total_nan)


# currently ignoring nan POS
def total_pos_reading_from_file():
    total = 0
    num_words = 0
    df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
    for i in range(len(df['Word'])):
        POS = str(df['All_PoS_SUBTLEX'][i]).split(".")
        if len(POS) == 1:
            # if POS[0] == "nan" and FREQ[0] == "nan":
            # words.append(Words(word, "N/A", "N/A")) # If we want to keep it as N/A
            #     words.append(Words(word, "N/A", str(df['FREQcount'][i])))  # If we want to use FREQcount
            # elif POS[0] == "nan" and FREQ[0] != "nan":
            #     words.append(Words(word, "N/A", FREQ[0]))
            # elif POS[0] != "nan" and FREQ[0] == "nan":
            #     words.append(Words(word, POS[0], str(df['FREQcount'][i])))
            # else:
            if POS[0] != "nan":
                num_words += 1
                total += len(POS)
        else:
            total += len(POS)
            num_words += 1
    print("Using file reading... ")
    print(f"{total} / {num_words} POS. Averaging {total / num_words}")


# Does not include nan values
def frequency_distribution(freq_words):
    frequency = {}
    df = pd.read_excel('SUBTLEX-US-Copy.xlsx')
    for i in range(len(df['Word'])):
        POS = str(df['All_PoS_SUBTLEX'][i])  # As of now we are treating X.Y separately compared to Y.X
        if POS != 'nan':  # remove if statement if we want nan included in this frequency distribution
            frequency[POS] = frequency.get(POS, 0) + 1
    sort_frequency = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
    f = open("frequency-distribution.txt", "w+")
    for i in sort_frequency:
        # print(f'{i[0]:<100} {i[1]}')
        f.write(f'{i[0]:<100} \t {i[1]} \n')
    f.write(f'\nThe average POS per word is: {freq_words}')
    # pprint.pprint(frequency)
    # pretty_dict_str = pprint.pformat(frequency)

    # f.write(pretty_dict_str)
    f.close()