From 215c736bcd21f68f3690ac6e5ac0a03a896dac0a Mon Sep 17 00:00:00 2001 From: Rajat Tandon Date: Sun, 16 May 2021 16:57:53 -0700 Subject: [PATCH] Filter_Group_Users_2020 --- Filter_Group_Users_2020.py | 250 ++++++++++++++----------------------- 1 file changed, 97 insertions(+), 153 deletions(-) diff --git a/Filter_Group_Users_2020.py b/Filter_Group_Users_2020.py index 6b7c0ee..b3fac16 100644 --- a/Filter_Group_Users_2020.py +++ b/Filter_Group_Users_2020.py @@ -18,11 +18,7 @@ CHUNKSIZE = 10000 #===============================================================# PATH_TO_AA_LIST = "data/AA.txt" -PATH_TO_FRATERNITY_LIST = "data/FRATERNITY.txt" PATH_TO_GAMBLING_LIST = "data/GAMBLING.txt" -PATH_TO_ADULT_LIST = "data/ADULT.txt" -PATH_TO_DRUGS_LIST = "data/DRUGS.txt" -PATH_TO_ALCOHOLICS_LIST = "data/ALCOHOLICS.txt" PATH_TO_STOPWORDS_LIST = "data/STOPWORDS.txt" #===============================================================# USERS_FILE = "checkpoint_part1/PARTIAL_OUTPUT.txt" @@ -52,12 +48,7 @@ #===============================================================# aa=set() -fraternity = set() -fraternity_small = set() gambling = set() -adult = set() -drugs = set() -alcoholics = set() filter_users = {} @@ -69,6 +60,21 @@ transactions = 0 pattern = re.compile(r"(.)\1{2,}") + +#SINGLE WORDS SEPARATE PROCESSING# +aa_more = set() +aa_more.add(" AA ") +aa_more.add("Awakening") +aa_more.add("awakening") +aa_more.add("Gratitude") +aa_more.add("GRATITUDE") +aa_more.add("gratitude") +aa_more.add("AWAKENING") +aa_more.add("Reflections") +aa_more.add("reflections") +aa_more.add("Sobriety") +aa_more.add("SOBRIETY") +aa_more.add("sobriety") #===============================================================# f = open(sys.argv[1]) @@ -142,33 +148,10 @@ def remove_special(tokens): for l in fp: gambling.add(''.join(convert_letters(l.strip()))) -with open(PATH_TO_ADULT_LIST,'r') as fp: - for l in fp: - adult.add(''.join(convert_letters(l.strip()))) - -with open(PATH_TO_DRUGS_LIST,'r') as fp: - for l in fp: - drugs.add(''.join(convert_letters(l.strip()))) - -with open(PATH_TO_ALCOHOLICS_LIST,'r') as fp: - for l in fp: - alcoholics.add(''.join(convert_letters(l.strip()))) - with open(PATH_TO_AA_LIST,'r') as fp: for l in fp: aa.add(''.join(convert_letters(l.strip()))) -with open(PATH_TO_FRATERNITY_LIST,'r') as fp: - for l in fp: - fraternity_small.add(''.join(convert_letters(l.strip()))) - -with open(PATH_TO_FRATERNITY_LIST,'r') as fp: - for l in fp: - fraternity.add(l.strip()) - - - - #===============================================================# """ @@ -219,207 +202,168 @@ def preprocessing(origtokens): outputfile.close() - if(transactions < current or len(row) != 11): + if(transactions < current or len(row) < 11): continue tusername = row[6] - name = row[10] + name = row[10].strip() firstname = lastname = name ## IDENTIFIED FROM Unames - note = row[1] + note = row[1] + + lfirstname = firstname.lower() + llastname = lastname.lower() + lname = name.lower() + + + #print(tusername,lastname,firstname,name,note) parts = tusername.split("-") flag = 0 for username in parts: - if( username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("AA-U") + lusername = username.lower() + if(firstname == "AA" or lastname == "AA" or username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")): + if(tusername not in filter_users): + filter_users[tusername] = {} + filter_users[tusername]['C'] = set() + filter_users[tusername]['C'].add("AA-U") flag = 1 break - - elif(firstname in aa or lastname in aa or username in aa or name in aa): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("AA-U") + + elif(firstname in aa or lastname in aa or username in aa or name in aa or firstname in aa_more or lastname in aa_more or username in aa_more or name in aa_more): + if(tusername not in filter_users): + filter_users[tusername] = {} + filter_users[tusername]['C'] = set() + filter_users[tusername]['C'].add("AA-U") flag = 1 break - - elif(firstname in fraternity or lastname in fraternity or username in fraternity or name in fraternity): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("F-U") - flag = 1 + elif(lfirstname in aa or llastname in aa or lusername in aa or lname in aa or lfirstname in aa_more or llastname in aa_more or lusername in aa_more or lname in aa_more): + if(tusername not in filter_users): + filter_users[tusername] = {} + filter_users[tusername]['C'] = set() + filter_users[tusername]['C'].add("AA-U") + flag = 1 break - - + elif(firstname in gambling or lastname in gambling or username in gambling or name in gambling): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("G-U") - flag = 1 - break - - elif(firstname in adult or lastname in adult or username in adult or name in adult): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("A-U") - flag = 1 - break - - elif(firstname in drugs or lastname in drugs or username in drugs or name in drugs): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("D-U") - flag = 1 - break - - elif(firstname in alcoholics or lastname in alcoholics or username in alcoholics or name in alcoholics): - if(username not in filter_users): - filter_users[username] = {} - filter_users[username]['C'] = set() - filter_users[username]['C'].add("AL-U") - flag = 1 + if(tusername not in filter_users): + filter_users[tusername] = {} + filter_users[tusername]['C'] = set() + filter_users[tusername]['C'].add("G-U") + flag = 1 break - - if(flag == 1): - continue - - for l in fraternity: - if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)): - if(username not in filter_users): + elif(lfirstname in gambling or llastname in gambling or lusername in gambling or lname in gambling): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("F-U") + filter_users[tusername]['C'].add("G-U") flag = 1 break - if(flag == 1): - continue - note = note.lower() - origtokens = nltk.word_tokenize(note) - origtokens = preprocessing(origtokens) - - if(len(origtokens) > 30): + if(flag == 1): continue - - - - for l in aa: - if( l in name): - if(username not in filter_users): + + parts = name.split(" ") + for username in parts: + lusername = username.lower() + if(firstname == "AA" or lastname == "AA" or username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() filter_users[tusername]['C'].add("AA-U") flag = 1 break - if(l in note): - if(username not in filter_users): + + elif(firstname in aa or lastname in aa or username in aa or name in aa or firstname in aa_more or lastname in aa_more or username in aa_more or name in aa_more): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("AA-N") + filter_users[tusername]['C'].add("AA-U") flag = 1 break - if(flag == 1): - continue - - - - - for l in origtokens: - - if(l in fraternity_small): - if(username not in filter_users): + elif(lfirstname in aa or llastname in aa or lusername in aa or lname in aa or lfirstname in aa_more or llastname in aa_more or lusername in aa_more or lname in aa_more): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("F-N") + filter_users[tusername]['C'].add("AA-U") flag = 1 break - elif(l in gambling): - if(username not in filter_users): + elif(firstname in gambling or lastname in gambling or username in gambling or name in gambling): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("G-N") + filter_users[tusername]['C'].add("G-U") flag = 1 break - - elif(l in adult): - if(username not in filter_users): + + elif(lfirstname in gambling or llastname in gambling or lusername in gambling or lname in gambling): + if(tusername not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("A-N") + filter_users[tusername]['C'].add("G-U") flag = 1 break - elif(l in drugs): - if(username not in filter_users): - filter_users[tusername] = {} - filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("D-N") - flag = 1 - break + if(flag == 1): + continue + + note = note.lower() + origtokens = nltk.word_tokenize(note) + origtokens = preprocessing(origtokens) + + if(len(origtokens) > 30): + continue + name = name.lower() - elif(l in alcoholics): + for l in aa: + if( l in name): if(username not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("AL-N") + filter_users[tusername]['C'].add("AA-U") flag = 1 break - - - if(flag == 1): - continue - - for l in gambling: - if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)): + if(l in note): if(username not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("G-U") + filter_users[tusername]['C'].add("AA-N") flag = 1 break - if(flag == 1): continue - for l in adult: - if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)): + for l in origtokens: + + if(l in gambling): if(username not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("A-U") + filter_users[tusername]['C'].add("G-N") flag = 1 break - if(flag == 1): continue - for l in alcoholics: - if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)): + for l in gambling: + if(len(l) > 3 and ((len(firstname) > 3 and firstname in l) or (len(lastname) > 3 and lastname in l) or (len(tusername) > 3 and tusername in l) or (len(name) > 3 and name in l))): if(username not in filter_users): filter_users[tusername] = {} filter_users[tusername]['C'] = set() - filter_users[tusername]['C'].add("AL-U") + filter_users[tusername]['C'].add("G-U") flag = 1 break - - + if(flag == 1): + continue except: continue