Skip to content

Commit

Permalink
Filter_Group_Users_2020
Browse files Browse the repository at this point in the history
  • Loading branch information
rajattan committed May 16, 2021
1 parent 3960698 commit 215c736
Showing 1 changed file with 97 additions and 153 deletions.
250 changes: 97 additions & 153 deletions Filter_Group_Users_2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,7 @@
CHUNKSIZE = 10000
#===============================================================#
PATH_TO_AA_LIST = "data/AA.txt"
PATH_TO_FRATERNITY_LIST = "data/FRATERNITY.txt"
PATH_TO_GAMBLING_LIST = "data/GAMBLING.txt"
PATH_TO_ADULT_LIST = "data/ADULT.txt"
PATH_TO_DRUGS_LIST = "data/DRUGS.txt"
PATH_TO_ALCOHOLICS_LIST = "data/ALCOHOLICS.txt"
PATH_TO_STOPWORDS_LIST = "data/STOPWORDS.txt"
#===============================================================#
USERS_FILE = "checkpoint_part1/PARTIAL_OUTPUT.txt"
Expand Down Expand Up @@ -52,12 +48,7 @@
#===============================================================#

aa=set()
fraternity = set()
fraternity_small = set()
gambling = set()
adult = set()
drugs = set()
alcoholics = set()

filter_users = {}

Expand All @@ -69,6 +60,21 @@
transactions = 0

pattern = re.compile(r"(.)\1{2,}")

#SINGLE WORDS SEPARATE PROCESSING#
aa_more = set()
aa_more.add(" AA ")
aa_more.add("Awakening")
aa_more.add("awakening")
aa_more.add("Gratitude")
aa_more.add("GRATITUDE")
aa_more.add("gratitude")
aa_more.add("AWAKENING")
aa_more.add("Reflections")
aa_more.add("reflections")
aa_more.add("Sobriety")
aa_more.add("SOBRIETY")
aa_more.add("sobriety")
#===============================================================#
f = open(sys.argv[1])

Expand Down Expand Up @@ -142,33 +148,10 @@ def remove_special(tokens):
for l in fp:
gambling.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_ADULT_LIST,'r') as fp:
for l in fp:
adult.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_DRUGS_LIST,'r') as fp:
for l in fp:
drugs.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_ALCOHOLICS_LIST,'r') as fp:
for l in fp:
alcoholics.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_AA_LIST,'r') as fp:
for l in fp:
aa.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_FRATERNITY_LIST,'r') as fp:
for l in fp:
fraternity_small.add(''.join(convert_letters(l.strip())))

with open(PATH_TO_FRATERNITY_LIST,'r') as fp:
for l in fp:
fraternity.add(l.strip())





#===============================================================#
"""
Expand Down Expand Up @@ -219,207 +202,168 @@ def preprocessing(origtokens):
outputfile.close()


if(transactions < current or len(row) != 11):
if(transactions < current or len(row) < 11):
continue
tusername = row[6]
name = row[10]
name = row[10].strip()
firstname = lastname = name
## IDENTIFIED FROM Unames
note = row[1]
note = row[1]

lfirstname = firstname.lower()
llastname = lastname.lower()
lname = name.lower()



#print(tusername,lastname,firstname,name,note)
parts = tusername.split("-")
flag = 0
for username in parts:
if( username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("AA-U")
lusername = username.lower()
if(firstname == "AA" or lastname == "AA" or username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AA-U")
flag = 1
break

elif(firstname in aa or lastname in aa or username in aa or name in aa):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("AA-U")
elif(firstname in aa or lastname in aa or username in aa or name in aa or firstname in aa_more or lastname in aa_more or username in aa_more or name in aa_more):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AA-U")
flag = 1
break


elif(firstname in fraternity or lastname in fraternity or username in fraternity or name in fraternity):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("F-U")
flag = 1
elif(lfirstname in aa or llastname in aa or lusername in aa or lname in aa or lfirstname in aa_more or llastname in aa_more or lusername in aa_more or lname in aa_more):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AA-U")
flag = 1
break



elif(firstname in gambling or lastname in gambling or username in gambling or name in gambling):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("G-U")
flag = 1
break

elif(firstname in adult or lastname in adult or username in adult or name in adult):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("A-U")
flag = 1
break

elif(firstname in drugs or lastname in drugs or username in drugs or name in drugs):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("D-U")
flag = 1
break

elif(firstname in alcoholics or lastname in alcoholics or username in alcoholics or name in alcoholics):
if(username not in filter_users):
filter_users[username] = {}
filter_users[username]['C'] = set()
filter_users[username]['C'].add("AL-U")
flag = 1
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("G-U")
flag = 1
break

if(flag == 1):
continue


for l in fraternity:
if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)):
if(username not in filter_users):
elif(lfirstname in gambling or llastname in gambling or lusername in gambling or lname in gambling):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("F-U")
filter_users[tusername]['C'].add("G-U")
flag = 1
break

if(flag == 1):
continue

note = note.lower()
origtokens = nltk.word_tokenize(note)
origtokens = preprocessing(origtokens)

if(len(origtokens) > 30):
if(flag == 1):
continue


for l in aa:
if( l in name):
if(username not in filter_users):

parts = name.split(" ")
for username in parts:
lusername = username.lower()
if(firstname == "AA" or lastname == "AA" or username == "AA" or name == "AA" or " AA " in name or name.startswith("AA ") or name.endswith("AA")):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AA-U")
flag = 1
break
if(l in note):
if(username not in filter_users):

elif(firstname in aa or lastname in aa or username in aa or name in aa or firstname in aa_more or lastname in aa_more or username in aa_more or name in aa_more):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AA-N")
filter_users[tusername]['C'].add("AA-U")
flag = 1
break

if(flag == 1):
continue




for l in origtokens:

if(l in fraternity_small):
if(username not in filter_users):
elif(lfirstname in aa or llastname in aa or lusername in aa or lname in aa or lfirstname in aa_more or llastname in aa_more or lusername in aa_more or lname in aa_more):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("F-N")
filter_users[tusername]['C'].add("AA-U")
flag = 1
break

elif(l in gambling):
if(username not in filter_users):
elif(firstname in gambling or lastname in gambling or username in gambling or name in gambling):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("G-N")
filter_users[tusername]['C'].add("G-U")
flag = 1
break
elif(l in adult):
if(username not in filter_users):

elif(lfirstname in gambling or llastname in gambling or lusername in gambling or lname in gambling):
if(tusername not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("A-N")
filter_users[tusername]['C'].add("G-U")
flag = 1
break


elif(l in drugs):
if(username not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("D-N")
flag = 1
break
if(flag == 1):
continue

note = note.lower()
origtokens = nltk.word_tokenize(note)
origtokens = preprocessing(origtokens)

if(len(origtokens) > 30):
continue


name = name.lower()

elif(l in alcoholics):
for l in aa:
if( l in name):
if(username not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AL-N")
filter_users[tusername]['C'].add("AA-U")
flag = 1
break


if(flag == 1):
continue

for l in gambling:
if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)):
if(l in note):
if(username not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("G-U")
filter_users[tusername]['C'].add("AA-N")
flag = 1
break


if(flag == 1):
continue

for l in adult:
if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)):
for l in origtokens:

if(l in gambling):
if(username not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("A-U")
filter_users[tusername]['C'].add("G-N")
flag = 1
break


if(flag == 1):
continue

for l in alcoholics:
if(len(l) > 3 and (firstname in l or lastname in l or tusername in l or name in l)):
for l in gambling:
if(len(l) > 3 and ((len(firstname) > 3 and firstname in l) or (len(lastname) > 3 and lastname in l) or (len(tusername) > 3 and tusername in l) or (len(name) > 3 and name in l))):
if(username not in filter_users):
filter_users[tusername] = {}
filter_users[tusername]['C'] = set()
filter_users[tusername]['C'].add("AL-U")
filter_users[tusername]['C'].add("G-U")
flag = 1
break


if(flag == 1):
continue
except:
continue

Expand Down

0 comments on commit 215c736

Please sign in to comment.