-
Notifications
You must be signed in to change notification settings - Fork 0
/
Classifier Source Code.py
179 lines (138 loc) · 5.64 KB
/
Classifier Source Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""Spam Classifier Final.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1Pa81askeFCOOx9tF5paCvZ6OdP0osgBr
**Spam Classifier Final**
"""
import os
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
neutral_words = ['could', 'might', 'would', 'may', 'shall', 'www', 'http', 'email', 'sent', 'send', 'subject']
special_characters = ['+','-','_','?', '<=','>=','>','<','(' , ')' , '{' , '}' , '[', ']','"', ';', ':','!', '*', '@' , '#', '$', '%', '&' ,'~',',', '.' , '\ ' , '/' ]
def make_tokens(email):
return RegexpTokenizer('\w+|\$[\d\.]+|\S+').tokenize(email)
# Removes mail ID
def remove_mail_id(email):
modified_email=[]
for m in email:
if (('@' not in m) or ('.' not in m)):
modified_email.append(m)
return modified_email
# Lemmatizes
def lemmatize(email):
for i in range(len(email)):
email[i] = WordNetLemmatizer().lemmatize(email[i])
return email
# Removes formatting
def remove_formatting(test_point):
format_words = ['\\', '{', '}', '.', ',', ';', ':']
modified_test_point = []
for word in test_point:
if word[0] not in format_words:
modified_test_point.append(word)
return modified_test_point
# Removes numbers and punctutations
def remove_numbers_punctuations(test_point):
punctuations = list(string.punctuation)
modified_test_point = []
for word in test_point:
modified_test_point.append(''.join([i for i in word if not i.isdigit() and i not in punctuations]))
return modified_test_point
# Removes stopwords
def remove_stopwords(test_point):
updated_stop_words = special_characters + neutral_words + list(stopwords.words('english'))
modified_test_point = []
for word in test_point:
if word not in updated_stop_words and len(word) > 2:
modified_test_point.append(word.translate(string.punctuation).lower())
return modified_test_point
def train_model():
# Importing the Dataset
dataset = pd.read_csv("training_dataset.csv", encoding = "latin-1")
# Dropping rows containing NA values
dataset.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)
rows, cols = dataset.shape
n_ham = 0 # Numbers of Ham mails
n_spam = 0 # Numbers of Spam mails
dictionary = {} # Dictionary for storing elements in form "word: [No of ham mails in which word occur , No of spam mails in which word occur]"
for i in range(rows):
if dataset.loc[i][1] == 1:
n_spam += 1
else:
n_ham += 1
email = dataset.loc[i][0]
content = list(set(lemmatize(remove_stopwords(remove_numbers_punctuations(remove_formatting(remove_mail_id(make_tokens(email))))))))
for word in content:
if word not in dictionary:
if dataset.loc[i][1] == 0:
dictionary[word] = [1, 0]
else:
dictionary[word] = [0, 1]
else:
if dataset.loc[i][1] == 0:
dictionary[word][0] += 1
else:
dictionary[word][1] += 1
# Increasing the count of each word by 1 each in both the categories (as a part of Laplace smoothing) .
for word in dictionary:
dictionary[word][0] += 1
dictionary[word][1] += 1
filtered_dictionary = {}
for word in dictionary:
if (dictionary[word][0] / dictionary[word][1]) > 1.7 or (dictionary[word][1] / dictionary[word][0]) > 1.7:
filtered_dictionary[word] = [dictionary[word][0], dictionary[word][1]]
# Creating Probability table. It will store elements in form "word: [No of ham mails in which word occur/total ham mails , No of spam mails in which word occur/total spam mails]"
probability_table = {}
for word in filtered_dictionary:
probability_table[word] = [filtered_dictionary[word][0] / (n_ham + 1) , filtered_dictionary[word][1] / (n_spam + 1)]
return probability_table
# Opens and reads mail
def get_test_mails(dir_name):
test_mails = []
for i in range(len(os.listdir(dir_name))):
path = dir_name + '/' + os.listdir(dir_name)[i]
if not os.path.isfile(path):
continue
f = open(path, encoding='utf-8')
mail = f.read();
name = os.listdir(dir_name)[i][:-4]
test_mails.append([name, mail])
return test_mails
# Preprocess raw mail content
def pre_process(content):
return list(set(lemmatize(remove_stopwords(remove_numbers_punctuations(remove_formatting(remove_mail_id(make_tokens(content))))))))
# Run model to determine label of processed content
def get_label(probability_table, content):
probability_ham = probability_spam = 0.5
probability_words_ham = probability_words_spam = 1e175
for word in content:
if word in probability_table:
probability_words_ham *= probability_table[word][0]
probability_words_spam *= probability_table[word][1]
probability_ham_words = probability_ham * probability_words_ham
probability_spam_words = probability_spam * probability_words_spam
label = 1
if probability_ham_words >= probability_spam_words:
label = 0
return label
# Invoke this method to classify email of test folder
def classify():
# Guys let's train the model
model = train_model()
f = open("output.csv", 'w')
f.write("Email, Label\n")
test_mails = get_test_mails('test')
for mail in test_mails:
name = mail[0]
content = mail[1]
pre_processed_content = pre_process(content)
label = get_label(model, pre_processed_content)
f.write(name + ',' + str(label) + '\n')
f.close()