Classifier Source Code.py

# -*- coding: utf-8 -*-
"""Spam Classifier Final.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Pa81askeFCOOx9tF5paCvZ6OdP0osgBr

**Spam Classifier Final**
"""

import os
import nltk
import string
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

nltk.download('stopwords')
nltk.download('wordnet')

neutral_words = ['could', 'might', 'would', 'may', 'shall', 'www', 'http', 'email', 'sent', 'send', 'subject']

special_characters = ['+','-','_','?', '<=','>=','>','<','(' , ')' , '{' , '}' ,  '[', ']','"', ';', ':','!', '*', '@' , '#', '$', '%', '&' ,'~',',', '.' , '\ ' ,  '/' ]

def make_tokens(email):
  return RegexpTokenizer('\w+|\$[\d\.]+|\S+').tokenize(email)

# Removes mail ID
def remove_mail_id(email):
  modified_email=[]
  for m in email:
    if (('@' not in m) or ('.' not in m)): 
      modified_email.append(m)
  return modified_email

# Lemmatizes
def lemmatize(email):
  for i in range(len(email)):
    email[i] = WordNetLemmatizer().lemmatize(email[i])
  return email

# Removes formatting
def remove_formatting(test_point):
  format_words = ['\\', '{', '}', '.', ',', ';', ':']
  modified_test_point = []
  for word in test_point:
    if word[0] not in format_words:
      modified_test_point.append(word)
  return modified_test_point

# Removes numbers and punctutations
def remove_numbers_punctuations(test_point): 
  punctuations = list(string.punctuation)
  modified_test_point = []
  for word in test_point:
    modified_test_point.append(''.join([i for i in word if not i.isdigit() and i not in punctuations]))
  return modified_test_point
  
# Removes stopwords
def remove_stopwords(test_point):
  updated_stop_words = special_characters + neutral_words + list(stopwords.words('english'))
  modified_test_point = []
  for word in test_point:
    if word not in updated_stop_words and len(word) > 2:
      modified_test_point.append(word.translate(string.punctuation).lower())
  return modified_test_point

def train_model():

  # Importing the Dataset
  dataset =  pd.read_csv("training_dataset.csv", encoding = "latin-1")
  # Dropping rows containing NA values
  dataset.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

  rows, cols = dataset.shape
  n_ham = 0                # Numbers of Ham mails 
  n_spam = 0               # Numbers of Spam mails                 
  dictionary = {}          # Dictionary for storing elements in form "word: [No of ham mails in which word occur , No of spam mails in which word occur]"

  for i in range(rows):
    if dataset.loc[i][1] == 1:
      n_spam += 1
    else:
      n_ham += 1
    
    email = dataset.loc[i][0]
    content = list(set(lemmatize(remove_stopwords(remove_numbers_punctuations(remove_formatting(remove_mail_id(make_tokens(email))))))))

    for word in content:
      if word not in dictionary:
        
        if dataset.loc[i][1] == 0:
          dictionary[word] = [1, 0]
        else:
          dictionary[word] = [0, 1]
      
      else:
        if dataset.loc[i][1] == 0:
          dictionary[word][0] += 1
        else:
          dictionary[word][1] += 1

  # Increasing the count of each word by 1 each in both the categories (as a part of Laplace smoothing) .
  for word in dictionary:
    dictionary[word][0] += 1
    dictionary[word][1] += 1

  filtered_dictionary = {}
  for word in dictionary:
    if (dictionary[word][0] / dictionary[word][1]) > 1.7  or (dictionary[word][1] / dictionary[word][0]) > 1.7:
      filtered_dictionary[word] = [dictionary[word][0], dictionary[word][1]]
                    
  # Creating Probability table. It will store elements in form "word: [No of ham mails in which word occur/total ham mails , No of spam mails in which word occur/total spam mails]"                    
  probability_table = {}              
  for word in filtered_dictionary:
    probability_table[word] = [filtered_dictionary[word][0] / (n_ham + 1) , filtered_dictionary[word][1] / (n_spam + 1)]

  return probability_table


# Opens and reads mail
def get_test_mails(dir_name):
  test_mails = []
  for i in range(len(os.listdir(dir_name))):
    path = dir_name + '/' + os.listdir(dir_name)[i]
    if not os.path.isfile(path):
      continue
    f = open(path, encoding='utf-8')
    mail = f.read();
    name = os.listdir(dir_name)[i][:-4]
    test_mails.append([name, mail])
  return test_mails

# Preprocess raw mail content
def pre_process(content):
  return list(set(lemmatize(remove_stopwords(remove_numbers_punctuations(remove_formatting(remove_mail_id(make_tokens(content))))))))

# Run model to determine label of processed content
def get_label(probability_table, content):

  probability_ham = probability_spam = 0.5
  
  probability_words_ham = probability_words_spam = 1e175
  
  for word in content:
    if word in probability_table:
      probability_words_ham  *= probability_table[word][0]
      probability_words_spam *= probability_table[word][1]

  probability_ham_words = probability_ham * probability_words_ham 
  probability_spam_words = probability_spam * probability_words_spam 
  
  label = 1          
  if probability_ham_words >= probability_spam_words:
    label = 0
 
  return label

# Invoke this method to classify email of test folder
def classify():

  # Guys let's train the model
  model = train_model()

  f = open("output.csv", 'w')
  f.write("Email, Label\n")
  
  test_mails = get_test_mails('test')
  for mail in test_mails:
    name = mail[0]
    content = mail[1]
    pre_processed_content = pre_process(content)
    label = get_label(model, pre_processed_content)
    f.write(name + ',' + str(label) + '\n')

  f.close()