forked from ajtulloch/NaiveBayesSpamFilter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMessage.py
131 lines (99 loc) · 4.28 KB
/
Message.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# encoding: utf-8
"""
Message.py
Created by Andrew Tulloch on 2010-04-27.
Copyright (c) 2010 Andrew Tulloch. All rights reserved.
"""
from utils import *
import math
import Stemmer
#------------------------------------------------------------------------------
class Message():
"""Implements the message class.
Attributes
subject - subject data
body - body data
subject word count - dictionary containing word --> count for subject
body word count - dictionary containing word --> count for body
spam - identifier if message is spam or not spam"""
def __init__(self, filename):
file = open("./Data/" + filename, 'r')
data = file.readlines()
file.close()
self.subject = data[0][9:].strip()
self.body = [line.strip() for line in data[2:]][0]
# Initialise data
self.stem_data()
self.numeric_filter()
# Perform the stemmer and numeric methods to further process the data
self.subject_word_count = counter(self.subject.split())
self.body_word_count = counter(self.body.split())
# Calculate word counts for the data
self.filename = filename
self.spam = self.spam_class()
# Message attributes
def spam_class(self):
"""From the filename, classes the message as spam or not spam"""
if self.filename[:5] == 'spmsg':
return "Spam"
else:
return "Not Spam"
def stem_data(self):
"""Stems the data, using Porters algorithm"""
stemmer = Stemmer.Stemmer('english')
# The stemming object
def stem_string(string):
"""Input a string, returns a string with the
words replaced by their stemmed equivalents"""
stemmed_list = []
for word in string.split():
stemmed_word = stemmer.stemWord(word)
stemmed_list.append(stemmed_word)
stemmed_string = " ".join(stemmed_list)
return stemmed_string
self.body = stem_string(self.body)
self.subject = stem_string(self.subject)
def numeric_filter(self):
"""Replaces instances of numbers in a string with
a "NUMERIC" placeholder
e.g.("112", "22" ---> "NUMERIC")"""
def num_filter_string(string):
"""Input a string, returns a string with
strings of digits replaced with "NUMERIC"
"""
filtered_list = []
for word in string.split():
if word.isdigit():
filtered_list.append("NUMERIC")
else:
filtered_list.append(word)
filtered_string = " ".join(filtered_list)
return filtered_string
self.body = num_filter_string(self.body)
self.subject = num_filter_string(self.subject)
def tf_idf(self, corpus):
"""Input a corpus (with its list of document frequencies)
calculates the tf-idf score for the message for every feature"""
top200list = [(word, count) for count, word in corpus.top200]
if corpus.type == "subject":
word_count = self.subject_word_count
else:
word_count = self.body_word_count
self.tf_idf_scorelist = []
# print word_count
for word, document_frequency in top200list:
if word not in word_count:
# If word does not appear in the message, tf-idf == 0
self.tf_idf_scorelist.append([word, 0])
else:
# calculate the tf-idf score for the word, appending the pair (word, score) to the list
tf_idf_score = word_count[word] * math.log10(corpus.length / float(document_frequency)) + 1.0/200
self.tf_idf_scorelist.append([word, tf_idf_score])
return self.tf_idf_scorelist
#------------------------------------------------------------------------------
def testing():
pass
#------------------------------------------------------------------------------
if __name__ == '__main__':
testing()