-
Notifications
You must be signed in to change notification settings - Fork 1
/
task_2_msg_len_plot.py
58 lines (49 loc) · 1.77 KB
/
task_2_msg_len_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from collections import Counter
import csv
from matplotlib import pyplot as plt
import numpy as np
DATA_FILE_NAME = 'sms-spam-corpus.csv'
SPAM = 'spam'
HAM = 'ham'
MSG_TYPE_KEY = 'v1'
MSG_KEY = 'v2'
def calculate_avg(d):
x = 0
y = 0
for item in d.items():
c = item[1]
x += item[0] * c
y += c
return x / y
def main():
with open(DATA_FILE_NAME) as csv_file:
csv_reader = csv.DictReader(csv_file)
spam_w_list = []
ham_w_list = []
for row in csv_reader:
msg_len = len(row[MSG_KEY])
if row[MSG_TYPE_KEY] == SPAM:
spam_w_list.append(msg_len)
else:
ham_w_list.append(msg_len)
spam_w_dict = {word[0]: word[1] for word in Counter(spam_w_list).most_common()}
ham_w_dict = {word[0]: word[1] for word in Counter(ham_w_list).most_common()}
spam_w_count = sum(spam_w_dict.values())
ham_w_count = sum(ham_w_dict.values())
spam_w_dict = {word[0]: word[1] / spam_w_count for word in spam_w_dict.items()}
ham_w_dict = {word[0]: word[1] / ham_w_count for word in ham_w_dict.items()}
draw_chart(spam_w_dict, SPAM)
draw_chart(ham_w_dict, HAM)
def draw_chart(dict, msg_type):
plt.style.use('fivethirtyeight')
plt.title('{} message length chart'.format(msg_type))
count_of_words = [c for c in dict.keys()]
x_indexes = np.arange(len(count_of_words))
plt.xticks(x_indexes, count_of_words)
plt.bar(x_indexes, [c for c in dict.values()], color='#006a71', label='count of msg by length')
plt.xlabel("message length")
plt.ylabel("count of messages")
plt.show()
print('Average {} message count = {}'.format(msg_type, str(calculate_avg(dict))))
if __name__ == '__main__':
main()