-
Notifications
You must be signed in to change notification settings - Fork 1
/
spam.py
128 lines (82 loc) · 2.93 KB
/
spam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 23 01:41:48 2017
@author: Tushar
"""
#spam sms
import pandas as pd
import numpy as np
df = pd.read_csv("spam.csv")
x=df.iloc[:,:].values
countham=0
countspam=0
for i in range(len(x)):
if x[i][0]=='ham':
countham+=1
else:
countspam+=1
from sklearn.model_selection import train_test_split
# split into train and test
data_train, data_test, labels_train, labels_test = train_test_split(
df.v2,
df.v1,
test_size=0.2,
random_state=0)
x=labels_train.iloc[:].values
ltrainham=0
ltrainspam=0
for i in range(len(x)):
if x[i]=='ham':
ltrainham+=1
else:
ltrainspam+=1
x=labels_test.iloc[:].values
ltrainham=0
ltrainspam=0
for i in range(len(x)):
if x[i]=='ham':
ltrainham+=1
else:
ltrainspam+=1
print (data_train.head())
print (labels_train.head())
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
data_train_count = vectorizer.fit_transform(data_train)
data_test_count = vectorizer.transform(data_test)
import matplotlib.pyplot as plt
word_freq_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'occurrences':data_train_count.toarray().sum(axis=0)})
word_freq_df['frequency'] = word_freq_df['occurrences']/np.sum(word_freq_df['occurrences'])
plt.plot(word_freq_df.occurrences)
plt.show()
print (data_train_count.shape, labels_train.shape, data_test_count.shape)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(data_train_count,labels_train)
y_pred2 = classifier.predict(data_test_count)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_test,y_pred2)
sclog = classifier.score(data_test_count,labels_test)
check = pd.DataFrame(labels_test)
check2 = pd.DataFrame(y_pred2)
from sklearn.neighbors import KNeighborsClassifier
classifier1 = KNeighborsClassifier(n_neighbors =2, metric ='minkowski', p=2)
classifier1.fit(data_train_count,labels_train)
y_pred3 = classifier1.predict(data_test_count)
check3 = pd.DataFrame(y_pred3)
cm1 = confusion_matrix(labels_test,y_pred3)
scKN = classifier1.score(data_test_count,labels_test)
from sklearn.tree import DecisionTreeClassifier
classifier2 = DecisionTreeClassifier(criterion = 'entropy', random_state =0)
classifier2.fit(data_train_count,labels_train)
y_pred4 = classifier2.predict(data_test_count)
check4 = pd.DataFrame(y_pred4)
cm2 = confusion_matrix(labels_test,y_pred4)
scDT = classifier2.score(data_test_count,labels_test)
from sklearn.ensemble import RandomForestClassifier
classifier3 = RandomForestClassifier(n_estimators = 55,criterion = 'entropy', random_state =0)
classifier3.fit(data_train_count,labels_train)
y_pred5 = classifier3.predict(data_test_count)
check5 = pd.DataFrame(y_pred5)
cm3 = confusion_matrix(labels_test,y_pred4)
scRF = classifier3.score(data_test_count,labels_test)