-
Notifications
You must be signed in to change notification settings - Fork 0
/
Saici.ai.py
377 lines (185 loc) · 6.09 KB
/
Saici.ai.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#!/usr/bin/env python
# coding: utf-8
# ### Import Necessary Libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer , WordNetLemmatizer
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
import pickle
import warnings
warnings.filterwarnings('ignore')
# ### Import Dataset [Train , Test, ]
### Import Train dataset
train_df = pd.read_csv('train.tsv',sep='\t')
train_df.head(5)
### Import Test dataset
test_df = pd.read_csv('test.tsv',sep='\t')
test_df.head(5)
### Import another dataset
dev_df = pd.read_csv('dev.tsv',sep='\t')
dev_df.head(5)
# ### Train Dataset Analysis & Preprocessing
### Data shape
train_df.shape
### Show Data Info
train_df.info()
### Describe Dtaset
train_df.describe()
### Data Label count
train_df['label'].value_counts()
### Show Plot for Train Dataset
plt.style.use('fivethirtyeight')
sns.countplot(data=train_df,x='label')
### Drop Duplicate Value
train_df.drop_duplicates(inplace = True)
train_df
train_df.describe()
plt.style.use('fivethirtyeight')
sns.countplot(data=train_df,x='label')
train_df['label'].value_counts()
train_df.isnull().sum()
# ### Test Dataset ANalysis and Preprocessing
### Show Test dataset shape
test_df.shape
### Show Test dataset Info
test_df.info()
### Describe Test dataset
test_df.describe()
### Counts Label of Test Dataset
test_df['label'].value_counts()
### Show Label Plot for test dataset
plt.style.use('fivethirtyeight')
sns.countplot(data=test_df,x='label')
### Drop duplicate Rows
test_df.drop_duplicates(inplace = True)
test_df
test_df.describe()
### Show Label plot after drop duplicate rows
plt.style.use('fivethirtyeight')
sns.countplot(data=test_df,x='label')
test_df['label'].value_counts()
### Check Null Value
test_df.isnull().sum()
# ### Another Dataset ANalysis and Preprocessing
### Data shape
dev_df.shape
### Show Data Info
dev_df.info()
### Describe Dtaset
dev_df.describe()
# In[206]:
### Data Label count
dev_df['label'].value_counts()
### Show Plot for Train Dataset
plt.style.use('fivethirtyeight')
sns.countplot(data=dev_df,x='label')
### Drop duplicate Rows
dev_df.drop_duplicates(inplace = True)
dev_df
dev_df.describe()
### Show Label plot after drop duplicate rows
plt.style.use('fivethirtyeight')
sns.countplot(data=dev_df,x='label')
dev_df['label'].value_counts()
dev_df.isnull().sum()
# ### Indentify Train , Test and Another dataet value
X_train=train_df['text_a'].values
Y_train=train_df['label'].values
X_test=test_df['text_a'].values
Y_test=test_df['label'].values
X_dev=dev_df['text_a'].values
Y_dev=dev_df['label'].values
# ### Analysis & Preprocessing Train and Test Dataset
(X_train.shape,Y_train.shape),(X_test.shape,Y_test.shape)
train_df.iloc[:,1].describe()
test_df.iloc[:,1].describe()
X_train_len=[len(str(i).split()) for i in X_train]
plt.hist(X_train_len)
X_test_len=[len(str(i).split()) for i in X_test]
plt.hist(X_test_len)
vocab_size=30000
embedding_dimension=64
turnc='post'#preprocessing step for pad_sequences
oov_tok='<OOV>'#oov stands for out of vocabulary tokens
# vectorising the text
vect = CountVectorizer(stop_words=None)
vect.fit(X_train)
vect.vocabulary_
vect.get_feature_names()
# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)
print(X_test[:1])
print(X_test_tranformed)
from sklearn.naive_bayes import BernoulliNB
# Instantiate bernoulli NB object
bnb = BernoulliNB()
# Fit The
bnb.fit(X_train_transformed,Y_train)
# predict class
y_pred_class = bnb.predict(X_test_tranformed)
# Predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)
bnb
cv = CountVectorizer()
x_train = cv.fit_transform(X_train)
x_test = cv.fit_transform(X_test)
x_dev = cv.fit_transform(X_dev)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
### Train Dataset Accuracy
model.fit(x_train,Y_train)
model.score(x_train,Y_train)
### Test Dataset Accuracy
model.fit(x_test,Y_test)
model.score(x_test,Y_test)
### Another Dataset Accuracy
model.fit(x_dev,Y_dev)
model.score(x_dev,Y_dev)
metrics.confusion_matrix(Y_test, y_pred_class)
confusion = metrics.confusion_matrix(Y_test, y_pred_class)
print(confusion)
#[row, column]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TP = confusion[1, 1]
sensitivity = TP / float(FN + TP)
print("sensitivity",sensitivity)
specificity = TN / float(TN + FP)
print("specificity",specificity)
precision = TP / float(TP + FP)
print("precision",precision)
print(metrics.precision_score(Y_test, y_pred_class))
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(Y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(Y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(Y_test, y_pred_class))
y_pred_proba
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, y_pred_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate)
# ### Check Model With Predict
review = ['机器背面似乎被撕了张什么标签,残胶还在。但是又看不出是什么标签不见了,该有的都在,怪','地理位置佳,在市中心。酒店服务好、早餐品种丰富。我住的商务数码房电脑宽带速度满意,房间还算干净,离湖南路小吃街近。']
cv_review = cv.transform(review)
model.predict(cv_review)
# ### Generate Pickle File
#Saving model
pickle.dump(model, open('saici_task.pkl', 'wb'))
#Testing model by loading it first
model1= pickle.load(open('saici_task.pkl', 'rb'))
# ### Thank you Saici.ai Team