-
Notifications
You must be signed in to change notification settings - Fork 0
/
navie_bayes.py
74 lines (61 loc) · 2.81 KB
/
navie_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import argparse
from sklearn.metrics import precision_recall_fscore_support, precision_score, recall_score
from sklearn.metrics import roc_auc_score
def get_text(file_name):
"""
read the text and return a list
:param file_name:
:return: a list of text
"""
text = []
with open(file_name, "r") as f:
for line in f.readlines():
text.append(line.lower())
return text
def get_result(max_length):
x_test = get_text("data/processed" + "/X_test.txt")
x_train = get_text("data/processed" + "/X_train.txt")
y_test = np.loadtxt("data/processed" + "/y_test.txt", dtype=int)
y_train = np.loadtxt("data/processed" + "/y_train.txt", dtype=int)
# word index from 1
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)
x_train_cut_num = tokenizer.texts_to_sequences(x_train)
x_test_cut_num = tokenizer.texts_to_sequences(x_test)
x_train_cut_num_pad = pad_sequences(x_train_cut_num, padding="post", maxlen=max_length, value=4)
x_test_cut_num_pad = pad_sequences(x_test_cut_num, padding="post", maxlen=max_length, value=4)
x_train_cut_text = tokenizer.sequences_to_texts(x_train_cut_num_pad)
x_test_cut_text = tokenizer.sequences_to_texts(x_test_cut_num_pad)
nb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
('clf', MultinomialNB(fit_prior=True, class_prior=None))])
nb_total = 0
nb_result_list = []
for i in range(y_test.shape[1]):
nb_pipeline.fit(x_train_cut_text, y_train[:, i])
nb_predict = nb_pipeline.predict(x_test_cut_text)
nb_result_list.append(nb_predict)
nb_total += np.sum([y_test[j, i] == nb_predict[j] for j in range(y_test.shape[0])])
nb_result_reshape = np.array(nb_result_list).reshape(y_test.shape[0], y_test.shape[1])
total_num = y_test.shape[0]*y_test.shape[1]
print("navie bayes accuracy: ")
print(nb_total/total_num)
print("F1 score: ")
print(precision_recall_fscore_support(y_test, nb_result_reshape, average='macro'))
print("roc score: ")
print(roc_auc_score(y_test, nb_result_reshape))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("max_length", type=int, help="the truncation length of the text or query, usually longer " +
"than the average length of the text")
args = parser.parse_args()
get_result(args.max_length)
if __name__ == "__main__":
main()