-
Notifications
You must be signed in to change notification settings - Fork 2
/
save.py
66 lines (51 loc) · 1.91 KB
/
save.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
import os
import sys
import pickle
import random
import jieba
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from run import txt2data, delete_end_symbol
def get_features(max_features):
"""获取 TfidfVector特征
Args:
max_features (int): 最大特征数,超参数
Return:
x_train, x_val, y_train, y_val
"""
clf = TfidfVectorizer(
token_pattern=r"(?u)\b\w+\b",
max_features=max_features,
# ngram_range=(1, 3),
)
clf.fit(document)
pickle.dump(clf, open("model/vectorizer.pickle", "wb"))
tfidf = clf.transform(document)
x_train, x_val, y_train, y_val = train_test_split(tfidf,
label,
test_size=0.3,
shuffle=True,
random_state=2022)
return x_train, x_val, y_train, y_val
def save_model():
clf1 = LinearSVC()
clf2 = DecisionTreeClassifier()
clf3 = AdaBoostClassifier(random_state=2022)
clf5 = VotingClassifier(estimators=[('svc', clf1), ('tree', clf2), ('ada', clf3)])
clf5.fit(x_train, y_train)
pickle.dump(clf5, open("model/model.pickle", "wb"))
val_pre = clf5.predict(x_val)
f1 = f1_score(y_val, val_pre, average='macro')
print('TF-IDF + VotingHard : %.4f' % f1)
return
if __name__ == "__main__":
document, label = txt2data("data/all_data.txt", participle=True)
document = delete_end_symbol(document, p=0.5)
x_train, x_val, y_train, y_val = get_features(max_features=500)
save_model()