-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathnmf-train-save.py
133 lines (107 loc) · 5.48 KB
/
nmf-train-save.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# (setq python-shell-interpreter "~/python-environments/ml/bin/python")
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
import csv
import numpy as np
def save_model(dictionary, feature_names, filename):
"""save two numpy arrays, one for the dictionary and one for the
feature names. Can be loaded back in with load_model"""
with open(filename, "wb") as f:
np.savez(f, dictionary=dictionary, feature_names=feature_names)
def load_model(filename):
"""load model saved with save_model. Returns a tupe of dictionary, feature_names"""
with open(filename, "rb") as f:
arrs = np.load(f)
return arrs["dictionary"], arrs["feature_names"]
def print_top_words(dictionary, feature_names, n_top_words):
"""print the top n_top_words from each topic in the dictionary matrix
of dictionary. Feature names is a list of words with indices corresponding
to the columns of the dictionary"""
for topic_idx, topic in enumerate(dictionary):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
def train_model(data, n_features, n_components):
"""train a model on data. Data is a list of strings to be trained on.
They are converted into a bag of words representation and then
transformed using a tfidf transformer with a maximum of n_features
words in the model. This representation is then fed into the NMF
algorithm which has n_components number of features. Returns the
dictionary matrix (np array) and a list of words which represent the
columns of the dictionary matrix"""
model = Pipeline(steps=[("tfidf", TfidfVectorizer(max_df=0.95, min_df=2,
max_features=n_features, stop_words='english')),
("nmf", NMF(n_components=n_components, random_state=1,
alpha=.1, l1_ratio=.5))])
model.fit(data)
features = model["tfidf"].get_feature_names()
dictionary = model["nmf"].components_
return dictionary, features
def train_save_model(data, n_features, n_components, filename):
"""convenience function for training and saving a model. data should
be a list of strings to train on, n_features is number of words in
tfidf model, and n_components is number of nmf topics. Filename is
where to save the model. Can be loaded later on with load_model"""
dictionary, features = train_model(data, n_features, n_components)
save_model(dictionary, features, filename)
return dictionary, features
def load_shakes(filename):
"""load shakespear data from https://www.kaggle.com/kingburrito666/shakespeare-plays#alllines.txt"""
with open(filename, "r") as f:
data = [line.strip() for line in f]
return data
def load_delta(filename, method="append"):
"""load delta data. Depending on method, either return question +
answer separated with a space ('append'), question ('question'), or answer ('answer')"""
with open(filename, "r", encoding = "utf8") as f:
reader = csv.reader(f, delimiter='\t')
data = []
next(reader, None) # ignore headers
for line in reader:
if method == "append":
data.append(line[1] + " " + line[2])
elif method == "question":
data.append(line[1])
elif method == "answer":
data.append(line[2])
else:
print("Error: method must be either 'append', 'question', or 'answer'")
return None
return data
def load_news():
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
return data_samples
def main():
# number of top words to use in tfidf representation of data
n_features = 1000
# number of topics in NMF
n_components = 10
# train model and save it
print("Loading, training, and saving 20 news groups model")
news_data = load_news()
news_dict, news_feats = train_save_model(news_data, n_features, n_components, "C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/news-nmf.npz")
print_top_words(news_dict, news_feats, 10)
# shakespeare data
print("Loading, training, and saving Shakespeare model")
shakes_data = load_shakes("C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/data/alllines.txt")
shakes_dict, shakes_feats = train_save_model(shakes_data, n_features, n_components, "C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/shakes-nmf.npz")
print_top_words(shakes_dict, shakes_feats, 10)
# delta data
print("Loading, training, and saving Delta model")
delta_data = load_delta("C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/data/delta-train.tsv")
delta_dict, delta_feats = train_save_model(delta_data, n_features, n_components, "C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/delta-nmf.npz")
print_top_words(delta_dict, delta_feats, 10)
if __name__ == "__main__":
main()
#c = np.load("C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/delta-nmf.npz")["dictionary"]
#d = np.load("C:/Users/wxwyl/Desktop/wylcode/nmf-train-save/delta-nmf.npz")["feature_names"]
#print(len(c[0]),len(c))
#print(len(d))
#print(c)
#print(d)