-
Notifications
You must be signed in to change notification settings - Fork 18
/
main_tf.py
67 lines (43 loc) · 2.16 KB
/
main_tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from lib import features_word2vec, model_lstm_tf
import pandas as pd
import os
# The next steps:
# 1. Check
# Will ingest/clean data and save the following:
# 1. cleaned text translated to array of word indices: imdb_indices.pickle
# 2. word2vec model, where the indices/word vecs are stored: 300features_40minwords_10context
# 3. word embeddings: this is the index to wordvec mapping derived from 2.
# ingestion clean data
# create word embedding
# create word indices that can be mapped to word embedding
labeled_data_path = "./data/labeledTrainData.tsv"
unlabeled_data_path = "./data/unlabeledTrainData.tsv"
model_path = "./model/300features_40minwords_10context"
embedding_path = "./model/embedding_weights.pkl"
text2indices_path = "./model/imdb_indices.pickle"
def data_prep():
# Read data
# Use the kaggle Bag of words vs Bag of popcorn data:
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data
data = pd.read_csv(labeled_data_path, header=0,
delimiter="\t", quoting=3, encoding="utf-8")
data2 = pd.read_csv(unlabeled_data_path, header=0,
delimiter="\t", quoting=3, encoding="utf-8")
# data2 and data are combined to train word2vec model
data2 = data.append(data2)
model = features_word2vec.get_word2vec_model(data2, "review", num_features=300, downsampling=1e-3, model_path=model_path)
embedding_weights = features_word2vec.create_embedding_weights(model)
features = features_word2vec.get_indices_word2vec(data, "review", model, maxLength=500,
writeIndexFileName="./model/imdb_indices.pickle", padLeft=True)
return model, embedding_weights, features
if __name__ == '__main__':
# Run data prep routine if some files are not found
if not(os.path.isfile(model_path) and os.path.isfile(embedding_path) and os.path.isfile(text2indices_path)):
data_prep()
#model1 = model_lstm_tf.LstmTFModel(useAttention=True, restore = False)
#model1.train_epochs(1)
#model1.test()
model2 = model_lstm_tf.LstmTFModel(useAttention=True, restore = True)
model2.train_epochs(5)
model2.test()
model2.plot_attention()