-
Notifications
You must be signed in to change notification settings - Fork 0
/
encoder.py
117 lines (91 loc) · 3.37 KB
/
encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from __future__ import print_function
from keras.models import Model
from keras.layers import Input
from csv import DictReader, DictWriter
import numpy as np
import sklearn
from numpy import array
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from word2vec_gen import *
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, Add, Concatenate
from keras.datasets import imdb
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
kTARGET_FIELD = 'correctAnswer'
kTEXT_FIELD = 'question'
kID_FIELD = 'id'
kA = 'answerA'
kB = 'answerB'
kC = 'answerC'
kD = 'answerD'
batch_size = 64 # Batch size for training.
epochs = 100 # Number of epochs to train for.
latent_dim = 256 # Latent dimensionality of the encoding space.
num_samples = 10000 # Number of samples to train on.
def formVector(data):
# lablelist = ["A", "B", "C", "D"]
x_train = []
y_train = []
for x in data:
interlist = []
for c in x[kTEXT_FIELD]:
q = model.sv(c)
interlist.append(q)
v = interlist
y = np.hstack((model.sv(x[kA])))
y = np.hstack((y, model.sv(x[kB])))
y = np.hstack((y, model.sv(x[kC])))
y = np.hstack((y, model.sv(x[kD])))
x_train.append(v)
y_train.append(y)
return x_train, y_train
if __name__ == "__main__":
n = 0.7 # train validation split
train = list(DictReader(open("data/filtered_train.csv", 'r')))
train = shuffle(train)
print("Total length: ", len(train))
test = train[-int(len(train) * (1.0 - n)):]
train = train[:int(len(train) * n)]
model = w2v()
x_train, y_train = formVector(train)
x_test, y_test = formVector(test)
input_texts = x_train
target_texts = y_train
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])
# print('input len',max_encoder_seq_length )
encoder_inputs = Input(shape=(1, 250))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
# print(encoder_outputs)
encoder_states = [state_h, state_c]
concat_states = Concatenate(axis=-1)([state_h, state_c])
print(concat_states)
sess = tf.Session()
with sess.as_default():
print(type(concat_states.eval()))
print("x_train_before", x_train)
# x_train = np.array(concat_states)
y_train = np.array(y_train)
model = Model(encoder_inputs, encoder_outputs)
# print("x_train", x_train)
# print("y_train", y_train[1])
# Run training
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_split=0.2)
# lr = LogisticRegression(C=1, penalty='l2', fit_intercept=True)
# print(x_train.shape)
# lr.fit(x_train, y_train)
# sess = tf.Session()
# with sess.as_default():
print(type(concat_states.eval()))
print("x_train_before", x_train)
# print("x train val:",x_train[1])