-
Notifications
You must be signed in to change notification settings - Fork 2
/
logistic_regression_train.py
254 lines (196 loc) · 7.85 KB
/
logistic_regression_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
################
### PREAMBLE ###
################
from __future__ import division
import tensorflow as tf
import numpy as np
import tarfile
import os
import matplotlib.pyplot as plt
import time
###################
### IMPORT DATA ###
###################
def csv_to_numpy_array(filePath, delimiter):
return np.genfromtxt(filePath, delimiter=delimiter, dtype=None)
def import_data():
if "data" not in os.listdir(os.getcwd()):
# Untar directory of data if we haven't already
tarObject = tarfile.open("data.tar.gz")
tarObject.extractall()
tarObject.close()
print("Extracted tar to current directory")
else:
# we've already extracted the files
pass
print("loading training data")
trainX = csv_to_numpy_array("data/trainX.csv", delimiter="\t")
trainY = csv_to_numpy_array("data/trainY.csv", delimiter="\t")
print("loading test data")
testX = csv_to_numpy_array("data/testX.csv", delimiter="\t")
testY = csv_to_numpy_array("data/testY.csv", delimiter="\t")
return trainX,trainY,testX,testY
trainX,trainY,testX,testY = import_data()
#########################
### GLOBAL PARAMETERS ###
#########################
## DATA SET PARAMETERS
# Get our dimensions for our different variables and placeholders:
# numFeatures = the number of words extracted from each email
numFeatures = trainX.shape[1]
# numLabels = number of classes we are predicting (here just 2: Ham or Spam)
numLabels = trainY.shape[1]
## TRAINING SESSION PARAMETERS
# number of times we iterate through training data
# tensorboard shows that accuracy plateaus at ~25k epochs
numEpochs = 25000
# a smarter learning rate for gradientOptimizer
learningRate = tf.train.exponential_decay(learning_rate=0.0008,
global_step= 1,
decay_steps=trainX.shape[0],
decay_rate= 0.95,
staircase=True)
####################
### PLACEHOLDERS ###
####################
# X = X-matrix / feature-matrix / data-matrix... It's a tensor to hold our email
# data. 'None' here means that we can hold any number of emails
X = tf.placeholder(tf.float32, [None, numFeatures])
# yGold = Y-matrix / label-matrix / labels... This will be our correct answers
# matrix. Every row has either [1,0] for SPAM or [0,1] for HAM. 'None' here
# means that we can hold any number of emails
yGold = tf.placeholder(tf.float32, [None, numLabels])
#################
### VARIABLES ###
#################
# Values are randomly sampled from a Gaussian with a standard deviation of:
# sqrt(6 / (numInputNodes + numOutputNodes + 1))
weights = tf.Variable(tf.random_normal([numFeatures,numLabels],
mean=0,
stddev=(np.sqrt(6/numFeatures+
numLabels+1)),
name="weights"))
bias = tf.Variable(tf.random_normal([1,numLabels],
mean=0,
stddev=(np.sqrt(6/numFeatures+numLabels+1)),
name="bias"))
######################
### PREDICTION OPS ###
######################
# INITIALIZE our weights and biases
init_OP = tf.initialize_all_variables()
# PREDICTION ALGORITHM i.e. FEEDFORWARD ALGORITHM
apply_weights_OP = tf.matmul(X, weights, name="apply_weights")
add_bias_OP = tf.add(apply_weights_OP, bias, name="add_bias")
activation_OP = tf.nn.sigmoid(add_bias_OP, name="activation")
#####################
### EVALUATION OP ###
#####################
# COST FUNCTION i.e. MEAN SQUARED ERROR
cost_OP = tf.nn.l2_loss(activation_OP-yGold, name="squared_error_cost")
#######################
### OPTIMIZATION OP ###
#######################
# OPTIMIZATION ALGORITHM i.e. GRADIENT DESCENT
training_OP = tf.train.GradientDescentOptimizer(learningRate).minimize(cost_OP)
###########################
### GRAPH LIVE UPDATING ###
###########################
epoch_values=[]
accuracy_values=[]
cost_values=[]
# Turn on interactive plotting
plt.ion()
# Create the main, super plot
fig = plt.figure()
# Create two subplots on their own axes and give titles
ax1 = plt.subplot("211")
ax1.set_title("TRAINING ACCURACY", fontsize=18)
ax2 = plt.subplot("212")
ax2.set_title("TRAINING COST", fontsize=18)
plt.tight_layout()
#####################
### RUN THE GRAPH ###
#####################
# Create a tensorflow session
sess = tf.Session()
# Initialize all tensorflow variables
sess.run(init_OP)
## Ops for vizualization
# argmax(activation_OP, 1) gives the label our model thought was most likely
# argmax(yGold, 1) is the correct label
correct_predictions_OP = tf.equal(tf.argmax(activation_OP,1),tf.argmax(yGold,1))
# False is 0 and True is 1, what was our average?
accuracy_OP = tf.reduce_mean(tf.cast(correct_predictions_OP, "float"))
# Summary op for regression output
activation_summary_OP = tf.histogram_summary("output", activation_OP)
# Summary op for accuracy
accuracy_summary_OP = tf.scalar_summary("accuracy", accuracy_OP)
# Summary op for cost
cost_summary_OP = tf.scalar_summary("cost", cost_OP)
# Summary ops to check how variables (W, b) are updating after each iteration
weightSummary = tf.histogram_summary("weights", weights.eval(session=sess))
biasSummary = tf.histogram_summary("biases", bias.eval(session=sess))
# Merge all summaries
all_summary_OPS = tf.merge_all_summaries()
# Summary writer
writer = tf.train.SummaryWriter("summary_logs", sess.graph_def)
# Initialize reporting variables
cost = 0
diff = 1
# Training epochs
for i in range(numEpochs):
if i > 1 and diff < .0001:
print("change in cost %g; convergence."%diff)
break
else:
# Run training step
step = sess.run(training_OP, feed_dict={X: trainX, yGold: trainY})
# Report occasional stats
if i % 10 == 0:
# Add epoch to epoch_values
epoch_values.append(i)
# Generate accuracy stats on test data
summary_results, train_accuracy, newCost = sess.run(
[all_summary_OPS, accuracy_OP, cost_OP],
feed_dict={X: trainX, yGold: trainY}
)
# Add accuracy to live graphing variable
accuracy_values.append(train_accuracy)
# Add cost to live graphing variable
cost_values.append(newCost)
# Write summary stats to writer
writer.add_summary(summary_results, i)
# Re-assign values for variables
diff = abs(newCost - cost)
cost = newCost
#generate print statements
print("step %d, training accuracy %g"%(i, train_accuracy))
print("step %d, cost %g"%(i, newCost))
print("step %d, change in cost %g"%(i, diff))
# Plot progress to our two subplots
accuracyLine, = ax1.plot(epoch_values, accuracy_values)
costLine, = ax2.plot(epoch_values, cost_values)
fig.canvas.draw()
time.sleep(1)
# How well do we perform on held-out test data?
print("final accuracy on test set: %s" %str(sess.run(accuracy_OP,
feed_dict={X: testX,
yGold: testY})))
##############################
### SAVE TRAINED VARIABLES ###
##############################
# Create Saver
saver = tf.train.Saver()
# Save variables to .ckpt file
# saver.save(sess, "trained_variables.ckpt")
############################
### MAKE NEW PREDICTIONS ###
############################
# Close tensorflow session
sess.close()
# To view tensorboard:
#1. run: tensorboard --logdir=/path/to/log-directory
#2. open your browser to http://localhost:6006/
# See tutorial here for graph visualization:
# https://www.tensorflow.org/versions/0.6.0/how_tos/graph_viz/index.html