-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathconv_net.py
245 lines (198 loc) · 9.36 KB
/
conv_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import numpy as np
from sklearn.datasets import fetch_mldata
notice = '''
**************************************************************************
The purpose of this convolutional net, along with other neural networks
in this repository, has been first of all to facilitate learning and
only as a secondary objective to be fit for the task at hand.
I implemented this NN in order to learn the math behind convolutional
and pooling layers. The architecture of this NN, especially given the
fully connected layer between the max pooling layer and the softmax layer
(which further increases the number of parameters) is not great.
Nonetheless, this convolutional neural network, implemented solely using
basic matrix operations, works and can learn. The best result I achieved
on my laptop was a 6.98% classification error on the test set after training
for 2 epochs. The training took around 4 hours.
If you execute the script, the values set as default will run the network
for one full epoch. The running time depends on you hardware however it
might take as much as two hours or longer.
**************************************************************************
'''
print(notice)
################################
# Hyperparameters
################################
# Count of hidden units in the fully connected layer
no_of_hidden_units = 30
################################
# Preparing the data set
################################
# Fetching the dataset and performing minor normalization
# to help with training
print('\nFetching MNIST dataset. Please wait...\n')
dataset = fetch_mldata('MNIST original', data_home='datasets')
dataset.data = dataset.data / 255
# Shuffling the ids to prepare for creation of train and test sets
ids = np.arange(len(dataset.data))
np.random.shuffle(ids)
# The full dataset consists of 70000 labelled examples.
# We will use 60000 examples for training and 10000 for our test set.
n_rows_train = 60000
n_rows_test = len(dataset.target) - n_rows_train
data_train = dataset.data[ids[:n_rows_train], :]
targets_train = np.zeros((n_rows_train, 10))
targets_train[np.arange(n_rows_train), dataset.target[ids[:n_rows_train]].astype(int)] = 1
data_test = dataset.data[ids[n_rows_train:], :]
targets_test = np.zeros((n_rows_test, 10))
targets_test[np.arange(n_rows_test), dataset.target[ids[n_rows_train:]].astype(int)] = 1
################################
# Weights initialization
################################
print('Initializing weights\n')
# Initializing a set of weights for each of the 6 feature maps
c1_weights = np.random.random((6, 5, 5)) - 0.5
# Initializing weights for the fully connected layer before softmax
f3_weights = np.random.random((865, no_of_hidden_units)) - 0.5
# Initializing weights connecting the fully connected layer to softmax group
softmax_weights = np.random.random((no_of_hidden_units + 1, 10)) - 0.5
def reset_weights():
global c1_weights, f3_weights, softmax_weights
# Initializing a set of weights for each of the 6 feature maps
c1_weights = np.random.random((6, 5, 5)) - 0.5
# Initializing weights for the fully connected layer before softmax
f3_weights = np.random.random((865, no_of_hidden_units)) - 0.5
# Initializing weights connecting the fully connected layer to softmax group
softmax_weights = np.random.random((no_of_hidden_units + 1, 10)) - 0.5
################################
# Function definitions
################################
def forward_pass(image):
# runs the network forward on a single example returning
# layer states along the way including final output
image = image.reshape((28,28))
# numpy likely features a less trivial and likely faster way to perform
# convolutions, but the point of this repo is to stay as close to the
# metal as feasible, hence sticking to basic numpy operations
c1 = np.zeros((6, 24, 24))
for c in range(6):
for i in range(24):
for j in range(24):
c1[c, i, j] = np.sum(image[i:i+5, j:j+5] * c1_weights[c, :, :])
s2 = np.zeros((6, 12, 12))
s2_idx = np.ndarray((6, 12, 12, 2))
for s in range(6):
for i in range(12):
for j in range(12):
max_ = np.max(c1[s, 2*i:2*i+2, 2*j:2*j+2])
s2[s, i, j] = max_
# There will be situations, mainly when all four entries are
# equal to 0, where there will be multiple entries == max_.
# In such a case, we will pick the first one returned.
max_idx = np.where(c1[s, 2*i:2*i+2, 2*j:2*j+2] == max_)
s2_idx[s, i, j, 0] = max_idx[0][0] + 2*i
s2_idx[s, i, j, 1] = max_idx[1][0] + 2*j
# Introducing the non-linearity at the max pooling layer
s2 = sigmoid(s2)
# Feeding forward through the fully connected layer
z_f3 = np.c_[1, s2.reshape((1, -1))].dot(f3_weights)
f3 = sigmoid(z_f3)
# Connecting the fully connected layer to the softmax group
z_softmax = np.c_[1, f3].dot(softmax_weights)
# avoiding overflow
z_softmax -= z_softmax.max()
hypothesis = np.exp(z_softmax) / np.sum(np.exp(z_softmax))
return c1, s2, s2_idx, f3, hypothesis
def gradient(image, target):
c1, s2, s2_idx, f3, hypothesis = forward_pass(image)
s2_unrolled = s2.reshape((-1, 1))
image = image.reshape((28,28))
delta_softmax = hypothesis - target
softmax_weights_grad = np.outer(
np.r_[np.atleast_2d(1), f3.T],
delta_softmax
)
f3_grad = delta_softmax.dot(softmax_weights[1:, :].T)
delta_f3 = f3_grad * f3 * (1 - f3)
f3_weights_grad = np.outer(
np.r_[np.atleast_2d(1), s2_unrolled],
delta_f3
)
s2_grad = f3_weights[1:, :].dot(delta_f3.T)
delta_s2 = s2_grad * s2_unrolled * (1 - s2_unrolled)
delta_s2 = delta_s2.reshape((6, 12, 12))
c1_weights_grad = np.zeros(c1_weights.shape)
delta_c1 = np.zeros(c1.shape)
for c in range(delta_s2.shape[0]):
for i in range(delta_s2.shape[1]):
for j in range(delta_s2.shape[2]):
x, y = s2_idx[c, i, j].astype(int)
delta_c1[c, x, y] = delta_s2[c, i, j]
for c in range(c1.shape[0]):
for i in range(c1.shape[1]):
for j in range(c1.shape[2]):
c1_weights_grad[c] += image[i:i+5, j:j+5] * delta_c1[c, i, j]
return c1_weights_grad, f3_weights_grad, softmax_weights_grad
def numerical_gradient(image, target):
delta = 1e-4
c1_weights_grad = np.zeros(c1_weights.shape)
for c in range(c1_weights.shape[0]):
for i in range(c1_weights.shape[1]):
for j in range(c1_weights.shape[2]):
c1_weights[c][i][j] -= delta
cost_minus = cost(forward_pass(image)[-1], target)
c1_weights[c][i][j] += 2 * delta
cost_plus = cost(forward_pass(image)[-1], target)
c1_weights_grad[c][i][j] = (cost_plus - cost_minus) / (2 * delta)
c1_weights[c][i][j] -= delta
f3_weights_grad = np.zeros(f3_weights.shape)
for i in range(f3_weights.shape[0]):
for j in range(f3_weights.shape[1]):
f3_weights[i][j] -= delta
cost_minus = cost(forward_pass(image)[-1], target)
f3_weights[i][j] += 2 * delta
cost_plus = cost(forward_pass(image)[-1], target)
f3_weights_grad[i][j] = (cost_plus - cost_minus) / (2 * delta)
f3_weights[i][j] -= delta
softmax_weights_grad = np.zeros(softmax_weights.shape)
for i in range(softmax_weights.shape[0]):
for j in range(softmax_weights.shape[1]):
softmax_weights[i][j] -= delta
cost_minus = cost(forward_pass(image)[-1], target)
softmax_weights[i][j] += 2 * delta
cost_plus = cost(forward_pass(image)[-1], target)
softmax_weights_grad[i][j] = (cost_plus - cost_minus) / (2 * delta)
softmax_weights[i][j] -= delta
return c1_weights_grad, f3_weights_grad, softmax_weights_grad
def mean_cost(hypothesis_ary, targets):
n = hypothesis_ary.shape[0]
target_column_ids = targets.argmax(1)
target_probabilities = hypothesis_ary[np.arange(n), target_column_ids]
return - np.sum(np.log(target_probabilities)) / n
def classification_error(hypothesis_ary, targets):
labels = targets.argmax(1)
best_guesses = hypothesis_ary.argmax(1)
return 100 - np.mean(labels == best_guesses).round(4) * 100
def cost_on_100_test_cases():
h = hypotheses(data_test[:100, :])
t = targets_test[:100, :]
return mean_cost(h, t)
def sigmoid(x):
return 1/(1 + np.exp(-x))
def hypotheses(data):
h = [forward_pass(img)[-1] for img in data]
return np.array(h).squeeze()
def train(epoch_count = 1, learning_rate = 0.01):
global c1_weights, f3_weights, softmax_weights
print('Training for', epoch_count, 'epoch(s) with learning_rate', learning_rate)
for e in range(epoch_count):
for i in range(data_train.shape[0]):
c1_weights_grad, f3_weights_grad, softmax_weights_grad = gradient(data_train[i, :], targets_train[i, :])
c1_weights -= learning_rate * c1_weights_grad
f3_weights -= learning_rate * f3_weights_grad
softmax_weights -= learning_rate * softmax_weights_grad
print('Completed epoch', e + 1)
print('Classification error on the test set: %0.2f%%' % classification_error(hypotheses(data_test), targets_test))
################################
# Training
################################
train(1, 0.01)