-
Notifications
You must be signed in to change notification settings - Fork 0
/
grade_net.py
286 lines (214 loc) · 10.4 KB
/
grade_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import numpy as np
import pickle
import torch as t
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import Adam
#from torch.optim.lr_scheduler import MultiplicativeLR
import matplotlib.pyplot as plt
from itertools import chain
#from scipy.stats import pearsonr
#import seaborn as sns
batch_factor = 2
batch_size = 32 * batch_factor
lr = 5e-4 * batch_factor
b1, b2 = 0.7, 0.999
weight_decay = 1.6e-3 #0.0012
n_epochs = 100
train_frac = 0.75
conv_channels = 4
board_size = (18, 11)
drop_top_n = 4
drop_bottom_n = 1
grade_dict = {'6A':1, '6A+':2, '6B':3, '6B+':4, '6C':5, '6C+':6, '7A':7, '7A+':8, '7B':9, '7B+':10, '7C':11, '7C+':12, '8A':13, '8A+':14, '8B':15, '8B+':16}
# Convert grade n from ints to 1D tensors with ones in the first n positions and zeros elsewhere.
# Ex. 3 -> [1,1,1,0,0,...]
# This allows for ordinal regression in the loss function
# i.e. loss is greater for predictions further from the true grade
grade_dict_vec = {}
for grade in grade_dict:
vec = t.zeros(len(grade_dict) - drop_bottom_n - drop_top_n)
vec[0:grade_dict[grade] - drop_bottom_n] = 1
grade_dict_vec[grade] = vec
num_grades = len(grade_dict) - drop_bottom_n - drop_top_n
def prediction2grade(prediction):
#Convert ordinal predictions to grades
return (prediction > 0.5).cumprod(axis=1).sum(axis=1).tolist()
def chain_lists(lists) -> np.ndarray:
return np.array(list(chain(*lists)))
def kl_divergence(p: np.ndarray, q: np.ndarray) -> float:
#p represents the data, q represents the model predictions
p, q = p[q != 0], q[q != 0]
return np.sum(p * np.log2(p / q))
class Data(Dataset):
def __init__(self):
with open('problems1.pkl', 'rb') as f:
data1 = pickle.load(f)
with open('problems2.pkl', 'rb') as f:
data2 = pickle.load(f)
data = {**data1, **data2}
self.names = data.keys()
self.grades, self.start_holds, self.mid_holds, self.end_holds, self.all_holds = [], [], [], [], []
for name in self.names:
problem = data[name]
if problem[0] in grade_dict.keys():
if grade_dict[problem[0]] in (max(grade_dict.values()) - np.array(range(0, drop_top_n))):
continue
if grade_dict[problem[0]] in range(1, drop_bottom_n + 1):
continue
self.grades.append(problem[0])
self.start_holds.append(problem[1])
self.mid_holds.append(problem[2])
self.end_holds.append(problem[3])
self.all_holds.append(problem[4])
self.all_holds_split_channels = t.Tensor([[self.start_holds[i], self.mid_holds[i], self.end_holds[i]] for i in range(len(self.grades))])
self.start_holds = t.Tensor(self.start_holds)
self.mid_holds = t.Tensor(self.mid_holds)
self.end_holds = t.Tensor(self.end_holds)
self.all_holds = t.Tensor(self.all_holds)
self.all_holds_neg_ends = self.mid_holds - self.start_holds - self.end_holds
self.grades_numeric = [grade_dict[grade] for grade in self.grades]
self.grades = [grade_dict_vec[grade] for grade in self.grades]
self.len = len(self.grades)
def __len__(self):
return self.len
def __getitem__(self, index):
return (self.all_holds_split_channels[index], self.grades[index])
# Random split of data
dataset = Data()
# sns.violinplot(x=dataset.grades_numeric)
# plt.show()
# grades = [int(grade.sum().item()) for grade in dataset.grades]
# hist = plt.hist(grades, bins=np.array(range(min(grades), max(grades)+2))-0.5, rwidth=0.9)
# plt.show()
train_len = int(train_frac * dataset.len)
valid_len = dataset.len - train_len
train_data, valid_data = random_split(dataset, (train_len, valid_len))
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=True)
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.conv_layer = nn.Conv2d(in_channels=1, out_channels=conv_channels, kernel_size=np.array([11, 7]), padding=(5, 3), stride=1) # Convolution with 4 filters of size 11x7
self.bypass_layer = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=1, stride=1, bias=False) # Convolution with 1 filter of size 1 to feed info directly to next layer. Bias set to false so that non-holds are set to zero
self.fc1 = nn.Sequential(nn.Linear((conv_channels + 1) * np.prod(board_size), 50), nn.Sigmoid()) # First fc layer with 5x18x11 neurons -> 50 neurons
self.dropout = nn.Dropout(p=0.5) # Dropout layer to reduce overtraining
self.fc2 = nn.Sequential(nn.Linear(50, num_grades), nn.Sigmoid()) # Second fc layer
# self.conv_layer = nn.Conv2d(in_channels=3, out_channels=conv_channels, kernel_size=np.array([11, 7]), padding=(5, 3), stride=1) # Convolution with 4 filters of size 11x7
# self.bypass_layer = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=1, stride=1, bias=False) # Convolution with 1 filter of size 1 to feed info directly to next layer. Bias set to false so that non-holds are set to zero
# self.fc1 = nn.Sequential(nn.Linear((conv_channels + 3) * np.prod(board_size), 50), nn.Sigmoid()) # First fc layer with 5x18x11 neurons -> 50 neurons
# self.dropout = nn.Dropout(p=0.5) # Dropout layer to reduce overtraining
# self.fc2 = nn.Sequential(nn.Linear(50, num_grades), nn.Sigmoid()) # Second fc layer
def forward(self, problem):
p = (problem[:,0] + problem[:,1] + problem[:,2]).unsqueeze(1)
conv = self.conv_layer(p) #* p
bypass = self.bypass_layer(p)
conv = conv.view(conv.shape[0], -1)
bypass = bypass.view(bypass.shape[0], -1)
inter = t.cat((conv, bypass), 1)
inter = self.fc1(inter)
inter = self.dropout(inter)
return self.fc2(inter)
def ordinal_regression_loss(prediction, target):
return t.pow(nn.MSELoss(reduction='none')(prediction, target).sum(axis=1), 2).mean()
model = Model()
#Stochastic gradient descent optimizer
optimizer = Adam(model.parameters(), lr=lr, betas=(b1, b2), weight_decay=weight_decay)
#Change the learning rate dyamically
# lmbda = lambda batch: 1.018
# scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda)
def evaluate():
valid_loss, predictions, actual = [], [], []
#Training mode
model.eval()
for data in valid_loader:
#Split batch into input (boulder problem) and targets (grade)
problem = data[0]
grade = data[1]
#Forward Propagation
prediction = model(problem)
#Loss computation
loss = ordinal_regression_loss(prediction, grade)
valid_loss.append(loss.item())
predictions.append(prediction2grade(prediction))
actual.append(prediction2grade(grade))
return valid_loss, chain_lists(predictions), chain_lists(actual)
def train():
train_loss, predictions, actual = [], [], []
#lr = []
#Training mode
model.train()
for data in train_loader:
#Split batch into input (boulder problem) and targets (grade)
problem = data[0]
grade = data[1]
#Zero the gradients
optimizer.zero_grad()
#Forward Propagation
prediction = model(problem)
#Loss computation
loss = ordinal_regression_loss(prediction, grade)
#Backpropagation
loss.backward()
#Weight optimization
optimizer.step()
train_loss.append(loss.item())
predictions.append(prediction2grade(prediction))
actual.append(prediction2grade(grade))
#lr.append(scheduler.get_last_lr())
#scheduler.step()
return train_loss, chain_lists(predictions), chain_lists(actual)
#Training and validation loop
for epoch in range(n_epochs): #An epoch is a run of the entire training dataset
train_loss, predictions_train, actual_train = train()
# plt.scatter(lr, train_loss)
# plt.xscale('log')
# plt.xlim(1e-4, 2e-3)
# plt.show()
valid_loss, predictions_valid, actual_valid = evaluate()
grade_diff_valid = predictions_valid - actual_valid
grade_diff_train = predictions_train - actual_train
bins = np.array(list(range(1, 18))) - 0.5
p = np.histogram(actual_valid, bins, density=True)[0]
q = np.histogram(predictions_valid, bins, density=True)[0]
print('Epoch: %i' % epoch)
print(' Std of training error distribution: %.3f' % np.std(grade_diff_train))
print(' Std of validation error distribution: %.3f' % np.std(grade_diff_valid))
print(' Mean training loss: %.3f' % np.mean(train_loss))
print(' Mean validation loss: %.3f' % np.mean(valid_loss))
print(' Validation KL divergence: %.3f' % kl_divergence(p, q))
n_correct = len(predictions_valid[predictions_valid == actual_valid])
print(' Correct predictions: %i' % n_correct)
proportion_correct = n_correct/len(predictions_valid)
print(' Proportion correct: %.3f' % proportion_correct)
# bins = np.array(range(min(grade_diff), max(grade_diff)+2)) - 0.5
# plt.hist(grade_diff, align='mid', rwidth=.9, bins=bins)
# plt.title('epoch: %d' % (epoch+1))
# plt.show()
actual_errors = actual_valid[actual_valid != predictions_valid]
predictions_errors = predictions_valid[actual_valid != predictions_valid]
print('Number of problems: %i' % len(actual_valid))
print('Number of incorrect predictions: %i' % len(actual_errors))
print('Number of correct predictions: %i' % (len(actual_valid) - len(actual_errors)))
#corr, _ = pearsonr(actual_valid, predictions_valid)
#print('Correlation coefficient: %.3f' % corr)
bins = np.array(range(min(grade_diff_valid), max(grade_diff_valid)+2)) - 0.5
plt.hist(grade_diff_valid, rwidth=.9, bins=bins, density=True)
plt.title('Difference in actual and predicted grades')
plt.show()
bins = np.array(range(min(actual_errors), max(actual_errors) + 2)) - 0.5
plt.hist2d(actual_errors,
predictions_errors,
bins=bins,
cmin=0.00001,
density=True,
zorder=1)
ticks = np.array(range(min(actual_errors), max(actual_errors)+2))
plt.xticks(ticks)
plt.yticks(ticks)
plt.xlabel('Actual Grade (1=6A, 16=8B+)')
plt.ylabel('Predicted Grade (1=6A, 16=8B+)')
plt.grid(alpha=0.2, zorder=0)
plt.colorbar(label='Proportion of errors')
plt.gca().invert_yaxis()
plt.show()