forked from mwong009/latent-variable-rbm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizers.py
89 lines (78 loc) · 3.25 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Authors: Kyle Kastner
# License: BSD 3-clause
import theano.tensor as T
import numpy as np
import theano
class rmsprop(object):
"""
RMSProp with nesterov momentum and gradient rescaling
"""
def __init__(self, params):
self.running_square_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.running_avg_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
self.memory_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
def updates(self, params, grads, learning_rate, momentum, rescale=5.):
grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
grad_norm = T.sqrt(grad_norm)
scaling_num = rescale
scaling_den = T.maximum(rescale, grad_norm)
# Magic constants
combination_coeff = 0.9
minimum_grad = 1E-4
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
grad = T.switch(not_finite, 0.1 * param,
grad * (scaling_num / scaling_den))
old_square = self.running_square_[n]
new_square = combination_coeff * old_square + (
1. - combination_coeff) * T.sqr(grad)
old_avg = self.running_avg_[n]
new_avg = combination_coeff * old_avg + (
1. - combination_coeff) * grad
rms_grad = T.sqrt(new_square - new_avg ** 2)
rms_grad = T.maximum(rms_grad, minimum_grad)
memory = self.memory_[n]
update = momentum * memory - learning_rate * grad / rms_grad
update2 = momentum * momentum * memory - (
1 + momentum) * learning_rate * grad / rms_grad
updates.append((old_square, new_square))
updates.append((old_avg, new_avg))
updates.append((memory, update))
updates.append((param, param + update2))
return updates
class sgd_nesterov(object):
def __init__(self, params):
self.memory_ = [theano.shared(np.zeros_like(p.get_value()))
for p in params]
def updates(self, params, grads, learning_rate, momentum):
updates = []
for n, (param, grad) in enumerate(zip(params, grads)):
memory = self.memory_[n]
update = momentum * memory - learning_rate * grad
update2 = momentum * momentum * memory - (
1 + momentum) * learning_rate * grad
updates.append((memory, update))
updates.append((param, param + update2))
return updates
class sgd(object):
# Only here for API conformity with other optimizers
def __init__(self, params):
pass
def updates(self, params, grads, learning_rate):
updates = []
for param, grad in zip(params, grads):
updates.append((param, param - grad * learning_rate))
return updates
"""
Usage:
grads = T.grad(cost, self.params)
#opt = sgd_nesterov(self.params)
opt = rmsprop(self.params)
updates = opt.updates(self.params, grads,
learning_rate / np.cast['float32'](self.batch_size),
momentum)
"""