-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizers.py
124 lines (97 loc) · 4.39 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
def calculate_gradients(output_gradient, layer) -> tuple[torch.Tensor]:
weights_gradient = torch.matmul(layer.input.T, output_gradient)
biases_gradient = torch.sum(output_gradient, axis=0, keepdim=True)
input_gradient = torch.matmul(output_gradient, layer.weights.T)
return weights_gradient, biases_gradient, input_gradient
class Optimizer:
def __init__(self, learning_rate=0.001):
self.learning_rate = learning_rate
def update(self):
pass
class SGD(Optimizer):
def __init__(self, learning_rate=0.001, momentum=0):
self.learning_rate = learning_rate
self.momentum = momentum
def update(self, layer, output_gradient):
if not hasattr(layer, "weights_velocity")\
or not hasattr(layer, "biases_velocity"):
layer.weights_velocity = torch.zeros_like(layer.weights)
layer.biases_velocity = torch.zeros_like(layer.biases)
(
weights_gradient,
biases_gradient,
input_gradient
) = calculate_gradients(output_gradient, layer)
layer.weights_velocity = self.momentum * layer.weights_velocity \
+ self.learning_rate * weights_gradient
layer.biases_velocity = self.momentum * layer.biases_velocity \
+ self.learning_rate * biases_gradient
layer.weights -= layer.weights_velocity
layer.biases -= layer.biases_velocity
return input_gradient
class RMSprop(Optimizer):
def __init__(self, learning_rate=0.001, decay=0.9, epsilon=1e-7):
self.learning_rate = learning_rate
self.decay = decay
self.epsilon = epsilon
def update(self, layer, output_gradient):
if not hasattr(layer, "weights_velocity")\
or not hasattr(layer, "biases_velocity"):
layer.weights_velocity = torch.zeros_like(layer.weights)
layer.biases_velocity = torch.zeros_like(layer.biases)
(
weights_gradient,
biases_gradient,
input_gradient
) = calculate_gradients(output_gradient, layer)
layer.weights_velocity = self.decay * layer.weights_velocity \
+ (1 - self.decay) * weights_gradient ** 2
layer.biases_velocity = self.decay * layer.biases_velocity \
+ (1 - self.decay) * biases_gradient ** 2
layer.weights -= self.learning_rate * weights_gradient \
/ (torch.sqrt(layer.weights_velocity) + self.epsilon)
layer.biases -= self.learning_rate * biases_gradient \
/ (torch.sqrt(layer.biases_velocity) + self.epsilon)
return input_gradient
class Adam(Optimizer):
def __init__(
self,
learning_rate=0.001,
momentum_decay=0.9,
rms_decay=0.999,
epsilon=1e-7
):
self.learning_rate = learning_rate
self.momentum_decay = momentum_decay
self.rms_decay = rms_decay
self.epsilon = epsilon
def update(self, layer, output_gradient):
if not hasattr(layer, "weights_momentum")\
or not hasattr(layer, "biases_momentum")\
or not hasattr(layer, "weights_rms")\
or not hasattr(layer, "biases_rms"):
layer.weights_momentum = torch.zeros_like(layer.weights)
layer.biases_momentum = torch.zeros_like(layer.biases)
layer.weights_rms = torch.zeros_like(layer.weights)
layer.biases_rms = torch.zeros_like(layer.biases)
(
weights_gradient,
biases_gradient,
input_gradient
) = calculate_gradients(output_gradient, layer)
layer.weights_momentum = self.momentum_decay * layer.weights_momentum \
+ (1 - self.momentum_decay) * weights_gradient
layer.biases_momentum = self.momentum_decay * layer.biases_momentum \
+ (1 - self.momentum_decay) * biases_gradient
layer.weights_rms = self.rms_decay * layer.weights_rms \
+ (1 - self.rms_decay) * weights_gradient ** 2
layer.biases_rms = self.rms_decay * layer.biases_rms \
+ (1 - self.rms_decay) * biases_gradient ** 2
layer.weights -= self.learning_rate /\
(self.epsilon + torch.sqrt(layer.weights_rms))\
* layer.weights_momentum
layer.biases -= self.learning_rate /\
(self.epsilon + torch.sqrt(layer.biases_rms))\
* layer.biases_momentum
return input_gradient