-
Notifications
You must be signed in to change notification settings - Fork 0
/
AC_brain.py
92 lines (68 loc) · 3.29 KB
/
AC_brain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from keras import backend as K
from keras.layers import Dense, Activation, Input
from keras.models import Model, load_model
from keras.optimizers import Adam
import numpy as np
import tensorflow as tf
class AC(object):
def __init__(self, lr, gamma, state_size, action_size=3):
self.gamma = gamma
self.lr = lr
self.action_size = action_size
self.state_size = state_size
self.state_memory = []
self.action_memory = []
self.reward_memory = []
self.actor_train, self.actor_predict = self.build_actor()
self.critic = self.build_critic()
self.action_space = [i for i in range(self.action_size)]
self.mdl_name = 'my_temp_mdl.h5'
self.buyrecord=[]
self.eps=0.99
self.G = 0
def build_actor(self):
# The actor is almost same as PG, the only difference is the weight of gradient G
real_input = Input(shape=(self.state_size,))
G = Input(shape=[1])
l1 = Dense(16, kernel_initializer='random_uniform', activation='relu')(
real_input) # it always takes real_input, G can be viewed as parameter
l2 = Dense(32, kernel_initializer='random_uniform', activation='relu')(l1)
actions = Dense(self.action_size, kernel_initializer='random_uniform', activation='softmax')(l2)
def my_loss(y_true, y_pred):
log_like = y_true * K.log(K.clip(y_pred, 1e-8, 1 - 1e-8))
return K.sum(-log_like * G)
actor_train = Model(input=[real_input, G], output=[actions])
actor_train.compile(loss=my_loss, optimizer=Adam(lr=self.lr))
actor_predict = Model(input=[real_input], output=[actions])
return actor_train, actor_predict
def build_critic(self):
real_input = Input(shape=(self.state_size,))
l1 = Dense(16, kernel_initializer='random_uniform', activation='relu')(
real_input) # it always takes real_input, G can be viewed as parameter
l2 = Dense(32, kernel_initializer='random_uniform', activation='relu')(l1)
V_val = Dense(1,activation= 'linear')(l2)
critic = Model(input=[real_input], output=[V_val])
critic.compile(loss='mean_squared_error', optimizer=Adam(lr=self.lr))
return critic
def act(self, state):
state = state[np.newaxis, :]
action_prob = self.actor_predict.predict(state)[0]
# if np.random.uniform()<self.eps:
# action = np.random.choice(self.action_space, p = action_prob)
# else:
# action = np.random.choice(self.action_space)
return np.random.choice(self.action_space, p=action_prob)
def learn(self, s, a, r, sp):
# It's different from PG, In PG, need to run through entire episode, here we can update two nn
# every step
s = s[np.newaxis, :]
sp = sp[np.newaxis, :]
V_s = self.critic.predict(s)
V_sp = self.critic.predict(sp)
target = r + self.gamma * V_sp
G = target - V_s
one_hot_actions = tf.one_hot(a, self.action_size)
actions = np.zeros([1, self.action_size])
actions[np.arange(1), a] = 1
self.critic.train_on_batch([s],target)
self.actor_train.train_on_batch([s,G], actions)