Skip to content

Commit

Permalink
add ActorCritic
Browse files Browse the repository at this point in the history
  • Loading branch information
shubhamjha97 committed Nov 10, 2018
1 parent 390cdf0 commit b4d7ba2
Show file tree
Hide file tree
Showing 12 changed files with 340 additions and 173 deletions.
1 change: 0 additions & 1 deletion RLkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
print("Import from directory")
from .algorithms.random_agent import RandomAgent
from .algorithms.dqn import DQN
from .algorithms.policy_gradients import REINFORCE
Expand Down
2 changes: 1 addition & 1 deletion RLkit/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .random_agent import RandomAgent
from .dqn import DQN
from .policy_gradients import REINFORCE
# from .policy_gradients import REINFORCE
from .agent import Agent
18 changes: 12 additions & 6 deletions RLkit/algorithms/agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import tensorflow as tf
import pdb

class Agent:
def __init__():
Expand All @@ -12,17 +11,24 @@ def test():
raise NotImplementedError

def _add_model(self, scope_name='model', input_placeholder = None, network_specs=None):
activations_map = {
'linear':None,
'relu':tf.nn.relu,
'sigmoid':tf.nn.sigmoid,
'tanh':tf.nn.tanh
}
layers = []
with tf.name_scope(scope_name):
with tf.variable_scope(scope_name):
for ix, layer in enumerate(network_specs):
if layer['type']=='dense':
if ix==0:
layer = tf.layers.dense(inputs = input_placeholder, units = layer['size'])
layer = tf.layers.dense(inputs = input_placeholder, units = layer['size'], activation = activations_map[layer['activation']])
layers.append(layer)
if ix == len(network_specs)-1:
return layer
elif ix == len(network_specs)-1:
final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size'])
final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']])
return final_layer
else:
pdb.set_trace()
layer = tf.layers.dense(inputs = layers[-1], units = layer['size'])
layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']])
layers.append(layer)
97 changes: 52 additions & 45 deletions RLkit/algorithms/dqn.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
from .agent import Agent
import tensorflow as tf
from .utils import *
import pdb


class DQN(Agent):
def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, gamma = 0.95, eps = 0.01):
def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, gamma = 0.95, eps = 0.01, update_target_every_n = 1000, update_every_n = 300, tau = 0.001, logdir = '.', loss_fn='mse'):
# TODO(shubham): Add option to disable Tensorboard
# TODO(shubham): Add logging
# TODO(shubham): ADD L2 REG
self.env_ = env_
self.logdir = logdir
self.loss_fn = loss_fn
self.network_specs = network_specs
self.buffer_size = buffer_size
self.update_every_n = update_every_n
self.tau = tau
self.buffer_ = []
self.buffer_index = None
self.update_target_every_n = 100
Expand All @@ -19,20 +28,24 @@ def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, g
self.batch_size = batch_size
self.moving_reward = None

self.hidden1_size = 64
self.hidden2_size = 64

self.layers = []
self.dqn_scope = 'dqn'
self.target_dqn_scope = 'target_dqn'

self._add_placeholders()
self._add_model()
self._add_target_model()
print("{} layers".format(len(self.layers)))

self.dqn_final_layer = self._add_model(self.dqn_scope, self.state_placeholder, self.network_specs)
self.target_dqn_final_layer = self._add_model(self.target_dqn_scope, self.state_placeholder, self.network_specs)

self.q_values = tf.layers.dense(self.layers[-1], self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values')
self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1])
self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1])
with tf.variable_scope(self.dqn_scope):
self.q_values = tf.layers.dense(self.dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values')
self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1])
self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1])
with tf.variable_scope(self.target_dqn_scope):
self.target_q_values = tf.layers.dense(self.target_dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values')
self.target_max_q_values = tf.reshape(tf.reduce_max(self.target_q_values, axis=1, name='max_q_values'), [-1,1])
self.target_selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.target_q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1])

self._add_target_update(self.tau)
self._add_loss()
self._add_optim()

Expand All @@ -41,38 +54,26 @@ def _add_placeholders(self):
self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions')
self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr')
self.targets = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='targets')
self.layers.append(self.state_placeholder)

def _add_model(self):
with tf.name_scope('model'):
self.hidden1 = tf.nn.relu(tf.layers.dense(self.state_placeholder, self.hidden1_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden1'))
pdb.set_trace()
self.hidden2 = tf.nn.relu(tf.layers.dense(self.hidden1, self.hidden2_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden2'))
self.q_values = tf.layers.dense(self.hidden2, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values')

self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1])
self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1])

def _add_target_model(self):
with tf.name_scope('target_model'):
self.hidden1 = tf.nn.relu(tf.layers.dense(self.state_placeholder, self.hidden1_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden1'))
self.hidden2 = tf.nn.relu(tf.layers.dense(self.hidden1, self.hidden2_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden2'))
self.q_values = tf.layers.dense(self.hidden2, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values')

self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1])
self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1])

def _add_loss(self):
with tf.name_scope("loss_fn"):
self.loss = tf.reduce_mean(tf.square(tf.subtract(self.targets, self.selected_q_values)))

if self.loss_fn == 'huber':
self.loss = tf.clip_by_value(tf.losses.huber_loss(self.targets, self.selected_q_values), -1, 1)
else:
self.loss = tf.clip_by_value(tf.losses.mean_squared_error(self.targets, self.selected_q_values), -1, 1)

def _add_optim(self):
self.optim_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

def _add_target_update(self, tau):
self.update_op_holder = []
for var, target_var in zip(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.dqn_scope), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.target_dqn_scope)):
self.update_op_holder.append(target_var.assign(var.value() * tau + ((1 - tau) * var.value())))

def _start_session(self):
self.sess = tf.Session()
self.summary_op = tf.summary.merge_all()
self.writer = tf.summary.FileWriter("tensorboard/dqn/", self.sess.graph)
self.writer = tf.summary.FileWriter(os.path.join(self.logdir,"tensorboard/dqn/"), self.sess.graph)
self.writer.close()
self.sess.run(tf.global_variables_initializer())

Expand All @@ -84,6 +85,7 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01):
self._start_session()

update_steps = 0
target_update_steps = 0
for episode in range(episodes):
done = False
obs = self.env_.reset()
Expand All @@ -93,25 +95,25 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01):
while not done:
step+=1
update_steps += 1
target_update_steps += 1
experience = {}
action = self.action(obs)
experience['state'] = obs
experience['action'] = action
obs, reward, done, info = self.env_.step(action)
reward_sum += reward
experience['reward'] = reward
experience['next_state'] = obs
experience['done'] = done
self.store_experience(experience)
if len(self.buffer_) > self.batch_size+1:
if len(self.buffer_) < self.batch_size+1:
continue
if update_steps == self.update_every_n:
self.update_net(self.lr)
if update_steps == self.update_target_every_n:
# update target net
print("UPDATE TARGET")
update_steps = 0
pdb.set_trace()
for var, target_var in zip(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_model')):
pass

if target_update_steps == self.update_target_every_n:
target_update_steps = 0
self.update_target()
if self.moving_reward is None:
self.moving_reward = reward_sum
else:
Expand All @@ -122,6 +124,10 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01):
def test():
pass

def update_target(self):
for op in self.update_op_holder:
self.sess.run(op)

def store_experience(self, exp):
if len(self.buffer_)>=self.buffer_size:
if self.buffer_index is None:
Expand All @@ -143,6 +149,7 @@ def action(self, state):
def update_net(self, lr = 0.001):
sampled_buffer = random.sample(self.buffer_, min(self.batch_size, len(self.buffer_)))
states = np.array([x['state'] for x in sampled_buffer])
next_states = np.array([x['next_state'] for x in sampled_buffer])
rewards = np.array([x['reward'] for x in sampled_buffer]).reshape([-1, 1])
done_arr = np.array([x['done'] for x in sampled_buffer]).reshape([-1, 1])

Expand All @@ -151,7 +158,7 @@ def update_net(self, lr = 0.001):
temp_action = x['action']
actions[i, temp_action] = 1

q_vals = self.sess.run(self.q_values, feed_dict={self.state_placeholder:states})
max_q = np.amax(q_vals, axis=1).reshape([-1,1])
target_q_vals = self.sess.run(self.target_q_values, feed_dict={self.state_placeholder:next_states})
max_q = np.amax(target_q_vals, axis=1).reshape([-1,1])
targets = rewards + self.gamma * np.multiply((1-done_arr), max_q)
__, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.actions_placeholder:actions, self.targets:targets, self.learning_rate:lr})
__, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.actions_placeholder:actions, self.targets:targets, self.learning_rate:lr})
113 changes: 0 additions & 113 deletions RLkit/algorithms/policy_gradients.py

This file was deleted.

Loading

0 comments on commit b4d7ba2

Please sign in to comment.