Skip to content

Commit

Permalink
added REINFORCE with baseline
Browse files Browse the repository at this point in the history
  • Loading branch information
shubhamjha97 committed Nov 9, 2018
1 parent 2dfa3c8 commit 390cdf0
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 26 deletions.
1 change: 1 addition & 0 deletions RLkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
print("Import from directory")
from .algorithms.random_agent import RandomAgent
from .algorithms.dqn import DQN
from .algorithms.policy_gradients import REINFORCE
Expand Down
21 changes: 14 additions & 7 deletions RLkit/algorithms/agent.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import tensorflow as tf
import pdb

class Agent:
def __init__():
Expand All @@ -10,12 +11,18 @@ def train():
def test():
raise NotImplementedError

def _add_model(self, scope_name = 'model'):
print(scope_name)
def _add_model(self, scope_name='model', input_placeholder = None, network_specs=None):
layers = []
with tf.name_scope(scope_name):
for ix, layer in enumerate(self.network_specs):
for ix, layer in enumerate(network_specs):
if layer['type']=='dense':
current_layer = tf.layers.dense(inputs = self.layers[-1], units = layer['size'], name = "dense_{}".format(ix))
elif layer["type"]=="conv":
pdb.set_trace()
self.layers.append(current_layer)
if ix==0:
layer = tf.layers.dense(inputs = input_placeholder, units = layer['size'])
layers.append(layer)
elif ix == len(network_specs)-1:
final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size'])
return final_layer
else:
pdb.set_trace()
layer = tf.layers.dense(inputs = layers[-1], units = layer['size'])
layers.append(layer)
4 changes: 0 additions & 4 deletions RLkit/algorithms/gpi.py

This file was deleted.

30 changes: 16 additions & 14 deletions RLkit/algorithms/policy_gradients.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
from .utils import *

class REINFORCE(Agent):
def __init__(self, env_, network_specs, gamma = 0.95):
def __init__(self, env_, network_specs, value_estimator_specs=None, gamma = 0.95):
self.env_ = env_
self.network_specs = network_specs
self.use_baseline = False
if value_estimator_specs is not None:
self.value_estimator_specs = value_estimator_specs
self.use_baseline = True
self.gamma = gamma
self.action_space = self.env_.env.action_space
self.num_actions = self.action_space.n
Expand All @@ -14,12 +18,16 @@ def __init__(self, env_, network_specs, gamma = 0.95):
self.layers = []

self._add_placeholders()
self._add_model()
print("{} layers".format(len(self.layers)))

self.action_logits = tf.layers.dense(self.layers[-1], self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits')
self.policy_final_layer = self._add_model('policy_net', self.state_placeholder, network_specs)
if self.use_baseline:
self.value_final_layer = self._add_model('value_estimator', self.state_placeholder, value_estimator_specs)

self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits')
self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs')
self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood')
if self.use_baseline:
self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values')

self._add_loss()
self._add_optim()
Expand All @@ -29,19 +37,13 @@ def _add_placeholders(self):
self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns')
self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions')
self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr')
self.layers.append(self.state_placeholder)


# def _add_model(self):
# self.hidden1 = tf.nn.relu(tf.layers.dense(self.state_placeholder, self.hidden1_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden1'))
# self.hidden2 = tf.nn.relu(tf.layers.dense(self.hidden1, self.hidden2_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden2'))
# self.action_logits = tf.layers.dense(self.hidden2, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits')
# self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs')
# self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood')

def _add_loss(self):
with tf.name_scope("loss_fn"):
self.loss = -tf.reduce_mean(tf.multiply(self.returns_placeholder, tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0)
if self.use_baseline:
self.loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0)
else:
self.loss = -tf.reduce_mean(tf.multiply(self.returns_placeholder, tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0)

def _add_optim(self):
self.optim_step = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss)
Expand Down
36 changes: 36 additions & 0 deletions examples/REINFORCE_baseline_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import os, sys
import RLkit
from RLkit.environment import Environment
from RLkit.algorithms import REINFORCE

network_specs = [
{
"type": "dense",
"size": 64,
"activation":"relu"
},
{
"type": "dense",
"size": 32,
"activation":"relu"
}
]


value_estimator_specs = [
{
"type": "dense",
"size": 64,
"activation":"relu"
},
{
"type": "dense",
"size": 32,
"activation":"relu"
}
]

env_ = Environment(env_name="CartPole-v1", render = False)
agent = REINFORCE(env_, network_specs, value_estimator_specs)
agent.train(episodes=1000, lr=0.001, gamma=1)
2 changes: 1 addition & 1 deletion examples/REINFORCE_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
]
env_ = Environment(env_name="CartPole-v1", render = False)
agent = REINFORCE(env_, network_specs)
agent.train(episodes=6000, lr=0.01, gamma=1)
agent.train(episodes=1000, lr=0.001, gamma=1)

0 comments on commit 390cdf0

Please sign in to comment.