diff --git a/RLkit/__init__.py b/RLkit/__init__.py index 6d2bbad..1d5f453 100755 --- a/RLkit/__init__.py +++ b/RLkit/__init__.py @@ -1,4 +1,3 @@ -print("Import from directory") from .algorithms.random_agent import RandomAgent from .algorithms.dqn import DQN from .algorithms.policy_gradients import REINFORCE diff --git a/RLkit/algorithms/__init__.py b/RLkit/algorithms/__init__.py index 9484478..28758b7 100755 --- a/RLkit/algorithms/__init__.py +++ b/RLkit/algorithms/__init__.py @@ -1,4 +1,4 @@ from .random_agent import RandomAgent from .dqn import DQN -from .policy_gradients import REINFORCE +# from .policy_gradients import REINFORCE from .agent import Agent \ No newline at end of file diff --git a/RLkit/algorithms/agent.py b/RLkit/algorithms/agent.py index 4cd5542..24d19de 100755 --- a/RLkit/algorithms/agent.py +++ b/RLkit/algorithms/agent.py @@ -1,5 +1,4 @@ import tensorflow as tf -import pdb class Agent: def __init__(): @@ -12,17 +11,24 @@ def test(): raise NotImplementedError def _add_model(self, scope_name='model', input_placeholder = None, network_specs=None): + activations_map = { + 'linear':None, + 'relu':tf.nn.relu, + 'sigmoid':tf.nn.sigmoid, + 'tanh':tf.nn.tanh + } layers = [] - with tf.name_scope(scope_name): + with tf.variable_scope(scope_name): for ix, layer in enumerate(network_specs): if layer['type']=='dense': if ix==0: - layer = tf.layers.dense(inputs = input_placeholder, units = layer['size']) + layer = tf.layers.dense(inputs = input_placeholder, units = layer['size'], activation = activations_map[layer['activation']]) layers.append(layer) + if ix == len(network_specs)-1: + return layer elif ix == len(network_specs)-1: - final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size']) + final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']]) return final_layer else: - pdb.set_trace() - layer = tf.layers.dense(inputs = layers[-1], units = layer['size']) + layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']]) layers.append(layer) \ No newline at end of file diff --git a/RLkit/algorithms/dqn.py b/RLkit/algorithms/dqn.py index 0703f38..b28ce2a 100755 --- a/RLkit/algorithms/dqn.py +++ b/RLkit/algorithms/dqn.py @@ -1,12 +1,21 @@ from .agent import Agent import tensorflow as tf from .utils import * +import pdb + class DQN(Agent): - def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, gamma = 0.95, eps = 0.01): + def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, gamma = 0.95, eps = 0.01, update_target_every_n = 1000, update_every_n = 300, tau = 0.001, logdir = '.', loss_fn='mse'): + # TODO(shubham): Add option to disable Tensorboard + # TODO(shubham): Add logging + # TODO(shubham): ADD L2 REG self.env_ = env_ + self.logdir = logdir + self.loss_fn = loss_fn self.network_specs = network_specs self.buffer_size = buffer_size + self.update_every_n = update_every_n + self.tau = tau self.buffer_ = [] self.buffer_index = None self.update_target_every_n = 100 @@ -19,20 +28,24 @@ def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, g self.batch_size = batch_size self.moving_reward = None - self.hidden1_size = 64 - self.hidden2_size = 64 - - self.layers = [] + self.dqn_scope = 'dqn' + self.target_dqn_scope = 'target_dqn' self._add_placeholders() - self._add_model() - self._add_target_model() - print("{} layers".format(len(self.layers))) + + self.dqn_final_layer = self._add_model(self.dqn_scope, self.state_placeholder, self.network_specs) + self.target_dqn_final_layer = self._add_model(self.target_dqn_scope, self.state_placeholder, self.network_specs) - self.q_values = tf.layers.dense(self.layers[-1], self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') - self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1]) - self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) + with tf.variable_scope(self.dqn_scope): + self.q_values = tf.layers.dense(self.dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') + self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1]) + self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) + with tf.variable_scope(self.target_dqn_scope): + self.target_q_values = tf.layers.dense(self.target_dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') + self.target_max_q_values = tf.reshape(tf.reduce_max(self.target_q_values, axis=1, name='max_q_values'), [-1,1]) + self.target_selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.target_q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) + self._add_target_update(self.tau) self._add_loss() self._add_optim() @@ -41,38 +54,26 @@ def _add_placeholders(self): self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr') self.targets = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='targets') - self.layers.append(self.state_placeholder) - - def _add_model(self): - with tf.name_scope('model'): - self.hidden1 = tf.nn.relu(tf.layers.dense(self.state_placeholder, self.hidden1_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden1')) - pdb.set_trace() - self.hidden2 = tf.nn.relu(tf.layers.dense(self.hidden1, self.hidden2_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden2')) - self.q_values = tf.layers.dense(self.hidden2, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') - - self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1]) - self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) - - def _add_target_model(self): - with tf.name_scope('target_model'): - self.hidden1 = tf.nn.relu(tf.layers.dense(self.state_placeholder, self.hidden1_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden1')) - self.hidden2 = tf.nn.relu(tf.layers.dense(self.hidden1, self.hidden2_size, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='hidden2')) - self.q_values = tf.layers.dense(self.hidden2, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') - - self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1]) - self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) def _add_loss(self): with tf.name_scope("loss_fn"): - self.loss = tf.reduce_mean(tf.square(tf.subtract(self.targets, self.selected_q_values))) - + if self.loss_fn == 'huber': + self.loss = tf.clip_by_value(tf.losses.huber_loss(self.targets, self.selected_q_values), -1, 1) + else: + self.loss = tf.clip_by_value(tf.losses.mean_squared_error(self.targets, self.selected_q_values), -1, 1) + def _add_optim(self): self.optim_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) + def _add_target_update(self, tau): + self.update_op_holder = [] + for var, target_var in zip(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.dqn_scope), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.target_dqn_scope)): + self.update_op_holder.append(target_var.assign(var.value() * tau + ((1 - tau) * var.value()))) + def _start_session(self): self.sess = tf.Session() self.summary_op = tf.summary.merge_all() - self.writer = tf.summary.FileWriter("tensorboard/dqn/", self.sess.graph) + self.writer = tf.summary.FileWriter(os.path.join(self.logdir,"tensorboard/dqn/"), self.sess.graph) self.writer.close() self.sess.run(tf.global_variables_initializer()) @@ -84,6 +85,7 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01): self._start_session() update_steps = 0 + target_update_steps = 0 for episode in range(episodes): done = False obs = self.env_.reset() @@ -93,6 +95,7 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01): while not done: step+=1 update_steps += 1 + target_update_steps += 1 experience = {} action = self.action(obs) experience['state'] = obs @@ -100,18 +103,17 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01): obs, reward, done, info = self.env_.step(action) reward_sum += reward experience['reward'] = reward + experience['next_state'] = obs experience['done'] = done self.store_experience(experience) - if len(self.buffer_) > self.batch_size+1: + if len(self.buffer_) < self.batch_size+1: + continue + if update_steps == self.update_every_n: self.update_net(self.lr) - if update_steps == self.update_target_every_n: - # update target net - print("UPDATE TARGET") update_steps = 0 - pdb.set_trace() - for var, target_var in zip(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model'), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_model')): - pass - + if target_update_steps == self.update_target_every_n: + target_update_steps = 0 + self.update_target() if self.moving_reward is None: self.moving_reward = reward_sum else: @@ -122,6 +124,10 @@ def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01): def test(): pass + def update_target(self): + for op in self.update_op_holder: + self.sess.run(op) + def store_experience(self, exp): if len(self.buffer_)>=self.buffer_size: if self.buffer_index is None: @@ -143,6 +149,7 @@ def action(self, state): def update_net(self, lr = 0.001): sampled_buffer = random.sample(self.buffer_, min(self.batch_size, len(self.buffer_))) states = np.array([x['state'] for x in sampled_buffer]) + next_states = np.array([x['next_state'] for x in sampled_buffer]) rewards = np.array([x['reward'] for x in sampled_buffer]).reshape([-1, 1]) done_arr = np.array([x['done'] for x in sampled_buffer]).reshape([-1, 1]) @@ -151,7 +158,7 @@ def update_net(self, lr = 0.001): temp_action = x['action'] actions[i, temp_action] = 1 - q_vals = self.sess.run(self.q_values, feed_dict={self.state_placeholder:states}) - max_q = np.amax(q_vals, axis=1).reshape([-1,1]) + target_q_vals = self.sess.run(self.target_q_values, feed_dict={self.state_placeholder:next_states}) + max_q = np.amax(target_q_vals, axis=1).reshape([-1,1]) targets = rewards + self.gamma * np.multiply((1-done_arr), max_q) - __, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.actions_placeholder:actions, self.targets:targets, self.learning_rate:lr}) \ No newline at end of file + __, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.actions_placeholder:actions, self.targets:targets, self.learning_rate:lr}) diff --git a/RLkit/algorithms/policy_gradients.py b/RLkit/algorithms/policy_gradients.py deleted file mode 100755 index 1a49810..0000000 --- a/RLkit/algorithms/policy_gradients.py +++ /dev/null @@ -1,113 +0,0 @@ -from .agent import Agent -from .utils import * - -class REINFORCE(Agent): - def __init__(self, env_, network_specs, value_estimator_specs=None, gamma = 0.95): - self.env_ = env_ - self.network_specs = network_specs - self.use_baseline = False - if value_estimator_specs is not None: - self.value_estimator_specs = value_estimator_specs - self.use_baseline = True - self.gamma = gamma - self.action_space = self.env_.env.action_space - self.num_actions = self.action_space.n - self.state_size = self.env_.env.observation_space.shape[0] - self.moving_reward = None - - self.layers = [] - - self._add_placeholders() - - self.policy_final_layer = self._add_model('policy_net', self.state_placeholder, network_specs) - if self.use_baseline: - self.value_final_layer = self._add_model('value_estimator', self.state_placeholder, value_estimator_specs) - - self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits') - self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs') - self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood') - if self.use_baseline: - self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values') - - self._add_loss() - self._add_optim() - - def _add_placeholders(self): - self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') - self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns') - self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') - self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr') - - def _add_loss(self): - with tf.name_scope("loss_fn"): - if self.use_baseline: - self.loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) - else: - self.loss = -tf.reduce_mean(tf.multiply(self.returns_placeholder, tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) - - def _add_optim(self): - self.optim_step = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss) - - def _start_session(self): - self.sess = tf.Session() - self.summary_op = tf.summary.merge_all() - self.writer = tf.summary.FileWriter("tensorboard/pg/", self.sess.graph) - self.writer.close() - self.sess.run(tf.global_variables_initializer()) - - def train(self, episodes = 10, lr = 0.01, gamma = 0.95, update_steps = 10): - self.gamma = gamma - self.lr = lr - all_moving_rewards=[] - - self._start_session() - - for episode in range(episodes): - done = False - obs = self.env_.reset() - step = 0 - ep_start_time = time() - self.buffer_ = [] - while not done: - step+=1 - temp = {} - action = self.action(obs) - temp['state'] = obs - temp['action'] = action - obs, reward, done, info = self.env_.step(action) - temp['reward'] = reward - self.buffer_.append(temp) - if self.moving_reward is None: - self.moving_reward = float(sum(x['reward'] for x in self.buffer_)) - else: - self.moving_reward = 0.99 * self.moving_reward + 0.01 * float(sum(x['reward'] for x in self.buffer_)) - all_moving_rewards.append(self.moving_reward) - print("Episode:", episode, "Steps:", step, "reward:", self.moving_reward, "lr", self.lr, "Time:", time()-ep_start_time) - self.update_net(self.lr) - - def action(self, state): - action_probs = self.sess.run(self.action_probs, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) - action = np.random.choice(list(range(self.num_actions)), p=action_probs[0]) - return action - - def update_net(self, lr = 0.001): - states = np.array([x['state'] for x in self.buffer_]) - rewards = np.array([x['reward'] for x in self.buffer_]) - - discounted_r = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, rewards.size)): - running_add = running_add * self.gamma + rewards[t] - discounted_r[t] = running_add - returns = discounted_r.reshape([-1, 1]) - - actions = np.zeros([len(self.buffer_), self.num_actions]) - for i, x in enumerate(self.buffer_): - temp_action = x['action'] - actions[i, temp_action] = 1 - - __, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.returns_placeholder:returns, self.actions_placeholder:actions, self.learning_rate:lr}) - - - def test(): - pass \ No newline at end of file diff --git a/RLkit/algorithms/policy_gradients/ActorCritic.py b/RLkit/algorithms/policy_gradients/ActorCritic.py new file mode 100644 index 0000000..5bae0e5 --- /dev/null +++ b/RLkit/algorithms/policy_gradients/ActorCritic.py @@ -0,0 +1,119 @@ +from ..agent import Agent +from ..utils import * + +class ActorCritic(Agent): + def __init__(self, env_, actor_specs, critic_specs=None, gamma = 0.95, logdir = '.', inertia = 0.99): + self.env_ = env_ + self.inertia = inertia + self.actor_specs = actor_specs + self.critic_specs = critic_specs + self.logdir = logdir + self.gamma = gamma + self.action_space = self.env_.env.action_space + self.num_actions = self.action_space.n + self.state_size = self.env_.env.observation_space.shape[0] + self.moving_reward = None + + self._add_placeholders() + + # Add models + self.policy_final_layer = self._add_model('actor', self.state_placeholder, actor_specs) + self.value_final_layer = self._add_model('critic', self.state_placeholder, critic_specs) + self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values') + + self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits') + self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs') + self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood') + + self._add_loss() + self._add_optim() + + def _add_placeholders(self): + self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') + self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns') + self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') + self.target_state_val = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='target_state_val') + # Learning rates + self.actor_learning_rate = tf.placeholder(dtype=tf.float32, name='actor_lr') + self.critic_learning_rate = tf.placeholder(dtype=tf.float32, name='critic_lr') + + def _add_loss(self): + with tf.name_scope("loss_fn"): + self.actor_loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) + self.critic_loss = tf.losses.mean_squared_error(self.target_state_val, self.state_values) + + def _add_optim(self): + self.actor_optim_step = tf.train.AdamOptimizer(learning_rate = self.actor_learning_rate).minimize(self.actor_loss) + self.critic_optim_step = tf.train.AdamOptimizer(learning_rate = self.critic_learning_rate).minimize(self.critic_loss) + + def _start_session(self): + self.sess = tf.Session() + self.summary_op = tf.summary.merge_all() + self.writer = tf.summary.FileWriter(os.path.join(self.logdir, "tensorboard/AC/"), self.sess.graph) + self.writer.close() + self.sess.run(tf.global_variables_initializer()) + + def train(self, episodes = 10, actor_lr = 0.01, critic_lr = 0.1, gamma = 0.95, update_steps = 10): + self.gamma = gamma + all_moving_rewards = [] + + self._start_session() + + for episode in range(episodes): + done = False + obs = self.env_.reset() + step = 0 + ep_start_time = time() + self.buffer_ = [] + while not done: + step+=1 + experience = {} + action = self.action(obs) + experience['state'] = obs + experience['action'] = action + obs, reward, done, info = self.env_.step(action) + experience['reward'] = reward + experience['next_state'] = obs + self.buffer_.append(experience) + if self.moving_reward is None: + self.moving_reward = float(sum(x['reward'] for x in self.buffer_)) + else: + self.moving_reward = self.inertia * self.moving_reward + (1-self.inertia) * float(sum(x['reward'] for x in self.buffer_)) + all_moving_rewards.append(self.moving_reward) + print("Episode:{}\t Steps:{}\t Reward:{}\t Time:{}".format(episode, step, self.moving_reward, time()-ep_start_time)) + self.update_net(actor_lr, critic_lr) + + def action(self, state): + action_probs = self.sess.run(self.action_probs, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) + action = np.random.choice(list(range(self.num_actions)), p=action_probs[0]) + return action + + def update_net(self, actor_lr, critic_lr): + states = np.array([x['state'] for x in self.buffer_]) + rewards = np.array([x['reward'] for x in self.buffer_]) + next_states = np.array([x['next_state'] for x in self.buffer_]) + + next_state_val = self.sess.run(self.state_values, feed_dict={self.state_placeholder:next_states}) + # temp = self.sess.run(self.state_values, feed_dict={self.state_placeholder:states}) + + target_state_val = rewards.reshape([-1,1]) + self.gamma*next_state_val + + discounted_r = np.zeros_like(rewards) + running_add = 0 + for t in reversed(range(0, rewards.size)): + running_add = running_add * self.gamma + rewards[t] + discounted_r[t] = running_add + returns = discounted_r.reshape([-1, 1]) + + actions = np.zeros([len(self.buffer_), self.num_actions]) + for i, x in enumerate(self.buffer_): + temp_action = x['action'] + actions[i, temp_action] = 1 + + a_, c_, actor_loss_, critic_loss_ = self.sess.run([self.actor_optim_step, self.critic_optim_step, self.actor_loss, + self.critic_loss], feed_dict={self.state_placeholder: states, + self.returns_placeholder:returns, self.actions_placeholder:actions, self.actor_learning_rate:actor_lr, + self.critic_learning_rate:critic_lr, self.target_state_val:target_state_val}) + + def test(): + pass \ No newline at end of file diff --git a/RLkit/algorithms/policy_gradients/REINFORCE.py b/RLkit/algorithms/policy_gradients/REINFORCE.py new file mode 100755 index 0000000..672b791 --- /dev/null +++ b/RLkit/algorithms/policy_gradients/REINFORCE.py @@ -0,0 +1,117 @@ +from ..agent import Agent +from ..utils import * + +class REINFORCE(Agent): + def __init__(self, env_, network_specs, value_estimator_specs=None, gamma = 0.95, logdir = '.', inertia = 0.99): + self.env_ = env_ + self.inertia = inertia + self.network_specs = network_specs + self.use_baseline = False + if value_estimator_specs is not None: + self.value_estimator_specs = value_estimator_specs + self.use_baseline = True + self.logdir = logdir + self.gamma = gamma + self.action_space = self.env_.env.action_space + self.num_actions = self.action_space.n + self.state_size = self.env_.env.observation_space.shape[0] + self.moving_reward = None + + self._add_placeholders() + + # Add models + self.policy_final_layer = self._add_model('policy_net', self.state_placeholder, network_specs) + if self.use_baseline: + self.value_final_layer = self._add_model('value_estimator', self.state_placeholder, value_estimator_specs) + self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values') + + self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits') + self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs') + self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood') + + self._add_loss() + self._add_optim() + + def _add_placeholders(self): + self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') + self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns') + self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') + self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr') + self.value_est_learning_rate = tf.placeholder(dtype=tf.float32, name='value_est_lr') + + def _add_loss(self): + with tf.name_scope("loss_fn"): + if self.use_baseline: + self.loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) + self.value_estimator_loss = tf.losses.mean_squared_error(self.returns_placeholder, self.state_values) + else: + self.loss = -tf.reduce_mean(tf.multiply(self.returns_placeholder, tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) + + def _add_optim(self): + self.optim_step = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss) + if self.use_baseline: + self.optim_step_value_est = tf.train.AdamOptimizer(learning_rate = self.value_est_learning_rate).minimize(self.value_estimator_loss) + + def _start_session(self): + self.sess = tf.Session() + self.summary_op = tf.summary.merge_all() + self.writer = tf.summary.FileWriter(os.path.join(self.logdir, "tensorboard/pg/"), self.sess.graph) + self.writer.close() + self.sess.run(tf.global_variables_initializer()) + + def train(self, episodes = 10, lr = 0.01, value_est_lr = 0.01, gamma = 0.95, update_steps = 10): + self.gamma = gamma + self.lr = lr + all_moving_rewards=[] + + self._start_session() + + for episode in range(episodes): + done = False + obs = self.env_.reset() + step = 0 + ep_start_time = time() + self.buffer_ = [] + while not done: + step+=1 + temp = {} + action = self.action(obs) + temp['state'] = obs + temp['action'] = action + obs, reward, done, info = self.env_.step(action) + temp['reward'] = reward + self.buffer_.append(temp) + if self.moving_reward is None: + self.moving_reward = float(sum(x['reward'] for x in self.buffer_)) + else: + self.moving_reward = self.inertia * self.moving_reward + (1-self.inertia) * float(sum(x['reward'] for x in self.buffer_)) + all_moving_rewards.append(self.moving_reward) + print("Episode:{}\t Steps:{}\t Reward:{}\t Time:{}".format(episode, step, self.moving_reward, time()-ep_start_time)) + self.update_net(lr, value_est_lr) + + def action(self, state): + action_probs = self.sess.run(self.action_probs, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) + action = np.random.choice(list(range(self.num_actions)), p=action_probs[0]) + return action + + def update_net(self, lr, value_est_lr): + states = np.array([x['state'] for x in self.buffer_]) + rewards = np.array([x['reward'] for x in self.buffer_]) + + discounted_r = np.zeros_like(rewards) + running_add = 0 + for t in reversed(range(0, rewards.size)): + running_add = running_add * self.gamma + rewards[t] + discounted_r[t] = running_add + returns = discounted_r.reshape([-1, 1]) + + actions = np.zeros([len(self.buffer_), self.num_actions]) + for i, x in enumerate(self.buffer_): + temp_action = x['action'] + actions[i, temp_action] = 1 + + __, v__, loss_, loss_value_est = self.sess.run([self.optim_step, self.optim_step_value_est, self.loss, self.value_estimator_loss], feed_dict={self.state_placeholder: states, + self.returns_placeholder:returns, self.actions_placeholder:actions, self.learning_rate:lr, self.value_est_learning_rate:value_est_lr}) + + def test(): + pass \ No newline at end of file diff --git a/RLkit/algorithms/policy_gradients/__init__.py b/RLkit/algorithms/policy_gradients/__init__.py new file mode 100644 index 0000000..86d71ee --- /dev/null +++ b/RLkit/algorithms/policy_gradients/__init__.py @@ -0,0 +1,2 @@ +from .ActorCritic import ActorCritic +from .REINFORCE import REINFORCE \ No newline at end of file diff --git a/examples/ActorCritic_run.py b/examples/ActorCritic_run.py new file mode 100755 index 0000000..8920048 --- /dev/null +++ b/examples/ActorCritic_run.py @@ -0,0 +1,30 @@ +import numpy as np +import os, sys +import RLkit +from RLkit.environment import Environment +from RLkit.algorithms.policy_gradients import ActorCritic + +actor_specs = [ + { + "type": "dense", + "size": 64, + "activation":"relu" + }, + { + "type": "dense", + "size": 32, + "activation":"relu" + } +] + +critic_specs = [ + { + "type": "dense", + "size": 20, + "activation":"sigmoid" + } +] + +env_ = Environment(env_name="CartPole-v1", render = False) +agent = ActorCritic(env_, actor_specs, critic_specs) +agent.train(episodes=1000, actor_lr=0.001, critic_lr=0.1, gamma=1) \ No newline at end of file diff --git a/examples/DQN_run.py b/examples/DQN_run.py index ea4ed60..85b55fa 100755 --- a/examples/DQN_run.py +++ b/examples/DQN_run.py @@ -1,14 +1,14 @@ import numpy as np import os, sys -# sys.path.insert(0, 'C:/Users/Shubham/Documents/Shubham/Projects/rllib/rllib') +import RLkit from RLkit.environment import Environment -from RLkit.algorithms import RandomAgent, REINFORCE, DQN +from RLkit.algorithms import DQN network_specs = [ { "type": "dense", "size": 64, - "activation":"relu" + "activation":"sigmoid" }, { "type": "dense", @@ -17,5 +17,5 @@ } ] env_ = Environment(env_name="CartPole-v1", render = False) -agent = DQN(env_, network_specs, buffer_size = 5000, batch_size = 128) -agent.train(env_, episodes=6000, lr=0.01, gamma=1) \ No newline at end of file +agent = DQN(env_, network_specs, buffer_size = 100000, batch_size = 10, tau=0.001, update_target_every_n = 2000, eps=0.9, update_every_n = 200) +agent.train(env_, episodes=1000, lr=0.01, gamma=1) \ No newline at end of file diff --git a/examples/REINFORCE_baseline_run.py b/examples/REINFORCE_baseline_run.py index 1d612ad..79f7204 100755 --- a/examples/REINFORCE_baseline_run.py +++ b/examples/REINFORCE_baseline_run.py @@ -2,7 +2,7 @@ import os, sys import RLkit from RLkit.environment import Environment -from RLkit.algorithms import REINFORCE +from RLkit.algorithms.policy_gradients import REINFORCE network_specs = [ { diff --git a/examples/REINFORCE_run.py b/examples/REINFORCE_run.py index 4fe5620..0be8cf5 100755 --- a/examples/REINFORCE_run.py +++ b/examples/REINFORCE_run.py @@ -2,7 +2,7 @@ import os, sys import RLkit from RLkit.environment import Environment -from RLkit.algorithms import REINFORCE +from RLkit.algorithms.policy_gradients import REINFORCE network_specs = [ {