From 4571b7fa26e900bd25d993f1274ebdf1e0117426 Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Sun, 7 Jun 2020 21:36:24 +0530 Subject: [PATCH 1/6] Add venv to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b947a90..bd4ec27 100755 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ sftp-config.json tensorboard/* __pycache__/* -TO_DO.txt \ No newline at end of file +TO_DO.txt +venv/* \ No newline at end of file From 339ccff1cc53652c0efcb65290eee63032db2092 Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Mon, 8 Jun 2020 00:47:36 +0530 Subject: [PATCH 2/6] Rename library to rlkit --- examples/ActorCritic_run.py | 6 +++--- examples/DQN_run.py | 6 +++--- examples/REINFORCE_baseline_run.py | 6 +++--- examples/REINFORCE_run.py | 6 +++--- {RLkit => rlkit}/__init__.py | 0 {RLkit => rlkit}/algorithms/__init__.py | 0 {RLkit => rlkit}/algorithms/agent.py | 0 {RLkit => rlkit}/algorithms/dqn.py | 0 {RLkit => rlkit}/algorithms/policy_gradients/ActorCritic.py | 0 {RLkit => rlkit}/algorithms/policy_gradients/REINFORCE.py | 0 {RLkit => rlkit}/algorithms/policy_gradients/__init__.py | 0 {RLkit => rlkit}/algorithms/random_agent.py | 0 {RLkit => rlkit}/algorithms/utils.py | 0 {RLkit => rlkit}/environment.py | 0 setup.py | 2 +- 15 files changed, 13 insertions(+), 13 deletions(-) rename {RLkit => rlkit}/__init__.py (100%) rename {RLkit => rlkit}/algorithms/__init__.py (100%) rename {RLkit => rlkit}/algorithms/agent.py (100%) rename {RLkit => rlkit}/algorithms/dqn.py (100%) rename {RLkit => rlkit}/algorithms/policy_gradients/ActorCritic.py (100%) rename {RLkit => rlkit}/algorithms/policy_gradients/REINFORCE.py (100%) rename {RLkit => rlkit}/algorithms/policy_gradients/__init__.py (100%) rename {RLkit => rlkit}/algorithms/random_agent.py (100%) rename {RLkit => rlkit}/algorithms/utils.py (100%) rename {RLkit => rlkit}/environment.py (100%) diff --git a/examples/ActorCritic_run.py b/examples/ActorCritic_run.py index 8920048..6cdd8d2 100755 --- a/examples/ActorCritic_run.py +++ b/examples/ActorCritic_run.py @@ -1,8 +1,8 @@ import numpy as np import os, sys -import RLkit -from RLkit.environment import Environment -from RLkit.algorithms.policy_gradients import ActorCritic +import rlkit +from rlkit.environment import Environment +from rlkit.algorithms.policy_gradients import ActorCritic actor_specs = [ { diff --git a/examples/DQN_run.py b/examples/DQN_run.py index 85b55fa..8201501 100755 --- a/examples/DQN_run.py +++ b/examples/DQN_run.py @@ -1,8 +1,8 @@ import numpy as np import os, sys -import RLkit -from RLkit.environment import Environment -from RLkit.algorithms import DQN +import rlkit +from rlkit.environment import Environment +from rlkit.algorithms import DQN network_specs = [ { diff --git a/examples/REINFORCE_baseline_run.py b/examples/REINFORCE_baseline_run.py index 79f7204..1a8da2e 100755 --- a/examples/REINFORCE_baseline_run.py +++ b/examples/REINFORCE_baseline_run.py @@ -1,8 +1,8 @@ import numpy as np import os, sys -import RLkit -from RLkit.environment import Environment -from RLkit.algorithms.policy_gradients import REINFORCE +import rlkit +from rlkit.environment import Environment +from rlkit.algorithms.policy_gradients import REINFORCE network_specs = [ { diff --git a/examples/REINFORCE_run.py b/examples/REINFORCE_run.py index 0be8cf5..eb0e62d 100755 --- a/examples/REINFORCE_run.py +++ b/examples/REINFORCE_run.py @@ -1,8 +1,8 @@ import numpy as np import os, sys -import RLkit -from RLkit.environment import Environment -from RLkit.algorithms.policy_gradients import REINFORCE +import rlkit +from rlkit.environment import Environment +from rlkit.algorithms.policy_gradients import REINFORCE network_specs = [ { diff --git a/RLkit/__init__.py b/rlkit/__init__.py similarity index 100% rename from RLkit/__init__.py rename to rlkit/__init__.py diff --git a/RLkit/algorithms/__init__.py b/rlkit/algorithms/__init__.py similarity index 100% rename from RLkit/algorithms/__init__.py rename to rlkit/algorithms/__init__.py diff --git a/RLkit/algorithms/agent.py b/rlkit/algorithms/agent.py similarity index 100% rename from RLkit/algorithms/agent.py rename to rlkit/algorithms/agent.py diff --git a/RLkit/algorithms/dqn.py b/rlkit/algorithms/dqn.py similarity index 100% rename from RLkit/algorithms/dqn.py rename to rlkit/algorithms/dqn.py diff --git a/RLkit/algorithms/policy_gradients/ActorCritic.py b/rlkit/algorithms/policy_gradients/ActorCritic.py similarity index 100% rename from RLkit/algorithms/policy_gradients/ActorCritic.py rename to rlkit/algorithms/policy_gradients/ActorCritic.py diff --git a/RLkit/algorithms/policy_gradients/REINFORCE.py b/rlkit/algorithms/policy_gradients/REINFORCE.py similarity index 100% rename from RLkit/algorithms/policy_gradients/REINFORCE.py rename to rlkit/algorithms/policy_gradients/REINFORCE.py diff --git a/RLkit/algorithms/policy_gradients/__init__.py b/rlkit/algorithms/policy_gradients/__init__.py similarity index 100% rename from RLkit/algorithms/policy_gradients/__init__.py rename to rlkit/algorithms/policy_gradients/__init__.py diff --git a/RLkit/algorithms/random_agent.py b/rlkit/algorithms/random_agent.py similarity index 100% rename from RLkit/algorithms/random_agent.py rename to rlkit/algorithms/random_agent.py diff --git a/RLkit/algorithms/utils.py b/rlkit/algorithms/utils.py similarity index 100% rename from RLkit/algorithms/utils.py rename to rlkit/algorithms/utils.py diff --git a/RLkit/environment.py b/rlkit/environment.py similarity index 100% rename from RLkit/environment.py rename to rlkit/environment.py diff --git a/setup.py b/setup.py index 550cf7c..e2ac341 100755 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup, find_packages -setup(name='RLkit', +setup(name='rlkit', version='0.2.0', description='A simple RL library.', url='http://github.com/shubhamjha97/RLkit', From 37adf3b4517fdb7ccfc7d089cdfb8654ff46a8c1 Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Mon, 8 Jun 2020 00:56:38 +0530 Subject: [PATCH 3/6] Create basic v2 skeleton --- examples/ActorCritic_run.py | 30 ---- examples/DQN_run.py | 21 --- examples/REINFORCE_baseline_run.py | 36 ---- examples/REINFORCE_run.py | 21 --- rlkit/__main__.py | 0 rlkit/agents/__init__.py | 0 rlkit/algorithms/__init__.py | 4 - rlkit/algorithms/agent.py | 34 ---- rlkit/algorithms/dqn.py | 164 ------------------ .../policy_gradients/ActorCritic.py | 119 ------------- .../algorithms/policy_gradients/REINFORCE.py | 117 ------------- rlkit/algorithms/policy_gradients/__init__.py | 2 - rlkit/algorithms/random_agent.py | 11 -- rlkit/algorithms/utils.py | 6 - rlkit/core/__init__.py | 0 rlkit/core/base_action.py | 0 rlkit/core/base_agent.py | 0 rlkit/core/base_environment.py | 0 rlkit/core/base_policy.py | 0 rlkit/core/base_state.py | 0 rlkit/core/base_trainer.py | 9 + rlkit/core/base_value.py | 0 rlkit/environment.py | 22 --- rlkit/environments/__init__.py | 0 rlkit/environments/gym_environment.py | 0 rlkit/trainers/__init__.py | 0 rlkit/trainers/basic_trainer.py | 3 + 27 files changed, 12 insertions(+), 587 deletions(-) delete mode 100755 examples/ActorCritic_run.py delete mode 100755 examples/DQN_run.py delete mode 100755 examples/REINFORCE_baseline_run.py delete mode 100755 examples/REINFORCE_run.py create mode 100644 rlkit/__main__.py create mode 100644 rlkit/agents/__init__.py delete mode 100755 rlkit/algorithms/__init__.py delete mode 100755 rlkit/algorithms/agent.py delete mode 100755 rlkit/algorithms/dqn.py delete mode 100644 rlkit/algorithms/policy_gradients/ActorCritic.py delete mode 100755 rlkit/algorithms/policy_gradients/REINFORCE.py delete mode 100644 rlkit/algorithms/policy_gradients/__init__.py delete mode 100755 rlkit/algorithms/random_agent.py delete mode 100755 rlkit/algorithms/utils.py create mode 100644 rlkit/core/__init__.py create mode 100644 rlkit/core/base_action.py create mode 100644 rlkit/core/base_agent.py create mode 100644 rlkit/core/base_environment.py create mode 100644 rlkit/core/base_policy.py create mode 100644 rlkit/core/base_state.py create mode 100644 rlkit/core/base_trainer.py create mode 100644 rlkit/core/base_value.py delete mode 100755 rlkit/environment.py create mode 100644 rlkit/environments/__init__.py create mode 100644 rlkit/environments/gym_environment.py create mode 100644 rlkit/trainers/__init__.py create mode 100644 rlkit/trainers/basic_trainer.py diff --git a/examples/ActorCritic_run.py b/examples/ActorCritic_run.py deleted file mode 100755 index 6cdd8d2..0000000 --- a/examples/ActorCritic_run.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import os, sys -import rlkit -from rlkit.environment import Environment -from rlkit.algorithms.policy_gradients import ActorCritic - -actor_specs = [ - { - "type": "dense", - "size": 64, - "activation":"relu" - }, - { - "type": "dense", - "size": 32, - "activation":"relu" - } -] - -critic_specs = [ - { - "type": "dense", - "size": 20, - "activation":"sigmoid" - } -] - -env_ = Environment(env_name="CartPole-v1", render = False) -agent = ActorCritic(env_, actor_specs, critic_specs) -agent.train(episodes=1000, actor_lr=0.001, critic_lr=0.1, gamma=1) \ No newline at end of file diff --git a/examples/DQN_run.py b/examples/DQN_run.py deleted file mode 100755 index 8201501..0000000 --- a/examples/DQN_run.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np -import os, sys -import rlkit -from rlkit.environment import Environment -from rlkit.algorithms import DQN - -network_specs = [ - { - "type": "dense", - "size": 64, - "activation":"sigmoid" - }, - { - "type": "dense", - "size": 64, - "activation":"relu" - } -] -env_ = Environment(env_name="CartPole-v1", render = False) -agent = DQN(env_, network_specs, buffer_size = 100000, batch_size = 10, tau=0.001, update_target_every_n = 2000, eps=0.9, update_every_n = 200) -agent.train(env_, episodes=1000, lr=0.01, gamma=1) \ No newline at end of file diff --git a/examples/REINFORCE_baseline_run.py b/examples/REINFORCE_baseline_run.py deleted file mode 100755 index 1a8da2e..0000000 --- a/examples/REINFORCE_baseline_run.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import os, sys -import rlkit -from rlkit.environment import Environment -from rlkit.algorithms.policy_gradients import REINFORCE - -network_specs = [ - { - "type": "dense", - "size": 64, - "activation":"relu" - }, - { - "type": "dense", - "size": 32, - "activation":"relu" - } -] - - -value_estimator_specs = [ - { - "type": "dense", - "size": 64, - "activation":"relu" - }, - { - "type": "dense", - "size": 32, - "activation":"relu" - } -] - -env_ = Environment(env_name="CartPole-v1", render = False) -agent = REINFORCE(env_, network_specs, value_estimator_specs) -agent.train(episodes=1000, lr=0.001, gamma=1) \ No newline at end of file diff --git a/examples/REINFORCE_run.py b/examples/REINFORCE_run.py deleted file mode 100755 index eb0e62d..0000000 --- a/examples/REINFORCE_run.py +++ /dev/null @@ -1,21 +0,0 @@ -import numpy as np -import os, sys -import rlkit -from rlkit.environment import Environment -from rlkit.algorithms.policy_gradients import REINFORCE - -network_specs = [ - { - "type": "dense", - "size": 64, - "activation":"relu" - }, - { - "type": "dense", - "size": 32, - "activation":"relu" - } -] -env_ = Environment(env_name="CartPole-v1", render = False) -agent = REINFORCE(env_, network_specs) -agent.train(episodes=1000, lr=0.001, gamma=1) \ No newline at end of file diff --git a/rlkit/__main__.py b/rlkit/__main__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/agents/__init__.py b/rlkit/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/algorithms/__init__.py b/rlkit/algorithms/__init__.py deleted file mode 100755 index 28758b7..0000000 --- a/rlkit/algorithms/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .random_agent import RandomAgent -from .dqn import DQN -# from .policy_gradients import REINFORCE -from .agent import Agent \ No newline at end of file diff --git a/rlkit/algorithms/agent.py b/rlkit/algorithms/agent.py deleted file mode 100755 index 24d19de..0000000 --- a/rlkit/algorithms/agent.py +++ /dev/null @@ -1,34 +0,0 @@ -import tensorflow as tf - -class Agent: - def __init__(): - raise NotImplementedError - - def train(): - raise NotImplementedError - - def test(): - raise NotImplementedError - - def _add_model(self, scope_name='model', input_placeholder = None, network_specs=None): - activations_map = { - 'linear':None, - 'relu':tf.nn.relu, - 'sigmoid':tf.nn.sigmoid, - 'tanh':tf.nn.tanh - } - layers = [] - with tf.variable_scope(scope_name): - for ix, layer in enumerate(network_specs): - if layer['type']=='dense': - if ix==0: - layer = tf.layers.dense(inputs = input_placeholder, units = layer['size'], activation = activations_map[layer['activation']]) - layers.append(layer) - if ix == len(network_specs)-1: - return layer - elif ix == len(network_specs)-1: - final_layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']]) - return final_layer - else: - layer = tf.layers.dense(inputs = layers[-1], units = layer['size'], activation = activations_map[layer['activation']]) - layers.append(layer) \ No newline at end of file diff --git a/rlkit/algorithms/dqn.py b/rlkit/algorithms/dqn.py deleted file mode 100755 index b28ce2a..0000000 --- a/rlkit/algorithms/dqn.py +++ /dev/null @@ -1,164 +0,0 @@ -from .agent import Agent -import tensorflow as tf -from .utils import * -import pdb - - -class DQN(Agent): - def __init__(self, env_, network_specs, buffer_size = 10000, batch_size = 128, gamma = 0.95, eps = 0.01, update_target_every_n = 1000, update_every_n = 300, tau = 0.001, logdir = '.', loss_fn='mse'): - # TODO(shubham): Add option to disable Tensorboard - # TODO(shubham): Add logging - # TODO(shubham): ADD L2 REG - self.env_ = env_ - self.logdir = logdir - self.loss_fn = loss_fn - self.network_specs = network_specs - self.buffer_size = buffer_size - self.update_every_n = update_every_n - self.tau = tau - self.buffer_ = [] - self.buffer_index = None - self.update_target_every_n = 100 - self.action_space = self.env_.env.action_space - self.num_actions = self.action_space.n - self.state_size = self.env_.env.observation_space.shape[0] - self.moving_reward = None - self.gamma = gamma - self.eps = eps - self.batch_size = batch_size - self.moving_reward = None - - self.dqn_scope = 'dqn' - self.target_dqn_scope = 'target_dqn' - - self._add_placeholders() - - self.dqn_final_layer = self._add_model(self.dqn_scope, self.state_placeholder, self.network_specs) - self.target_dqn_final_layer = self._add_model(self.target_dqn_scope, self.state_placeholder, self.network_specs) - - with tf.variable_scope(self.dqn_scope): - self.q_values = tf.layers.dense(self.dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') - self.max_q_values = tf.reshape(tf.reduce_max(self.q_values, axis=1, name='max_q_values'), [-1,1]) - self.selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) - with tf.variable_scope(self.target_dqn_scope): - self.target_q_values = tf.layers.dense(self.target_dqn_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='q_values') - self.target_max_q_values = tf.reshape(tf.reduce_max(self.target_q_values, axis=1, name='max_q_values'), [-1,1]) - self.target_selected_q_values = tf.reshape(tf.reduce_sum(tf.multiply(self.target_q_values, self.actions_placeholder, name='selected_q_values'), axis=1), [-1,1]) - - self._add_target_update(self.tau) - self._add_loss() - self._add_optim() - - def _add_placeholders(self): - self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') - self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') - self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr') - self.targets = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='targets') - - def _add_loss(self): - with tf.name_scope("loss_fn"): - if self.loss_fn == 'huber': - self.loss = tf.clip_by_value(tf.losses.huber_loss(self.targets, self.selected_q_values), -1, 1) - else: - self.loss = tf.clip_by_value(tf.losses.mean_squared_error(self.targets, self.selected_q_values), -1, 1) - - def _add_optim(self): - self.optim_step = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) - - def _add_target_update(self, tau): - self.update_op_holder = [] - for var, target_var in zip(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.dqn_scope), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.target_dqn_scope)): - self.update_op_holder.append(target_var.assign(var.value() * tau + ((1 - tau) * var.value()))) - - def _start_session(self): - self.sess = tf.Session() - self.summary_op = tf.summary.merge_all() - self.writer = tf.summary.FileWriter(os.path.join(self.logdir,"tensorboard/dqn/"), self.sess.graph) - self.writer.close() - self.sess.run(tf.global_variables_initializer()) - - def train(self, env, episodes = 10, lr = 0.01, gamma = 0.95, eps = 0.01): - self.gamma = gamma - self.lr = lr - self.eps = eps - - self._start_session() - - update_steps = 0 - target_update_steps = 0 - for episode in range(episodes): - done = False - obs = self.env_.reset() - step = 0 - reward_sum = 0 - ep_start_time = time() - while not done: - step+=1 - update_steps += 1 - target_update_steps += 1 - experience = {} - action = self.action(obs) - experience['state'] = obs - experience['action'] = action - obs, reward, done, info = self.env_.step(action) - reward_sum += reward - experience['reward'] = reward - experience['next_state'] = obs - experience['done'] = done - self.store_experience(experience) - if len(self.buffer_) < self.batch_size+1: - continue - if update_steps == self.update_every_n: - self.update_net(self.lr) - update_steps = 0 - if target_update_steps == self.update_target_every_n: - target_update_steps = 0 - self.update_target() - if self.moving_reward is None: - self.moving_reward = reward_sum - else: - self.moving_reward = 0.99 * self.moving_reward + 0.01 * reward_sum - - print("Episode:", episode, "Steps:", step, "reward:", self.moving_reward, "lr", self.lr, "Time:", time()-ep_start_time) - - def test(): - pass - - def update_target(self): - for op in self.update_op_holder: - self.sess.run(op) - - def store_experience(self, exp): - if len(self.buffer_)>=self.buffer_size: - if self.buffer_index is None: - self.buffer_index = 0 - if self.buffer_index >= self.buffer_size: - self.buffer_index = 0 - self.buffer_[self.buffer_index] = exp - self.buffer_index+=1 - else: - self.buffer_.append(exp) - - def action(self, state): - if random.uniform(0,1) < self.eps: - return random.sample(range(self.num_actions), 1)[0] - q_values = self.sess.run(self.q_values, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) - action = np.argmax(q_values[0]) - return action - - def update_net(self, lr = 0.001): - sampled_buffer = random.sample(self.buffer_, min(self.batch_size, len(self.buffer_))) - states = np.array([x['state'] for x in sampled_buffer]) - next_states = np.array([x['next_state'] for x in sampled_buffer]) - rewards = np.array([x['reward'] for x in sampled_buffer]).reshape([-1, 1]) - done_arr = np.array([x['done'] for x in sampled_buffer]).reshape([-1, 1]) - - actions = np.zeros([states.shape[0], self.num_actions]) - for i, x in enumerate(sampled_buffer): - temp_action = x['action'] - actions[i, temp_action] = 1 - - target_q_vals = self.sess.run(self.target_q_values, feed_dict={self.state_placeholder:next_states}) - max_q = np.amax(target_q_vals, axis=1).reshape([-1,1]) - targets = rewards + self.gamma * np.multiply((1-done_arr), max_q) - __, loss_ = self.sess.run([self.optim_step, self.loss], feed_dict={self.state_placeholder: states, self.actions_placeholder:actions, self.targets:targets, self.learning_rate:lr}) diff --git a/rlkit/algorithms/policy_gradients/ActorCritic.py b/rlkit/algorithms/policy_gradients/ActorCritic.py deleted file mode 100644 index 5bae0e5..0000000 --- a/rlkit/algorithms/policy_gradients/ActorCritic.py +++ /dev/null @@ -1,119 +0,0 @@ -from ..agent import Agent -from ..utils import * - -class ActorCritic(Agent): - def __init__(self, env_, actor_specs, critic_specs=None, gamma = 0.95, logdir = '.', inertia = 0.99): - self.env_ = env_ - self.inertia = inertia - self.actor_specs = actor_specs - self.critic_specs = critic_specs - self.logdir = logdir - self.gamma = gamma - self.action_space = self.env_.env.action_space - self.num_actions = self.action_space.n - self.state_size = self.env_.env.observation_space.shape[0] - self.moving_reward = None - - self._add_placeholders() - - # Add models - self.policy_final_layer = self._add_model('actor', self.state_placeholder, actor_specs) - self.value_final_layer = self._add_model('critic', self.state_placeholder, critic_specs) - self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values') - - self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits') - self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs') - self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood') - - self._add_loss() - self._add_optim() - - def _add_placeholders(self): - self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') - self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns') - self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') - self.target_state_val = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='target_state_val') - # Learning rates - self.actor_learning_rate = tf.placeholder(dtype=tf.float32, name='actor_lr') - self.critic_learning_rate = tf.placeholder(dtype=tf.float32, name='critic_lr') - - def _add_loss(self): - with tf.name_scope("loss_fn"): - self.actor_loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) - self.critic_loss = tf.losses.mean_squared_error(self.target_state_val, self.state_values) - - def _add_optim(self): - self.actor_optim_step = tf.train.AdamOptimizer(learning_rate = self.actor_learning_rate).minimize(self.actor_loss) - self.critic_optim_step = tf.train.AdamOptimizer(learning_rate = self.critic_learning_rate).minimize(self.critic_loss) - - def _start_session(self): - self.sess = tf.Session() - self.summary_op = tf.summary.merge_all() - self.writer = tf.summary.FileWriter(os.path.join(self.logdir, "tensorboard/AC/"), self.sess.graph) - self.writer.close() - self.sess.run(tf.global_variables_initializer()) - - def train(self, episodes = 10, actor_lr = 0.01, critic_lr = 0.1, gamma = 0.95, update_steps = 10): - self.gamma = gamma - all_moving_rewards = [] - - self._start_session() - - for episode in range(episodes): - done = False - obs = self.env_.reset() - step = 0 - ep_start_time = time() - self.buffer_ = [] - while not done: - step+=1 - experience = {} - action = self.action(obs) - experience['state'] = obs - experience['action'] = action - obs, reward, done, info = self.env_.step(action) - experience['reward'] = reward - experience['next_state'] = obs - self.buffer_.append(experience) - if self.moving_reward is None: - self.moving_reward = float(sum(x['reward'] for x in self.buffer_)) - else: - self.moving_reward = self.inertia * self.moving_reward + (1-self.inertia) * float(sum(x['reward'] for x in self.buffer_)) - all_moving_rewards.append(self.moving_reward) - print("Episode:{}\t Steps:{}\t Reward:{}\t Time:{}".format(episode, step, self.moving_reward, time()-ep_start_time)) - self.update_net(actor_lr, critic_lr) - - def action(self, state): - action_probs = self.sess.run(self.action_probs, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) - action = np.random.choice(list(range(self.num_actions)), p=action_probs[0]) - return action - - def update_net(self, actor_lr, critic_lr): - states = np.array([x['state'] for x in self.buffer_]) - rewards = np.array([x['reward'] for x in self.buffer_]) - next_states = np.array([x['next_state'] for x in self.buffer_]) - - next_state_val = self.sess.run(self.state_values, feed_dict={self.state_placeholder:next_states}) - # temp = self.sess.run(self.state_values, feed_dict={self.state_placeholder:states}) - - target_state_val = rewards.reshape([-1,1]) + self.gamma*next_state_val - - discounted_r = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, rewards.size)): - running_add = running_add * self.gamma + rewards[t] - discounted_r[t] = running_add - returns = discounted_r.reshape([-1, 1]) - - actions = np.zeros([len(self.buffer_), self.num_actions]) - for i, x in enumerate(self.buffer_): - temp_action = x['action'] - actions[i, temp_action] = 1 - - a_, c_, actor_loss_, critic_loss_ = self.sess.run([self.actor_optim_step, self.critic_optim_step, self.actor_loss, - self.critic_loss], feed_dict={self.state_placeholder: states, - self.returns_placeholder:returns, self.actions_placeholder:actions, self.actor_learning_rate:actor_lr, - self.critic_learning_rate:critic_lr, self.target_state_val:target_state_val}) - - def test(): - pass \ No newline at end of file diff --git a/rlkit/algorithms/policy_gradients/REINFORCE.py b/rlkit/algorithms/policy_gradients/REINFORCE.py deleted file mode 100755 index 672b791..0000000 --- a/rlkit/algorithms/policy_gradients/REINFORCE.py +++ /dev/null @@ -1,117 +0,0 @@ -from ..agent import Agent -from ..utils import * - -class REINFORCE(Agent): - def __init__(self, env_, network_specs, value_estimator_specs=None, gamma = 0.95, logdir = '.', inertia = 0.99): - self.env_ = env_ - self.inertia = inertia - self.network_specs = network_specs - self.use_baseline = False - if value_estimator_specs is not None: - self.value_estimator_specs = value_estimator_specs - self.use_baseline = True - self.logdir = logdir - self.gamma = gamma - self.action_space = self.env_.env.action_space - self.num_actions = self.action_space.n - self.state_size = self.env_.env.observation_space.shape[0] - self.moving_reward = None - - self._add_placeholders() - - # Add models - self.policy_final_layer = self._add_model('policy_net', self.state_placeholder, network_specs) - if self.use_baseline: - self.value_final_layer = self._add_model('value_estimator', self.state_placeholder, value_estimator_specs) - self.state_values = tf.layers.dense(self.value_final_layer, 1, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='state_values') - - self.action_logits = tf.layers.dense(self.policy_final_layer, self.num_actions, kernel_initializer = tf.contrib.layers.xavier_initializer(), activation=None, name='action_logits') - self.action_probs = tf.nn.softmax(self.action_logits, axis=1, name='action_probs') - self.log_likelihood = tf.log(tf.clip_by_value(self.action_probs, 0.000001, 0.999999, name='clip'), name='log_likelihood') - - self._add_loss() - self._add_optim() - - def _add_placeholders(self): - self.state_placeholder = tf.placeholder(shape=[None, self.state_size], dtype=tf.float32, name='state') - self.returns_placeholder = tf.placeholder(shape=[None, 1], dtype=tf.float32, name='returns') - self.actions_placeholder = tf.placeholder(shape=[None, self.num_actions], dtype=tf.float32, name='actions') - self.learning_rate = tf.placeholder(dtype=tf.float32, name='lr') - self.value_est_learning_rate = tf.placeholder(dtype=tf.float32, name='value_est_lr') - - def _add_loss(self): - with tf.name_scope("loss_fn"): - if self.use_baseline: - self.loss = -tf.reduce_mean(tf.multiply(tf.subtract(self.returns_placeholder, self.state_values), tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) - self.value_estimator_loss = tf.losses.mean_squared_error(self.returns_placeholder, self.state_values) - else: - self.loss = -tf.reduce_mean(tf.multiply(self.returns_placeholder, tf.reshape(tf.reduce_sum(tf.multiply(self.log_likelihood, self.actions_placeholder), axis=1), [-1, 1])), axis=0) - - def _add_optim(self): - self.optim_step = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss) - if self.use_baseline: - self.optim_step_value_est = tf.train.AdamOptimizer(learning_rate = self.value_est_learning_rate).minimize(self.value_estimator_loss) - - def _start_session(self): - self.sess = tf.Session() - self.summary_op = tf.summary.merge_all() - self.writer = tf.summary.FileWriter(os.path.join(self.logdir, "tensorboard/pg/"), self.sess.graph) - self.writer.close() - self.sess.run(tf.global_variables_initializer()) - - def train(self, episodes = 10, lr = 0.01, value_est_lr = 0.01, gamma = 0.95, update_steps = 10): - self.gamma = gamma - self.lr = lr - all_moving_rewards=[] - - self._start_session() - - for episode in range(episodes): - done = False - obs = self.env_.reset() - step = 0 - ep_start_time = time() - self.buffer_ = [] - while not done: - step+=1 - temp = {} - action = self.action(obs) - temp['state'] = obs - temp['action'] = action - obs, reward, done, info = self.env_.step(action) - temp['reward'] = reward - self.buffer_.append(temp) - if self.moving_reward is None: - self.moving_reward = float(sum(x['reward'] for x in self.buffer_)) - else: - self.moving_reward = self.inertia * self.moving_reward + (1-self.inertia) * float(sum(x['reward'] for x in self.buffer_)) - all_moving_rewards.append(self.moving_reward) - print("Episode:{}\t Steps:{}\t Reward:{}\t Time:{}".format(episode, step, self.moving_reward, time()-ep_start_time)) - self.update_net(lr, value_est_lr) - - def action(self, state): - action_probs = self.sess.run(self.action_probs, feed_dict={self.state_placeholder:np.array(state).reshape(1, -1)}) - action = np.random.choice(list(range(self.num_actions)), p=action_probs[0]) - return action - - def update_net(self, lr, value_est_lr): - states = np.array([x['state'] for x in self.buffer_]) - rewards = np.array([x['reward'] for x in self.buffer_]) - - discounted_r = np.zeros_like(rewards) - running_add = 0 - for t in reversed(range(0, rewards.size)): - running_add = running_add * self.gamma + rewards[t] - discounted_r[t] = running_add - returns = discounted_r.reshape([-1, 1]) - - actions = np.zeros([len(self.buffer_), self.num_actions]) - for i, x in enumerate(self.buffer_): - temp_action = x['action'] - actions[i, temp_action] = 1 - - __, v__, loss_, loss_value_est = self.sess.run([self.optim_step, self.optim_step_value_est, self.loss, self.value_estimator_loss], feed_dict={self.state_placeholder: states, - self.returns_placeholder:returns, self.actions_placeholder:actions, self.learning_rate:lr, self.value_est_learning_rate:value_est_lr}) - - def test(): - pass \ No newline at end of file diff --git a/rlkit/algorithms/policy_gradients/__init__.py b/rlkit/algorithms/policy_gradients/__init__.py deleted file mode 100644 index 86d71ee..0000000 --- a/rlkit/algorithms/policy_gradients/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .ActorCritic import ActorCritic -from .REINFORCE import REINFORCE \ No newline at end of file diff --git a/rlkit/algorithms/random_agent.py b/rlkit/algorithms/random_agent.py deleted file mode 100755 index 7a66cee..0000000 --- a/rlkit/algorithms/random_agent.py +++ /dev/null @@ -1,11 +0,0 @@ -class RandomAgent: - def __init__(self, env_): - self.env_ = env_ - self.action_space = self.env_.env.action_space - self.num_actions = self.action_space.n - - def train(self, env): - pass - - def action(self, state = None): - return self.action_space.sample() \ No newline at end of file diff --git a/rlkit/algorithms/utils.py b/rlkit/algorithms/utils.py deleted file mode 100755 index 7990fc5..0000000 --- a/rlkit/algorithms/utils.py +++ /dev/null @@ -1,6 +0,0 @@ -import tensorflow as tf -import gym -from time import time -import random -import numpy as np -import os, sys \ No newline at end of file diff --git a/rlkit/core/__init__.py b/rlkit/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_action.py b/rlkit/core/base_action.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_agent.py b/rlkit/core/base_agent.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_environment.py b/rlkit/core/base_environment.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_policy.py b/rlkit/core/base_policy.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_state.py b/rlkit/core/base_state.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/core/base_trainer.py b/rlkit/core/base_trainer.py new file mode 100644 index 0000000..0046369 --- /dev/null +++ b/rlkit/core/base_trainer.py @@ -0,0 +1,9 @@ +class BaseTrainer: + def __init__(self): + pass + + def step(self): + pass + + def train(self): + pass diff --git a/rlkit/core/base_value.py b/rlkit/core/base_value.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/environment.py b/rlkit/environment.py deleted file mode 100755 index c6c676b..0000000 --- a/rlkit/environment.py +++ /dev/null @@ -1,22 +0,0 @@ -import gym - -class Environment: - def __init__(self, env_name, render=False): - self.env = gym.make(env_name) - self.render = render - self.timestep = 0 - self.done = False - self.reset() - - def reset(self): - observation = self.env.reset() - return observation - - def step(self, action): - if self.render: - self.env.render() - observation, reward, done, info = self.env.step(action) - return observation, reward, done, info - - def close(self): - self.env.close() \ No newline at end of file diff --git a/rlkit/environments/__init__.py b/rlkit/environments/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/environments/gym_environment.py b/rlkit/environments/gym_environment.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/trainers/__init__.py b/rlkit/trainers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rlkit/trainers/basic_trainer.py b/rlkit/trainers/basic_trainer.py new file mode 100644 index 0000000..b301e63 --- /dev/null +++ b/rlkit/trainers/basic_trainer.py @@ -0,0 +1,3 @@ +class BasicTrainer: + def __init__(self): + pass \ No newline at end of file From 702637874baae9981267f242f5e836be1c6ff66a Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Thu, 11 Jun 2020 20:19:24 +0530 Subject: [PATCH 4/6] Add basic training iteration --- requirements.txt | 2 +- rlkit/__init__.py | 4 -- rlkit/__main__.py | 23 ++++++++++++ rlkit/agents/__init__.py | 1 + rlkit/agents/random_agent.py | 9 +++++ rlkit/core/__init__.py | 3 ++ rlkit/core/base_action.py | 1 + rlkit/core/base_agent.py | 10 +++++ rlkit/core/base_environment.py | 16 ++++++++ rlkit/core/base_trainer.py | 8 ++-- rlkit/environments/gym_environment.py | 45 +++++++++++++++++++++++ rlkit/environments/vizdoom_environment.py | 14 +++++++ rlkit/trainers/__init__.py | 1 + rlkit/trainers/basic_trainer.py | 40 ++++++++++++++++++-- 14 files changed, 166 insertions(+), 11 deletions(-) create mode 100644 rlkit/agents/random_agent.py create mode 100644 rlkit/environments/vizdoom_environment.py diff --git a/requirements.txt b/requirements.txt index 660403b..96bf383 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ tensorflow==1.11.0 -gym==0.10.8 +gym==0.17.2 numpy==1.15.4 \ No newline at end of file diff --git a/rlkit/__init__.py b/rlkit/__init__.py index 1d5f453..e69de29 100755 --- a/rlkit/__init__.py +++ b/rlkit/__init__.py @@ -1,4 +0,0 @@ -from .algorithms.random_agent import RandomAgent -from .algorithms.dqn import DQN -from .algorithms.policy_gradients import REINFORCE -from .algorithms.agent import Agent \ No newline at end of file diff --git a/rlkit/__main__.py b/rlkit/__main__.py index e69de29..272df3f 100644 --- a/rlkit/__main__.py +++ b/rlkit/__main__.py @@ -0,0 +1,23 @@ +from rlkit.agents import RandomAgent +from rlkit.environments.gym_environment import GymEnvironment +from rlkit.trainers import BasicTrainer + +params = { + "environment_params": { + "env_name": "SpaceInvaders-v0", + }, + "agent_params": { + + }, + "training_params": { + "run_name": "test_run", + "train_interval": 10, + "episodes": 5, + "steps": 500, + }, +} + +env = GymEnvironment(params["environment_params"]) +agent = RandomAgent(params["agent_params"], env.get_action_space()) +trainer = BasicTrainer(params["training_params"], agent, env) +trainer.train() diff --git a/rlkit/agents/__init__.py b/rlkit/agents/__init__.py index e69de29..fe0737e 100644 --- a/rlkit/agents/__init__.py +++ b/rlkit/agents/__init__.py @@ -0,0 +1 @@ +from .random_agent import RandomAgent \ No newline at end of file diff --git a/rlkit/agents/random_agent.py b/rlkit/agents/random_agent.py new file mode 100644 index 0000000..615b1a9 --- /dev/null +++ b/rlkit/agents/random_agent.py @@ -0,0 +1,9 @@ +from rlkit.core.base_agent import BaseAgent + + +class RandomAgent(BaseAgent): + def __init__(self, params, action_space): + super(RandomAgent, self).__init__(params, action_space) + + def get_action(self, state): + return self.action_space.sample() \ No newline at end of file diff --git a/rlkit/core/__init__.py b/rlkit/core/__init__.py index e69de29..8c7f768 100644 --- a/rlkit/core/__init__.py +++ b/rlkit/core/__init__.py @@ -0,0 +1,3 @@ +from .base_agent import BaseAgent +from .base_environment import BaseEnvironment +from .base_trainer import BaseTrainer \ No newline at end of file diff --git a/rlkit/core/base_action.py b/rlkit/core/base_action.py index e69de29..11109cb 100644 --- a/rlkit/core/base_action.py +++ b/rlkit/core/base_action.py @@ -0,0 +1 @@ +class BaseAction \ No newline at end of file diff --git a/rlkit/core/base_agent.py b/rlkit/core/base_agent.py index e69de29..93c99cf 100644 --- a/rlkit/core/base_agent.py +++ b/rlkit/core/base_agent.py @@ -0,0 +1,10 @@ +class BaseAgent: + def __init__(self, params, action_space): + self.params = params + self.action_space = action_space + + def train(self): + pass + + def get_action(self, state): + pass \ No newline at end of file diff --git a/rlkit/core/base_environment.py b/rlkit/core/base_environment.py index e69de29..318519e 100644 --- a/rlkit/core/base_environment.py +++ b/rlkit/core/base_environment.py @@ -0,0 +1,16 @@ +class BaseEnvironment: + def __init__(self): + self.to_render = False + self.reset() + + def execute_action(self, action): + pass + + def reset(self): + pass + + def render(self): + pass + + def setRender(self, to_render): + self.to_render = to_render diff --git a/rlkit/core/base_trainer.py b/rlkit/core/base_trainer.py index 0046369..f88eaca 100644 --- a/rlkit/core/base_trainer.py +++ b/rlkit/core/base_trainer.py @@ -1,8 +1,10 @@ class BaseTrainer: - def __init__(self): - pass + def __init__(self, params): + self.global_step = 0 + self.episodes = params.get("episodes", 10); + self.steps = params.get("steps", 100) - def step(self): + def do_step(self): pass def train(self): diff --git a/rlkit/environments/gym_environment.py b/rlkit/environments/gym_environment.py index e69de29..1b44c21 100644 --- a/rlkit/environments/gym_environment.py +++ b/rlkit/environments/gym_environment.py @@ -0,0 +1,45 @@ +import gym +from rlkit.core import BaseEnvironment + + +class GymEnvironment(BaseEnvironment): + def __init__(self, params): + self.params = params + self.env_name = params["env_name"] + self.env = gym.make(self.env_name) + super(GymEnvironment, self).__init__() + + def execute_action(self, action): + self.env.step(action) + + def get_action_space(self): + return self.env.action_space + + def reset(self, reset_values=True): + if reset_values: + self.reset_values() + self.reset_env() + + def reset_values(self): + self.state = None + self.reward = None + self.done = False + self.info = None + + def reset_env(self): + self.env.reset() + + def close(self): + print("closing env") + return self.env.close() + + def render(self): + self.env.render() + + def step(self, action): + self.state, self.reward, self.done, self.info = self.env.step(action) + return (self.state, self.reward, self.done, self.info, ) + + +if __name__ == "__main__": + test_env = GymEnvironment("MountainCarContinuous-v0") diff --git a/rlkit/environments/vizdoom_environment.py b/rlkit/environments/vizdoom_environment.py new file mode 100644 index 0000000..b60a174 --- /dev/null +++ b/rlkit/environments/vizdoom_environment.py @@ -0,0 +1,14 @@ +from rlkit.core import BaseEnvironment +from vizdoom import * + +class VizDoomEnvironment(BaseEnvironment): + def __init__(self, params): + super(VizDoomEnvironment, self).__init__() + self.env_name = params["env_name"] + + pass + + def initialize_env(self): + self.env = DoomGame() + self.env.load_config("../config/basic.cfg") + self.env.init() \ No newline at end of file diff --git a/rlkit/trainers/__init__.py b/rlkit/trainers/__init__.py index e69de29..6df86b3 100644 --- a/rlkit/trainers/__init__.py +++ b/rlkit/trainers/__init__.py @@ -0,0 +1 @@ +from .basic_trainer import BasicTrainer \ No newline at end of file diff --git a/rlkit/trainers/basic_trainer.py b/rlkit/trainers/basic_trainer.py index b301e63..9123e15 100644 --- a/rlkit/trainers/basic_trainer.py +++ b/rlkit/trainers/basic_trainer.py @@ -1,3 +1,37 @@ -class BasicTrainer: - def __init__(self): - pass \ No newline at end of file +from rlkit.core import BaseTrainer + + +class BasicTrainer(BaseTrainer): + def __init__(self, params, agent, environment): + self.agent = agent + self.environment = environment + super(BasicTrainer, self).__init__(params) + + self.train_interval = params["train_interval"] + self.run_name = params["run_name"] + self.episodes = params["episodes"] + self.steps = params["steps"] + + def do_step(self): + action = self.agent.get_action(self.environment.state) + self.environment.step(action) + self.environment.render() # TODO: find better solution + + def train(self): + try: + for episode in range(1, self.episodes+1): + step = 0 + self.environment.reset() + while step < self.steps and not self.environment.done: + print("episode: {}, step: {}".format(episode, step)) + self.do_step() + + # Train agent + if self.global_step > 0 and not self.global_step % self.train_interval: + self.agent.train() + + # Increment step counts + step += 1 + self.global_step += 1 + finally: + self.environment.close() From 974d52db04a69c1c417804b6a8d8c46e35837212 Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Thu, 11 Jun 2020 22:14:27 +0530 Subject: [PATCH 5/6] Add initial Vizdoom environment --- examples/_vizdoom.ini | 568 ++++++++++++++++++++++ examples/basic.cfg | 38 ++ examples/random_agent_example.py | 25 + rlkit/core/__init__.py | 1 + rlkit/core/base_action.py | 1 - rlkit/core/base_action_space.py | 6 + rlkit/core/base_environment.py | 7 + rlkit/environments/vizdoom_environment.py | 62 ++- rlkit/trainers/basic_trainer.py | 2 +- 9 files changed, 702 insertions(+), 8 deletions(-) create mode 100644 examples/_vizdoom.ini create mode 100644 examples/basic.cfg create mode 100644 examples/random_agent_example.py delete mode 100644 rlkit/core/base_action.py create mode 100644 rlkit/core/base_action_space.py diff --git a/examples/_vizdoom.ini b/examples/_vizdoom.ini new file mode 100644 index 0000000..9dd8946 --- /dev/null +++ b/examples/_vizdoom.ini @@ -0,0 +1,568 @@ +# This file was generated by ViZDoom 1.1.8 (ZDOOM 2.8.1) on Thu Jun 11 22:00:16 2020 + +# These are the directories to automatically search for IWADs. +# Each directory should be on a separate line, preceded by Path= +[IWADSearch.Directories] +Path=. +Path=$DOOMWADDIR +Path=/Users/sjha/Documents/_vizdoom +Path=/Users/sjha/Library/Application Support/_vizdoom +Path=$PROGDIR +Path=/Library/Application Support/_vizdoom + +# These are the directories to search for wads added with the -file +# command line parameter, if they cannot be found with the path +# as-is. Layout is the same as for IWADSearch.Directories +[FileSearch.Directories] +Path=$PROGDIR +Path=/Library/Application Support/_vizdoom +Path=$DOOMWADDIR + +# Files to automatically execute when running the corresponding game. +# Each file should be on its own line, preceded by Path= + +[Doom.AutoExec] +Path=/Users/sjha/Documents/_vizdoom/autoexec.cfg + +[Heretic.AutoExec] +Path=/Users/sjha/Documents/_vizdoom/autoexec.cfg + +[Hexen.AutoExec] +Path=/Users/sjha/Documents/_vizdoom/autoexec.cfg + +[Strife.AutoExec] +Path=/Users/sjha/Documents/_vizdoom/autoexec.cfg + +[Chex.AutoExec] +Path=/Users/sjha/Documents/_vizdoom/autoexec.cfg + +# WAD files to always load. These are loaded after the IWAD but before +# any files added with -file. Place each file on its own line, preceded +# by Path= +[Global.Autoload] + +# Wad files to automatically load depending on the game and IWAD you are +# playing. You may have have files that are loaded for all similar IWADs +# (the game) and files that are only loaded for particular IWADs. For example, +# any files listed under 'doom.Autoload' will be loaded for any version of Doom, +# but files listed under 'doom.doom2.Autoload' will only load when you are +# playing a Doom 2 based game (doom2.wad, tnt.wad or plutonia.wad), and files listed under +# 'doom.doom2.commercial.Autoload' only when playing doom2.wad. + +[doom.Autoload] + +[doom.doom2.Autoload] + +[doom.doom2.commercial.Autoload] + +[doom.doom2.bfg.Autoload] + +[doom.doom2.plutonia.Autoload] + +[doom.doom2.tnt.Autoload] + +[doom.doom1.Autoload] + +[doom.doom1.registered.Autoload] + +[doom.doom1.ultimate.Autoload] + +[doom.doom1.bfg.Autoload] + +[doom.freedoom.Autoload] + +[doom.freedoom.demo.Autoload] + +[doom.freedoom.phase1.Autoload] + +[doom.freedoom.phase2.Autoload] + +[doom.freedoom.freedm.Autoload] + +[heretic.Autoload] + +[heretic.heretic.Autoload] + +[heretic.shadow.Autoload] + +[blasphemer.Autoload] + +[hexen.Autoload] + +[hexen.deathkings.Autoload] + +[hexen.hexen.Autoload] + +[strife.Autoload] + +[chex.Autoload] + +[chex.chex1.Autoload] + +[chex.chex3.Autoload] + +[urbanbrawl.Autoload] + +[hacx.Autoload] + +[hacx.hacx1.Autoload] + +[hacx.hacx2.Autoload] + +[harmony.Autoload] + +[square.Autoload] + +[square.squareware.Autoload] + +[square.square.Autoload] + +[LastRun] +Version=211 + +[GlobalSettings] +gus_memsize=0 +midi_dmxgus=true +gus_patchdir= +midi_voices=32 +midi_config=timidity.cfg +snd_efx=true +snd_aldevice=Default +wildmidi_enhanced_resampling=true +wildmidi_reverb=false +wildmidi_frequency=0 +wildmidi_config= +fluid_chorus_type=0 +fluid_chorus_depth=8 +fluid_chorus_speed=0.3 +fluid_chorus_level=1 +fluid_chorus_voices=3 +fluid_reverb_level=0.57 +fluid_reverb_width=0.76 +fluid_reverb_damping=0.23 +fluid_reverb_roomsize=0.61 +fluid_threads=1 +fluid_samplerate=0 +fluid_interp=1 +fluid_voices=128 +fluid_chorus=true +fluid_reverb=true +fluid_gain=0.5 +fluid_patchset= +opl_core=0 +opl_numchips=2 +timidity_frequency=44100 +timidity_pipe=90 +timidity_mastervolume=1 +timidity_byteswap=false +timidity_8bit=false +timidity_stereo=true +timidity_reverb=0 +timidity_chorus=0 +timidity_extargs= +timidity_exe=timidity +snd_mididevice=-1 +spc_amp=1.875 +mod_dumb_mastervolume=1 +mod_autochip_scan_threshold=12 +mod_autochip_size_scan=500 +mod_autochip_size_force=100 +mod_autochip=false +mod_interp=2 +mod_volramp=2 +mod_samplerate=0 +mod_dumb=true +snd_sfxvolume=1 +snd_backend=openal +snd_output=default +snd_buffersize=0 +snd_samplerate=0 +snd_musicvolume=0.5 +snd_waterlp=250 +snd_midipatchset= +snd_output_format=PCM-16 +snd_speakermode=Auto +snd_resampler=Linear +snd_waterreverb=true +snd_hrtf=false +snd_buffercount=0 +snd_driver=0 +opl_fullpan=true +vid_tft=true +m_showinputgrid=false +m_show_backbutton=0 +m_use_mouse=1 +show_messages=true +mouse_sensitivity=1 +map_point_coordinates=true +vid_aspect=3 +vid_nowidescreen=false +vid_refreshrate=0 +vid_vsync=false +vid_defbits=8 +vid_defheight=480 +vid_defwidth=640 +Gamma=1 +statfile=zdoomstat.txt +savestatistics=0 +snd_flipstereo=false +snd_channels=32 +r_columnmethod=1 +r_quakeintensity=1 +cl_predict_lerpthreshold=2 +cl_predict_lerpscale=0.05 +cl_predict_specials=true +cl_noprediction=false +telezoom=true +r_fakecontrast=1 +chase_dist=90 +chase_height=-8 +gl_cachetime=0.6 +gl_cachenodes=true +nomonsterinterpolation=false +png_gamma=0 +png_level=5 +screenshot_dir= +screenshot_type=png +screenshot_quiet=false +use_joystick=false +autosavecount=4 +disableautosave=0 +autosavenum=0 +smooth_mouse=false +m_side=2 +m_forward=1 +m_yaw=1 +m_pitch=1 +lookstrafe=false +freelook=false +invertmouse=false +cl_run=false +demo_compress=true +cl_waitforsave=true +save_dir= +longsavemessages=true +storesavepic=true +nofilecompression=false +cl_capfps=true +defaultiwad= +queryiwad=true +con_ctrl_d= +con_buffersize=-1 +osx_additional_parameters= +showendoom=0 +bgamma=1 +ggamma=1 +rgamma=1 +vid_forcesurface=false +vid_displaybits=32 +vid_adapter=0 +mouse_capturemode=1 +m_filter=false +m_noprescale=false +use_mouse=false +vid_winscale=1 +fullscreen=false +vid_maxfps=200 + +[GlobalSettings.Unknown] + +[Doom.Player] +wi_noautostartmap=false +playerclass=Fighter +stillbob=0 +movebob=0.25 +neverswitchonpickup=false +gender=male +team=255 +skin=base +colorset=0 +color=40 cf 00 +name=Player +autoaim=35 + +[Doom.ConsoleVariables] +r_drawfuzz=1 +vid_nopalsubstitutions=false +snd_pitched=false +menu_screenratios=-1 +snd_menuvolume=0.6 +show_obituaries=true +am_showmaplabel=2 +crosshairgrow=false +crosshairscale=false +crosshairhealth=true +crosshaircolor=ff 00 00 +crosshairforce=false +crosshair=0 +st_scale=true +paletteflash=0 +hudcolor_stats=3 +hudcolor_statnames=6 +hudcolor_xyco=3 +hudcolor_ttim=5 +hudcolor_ltim=8 +hudcolor_time=6 +hudcolor_titl=10 +hud_berserk_health=true +hud_armor_green=100 +hud_armor_yellow=50 +hud_armor_red=25 +hud_health_green=100 +hud_health_yellow=50 +hud_health_red=25 +hud_ammo_yellow=50 +hud_ammo_red=25 +hud_showlag=0 +hud_timecolor=5 +hud_showtime=0 +hud_showammo=2 +hud_showweapons=true +hud_showscore=false +hud_showstats=false +hud_showitems=false +hud_showmonsters=true +hud_showsecrets=true +hud_althud=false +hud_althudscale=2 +st_oldouch=false +cl_maxdecals=1024 +cl_spreaddecals=true +transsouls=0.75 +wi_showtotaltime=true +wi_percents=true +dimcolor=ff d7 00 +dimamount=-1 +hud_scale=true +allcheats=false +r_stretchsky=true +r_shadercolormaps=true +screenblocks=12 +r_deathcamera=false +cl_showsecretmessage=true +cl_bloodtype=1 +cl_pufftype=0 +addrocketexplosion=false +cl_missiledecals=true +cl_doautoaim=false +cl_bloodsplats=true +cl_showmultikills=false +cl_showsprees=false +r_maxparticles=4092 +r_rail_trailsparsity=1 +r_rail_spiralsparsity=1 +r_rail_smartspiral=false +cl_rockettrails=3 +dlg_musicvolume=1 +sb_teamdeathmatch_headingcolor=6 +sb_teamdeathmatch_enable=true +sb_deathmatch_otherplayercolor=2 +sb_deathmatch_yourplayercolor=3 +sb_deathmatch_headingcolor=6 +sb_deathmatch_enable=true +sb_cooperative_otherplayercolor=2 +sb_cooperative_yourplayercolor=3 +sb_cooperative_headingcolor=6 +sb_cooperative_enable=true +nametagcolor=5 +displaynametags=0 +language=auto +compatmode=0 +vid_cursor=None +wipetype=0 +dehload=0 +chat_substitution=false +chatmacro0=No +chatmacro9=Yes +chatmacro8=I'll take care of it. +chatmacro7=Come here! +chatmacro6=Next time, scumbag... +chatmacro5=You suck! +chatmacro4=Help! +chatmacro3=I'm not looking too good! +chatmacro2=I'm OK. +chatmacro1=I'm ready to kick butt! +lookspring=true +con_midtime=0 +msgmidcolor2=4 +msgmidcolor=5 +msg4color=3 +msg3color=3 +msg2color=2 +msg1color=5 +msg0color=6 +msg=0 +con_alpha=0.75 +con_scaletext=0 +con_centernotify=false +con_notifytime=0 +con_notablist=false +cl_bbannounce=false +am_followplayer=true +am_textured=true +am_ovthingcolor_citem=e8 88 00 +am_ovthingcolor_item=e8 88 00 +am_ovthingcolor_ncmonster=e8 88 00 +am_ovthingcolor_monster=e8 88 00 +am_ovthingcolor_friend=e8 88 00 +am_ovthingcolor=e8 88 00 +am_ovsecretsectorcolor=00 ff ff +am_ovinterlevelcolor=ff ff 00 +am_ovtelecolor=ff ff 00 +am_ovunseencolor=00 22 6e +am_ovcdwallcolor=00 88 44 +am_ovfdwallcolor=00 88 44 +am_ovefwallcolor=00 88 44 +am_ovlockedcolor=00 88 44 +am_ovotherwallscolor=00 88 44 +am_ovspecialwallcolor=ff ff ff +am_ovsecretwallcolor=00 88 44 +am_ovwallcolor=00 ff 00 +am_ovyourcolor=fc e8 d8 +am_thingcolor_citem=fc fc fc +am_thingcolor_item=fc fc fc +am_thingcolor_ncmonster=fc fc fc +am_thingcolor_monster=fc fc fc +am_thingcolor_friend=fc fc fc +am_secretsectorcolor=ff 00 ff +am_interlevelcolor=ff 00 00 +am_intralevelcolor=00 00 ff +am_lockedcolor=00 78 00 +am_notseencolor=6c 6c 6c +am_xhaircolor=80 80 80 +am_gridcolor=8b 5a 2b +am_thingcolor=fc fc fc +am_efwallcolor=66 55 55 +am_cdwallcolor=4c 38 20 +am_fdwallcolor=88 70 58 +am_tswallcolor=88 88 88 +am_specialwallcolor=ff ff ff +am_secretwallcolor=00 00 00 +am_wallcolor=2c 18 08 +am_yourcolor=fc e8 d8 +am_backcolor=6c 54 40 +am_showthingsprites=0 +am_showtriggerlines=true +am_showkeys=true +am_drawmapback=0 +am_map_secrets=1 +am_customcolors=true +am_colorset=0 +am_showtotaltime=false +am_showtime=false +am_showitems=false +am_showmonsters=false +am_showsecrets=false +am_overlay=0 +am_rotate=0 + +[Doom.LocalServerInfo] +sv_corpsequeuesize=64 +forcewater=false +sv_smartaim=0 +sv_disableautohealth=false +sv_dropstyle=0 +compatflags2=0 +compatflags=0 + +[Doom.UnknownConsoleVariables] + +[Doom.ConsoleAliases] + +[Doom.Bindings] +1=slot 1 +2=slot 2 +3=slot 3 +4=slot 4 +5=slot 5 +6=slot 6 +7=slot 7 +8=slot 8 +9=slot 9 +0=slot 0 +-=sizedown +Equals=sizeup +tab=togglemap +t=messagemode +LeftBracket=invprev +RightBracket=invnext +enter=invuse +ctrl=+attack +`=toggleconsole +shift=+speed +\=+showscores +,=+moveleft +.=+moveright +alt=+strafe +space=+use +capslock=toggle cl_run +f1=menu_help +f2=menu_save +f3=menu_load +f4=menu_options +f5=menu_display +f6=quicksave +f7=menu_endgame +f8=togglemessages +f9=quickload +f10=menu_quit +f11=bumpgamma +f12=spynext +sysrq=screenshot +pause=pause +home=land +uparrow=+forward +pgup=+moveup +leftarrow=+left +rightarrow=+right +end=centerview +downarrow=+back +pgdn=+lookup +ins=+movedown +del=+lookdown +mouse1=+attack +mouse2=+strafe +mouse3=+forward +mouse4=+speed +joy1=+attack +joy2=+strafe +joy3=+speed +joy4=+use +mwheelup=weapprev +mwheeldown=weapnext +mwheelright=invnext +mwheelleft=invprev +dpadup=togglemap +dpaddown=invuse +dpadleft=invprev +dpadright=invnext +pad_start=pause +pad_back=menu_main +lthumb=crouch +lshoulder=weapprev +rshoulder=weapnext +ltrigger=+altattack +rtrigger=+attack +pad_a=+use +pad_y=+jump + +[Doom.DoubleBindings] + +[Doom.AutomapBindings] +0=am_gobig +-=+am_zoomout +Equals=+am_zoomin +p=am_toggletexture +f=am_togglefollow +g=am_togglegrid +c=am_clearmarks +m=am_setmark +kp-=+am_zoomout +kp+=+am_zoomin +uparrow=+am_panup +leftarrow=+am_panleft +rightarrow=+am_panright +downarrow=+am_pandown +mwheelup=am_zoom 1.2 +mwheeldown=am_zoom -1.2 + diff --git a/examples/basic.cfg b/examples/basic.cfg new file mode 100644 index 0000000..a21ea9e --- /dev/null +++ b/examples/basic.cfg @@ -0,0 +1,38 @@ +# Lines starting with # are treated as comments (or with whitespaces+#). +# It doesn't matter if you use capital letters or not. +# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. + +doom_map = map01 + +# Rewards +living_reward = -1 + +# Rendering options +screen_resolution = RES_320X240 +screen_format = CRCGCB +render_hud = True +render_crosshair = false +render_weapon = true +render_decals = false +render_particles = false +window_visible = true + +# make episodes start after 20 tics (after unholstering the gun) +episode_start_time = 14 + +# make episodes finish after 300 actions (tics) +episode_timeout = 300 + +# Available buttons +available_buttons = + { + MOVE_LEFT + MOVE_RIGHT + ATTACK + } + +# Game variables that will be in the state +available_game_variables = { AMMO2} + +mode = PLAYER +doom_skill = 5 \ No newline at end of file diff --git a/examples/random_agent_example.py b/examples/random_agent_example.py new file mode 100644 index 0000000..2e039b0 --- /dev/null +++ b/examples/random_agent_example.py @@ -0,0 +1,25 @@ +from rlkit.agents import RandomAgent +from rlkit.environments.gym_environment import GymEnvironment +from rlkit.environments.vizdoom_environment import VizDoomEnvironment +from rlkit.trainers import BasicTrainer + +params = { + "environment_params": { + # "env_name": "SpaceInvaders-v0", + }, + "agent_params": { + + }, + "training_params": { + "run_name": "test_run", + "train_interval": 10, + "episodes": 5, + "steps": 500, + }, +} + +# env = GymEnvironment(params["environment_params"]) +env = VizDoomEnvironment(params["environment_params"]) +agent = RandomAgent(params["agent_params"], env.get_action_space()) +trainer = BasicTrainer(params["training_params"], agent, env) +trainer.train() diff --git a/rlkit/core/__init__.py b/rlkit/core/__init__.py index 8c7f768..dbe1e9b 100644 --- a/rlkit/core/__init__.py +++ b/rlkit/core/__init__.py @@ -1,3 +1,4 @@ +from .base_action_space import BaseActionSpace from .base_agent import BaseAgent from .base_environment import BaseEnvironment from .base_trainer import BaseTrainer \ No newline at end of file diff --git a/rlkit/core/base_action.py b/rlkit/core/base_action.py deleted file mode 100644 index 11109cb..0000000 --- a/rlkit/core/base_action.py +++ /dev/null @@ -1 +0,0 @@ -class BaseAction \ No newline at end of file diff --git a/rlkit/core/base_action_space.py b/rlkit/core/base_action_space.py new file mode 100644 index 0000000..b138e2c --- /dev/null +++ b/rlkit/core/base_action_space.py @@ -0,0 +1,6 @@ +class BaseActionSpace: + def __init__(self): + pass + + def sample(self): + pass \ No newline at end of file diff --git a/rlkit/core/base_environment.py b/rlkit/core/base_environment.py index 318519e..313e10e 100644 --- a/rlkit/core/base_environment.py +++ b/rlkit/core/base_environment.py @@ -1,8 +1,12 @@ class BaseEnvironment: def __init__(self): self.to_render = False + self.done = False self.reset() + def close(self): + pass + def execute_action(self, action): pass @@ -14,3 +18,6 @@ def render(self): def setRender(self, to_render): self.to_render = to_render + + def get_action_space(self): + pass diff --git a/rlkit/environments/vizdoom_environment.py b/rlkit/environments/vizdoom_environment.py index b60a174..1c6e559 100644 --- a/rlkit/environments/vizdoom_environment.py +++ b/rlkit/environments/vizdoom_environment.py @@ -1,14 +1,64 @@ -from rlkit.core import BaseEnvironment +import random +import time + +from rlkit.core import BaseEnvironment, BaseActionSpace from vizdoom import * class VizDoomEnvironment(BaseEnvironment): + + class VizDoomActionSpace(BaseActionSpace): + def __init__(self): + self.actions = [ + # http://www.cs.put.poznan.pl/visualdoomai/tutorial.html + [0, 0, 1], # shoot + [1, 0, 0], # left + [0, 1, 0], # right + ] + super(VizDoomEnvironment.VizDoomActionSpace, self).__init__() + + def sample(self): + return random.sample(self.actions, 1)[0] + def __init__(self, params): + self.action_space = self.VizDoomActionSpace() + self.initialize_env() super(VizDoomEnvironment, self).__init__() - self.env_name = params["env_name"] - - pass def initialize_env(self): self.env = DoomGame() - self.env.load_config("../config/basic.cfg") - self.env.init() \ No newline at end of file + self.env.load_config("./basic.cfg") # TODO: load via params + self.env.init() + + def get_action_space(self): + return self.action_space + + def reset(self, reset_values=True): + if reset_values: + self.reset_values() + self.reset_env() + + def reset_values(self): + self.state = None + self.reward = None + self.done = False + self.info = None + + def reset_env(self): + self.env.new_episode() + + def step(self, action): + self.reward = self.env.make_action(action) + + # TODO: see if need to get image buffer + # TODO: see if this happens before/after reward + self.state = self.env.get_state() + + self.done = self.env.is_episode_finished() + if not self.done: + self.info = self.state.game_variables + else: + self.info = None + + print(action, self.done, self.env.get_total_reward(), self.info) + time.sleep(0.02) # TODO: remove + return (self.state, self.reward, self.done, self.info, ) diff --git a/rlkit/trainers/basic_trainer.py b/rlkit/trainers/basic_trainer.py index 9123e15..02bdac0 100644 --- a/rlkit/trainers/basic_trainer.py +++ b/rlkit/trainers/basic_trainer.py @@ -15,7 +15,7 @@ def __init__(self, params, agent, environment): def do_step(self): action = self.agent.get_action(self.environment.state) self.environment.step(action) - self.environment.render() # TODO: find better solution + # self.environment.render() # TODO: find better solution def train(self): try: From 0cd91a5c85a581508dc31c4a90adfb855b8221e9 Mon Sep 17 00:00:00 2001 From: Shubham Jha Date: Sat, 20 Jun 2020 09:35:34 +0530 Subject: [PATCH 6/6] Change initilization syntax of params dict --- rlkit/__main__.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/rlkit/__main__.py b/rlkit/__main__.py index 272df3f..088295e 100644 --- a/rlkit/__main__.py +++ b/rlkit/__main__.py @@ -2,20 +2,23 @@ from rlkit.environments.gym_environment import GymEnvironment from rlkit.trainers import BasicTrainer -params = { - "environment_params": { - "env_name": "SpaceInvaders-v0", - }, - "agent_params": { - - }, - "training_params": { - "run_name": "test_run", - "train_interval": 10, - "episodes": 5, - "steps": 500, - }, -} +SEED = 1234 +params = dict( + environment_params = dict( + env_name = "SpaceInvaders-v0", + seed = SEED, + ), + agent_params = dict( + seed = SEED, + ), + training_params= dict( + run_name = "test_run", + train_interval = 10, + episodes = 5, + steps = 500, + seed = SEED, + ), +) env = GymEnvironment(params["environment_params"]) agent = RandomAgent(params["agent_params"], env.get_action_space())