Skip to content

Commit

Permalink
Rewrite A2C example
Browse files Browse the repository at this point in the history
Former-commit-id: 5d066a578e771161671328f3c803f419dd2e16d1 [formerly e00cfb322a3cb8a977a52eedad7a101b6aa4aaf5]
Former-commit-id: c564d1a875f7cbd6d2aa1ad5700f2e24c63ee8a5
  • Loading branch information
zuoxingdong committed Sep 20, 2018
1 parent 08252bc commit 5d5bd56
Show file tree
Hide file tree
Showing 10 changed files with 284 additions and 458 deletions.
174 changes: 93 additions & 81 deletions examples/policy_gradient/a2c/algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,135 +5,147 @@
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from lagom import set_global_seeds
from lagom import BaseAlgorithm
from lagom.envs import EnvSpec
from lagom import pickle_dump

from lagom.envs import make_envs
from lagom.envs import make_gym_env
from lagom.envs import make_vec_env
from lagom.envs import EnvSpec
from lagom.envs.vec_env import SerialVecEnv
from lagom.envs.vec_env import StandardizeVecEnv
from lagom.envs.vec_env import VecStandardize

from lagom.core.policies import CategoricalPolicy
from lagom.core.policies import GaussianPolicy

from lagom.runner import TrajectoryRunner
from lagom.runner import SegmentRunner

from lagom.agents import A2CAgent

from engine import Engine
from policy import CategoricalMLP
from policy import CategoricalPolicy
from policy import GaussianMLP
from policy import GaussianPolicy
from policy import Network


class Algorithm(BaseAlgorithm):
def __call__(self, config):
# Set random seeds: PyTorch, numpy.random, random
set_global_seeds(seed=config['seed'])
def __call__(self, config, seed, device_str):
# Set random seeds
set_global_seeds(seed)
# Create device
device = torch.device(device_str)
# Use log dir for current job (run_experiment)
logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

# Create an VecEnv environment
list_make_env = make_envs(make_env=make_gym_env,
env_id=config['env:id'],
num_env=config['train:N'],
init_seed=config['seed'])
env = SerialVecEnv(list_make_env)
# Wrapper to standardize observation and reward from running average
if config['env:normalize']:
env = StandardizeVecEnv(venv=env,
use_obs=True,
use_reward=True,
clip_obs=10.,
clip_reward=10.,
gamma=0.99,
eps=1e-8)
# Create environment specification
# Make environment (VecEnv) for training and evaluating
env = make_vec_env(vec_env_class=SerialVecEnv,
make_env=make_gym_env,
env_id=config['env.id'],
num_env=1,
init_seed=seed)
eval_env = make_vec_env(vec_env_class=SerialVecEnv,
make_env=make_gym_env,
env_id=config['env.id'],
num_env=1,
init_seed=seed)
if config['env.standardize']: # wrap with VecStandardize for running averages of observation and rewards
env = VecStandardize(venv=env,
use_obs=True,
use_reward=True,
clip_obs=10.,
clip_reward=10.,
gamma=0.99,
eps=1e-8)
eval_env = VecStandardize(venv=eval_env, # remember to synchronize running averages during evaluation !!!
use_obs=True,
use_reward=False, # do not process rewards, no training
clip_obs=env.clip_obs,
clip_reward=env.clip_reward,
gamma=env.gamma,
eps=env.eps,
constant_obs_mean=env.obs_runningavg.mu, # use current running average as constant
constant_obs_std=env.obs_runningavg.sigma)
env_spec = EnvSpec(env)

# Create device object, note that in BaseExperimentWorker already assigns a specific GPU for this task
device = torch.device(f'cuda:{torch.cuda.current_device()}' if config['cuda'] else 'cpu')

# Create policy
network = Network(config=config, env_spec=env_spec)
if env_spec.control_type == 'Discrete':
network = CategoricalMLP(config=config, env_spec=env_spec).to(device)
policy = CategoricalPolicy(network=network,
env_spec=env_spec,
config=config)
policy = CategoricalPolicy(config=config, network=network, env_spec=env_spec, learn_V=True)
elif env_spec.control_type == 'Continuous':
network = GaussianMLP(config=config, env_spec=env_spec).to(device)
policy = GaussianPolicy(network=network,
policy = GaussianPolicy(config=config,
network=network,
env_spec=env_spec,
config=config,
min_std=config['agent:min_std'],
std_style=config['agent:std_style'],
constant_std=config['agent:constant_std'])

# Create optimizer
optimizer = optim.Adam(policy.network.parameters(), lr=config['algo:lr'])
# Create learning rate scheduler
if config['algo:use_lr_scheduler']:
# Define max number of lr decay
if 'train:iter' in config: # iteration-based training
max_epoch = config['train:iter']
elif 'train:timestep' in config: # timestep-based training
max_epoch = config['train:timestep'] + 1 # plus 1 avoid having 0.0 lr in final iteration
learn_V=True,
min_std=config['agent.min_std'],
std_style=config['agent.std_style'],
constant_std=config['agent.constant_std'],
std_state_dependent=config['agent.std_state_dependent'],
init_std=config['agent.init_std'])
network = network.to(device)

# Create optimizer and learning rate scheduler
optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr'])
if config['algo.use_lr_scheduler']:
if 'train.iter' in config: # iteration-based training
max_epoch = config['train.iter']
elif 'train.timestep' in config: # timestep-based training
max_epoch = config['train.timestep'] + 1 # +1 to avoid 0.0 lr in final iteration
lambda_f = lambda epoch: 1 - epoch/max_epoch # decay learning rate for each training epoch
lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)

# Create agent
kwargs = {'device': device}
if config['algo:use_lr_scheduler']:
if config['algo.use_lr_scheduler']:
kwargs['lr_scheduler'] = lr_scheduler
agent = A2CAgent(policy=policy,
agent = A2CAgent(config=config,
policy=policy,
optimizer=optimizer,
config=config,
**kwargs)

# Create runner
runner = SegmentRunner(agent=agent,
env=env,
gamma=config['algo:gamma'])
gamma=config['algo.gamma'])
eval_runner = TrajectoryRunner(agent=agent,
env=eval_env,
gamma=1.0)

# Create engine
engine = Engine(agent=agent,
runner=runner,
config=config,
logger=None)
eval_runner=eval_runner)

# Training and evaluation
train_logs = []
eval_logs = []

for i in count(): # successively increment iteration
# Terminate until condition is met
if 'train:iter' in config and i >= config['train:iter']: # enough iteration, terminate
for i in count(): # incremental iteration
if 'train.iter' in config and i >= config['train.iter']: # enough iterations
break
elif 'train:timestep' in config and agent.accumulated_trained_timesteps >= config['train:timestep']:
elif 'train.timestep' in config and agent.total_T >= config['train.timestep']: # enough timesteps
break

# Do training
train_output = engine.train(i)

# Logging and evaluation
if i == 0 or (i+1) % config['log:interval'] == 0:
# Log training and record the loggings
train_logger = engine.log_train(train_output)
train_logs.append(train_logger.logs)
# Log evaluation and record the loggings
with torch.no_grad(): # no need to have gradient, save memory
eval_output = engine.eval(i)
eval_logger = engine.log_eval(eval_output)
eval_logs.append(eval_logger.logs)

# Save the logging periodically
# This is good to avoid saving very large file at once, because the program might get stuck
# The file name is augmented with current iteration
np.save(Path(config['log:dir']) / str(config['ID']) / f'train:{i}', train_logs)
np.save(Path(config['log:dir']) / str(config['ID']) / f'eval:{i}', eval_logs)
# Clear the logging list
train_logs.clear()
eval_logs.clear()
# train and evaluation
train_output = engine.train(n=i)

# logging
if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0:
train_log = engine.log_train(train_output)

with torch.no_grad(): # disable grad, save memory
eval_output = engine.eval(n=i)
eval_log = engine.log_eval(eval_output)

if i == 0 or (i+1) % config['log.record_interval'] == 0: # record loggings
train_logs.append(train_log)
eval_logs.append(eval_log)

# Save all loggings
pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')

return None
Loading

0 comments on commit 5d5bd56

Please sign in to comment.