From 24a41d4d85ff0ba7d963490a6d2f00072b5c88bd Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:46:22 -0300 Subject: [PATCH 01/27] Implement abstract reward function class --- gym_locm/envs/rewards.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 gym_locm/envs/rewards.py diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py new file mode 100644 index 0000000..71e932c --- /dev/null +++ b/gym_locm/envs/rewards.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +from gym_locm.engine import State, PlayerOrder + + +class RewardFunction(ABC): + @abstractmethod + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + pass + + +available_rewards = {} + + +def parse_reward(reward_name: str): + return available_rewards[reward_name.lower().replace(" ", "-")] From 351ee77a51c668e9f02c2b8cc9a0bb7d0d984f4b Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:46:53 -0300 Subject: [PATCH 02/27] Implement win-loss reward function --- gym_locm/envs/rewards.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index 71e932c..3e542ed 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -9,7 +9,19 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): pass -available_rewards = {} +class WinLossRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + if state.winner == for_player: + return 1 + elif state.winner == for_player.opposing(): + return -1 + else: + return 0 + + +available_rewards = { + "win-loss": WinLossRewardFunction +} def parse_reward(reward_name: str): From ef9e635a15bc0f5178e467f32d33a6363f3b79fe Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:47:18 -0300 Subject: [PATCH 03/27] Implement player health reward function --- gym_locm/envs/rewards.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index 3e542ed..8494725 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -19,8 +19,14 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): return 0 +class PlayerHealthRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + return state.players[for_player].health / 30 + + available_rewards = { - "win-loss": WinLossRewardFunction + "win-loss": WinLossRewardFunction, + "player-health": PlayerHealthRewardFunction } From 85484eb635a8ad561f6cbad1badc8eb74fc6438c Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:47:35 -0300 Subject: [PATCH 04/27] Implement opponent health reward function --- gym_locm/envs/rewards.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index 8494725..24d4a97 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -24,9 +24,15 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): return state.players[for_player].health / 30 +class OpponentHealthRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + return -max(0, state.players[for_player.opposing()].health) / 30 + + available_rewards = { "win-loss": WinLossRewardFunction, - "player-health": PlayerHealthRewardFunction + "player-health": PlayerHealthRewardFunction, + "opponent-health": OpponentHealthRewardFunction } From a335c54530ab197a7d073379315a7fde22a2dead Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:48:07 -0300 Subject: [PATCH 05/27] Add reward function and weights as parameters on base env --- gym_locm/envs/base_env.py | 11 ++++++++++- gym_locm/envs/battle.py | 2 -- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/gym_locm/envs/base_env.py b/gym_locm/envs/base_env.py index de9d1ca..9488c6a 100644 --- a/gym_locm/envs/base_env.py +++ b/gym_locm/envs/base_env.py @@ -6,18 +6,27 @@ from prettytable import PrettyTable from gym_locm.engine import Creature, GreenItem, RedItem, BlueItem, State, Phase, ActionType, Action, Lane +from gym_locm.envs.rewards import parse_reward from gym_locm.exceptions import MalformedActionError class LOCMEnv(gym.Env, ABC): card_types = {Creature: 0, GreenItem: 1, RedItem: 2, BlueItem: 3} - def __init__(self, seed=None, items=True, k=3, n=30): + def __init__(self, seed=None, items=True, k=3, n=30, reward_functions=('win-loss',), reward_weights=(1.0,)): self._seed = seed self.episodes = 0 self.items = items self.k, self.n = k, n + assert len(reward_functions) == len(reward_weights), \ + "The length of reward_functions and reward_weights must be the same" + + self.reward_functions = tuple([parse_reward(function_name)() for function_name in reward_functions]) + self.reward_weights = reward_weights + + self.reward_range = (-max(reward_weights), max(reward_weights)) + self.state = State(seed=seed, items=items, k=k, n=n) def seed(self, seed=None): diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index c2be698..91d2cf0 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -49,8 +49,6 @@ def __init__(self, # 41 possible actions self.action_space = gym.spaces.Discrete(41) - self.reward_range = (-1, 1) - # play through draft while self.state.phase == Phase.DRAFT: for agent in self.draft_agents: From f2d0e63ed53cbf37f0b0ef2d88cc15303de65921 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:48:51 -0300 Subject: [PATCH 06/27] Support reward functions and weights on battle envs --- gym_locm/envs/battle.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index 91d2cf0..18db1ab 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -12,8 +12,10 @@ class LOCMBattleEnv(LOCMEnv): def __init__(self, draft_agents=(RandomDraftAgent(), RandomDraftAgent()), - return_action_mask=False, seed=None, items=True, k=3, n=30): - super().__init__(seed=seed, items=items, k=k, n=n) + return_action_mask=False, seed=None, items=True, k=3, n=30, + reward_functions=('win-loss',), reward_weights=(1.0,)): + super().__init__(seed=seed, items=items, k=k, n=n, + reward_functions=reward_functions, reward_weights=reward_weights) self.rewards = [0.0] @@ -76,6 +78,10 @@ def step(self, action): # less property accesses state = self.state + current_player_id = state.current_player.id + + reward_before = [weight * function.calculate(state, for_player=current_player_id) + for function, weight in zip(self.reward_functions, self.reward_weights)] # execute the action if action is not None: @@ -83,23 +89,25 @@ def step(self, action): else: state.was_last_action_invalid = True + reward_after = [weight * function.calculate(state, for_player=current_player_id) + for function, weight in zip(self.reward_functions, self.reward_weights)] + # build return info winner = state.winner - reward = 0 + raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + reward = sum(raw_rewards) done = winner is not None info = {'phase': state.phase, 'turn': state.turn, 'winner': winner, - 'invalid': state.was_last_action_invalid} + 'invalid': state.was_last_action_invalid, + 'raw_rewards': raw_rewards} if self.return_action_mask: info['action_mask'] = self.state.action_mask - if winner is not None: - reward = 1 if winner == PlayerOrder.FIRST else -1 - - self.rewards[-1] += reward + self.rewards[-1] += reward return self.encode_state(), reward, done, info From 08b9808090eeadd9a5d89bd8d19c7a5b7d5e5e57 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 12:49:01 -0300 Subject: [PATCH 07/27] Support reward functions and weights on draft envs --- gym_locm/envs/draft.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/gym_locm/envs/draft.py b/gym_locm/envs/draft.py index 2f2b51e..4080501 100644 --- a/gym_locm/envs/draft.py +++ b/gym_locm/envs/draft.py @@ -14,8 +14,10 @@ def __init__(self, battle_agents=(RandomBattleAgent(), RandomBattleAgent()), use_draft_history=False, use_mana_curve=False, sort_cards=False, evaluation_battles=1, - seed=None, items=True, k=3, n=30): - super().__init__(seed=seed, items=items, k=k, n=n) + seed=None, items=True, k=3, n=30, + reward_functions=('win-loss',), reward_weights=(1.0,)): + super().__init__(seed=seed, items=items, k=k, n=n, + reward_functions=reward_functions, reward_weights=reward_weights) # init bookkeeping structures self.results = [] @@ -93,6 +95,10 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): # less property accesses state = self.state + current_player_id = state.current_player.id + + reward_before = [weight * function.calculate(state, for_player=current_player_id) + for function, weight in zip(self.reward_functions, self.reward_weights)] # find appropriate value for the provided card index if 0 <= action.origin < self.k: @@ -107,8 +113,10 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): # execute the action state.act(action) + reward_after = [weight * function.calculate(state, for_player=current_player_id) + for function, weight in zip(self.reward_functions, self.reward_weights)] + # init return info - reward = 0 done = False info = {'phase': state.phase, 'turn': state.turn, @@ -134,11 +142,20 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): self.results.append(1 if winner == PlayerOrder.FIRST else -1) info['winner'].append(winner) - reward = np.mean(self.results) + try: + win_loss_reward_index = self.reward_functions.index("win-loss") + reward_after[win_loss_reward_index] = np.mean(self.results) + except ValueError: + pass + done = True del info['turn'] + raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + info['raw_rewards'] = raw_rewards + reward = sum(raw_rewards) + return self.encode_state(), reward, done, info def do_match(self, state): From 9f89f636ce36d99cb40ad7db2ba5b933c179d661 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 13:16:38 -0300 Subject: [PATCH 08/27] Fix state cloning when RNG was not initialized --- gym_locm/engine.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gym_locm/engine.py b/gym_locm/engine.py index 7ba840d..522873d 100644 --- a/gym_locm/engine.py +++ b/gym_locm/engine.py @@ -922,7 +922,11 @@ def clone(self) -> 'State': cloned_state = State.empty_copy() cloned_state.np_random = np.random.RandomState() - cloned_state.np_random.set_state(self.np_random.get_state()) + + try: + cloned_state.np_random.set_state(self.np_random.get_state()) + except ValueError: + pass cloned_state.instance_counter = self.instance_counter cloned_state.summon_counter = self.summon_counter From 1287cc9a04de0741c11e558da6f66866b4cd26c1 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 26 Apr 2022 13:16:52 -0300 Subject: [PATCH 09/27] Add hello world battle script --- hello_world_battle.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 hello_world_battle.py diff --git a/hello_world_battle.py b/hello_world_battle.py new file mode 100644 index 0000000..562bb5d --- /dev/null +++ b/hello_world_battle.py @@ -0,0 +1,31 @@ +import gym + +from gym_locm import agents + + +def hello_world(): + env = gym.make( + "LOCM-battle-v0", + draft_agents=(agents.RandomDraftAgent(), agents.RandomDraftAgent()), + battle_agent=(agents.RandomBattleAgent()), + reward_functions=["win-loss", "opponent-health"], reward_weights=[1.0, 1.0], + seed=42 + ) + + agent = agents.GreedyBattleAgent() + + obs = env.reset() + done = False + + while not done: + env.render(mode='text') + action = agent.act(env.state) + print("Action:", action) + + obs, reward, done, info = env.step(action) + + print("Reward:", reward, info['raw_rewards']) + + +if __name__ == '__main__': + hello_world() From f392c7fdf8d4a954f9679c2ab7b5f300445de4b0 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 10:49:15 -0300 Subject: [PATCH 10/27] Remove recurrent done masks from RLBattleAgent --- gym_locm/agents.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gym_locm/agents.py b/gym_locm/agents.py index 366878e..2c52416 100644 --- a/gym_locm/agents.py +++ b/gym_locm/agents.py @@ -1145,8 +1145,7 @@ def reset(self): def act(self, state, action_masks): action, self.hidden_states = \ self.model.predict(state, state=self.hidden_states, - mask=self.dones, deterministic=True, - action_masks=action_masks) + deterministic=True, action_masks=action_masks) return action From dc57c039f94e8cde96639650c28915022c142459 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 10:51:23 -0300 Subject: [PATCH 11/27] Fix reward range on battle envs --- gym_locm/envs/base_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym_locm/envs/base_env.py b/gym_locm/envs/base_env.py index 9488c6a..652f3b9 100644 --- a/gym_locm/envs/base_env.py +++ b/gym_locm/envs/base_env.py @@ -25,7 +25,7 @@ def __init__(self, seed=None, items=True, k=3, n=30, reward_functions=('win-loss self.reward_functions = tuple([parse_reward(function_name)() for function_name in reward_functions]) self.reward_weights = reward_weights - self.reward_range = (-max(reward_weights), max(reward_weights)) + self.reward_range = (-sum(reward_weights), sum(reward_weights)) self.state = State(seed=seed, items=items, k=k, n=n) From 9a0e0d4e29fdb05c699635195582870a17d2b5c5 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 10:54:18 -0300 Subject: [PATCH 12/27] Remove unused battle training script --- gym_locm/experiments/training-battle.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 gym_locm/experiments/training-battle.py diff --git a/gym_locm/experiments/training-battle.py b/gym_locm/experiments/training-battle.py deleted file mode 100644 index 705e483..0000000 --- a/gym_locm/experiments/training-battle.py +++ /dev/null @@ -1,19 +0,0 @@ -import gym -import gym_locm -from sb3_contrib import MaskablePPO -from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback -from stable_baselines3.common.env_util import make_vec_env - -path = 'trained_models/battle/ppo_mask' -n_envs = 4 - -env = make_vec_env('LOCM-battle-v0', n_envs=n_envs) -eval_env = make_vec_env('LOCM-battle-v0', n_envs=n_envs) - -model = MaskablePPO("MlpPolicy", env, gamma=1, verbose=1) - -checkpoint_callback = CheckpointCallback(save_freq=10_000 // n_envs, save_path=path + '/models') - -model.learn(1_000_000, callback=checkpoint_callback) - -model.save('trained_models/battle/ppo_mask/final') From d70312b6937b2d12ae54d3d6cd554b9de1cfcd54 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 12:49:46 -0300 Subject: [PATCH 13/27] Separate battle and draft trainer scripts Resolves #2 --- gym_locm/experiments/hyp-search.py | 2 +- gym_locm/experiments/training.py | 7 +- gym_locm/toolbox/trainer_battle.py | 862 ++++++++++++++++++ .../toolbox/{trainer.py => trainer_draft.py} | 0 4 files changed, 868 insertions(+), 3 deletions(-) create mode 100644 gym_locm/toolbox/trainer_battle.py rename gym_locm/toolbox/{trainer.py => trainer_draft.py} (100%) diff --git a/gym_locm/experiments/hyp-search.py b/gym_locm/experiments/hyp-search.py index 9979405..befc098 100644 --- a/gym_locm/experiments/hyp-search.py +++ b/gym_locm/experiments/hyp-search.py @@ -8,7 +8,7 @@ from hyperopt.pyll import scope from gym_locm.agents import MaxAttackBattleAgent, GreedyBattleAgent, MaxAttackDraftAgent -from gym_locm.toolbox.trainer import AsymmetricSelfPlay, model_builder_mlp, model_builder_lstm +from gym_locm.toolbox.trainer_draft import AsymmetricSelfPlay, model_builder_mlp, model_builder_lstm hyperparameter_space = { 'switch_freq': hp.choice('switch_freq', [10, 100, 1000]), diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index 62e44b5..ef48f43 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -6,8 +6,6 @@ import wandb from gym_locm import agents -from gym_locm.toolbox.trainer import AsymmetricSelfPlay, model_builder_mlp, \ - model_builder_lstm, model_builder_mlp_masked, SelfPlay, FixedAdversary _counter = 0 @@ -86,6 +84,9 @@ def run(): if args.task == 'draft': + from gym_locm.toolbox.trainer_draft import AsymmetricSelfPlay, SelfPlay, FixedAdversary, \ + model_builder_mlp, model_builder_lstm + if args.approach == 'lstm': model_builder = model_builder_lstm else: @@ -109,6 +110,8 @@ def run(): elif args.task == 'battle': + from gym_locm.toolbox.trainer_battle import AsymmetricSelfPlay, SelfPlay, FixedAdversary, model_builder_mlp_masked + model_builder = model_builder_mlp_masked draft_agent = agents.parse_draft_agent(args.draft_agent) battle_agent = agents.parse_battle_agent(args.battle_agent) diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py new file mode 100644 index 0000000..64fd43d --- /dev/null +++ b/gym_locm/toolbox/trainer_battle.py @@ -0,0 +1,862 @@ +import json +import logging +import math +import os +import time +import numpy as np +from abc import abstractmethod +from datetime import datetime +from statistics import mean + +import torch as th + +from stable_baselines3.common.vec_env import VecEnv as VecEnv3, DummyVecEnv as DummyVecEnv3 +from stable_baselines3.common.callbacks import BaseCallback, CallbackList +from sb3_contrib import MaskablePPO +from wandb.integration.sb3 import WandbCallback + +from gym_locm.agents import Agent, MaxAttackDraftAgent, MaxAttackBattleAgent, RLBattleAgent, RLDraftAgent +from gym_locm.envs import LOCMBattleSingleEnv +from gym_locm.envs.battle import LOCMBattleSelfPlayEnv + +verbose = True +REALLY_BIG_INT = 1_000_000_000 + +if verbose: + logging.basicConfig(level=logging.DEBUG) + + +class TrainingSession: + def __init__(self, task, params, path, seed, wandb_run=None): + # initialize logger + self.logger = logging.getLogger('{0}.{1}'.format(__name__, + type(self).__name__)) + + # initialize results + self.checkpoints = [] + self.win_rates = [] + self.episode_lengths = [] + self.battle_lengths = [] + self.action_histograms = [] + self.start_time, self.end_time = None, None + self.wandb_run = wandb_run + + # save parameters + self.task = task + self.params = params + self.path = os.path.dirname(__file__) + "/../../" + path + self.seed = seed + + @abstractmethod + def _train(self): + pass + + def _save_results(self): + results_path = self.path + '/results.json' + + with open(results_path, 'w') as file: + info = dict(task=self.task, **self.params, seed=self.seed, checkpoints=self.checkpoints, + win_rates=self.win_rates, ep_lengths=self.episode_lengths, + battle_lengths=self.battle_lengths, + action_histograms=self.action_histograms, + start_time=str(self.start_time), end_time=str(self.end_time)) + info = json.dumps(info, indent=2) + + file.write(info) + + self.logger.debug(f"Results saved at {results_path}.") + + def run(self): + # log start time + self.start_time = datetime.now() + self.logger.info(f"Training a {self.task} agent...") + + # do the training + self._train() + + # log end time + self.end_time = datetime.now() + self.logger.info(f"End of training. Time elapsed: {self.end_time - self.start_time}.") + + # save model info to results file + self._save_results() + + +class FixedAdversary(TrainingSession): + def __init__(self, task, model_builder, model_params, env_params, + eval_env_params, train_episodes, eval_episodes, num_evals, + play_first, path, seed, num_envs=1, wandb_run=None): + super(FixedAdversary, self).__init__( + task, model_params, path, seed, wandb_run=wandb_run) + + # log start time + start_time = time.perf_counter() + + # initialize parallel environments + self.logger.debug("Initializing training env...") + env = [] + + env_class = LOCMBattleSingleEnv + + for i in range(num_envs): + # no overlap between episodes at each concurrent env + if seed is not None: + current_seed = seed + (train_episodes // num_envs) * i + else: + current_seed = None + + # create the env + env.append(lambda: env_class(seed=current_seed, play_first=play_first, **env_params)) + + # wrap envs in a vectorized env + self.env: VecEnv3 = DummyVecEnv3(env) + + # initialize evaluator + self.logger.debug("Initializing evaluator...") + eval_seed = seed + train_episodes if seed is not None else None + self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes, + eval_seed, num_envs) + + # build the model + self.logger.debug("Building the model...") + self.model = model_builder(self.env, seed, **model_params) + + # create necessary folders + os.makedirs(self.path, exist_ok=True) + + # set tensorflow log dir + # todo: check later if this was meant to be 'tensorboard_log' instead + self.model.tensorflow_log = self.path + + # save parameters + self.train_episodes = train_episodes + self.num_evals = num_evals + self.eval_frequency = train_episodes / num_evals + + # initialize control attributes + self.model.last_eval = None + self.model.next_eval = 0 + self.model.role_id = 0 if play_first else 1 + + # log end time + end_time = time.perf_counter() + + self.logger.debug("Finished initializing training session " + f"({round(end_time - start_time, ndigits=3)}s).") + + def _training_callback(self, _locals=None, _globals=None): + episodes_so_far = sum(self.env.get_attr('episodes')) + + # if it is time to evaluate, do so + if episodes_so_far >= self.model.next_eval: + # save model + model_path = self.path + f'/{episodes_so_far}' + self.model.save(model_path) + save_model_as_json(self.model, self.params['activation'], model_path) + self.logger.debug(f"Saved model at {model_path}.zip/json.") + + # evaluate the model + self.logger.info(f"Evaluating model ({episodes_so_far} episodes)...") + start_time = time.perf_counter() + + agent_class = RLBattleAgent + + agent = agent_class(self.model) + + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + self.evaluator.run(agent, play_first=self.model.role_id == 0) + + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") + + # save the results + self.checkpoints.append(episodes_so_far) + self.win_rates.append(win_rate) + self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) + self.action_histograms.append(act_hist) + + # update control attributes + self.model.last_eval = episodes_so_far + self.model.next_eval += self.eval_frequency + + # write partial results to file + self._save_results() + + # upload stats to wandb, if enabled + if self.wandb_run: + info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, + win_rate=win_rate, mean_ep_length=ep_length, + mean_battle_length=battle_length) + + info['pass_actions'] = act_hist[0] + info['summon_actions'] = sum(act_hist[1:17]) + + if self.env.get_attr('items', indices=[0])[0]: + info['use_actions'] = sum(act_hist[17:121]) + info['attack_actions'] = sum(act_hist[121:]) + else: + info['attack_actions'] = sum(act_hist[17:]) + + self.wandb_run.log(info) + + # if training should end, return False to end training + training_is_finished = episodes_so_far >= self.train_episodes + + if training_is_finished: + self.logger.debug(f"Training ended at {episodes_so_far} episodes") + + return not training_is_finished + + def _train(self): + # save and evaluate starting model + self._training_callback() + + callbacks = [TrainingCallback(self._training_callback)] + + if self.wandb_run: + callbacks.append(WandbCallback(gradient_save_freq=0, verbose=0)) + + try: + # train the model + # note: dynamic learning or clip rates will require accurate # of timesteps + self.model.learn(total_timesteps=REALLY_BIG_INT, # we'll stop manually + callback=CallbackList(callbacks)) + except KeyboardInterrupt: + pass + + # save and evaluate final model, if not done yet + if len(self.win_rates) < self.num_evals: + self._training_callback() + + # close the envs + for e in (self.env, self.evaluator): + e.close() + + +class SelfPlay(TrainingSession): + def __init__(self, task, model_builder, model_params, env_params, + eval_env_params, train_episodes, eval_episodes, num_evals, + switch_frequency, path, seed, num_envs=1, wandb_run=None): + super(SelfPlay, self).__init__( + task, model_params, path, seed, wandb_run=wandb_run) + + # log start time + start_time = time.perf_counter() + + # initialize parallel training environments + self.logger.debug("Initializing training envs...") + env = [] + + env_class = LOCMBattleSelfPlayEnv + + for i in range(num_envs): + # no overlap between episodes at each process + if seed is not None: + current_seed = seed + (train_episodes // num_envs) * i + else: + current_seed = None + + # create one env per process + env.append(lambda: env_class(seed=current_seed, play_first=True, **env_params)) + + # wrap envs in a vectorized env + self.env: VecEnv3 = DummyVecEnv3(env) + + # initialize parallel evaluating environments + self.logger.debug("Initializing evaluation envs...") + eval_seed = seed + train_episodes if seed is not None else None + self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes // 2, + eval_seed, num_envs) + + # build the models + self.logger.debug("Building the models...") + self.model = model_builder(self.env, seed, **model_params) + self.model.adversary = model_builder(self.env, seed, **model_params) + + # initialize parameters of adversary models accordingly + self.model.adversary.set_parameters(self.model.get_parameters(), exact_match=True) + + # set adversary models as adversary policies of the self-play envs + def make_adversary_policy(model, env): + def adversary_policy(obs): + actions, _ = model.adversary.predict( + obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) + + return actions + + return adversary_policy + + self.env.set_attr('adversary_policy', make_adversary_policy(self.model, self.env)) + + # create necessary folders + os.makedirs(self.path, exist_ok=True) + + # set tensorflow log dirs + self.model.tensorflow_log = self.path + + # save parameters + self.task = task + self.train_episodes = train_episodes + self.eval_episodes = eval_episodes + self.num_evals = num_evals + self.switch_frequency = switch_frequency + self.eval_frequency = train_episodes / num_evals + self.num_switches = math.ceil(train_episodes / switch_frequency) + + # initialize control attributes + self.model.last_eval, self.model.next_eval = None, 0 + self.model.last_switch, self.model.next_switch = None, self.switch_frequency + + # initialize results + self.checkpoints = [] + self.win_rates = [] + self.episode_lengths = [] + self.action_histograms = [] + + # log end time + end_time = time.perf_counter() + + self.logger.debug("Finished initializing training session " + f"({round(end_time - start_time, ndigits=3)}s).") + + def _training_callback(self, _locals=None, _globals=None): + model = self.model + episodes_so_far = sum(self.env.get_attr('episodes')) + + # note: wtf was this code about, ronaldo??? + # turns = model.env.get_attr('turn') + # playing_first = model.env.get_attr('play_first') + # + # for i in range(model.env.num_envs): + # if turns[i] in range(0, model.env.num_envs): + # model.env.set_attr('play_first', not playing_first[i], indices=[i]) + + # if it is time to evaluate, do so + if episodes_so_far >= model.next_eval: + # save model + model_path = self.path + f'/{episodes_so_far}' + + model.save(model_path, exclude=['adversary']) + save_model_as_json(model, self.params['activation'], model_path) + self.logger.debug(f"Saved model at {model_path}.zip/json.") + + # evaluate the model + self.logger.info(f"Evaluating model ({episodes_so_far} episodes)...") + start_time = time.perf_counter() + + agent_class = RLBattleAgent + + if self.evaluator.seed is not None: + self.evaluator.seed = self.seed + self.train_episodes + + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + self.evaluator.run(agent_class(model), play_first=True) + + if self.evaluator.seed is not None: + self.evaluator.seed += self.eval_episodes + + win_rate2, mean_reward2, ep_length2, battle_length2, act_hist2 = \ + self.evaluator.run(agent_class(model), play_first=False) + + mean_reward = (mean_reward + mean_reward2) / 2 + win_rate = (win_rate + win_rate2) / 2 + ep_length = (ep_length + ep_length2) / 2 + battle_length = (battle_length + battle_length2) / 2 + act_hist = [(act_hist[i] + act_hist2[i]) / 2 for i in range(model.env.get_attr('action_space', indices=[0])[0].n)] + + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") + + # save the results + self.checkpoints.append(episodes_so_far) + self.win_rates.append(win_rate) + self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) + self.action_histograms.append(act_hist) + + # update control attributes + model.last_eval = episodes_so_far + model.next_eval += self.eval_frequency + + # write partial results to file + self._save_results() + + # upload stats to wandb, if enabled + if self.wandb_run: + info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, + win_rate=win_rate, mean_ep_length=ep_length, + mean_battle_length=battle_length) + + info['pass_actions'] = act_hist[0] + info['summon_actions'] = sum(act_hist[1:17]) + + if self.env.get_attr('items', indices=[0])[0]: + info['use_actions'] = sum(act_hist[17:121]) + info['attack_actions'] = sum(act_hist[121:]) + else: + info['attack_actions'] = sum(act_hist[17:]) + + self.wandb_run.log(info) + + # if it is time to update the adversary model, do so + if episodes_so_far >= model.next_switch: + model.last_switch = episodes_so_far + model.next_switch += self.switch_frequency + + # log training win rate at the time of the switch + train_mean_reward = np.mean([np.mean(rewards) for rewards in model.env.env_method('get_episode_rewards')]) + self.wandb_run.log({'train_mean_reward': train_mean_reward}) + + self.logger.debug(f"Model trained for " + f"{sum(model.env.get_attr('episodes'))} episodes. " + f"Train reward: {train_mean_reward}") + + # reset training env rewards + for i in range(model.env.num_envs): + model.env.set_attr('rewards', [0.0], indices=[i]) + + # update parameters of adversary models + model.adversary.set_parameters(model.get_parameters(), exact_match=True) + + self.logger.debug("Parameters of adversary network updated.") + + # if training should end, return False to end training + training_is_finished = episodes_so_far >= self.train_episodes + + return not training_is_finished + + def _train(self): + # save and evaluate starting models + self._training_callback({'self': self.model}) + + callbacks = [TrainingCallback(self._training_callback)] + + if self.wandb_run: + callbacks.append(WandbCallback(gradient_save_freq=0, verbose=0)) + + try: + self.logger.debug(f"Training will switch models every " + f"{self.switch_frequency} episodes") + + # train the model + self.model.learn(total_timesteps=REALLY_BIG_INT, + reset_num_timesteps=False, + callback=CallbackList(callbacks)) + + except KeyboardInterrupt: + pass + + self.logger.debug(f"Training ended at {sum(self.env.get_attr('episodes'))} " + f"episodes") + + # save and evaluate final models, if not done yet + if len(self.win_rates) < self.num_evals: + self._training_callback({'self': self.model}) + + if len(self.win_rates) < self.num_evals: + self._training_callback({'self': self.model}) + + # close the envs + for e in (self.env, self.evaluator): + e.close() + + +class AsymmetricSelfPlay(TrainingSession): + def __init__(self, task, model_builder, model_params, env_params, + eval_env_params, train_episodes, eval_episodes, num_evals, + switch_frequency, path, seed, num_envs=1, wandb_run=None): + super(AsymmetricSelfPlay, self).__init__( + task, model_params, path, seed, wandb_run=wandb_run) + + # log start time + start_time = time.perf_counter() + + # initialize parallel training environments + self.logger.debug("Initializing training envs...") + env1, env2 = [], [] + + env_class = LOCMBattleSelfPlayEnv + + for i in range(num_envs): + # no overlap between episodes at each process + if seed is not None: + current_seed = seed + (train_episodes // num_envs) * i + else: + current_seed = None + + # create one env per process + env1.append(lambda: env_class(seed=current_seed, play_first=True, **env_params)) + env2.append(lambda: env_class(seed=current_seed, play_first=False, **env_params)) + + # wrap envs in a vectorized env + self.env1: VecEnv3 = DummyVecEnv3(env1) + self.env2: VecEnv3 = DummyVecEnv3(env2) + + # initialize parallel evaluating environments + self.logger.debug("Initializing evaluation envs...") + eval_seed = seed + train_episodes if seed is not None else None + self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes, + eval_seed, num_envs) + + # build the models + self.logger.debug("Building the models...") + self.model1 = model_builder(self.env1, seed, **model_params) + self.model1.adversary = model_builder(self.env2, seed, **model_params) + self.model2 = model_builder(self.env2, seed, **model_params) + self.model2.adversary = model_builder(self.env1, seed, **model_params) + + # initialize parameters of adversary models accordingly + self.model1.adversary.set_parameters(self.model2.get_parameters(), exact_match=True) + self.model2.adversary.set_parameters(self.model1.get_parameters(), exact_match=True) + + # set adversary models as adversary policies of the self-play envs + def make_adversary_policy(model, env): + def adversary_policy(obs): + actions, _ = model.adversary.predict( + obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) + + return actions + + return adversary_policy + + self.env1.set_attr('adversary_policy', make_adversary_policy(self.model1, self.env1)) + self.env2.set_attr('adversary_policy', make_adversary_policy(self.model2, self.env2)) + + # create necessary folders + os.makedirs(self.path + '/role0', exist_ok=True) + os.makedirs(self.path + '/role1', exist_ok=True) + + # set tensorflow log dirs + self.model1.tensorflow_log = self.path + '/role0' + self.model2.tensorflow_log = self.path + '/role1' + + # save parameters + self.train_episodes = train_episodes + self.eval_episodes = eval_episodes + self.num_evals = num_evals + self.switch_frequency = switch_frequency + self.eval_frequency = train_episodes / num_evals + self.num_switches = math.ceil(train_episodes / switch_frequency) + + # initialize control attributes + self.model1.role_id, self.model2.role_id = 0, 1 + self.model1.last_eval, self.model1.next_eval = None, 0 + self.model2.last_eval, self.model2.next_eval = None, 0 + self.model1.last_switch, self.model1.next_switch = 0, self.switch_frequency + self.model2.last_switch, self.model2.next_switch = 0, self.switch_frequency + + # initialize results + self.checkpoints = [], [] + self.win_rates = [], [] + self.episode_lengths = [], [] + self.action_histograms = [], [] + + # log end time + end_time = time.perf_counter() + + self.logger.debug("Finished initializing training session " + f"({round(end_time - start_time, ndigits=3)}s).") + + def _training_callback(self, _locals=None, _globals=None): + model = _locals['self'] + episodes_so_far = sum(model.env.get_attr('episodes')) + + # if it is time to evaluate, do so + if episodes_so_far >= model.next_eval: + # save model + model_path = f'{self.path}/role{model.role_id}/{episodes_so_far}' + + model.save(model_path, exclude=['adversary']) + save_model_as_json(model, self.params['activation'], model_path) + self.logger.debug(f"Saved model at {model_path}.zip/json.") + + # evaluate the model + self.logger.info(f"Evaluating model {model.role_id} " + f"({episodes_so_far} episodes)...") + start_time = time.perf_counter() + + agent_class = RLBattleAgent + + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + self.evaluator.run(agent_class(model), play_first=model.role_id == 0) + + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") + + # save the results + self.checkpoints[model.role_id].append(episodes_so_far) + self.win_rates[model.role_id].append(win_rate) + self.episode_lengths[model.role_id].append(ep_length) + self.battle_lengths[model.role_id].append(battle_length) + self.action_histograms[model.role_id].append(act_hist) + + # update control attributes + model.last_eval = episodes_so_far + model.next_eval += self.eval_frequency + + # write partial results to file + self._save_results() + + # upload stats to wandb, if enabled + if self.wandb_run: + info = {'checkpoint_' + model.role_id: episodes_so_far, + 'mean_reward_' + model.role_id: mean_reward, + 'win_rate_' + model.role_id: win_rate, + 'mean_ep_length_' + model.role_id: ep_length, + 'mean_battle_length_' + model.role_id: battle_length, + 'pass_actions_' + model.role_id: act_hist[0], + 'summon_actions_' + model.role_id: sum(act_hist[1:17])} + + if model.env.get_attr('items', indices=[0])[0]: + info['use_actions'] = sum(act_hist[17:121]) + info['attack_actions'] = sum(act_hist[121:]) + else: + info['attack_actions'] = sum(act_hist[17:]) + + self.wandb_run.log(info) + + # if training should end, return False to end training + training_is_finished = episodes_so_far >= model.next_switch or episodes_so_far >= self.train_episodes + + if training_is_finished: + model.last_switch = episodes_so_far + model.next_switch += self.switch_frequency + + return not training_is_finished + + def _train(self): + # save and evaluate starting models + self._training_callback({'self': self.model1}) + self._training_callback({'self': self.model2}) + + try: + self.logger.debug(f"Training will switch models every " + f"{self.switch_frequency} episodes") + + callbacks1 = [TrainingCallback(lambda: self._training_callback({'self': self.model1}))] + callbacks2 = [TrainingCallback(lambda: self._training_callback({'self': self.model2}))] + + if self.wandb_run: + callbacks1.append(WandbCallback(gradient_save_freq=0, verbose=0)) + callbacks2.append(WandbCallback(gradient_save_freq=0, verbose=0)) + + for _ in range(self.num_switches): + # train the first player model + self.model1.learn(total_timesteps=REALLY_BIG_INT, + reset_num_timesteps=False, + callback=CallbackList(callbacks1)) + + # log training win rate at the time of the switch + train_mean_reward1 = np.mean([np.mean(rewards) for rewards in self.env1.env_method('get_episode_rewards')]) + self.wandb_run.log({'train_mean_reward_0': train_mean_reward1}) + + # reset training env rewards + for i in range(self.env1.num_envs): + self.env1.set_attr('rewards', [0.0], indices=[i]) + + self.logger.debug(f"Model {self.model1.role_id} trained for " + f"{sum(self.env1.get_attr('episodes'))} episodes. " + f"Train reward: {train_mean_reward1}. " + f"Switching to model {self.model2.role_id}.") + + # train the second player model + self.model2.learn(total_timesteps=REALLY_BIG_INT, + reset_num_timesteps=False, + callback=CallbackList(callbacks2)) + + # log training win rate at the time of the switch + train_mean_reward2 = np.mean([np.mean(rewards) for rewards in self.env2.env_method('get_episode_rewards')]) + self.wandb_run.log({'train_mean_reward_1': train_mean_reward2}) + + # reset training env rewards + for i in range(self.env2.num_envs): + self.env2.set_attr('rewards', [0.0], indices=[i]) + + self.logger.debug(f"Model {self.model2.role_id} trained for " + f"{sum(self.env2.get_attr('episodes'))} episodes. " + f"Train reward: {train_mean_reward2}. " + f"Switching to model {self.model1.role_id}.") + + # update parameters of adversary models + self.model1.adversary.set_parameters(self.model2.get_parameters(), exact_match=True) + self.model2.adversary.set_parameters(self.model1.get_parameters(), exact_match=True) + + self.logger.debug("Parameters of adversary networks updated.") + except KeyboardInterrupt: + pass + + self.logger.debug(f"Training ended at {sum(self.env1.get_attr('episodes'))} " + f"episodes") + + # save and evaluate final models, if not done yet + if len(self.win_rates[0]) < self.num_evals: + self._training_callback({'self': self.model1}) + + if len(self.win_rates[1]) < self.num_evals: + self._training_callback({'self': self.model1}) + + # close the envs + for e in (self.env1, self.env2, self.evaluator): + e.close() + + +class Evaluator: + def __init__(self, task, env_params, episodes, seed, num_envs): + # log start time + start_time = time.perf_counter() + + # initialize logger + self.logger = logging.getLogger('{0}.{1}'.format(__name__, type(self).__name__)) + + # initialize parallel environments + self.logger.debug("Initializing envs...") + + env_class = LOCMBattleSingleEnv + + self.env = [lambda: env_class(**env_params) for _ in range(num_envs)] + + self.env: VecEnv3 = DummyVecEnv3(self.env) + + # save parameters + self.episodes = episodes + self.seed = seed + + # log end time + end_time = time.perf_counter() + + self.logger.debug("Finished initializing evaluator " + f"({round(end_time - start_time, ndigits=3)}s).") + + def run(self, agent: Agent, play_first=True): + """ + Evaluates an agent. + :param agent: (gym_locm.agents.Agent) Agent to be evaluated. + :param play_first: Whether the agent will be playing first. + :return: A tuple containing the `win_rate`, the `mean_reward`, + the `mean_length` and the `action_histogram` of the evaluation episodes. + """ + # set appropriate seeds + if self.seed is not None: + for i in range(self.env.num_envs): + current_seed = self.seed + current_seed += (self.episodes // self.env.num_envs) * i + current_seed -= 1 # resetting the env increases the seed by one + + self.env.env_method('seed', current_seed, indices=[i]) + + # set agent role + self.env.set_attr('play_first', play_first) + + player = 0 if play_first else 1 + + # reset the env + observations = self.env.reset() + + # initialize metrics + episodes_so_far = 0 + episode_wins = [[] for _ in range(self.env.num_envs)] + episode_rewards = [[0.0] for _ in range(self.env.num_envs)] + episode_lengths = [[0] for _ in range(self.env.num_envs)] + episode_turns = [[] for _ in range(self.env.num_envs)] + action_histogram = [0] * self.env.action_space.n + + # run the episodes + while True: + # get the agent's action for all parallel envs + # todo: do this in a more elegant way + if isinstance(agent, RLDraftAgent): + actions = agent.act(observations) + elif isinstance(agent, RLBattleAgent): + action_masks = self.env.env_method('action_masks') + actions = agent.act(observations, action_masks) + else: + observations = self.env.get_attr('state') + actions = [agent.act(observation) for observation in observations] + + # update the action histogram + for action in actions: + action_histogram[action] += 1 + + # perform the action and get the outcome + observations, rewards, dones, infos = self.env.step(actions) + + # update metrics + for i in range(self.env.num_envs): + episode_rewards[i][-1] += rewards[i] + episode_lengths[i][-1] += 1 + + if dones[i]: + episode_wins[i].append(1 if infos[i]['winner'] == player else 0) + episode_rewards[i].append(0.0) + episode_lengths[i].append(0) + episode_turns[i].append(infos[i]['turn']) + + episodes_so_far += 1 + + # check exiting condition + if episodes_so_far >= self.episodes: + break + + # join all parallel metrics + all_wins = [win for wins in episode_wins + for win in wins[:-1]] + all_rewards = [reward for rewards in episode_rewards + for reward in rewards[:-1]] + all_lengths = [length for lengths in episode_lengths + for length in lengths[:-1]] + all_turns = [turn for turns in episode_turns for turn in turns] + + # todo: fix -- sometimes we miss self.episodes by one + # assert len(all_rewards) == self.episodes + # assert len(all_lengths) == self.episodes + # assert len(all_turns) == self.episodes + + # transform the action histogram in a probability distribution + action_histogram = [action_freq / sum(action_histogram) + for action_freq in action_histogram] + + # cap any unsolicited additional episodes + all_wins = all_wins[:self.episodes] + all_rewards = all_rewards[:self.episodes] + all_lengths = all_lengths[:self.episodes] + all_turns = all_turns[:self.episodes] + + return mean(all_wins), mean(all_rewards), mean(all_lengths), mean(all_turns), action_histogram + + def close(self): + self.env.close() + + +class TrainingCallback(BaseCallback): + def __init__(self, callback_func, verbose=0): + super(TrainingCallback, self).__init__(verbose) + + self.callback_func = callback_func + + def _on_step(self): + return self.callback_func() + + +def save_model_as_json(model, act_fun, path): + pass # todo: reimplement this supporting stable-baselines 2 and 3 + + +def model_builder_mlp_masked(env, seed, neurons, layers, activation, n_steps, + nminibatches, noptepochs, cliprange, vf_coef, ent_coef, + learning_rate, tensorboard_log=None): + net_arch = [neurons] * layers + activation = dict(tanh=th.nn.Tanh, relu=th.nn.ReLU, elu=th.nn.ELU)[activation] + + return MaskablePPO("MlpPolicy", env, learning_rate=learning_rate, n_steps=n_steps, + batch_size=nminibatches, n_epochs=noptepochs, gamma=1, + clip_range=cliprange, ent_coef=ent_coef, vf_coef=vf_coef, + verbose=0, seed=seed, + policy_kwargs=dict(net_arch=net_arch, activation_fn=activation), + tensorboard_log=tensorboard_log) diff --git a/gym_locm/toolbox/trainer.py b/gym_locm/toolbox/trainer_draft.py similarity index 100% rename from gym_locm/toolbox/trainer.py rename to gym_locm/toolbox/trainer_draft.py From 50ccd5bd2a4e78b8ab9188b621479ac95a8b3052 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 12:58:25 -0300 Subject: [PATCH 14/27] Support W&B entity and project names as parameter on training script --- gym_locm/experiments/training.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index ef48f43..1f5bfcb 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -67,6 +67,11 @@ def get_arg_parser(): p.add_argument("--concurrency", type=int, default=1, help="amount of environments to use") + p.add_argument("--wandb-entity", type=str, default="j-ufmg", + help="entity name on W&B") + p.add_argument("--wandb-project", type=str, default="gym-locm", + help="project name on W&B") + return p @@ -144,8 +149,8 @@ def run(): 'tensorboard_log': args.path + '/tf_logs'} run = wandb.init( - project='gym-locm', - entity='j-ufmg', + project=args.wandb_project, + entity=args.wandb_entity, sync_tensorboard=True, config=vars(args) ) From cb262babbdff7abfb6673a48d896f197474f62ae Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 28 Apr 2022 13:02:36 -0300 Subject: [PATCH 15/27] Add determinism as a parameter on RL battle agents Resolves #5 --- gym_locm/agents.py | 6 ++++-- gym_locm/toolbox/trainer_battle.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/gym_locm/agents.py b/gym_locm/agents.py index 2c52416..62dd8fa 100644 --- a/gym_locm/agents.py +++ b/gym_locm/agents.py @@ -1129,8 +1129,9 @@ def act(self, state): class RLBattleAgent(Agent): - def __init__(self, model): + def __init__(self, model, deterministic=False): self.model = model + self.deterministic = deterministic self.hidden_states = None self.dones = None @@ -1145,7 +1146,8 @@ def reset(self): def act(self, state, action_masks): action, self.hidden_states = \ self.model.predict(state, state=self.hidden_states, - deterministic=True, action_masks=action_masks) + deterministic=self.deterministic, + action_masks=action_masks) return action diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py index 64fd43d..4c921dc 100644 --- a/gym_locm/toolbox/trainer_battle.py +++ b/gym_locm/toolbox/trainer_battle.py @@ -161,7 +161,7 @@ def _training_callback(self, _locals=None, _globals=None): agent_class = RLBattleAgent - agent = agent_class(self.model) + agent = agent_class(self.model, deterministic=True) win_rate, mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent, play_first=self.model.role_id == 0) @@ -353,7 +353,7 @@ def _training_callback(self, _locals=None, _globals=None): self.evaluator.seed = self.seed + self.train_episodes win_rate, mean_reward, ep_length, battle_length, act_hist = \ - self.evaluator.run(agent_class(model), play_first=True) + self.evaluator.run(agent_class(model, deterministic=True), play_first=True) if self.evaluator.seed is not None: self.evaluator.seed += self.eval_episodes @@ -583,7 +583,7 @@ def _training_callback(self, _locals=None, _globals=None): agent_class = RLBattleAgent win_rate, mean_reward, ep_length, battle_length, act_hist = \ - self.evaluator.run(agent_class(model), play_first=model.role_id == 0) + self.evaluator.run(agent_class(model, deterministic=True), play_first=model.role_id == 0) end_time = time.perf_counter() self.logger.info(f"Finished evaluating " From f42d54223e02eff42ed0db235fcfae3d4d6e8627 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 3 May 2022 09:50:12 -0300 Subject: [PATCH 16/27] Fix self-play battle evaluation --- gym_locm/toolbox/trainer_battle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py index 4c921dc..4f39693 100644 --- a/gym_locm/toolbox/trainer_battle.py +++ b/gym_locm/toolbox/trainer_battle.py @@ -359,7 +359,7 @@ def _training_callback(self, _locals=None, _globals=None): self.evaluator.seed += self.eval_episodes win_rate2, mean_reward2, ep_length2, battle_length2, act_hist2 = \ - self.evaluator.run(agent_class(model), play_first=False) + self.evaluator.run(agent_class(model, deterministic=True), play_first=False) mean_reward = (mean_reward + mean_reward2) / 2 win_rate = (win_rate + win_rate2) / 2 From 087e33d1757309cc929623c402ff4de247128be4 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 3 May 2022 09:50:25 -0300 Subject: [PATCH 17/27] Fix win rate stat on evaluator --- gym_locm/toolbox/trainer_battle.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py index 4f39693..a6af8cf 100644 --- a/gym_locm/toolbox/trainer_battle.py +++ b/gym_locm/toolbox/trainer_battle.py @@ -805,13 +805,12 @@ def run(self, agent: Agent, play_first=True): break # join all parallel metrics - all_wins = [win for wins in episode_wins - for win in wins[:-1]] all_rewards = [reward for rewards in episode_rewards for reward in rewards[:-1]] all_lengths = [length for lengths in episode_lengths for length in lengths[:-1]] all_turns = [turn for turns in episode_turns for turn in turns] + all_wins = [win for wins in episode_wins for win in wins] # todo: fix -- sometimes we miss self.episodes by one # assert len(all_rewards) == self.episodes From ef68d5b6d51ea2b4975c453bbe6fe3eba60b177c Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 3 May 2022 10:09:18 -0300 Subject: [PATCH 18/27] Fix reward calculation --- gym_locm/envs/battle.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index 18db1ab..31d558a 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -78,9 +78,8 @@ def step(self, action): # less property accesses state = self.state - current_player_id = state.current_player.id - reward_before = [weight * function.calculate(state, for_player=current_player_id) + reward_before = [weight * function.calculate(state, for_player=PlayerOrder.FIRST) for function, weight in zip(self.reward_functions, self.reward_weights)] # execute the action @@ -89,7 +88,7 @@ def step(self, action): else: state.was_last_action_invalid = True - reward_after = [weight * function.calculate(state, for_player=current_player_id) + reward_after = [weight * function.calculate(state, for_player=PlayerOrder.FIRST) for function, weight in zip(self.reward_functions, self.reward_weights)] # build return info From ebbb782e43bf4eb04bcf1ea785dac669d54a5e44 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 3 May 2022 11:51:23 -0300 Subject: [PATCH 19/27] Add reward parameters to training script --- gym_locm/experiments/training.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index 1f5bfcb..47cbfc2 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -6,6 +6,7 @@ import wandb from gym_locm import agents +from gym_locm.envs import rewards _counter = 0 @@ -26,6 +27,10 @@ def get_arg_parser(): default="max-attack") p.add_argument("--battle-agent", "-b", choices=battle_agents, default="max-attack") + p.add_argument("--reward-functions", "-rf", nargs="+", choices=list(rewards.available_rewards.keys()), + default=("win-loss",), help="reward functions to use") + p.add_argument("--reward-weights", "-rw", nargs="+", type=float, + default=None, help="weights of the reward functions") p.add_argument("--path", "-p", help="path to save models and results", required=True) @@ -87,6 +92,12 @@ def run(): os.makedirs(args.path, exist_ok=True) + if args.reward_weights is None: + args.reward_weights = tuple([1.0 for _ in range(len(args.reward_functions))]) + + assert len(args.reward_weights) == len(args.reward_functions), \ + f"The amount of reward weights should be the same as those of reward functions" + if args.task == 'draft': from gym_locm.toolbox.trainer_draft import AsymmetricSelfPlay, SelfPlay, FixedAdversary, \ @@ -104,13 +115,17 @@ def run(): env_params = { 'battle_agents': (battle_agent(), battle_agent()), - 'use_draft_history': args.approach == 'history' + 'use_draft_history': args.approach == 'history', + 'reward_functions': args.reward_functions, + 'reward_weights': args.reward_weights } eval_env_params = { 'draft_agent': agents.MaxAttackDraftAgent(), 'battle_agents': (battle_agent(), battle_agent()), - 'use_draft_history': args.approach == 'history' + 'use_draft_history': args.approach == 'history', + 'reward_functions': args.reward_functions, + 'reward_weights': args.reward_weights } elif args.task == 'battle': @@ -122,15 +137,19 @@ def run(): battle_agent = agents.parse_battle_agent(args.battle_agent) env_params = { - 'draft_agents': (draft_agent(), draft_agent()) + 'draft_agents': (draft_agent(), draft_agent()), + 'reward_functions': args.reward_functions, + 'reward_weights': args.reward_weights } if args.adversary == 'fixed': - env_params['battle_agent'] = battle_agent() + env_params['battle_agent'] = battle_agent(), eval_env_params = { 'draft_agents': (draft_agent(), draft_agent()), - 'battle_agent': battle_agent() + 'battle_agent': battle_agent(), + 'reward_functions': args.reward_functions, + 'reward_weights': args.reward_weights } else: From 3ce01f8624dd26bfd105fe1ef7f7c229cd24d8fd Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 3 May 2022 12:05:38 -0300 Subject: [PATCH 20/27] Add the gamma parameter to training script --- gym_locm/experiments/training.py | 3 ++- gym_locm/toolbox/trainer_battle.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index 47cbfc2..73ca97f 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -41,6 +41,7 @@ def get_arg_parser(): p.add_argument("--num-evals", "-ne", type=int, default=12, help="how many evaluations to perform throughout training") + p.add_argument("--gamma", type=float, default=1.0, help="gamma (discount factor)") p.add_argument("--switch-freq", type=int, default=1000, help="how many episodes to run before updating opponent networks") p.add_argument("--layers", type=int, default=1, @@ -165,7 +166,7 @@ def run(): 'noptepochs': args.noptepochs, 'cliprange': args.cliprange, 'vf_coef': args.vf_coef, 'ent_coef': args.ent_coef, 'activation': args.act_fun, 'learning_rate': args.learning_rate, - 'tensorboard_log': args.path + '/tf_logs'} + 'tensorboard_log': args.path + '/tf_logs', 'gamma': args.gamma} run = wandb.init( project=args.wandb_project, diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py index a6af8cf..de22688 100644 --- a/gym_locm/toolbox/trainer_battle.py +++ b/gym_locm/toolbox/trainer_battle.py @@ -849,12 +849,12 @@ def save_model_as_json(model, act_fun, path): def model_builder_mlp_masked(env, seed, neurons, layers, activation, n_steps, nminibatches, noptepochs, cliprange, vf_coef, ent_coef, - learning_rate, tensorboard_log=None): + learning_rate, gamma, tensorboard_log=None): net_arch = [neurons] * layers activation = dict(tanh=th.nn.Tanh, relu=th.nn.ReLU, elu=th.nn.ELU)[activation] return MaskablePPO("MlpPolicy", env, learning_rate=learning_rate, n_steps=n_steps, - batch_size=nminibatches, n_epochs=noptepochs, gamma=1, + batch_size=nminibatches, n_epochs=noptepochs, gamma=gamma, clip_range=cliprange, ent_coef=ent_coef, vf_coef=vf_coef, verbose=0, seed=seed, policy_kwargs=dict(net_arch=net_arch, activation_fn=activation), From d8265d3f05b02ad0b26e3e777e12e8f423c7de2d Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 5 May 2022 12:48:18 -0300 Subject: [PATCH 21/27] Implement player board presence reward function --- gym_locm/envs/rewards.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index 24d4a97..b3dbe6d 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -29,10 +29,16 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): return -max(0, state.players[for_player.opposing()].health) / 30 +class PlayerBoardPresenceRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + return sum(creature.attack for lane in state.players[for_player].lanes for creature in lane) + + available_rewards = { "win-loss": WinLossRewardFunction, "player-health": PlayerHealthRewardFunction, - "opponent-health": OpponentHealthRewardFunction + "opponent-health": OpponentHealthRewardFunction, + "player-board-presence": PlayerBoardPresenceRewardFunction } From 4118ca648f434385f50486975af5863449fabf89 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 5 May 2022 12:48:36 -0300 Subject: [PATCH 22/27] Implement opponent board presence reward function --- gym_locm/envs/rewards.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index b3dbe6d..8c7b539 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -34,11 +34,17 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): return sum(creature.attack for lane in state.players[for_player].lanes for creature in lane) +class OpponentBoardPresenceRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + return -sum(creature.attack for lane in state.players[for_player.opposing()].lanes for creature in lane) + + available_rewards = { "win-loss": WinLossRewardFunction, "player-health": PlayerHealthRewardFunction, "opponent-health": OpponentHealthRewardFunction, - "player-board-presence": PlayerBoardPresenceRewardFunction + "player-board-presence": PlayerBoardPresenceRewardFunction, + "opponent-board-presence": OpponentBoardPresenceRewardFunction } From 405fe88e3751e9c2d44ba6b81fcde2db4a49c6c3 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 5 May 2022 13:07:51 -0300 Subject: [PATCH 23/27] Make Coac battle agent's state eval method public --- gym_locm/agents.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gym_locm/agents.py b/gym_locm/agents.py index 62dd8fa..94c42f4 100644 --- a/gym_locm/agents.py +++ b/gym_locm/agents.py @@ -286,7 +286,7 @@ def _eval_creature(creature): return score @staticmethod - def _eval_state(state): + def eval_state(state): score = 0 player, enemy = state.current_player, state.opposing_player @@ -353,7 +353,7 @@ def _brute_force_leaf(self, state, alpha): self.leaf += 1 - return best_action, -self._eval_state(state) + return best_action, -self.eval_state(state) def _brute_force(self, state, depth, alpha): state = state.clone() @@ -415,7 +415,7 @@ def _run_brute_force(self, state, depth, alpha): else: return action, -100000 - return action, self._eval_state(state) + return action, self.eval_state(state) def act(self, state, time_limit_ms=1000): self.leaf = 0 From 29dc1f0e8498956c1c8efb1fc46a85bc151747d7 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Thu, 5 May 2022 13:08:26 -0300 Subject: [PATCH 24/27] Implement Coac state eval reward function --- gym_locm/envs/rewards.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gym_locm/envs/rewards.py b/gym_locm/envs/rewards.py index 8c7b539..96f5c2e 100644 --- a/gym_locm/envs/rewards.py +++ b/gym_locm/envs/rewards.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod +from gym_locm.agents import CoacBattleAgent from gym_locm.engine import State, PlayerOrder @@ -39,12 +40,20 @@ def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): return -sum(creature.attack for lane in state.players[for_player.opposing()].lanes for creature in lane) +class CoacRewardFunction(RewardFunction): + def calculate(self, state: State, for_player: PlayerOrder = PlayerOrder.FIRST): + signal = 1 if state.current_player.id == for_player else -1 + + return min(1, max(-1, signal * CoacBattleAgent.eval_state(state) / 2000)) + + available_rewards = { "win-loss": WinLossRewardFunction, "player-health": PlayerHealthRewardFunction, "opponent-health": OpponentHealthRewardFunction, "player-board-presence": PlayerBoardPresenceRewardFunction, - "opponent-board-presence": OpponentBoardPresenceRewardFunction + "opponent-board-presence": OpponentBoardPresenceRewardFunction, + "coac": CoacRewardFunction } From cddfb47438ee9487ab0b1beb8907b2ecae4a4265 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 10 May 2022 17:54:39 -0300 Subject: [PATCH 25/27] Fix rewards after turn passing actions --- gym_locm/envs/base_env.py | 3 +++ gym_locm/envs/battle.py | 12 +++++++++--- gym_locm/envs/draft.py | 12 +++++++++--- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/gym_locm/envs/base_env.py b/gym_locm/envs/base_env.py index 652f3b9..47a1b17 100644 --- a/gym_locm/envs/base_env.py +++ b/gym_locm/envs/base_env.py @@ -25,6 +25,8 @@ def __init__(self, seed=None, items=True, k=3, n=30, reward_functions=('win-loss self.reward_functions = tuple([parse_reward(function_name)() for function_name in reward_functions]) self.reward_weights = reward_weights + self.last_player_rewards = [None, None] + self.reward_range = (-sum(reward_weights), sum(reward_weights)) self.state = State(seed=seed, items=items, k=k, n=n) @@ -55,6 +57,7 @@ def reset(self): self.state = State(seed=self._seed, items=self.items) self.episodes += 1 + self.last_player_rewards = [None, None] def render(self, mode: str = 'text'): """Builds a representation of the current state.""" diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index 31d558a..4a8eaa7 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -79,8 +79,9 @@ def step(self, action): # less property accesses state = self.state - reward_before = [weight * function.calculate(state, for_player=PlayerOrder.FIRST) - for function, weight in zip(self.reward_functions, self.reward_weights)] + self.last_player_rewards[state.current_player.id] = \ + [weight * function.calculate(state, for_player=PlayerOrder.FIRST) + for function, weight in zip(self.reward_functions, self.reward_weights)] # execute the action if action is not None: @@ -88,13 +89,18 @@ def step(self, action): else: state.was_last_action_invalid = True + reward_before = self.last_player_rewards[state.current_player.id] reward_after = [weight * function.calculate(state, for_player=PlayerOrder.FIRST) for function, weight in zip(self.reward_functions, self.reward_weights)] # build return info winner = state.winner - raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + if reward_before is None: + raw_rewards = (0.0,) * len(self.reward_functions) + else: + raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + reward = sum(raw_rewards) done = winner is not None info = {'phase': state.phase, diff --git a/gym_locm/envs/draft.py b/gym_locm/envs/draft.py index 4080501..2e158cf 100644 --- a/gym_locm/envs/draft.py +++ b/gym_locm/envs/draft.py @@ -97,8 +97,9 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): state = self.state current_player_id = state.current_player.id - reward_before = [weight * function.calculate(state, for_player=current_player_id) - for function, weight in zip(self.reward_functions, self.reward_weights)] + self.last_player_rewards[state.current_player.id] = \ + [weight * function.calculate(state, for_player=current_player_id) + for function, weight in zip(self.reward_functions, self.reward_weights)] # find appropriate value for the provided card index if 0 <= action.origin < self.k: @@ -113,6 +114,7 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): # execute the action state.act(action) + reward_before = self.last_player_rewards[state.current_player.id] reward_after = [weight * function.calculate(state, for_player=current_player_id) for function, weight in zip(self.reward_functions, self.reward_weights)] @@ -152,7 +154,11 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): del info['turn'] - raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + if reward_before is None: + raw_rewards = (0.0,) * len(self.reward_functions) + else: + raw_rewards = tuple([after - before for before, after in zip(reward_before, reward_after)]) + info['raw_rewards'] = raw_rewards reward = sum(raw_rewards) From 531a02c1559a0ebab5e9f194e02b0a8762ec4d26 Mon Sep 17 00:00:00 2001 From: Ronaldo e Silva Vieira Date: Fri, 20 May 2022 09:58:28 -0300 Subject: [PATCH 26/27] Fix training against fixed adversary --- gym_locm/experiments/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index 73ca97f..58b6745 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -144,7 +144,7 @@ def run(): } if args.adversary == 'fixed': - env_params['battle_agent'] = battle_agent(), + env_params['battle_agent'] = battle_agent() eval_env_params = { 'draft_agents': (draft_agent(), draft_agent()), From 53f416ff7970b1df58f6416713dc7415e22c0c43 Mon Sep 17 00:00:00 2001 From: Ronaldo Vieira Date: Tue, 31 May 2022 10:29:57 -0300 Subject: [PATCH 27/27] Bump version to 1.2.0 --- CITATION.cff | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index 351e611..62bd3f2 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -19,5 +19,5 @@ license: MIT message: "If you use this software, please cite it as below." repository-code: "https://github.com/ronaldosvieira/gym-locm" title: "OpenAI Gym Environments for Legends of Code and Magic" -version: "1.1.0" +version: "1.2.0" ... diff --git a/setup.py b/setup.py index 7346b47..64113ce 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='gym_locm', - version='1.0.0', + version='1.2.0', install_requires=['gym', 'numpy', 'prettytable', 'pexpect', 'sty'], extras_require={ 'experiments': ['numpy', 'scipy', 'stable_baselines', 'hyperopt',