diff --git a/CITATION.cff b/CITATION.cff index 62bd3f2..e130815 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -19,5 +19,5 @@ license: MIT message: "If you use this software, please cite it as below." repository-code: "https://github.com/ronaldosvieira/gym-locm" title: "OpenAI Gym Environments for Legends of Code and Magic" -version: "1.2.0" +version: "1.3.0" ... diff --git a/README.md b/README.md index 114f943..ed7290d 100644 --- a/README.md +++ b/README.md @@ -243,8 +243,9 @@ engine, and with a specific random seed: ### Train draft agents with deep reinforcement learning We provide scripts to train deep reinforcement learning draft agents as described in our -thesis [2] and paper [3]. Further instructions are available in the README.md in -the [experiments](https://github.com/ronaldosvieira/gym-locm/tree/master/gym_locm/experiments) +thesis [2] and SBGames 2020 paper [3]. +Further instructions are available in the README.md file in +the [experiments](gym_locm/experiments) package. To install the dependencies necessary to run the scripts, install @@ -253,16 +254,25 @@ the repository with pip install -e .['experiments'] ``` -### Use trained draft agents - -We provide a collection of draft agents trained with deep +We also provide a collection of draft agents trained with deep reinforcement learning, and a script to use them in the LOCM's original engine. Further details on these agents and instructions for the script are available in the README.md in the -[trained_models](https://github.com/ronaldosvieira/gym-locm/tree/master/gym_locm/trained_models) +[trained_models](gym_locm/trained_models) +package. The use of these draft agents with the Runner script is not implemented yet. + +### Train battle agents with deep reinforcement learning + +We provide scripts to train deep reinforcement learning battle agents as described in our +SBGames 2022 paper [4]. Further instructions are available +in the README.md file in the [experiments/papers/sbgames-2022](gym_locm/experiments/papers/sbgames-2022) package. -The use of these draft agents with the Runner script is not implemented yet. +To install the dependencies necessary to run the scripts, install +the repository with +```python +pip install -e .['experiments'] +``` ## References 1. Kowalski, J., Miernik, R. (2020). Evolutionary @@ -276,5 +286,9 @@ of Minas Gerais, Belo Horizonte, Brazil. Collectible Card Games via Reinforcement Learning. 19th Brazilian Symposium of Computer Games and Digital Entertainment (SBGames). +4. Vieira, R., Tavares, A. R., Chaimowicz, L. (2022). Exploring Deep + Reinforcement Learning for Battling in Collectible Card Games. 19th Brazilian Symposium + of Computer Games and Digital Entertainment (SBGames). + ## License [MIT](https://choosealicense.com/licenses/mit/) diff --git a/gym_locm/agents.py b/gym_locm/agents.py index 94c42f4..e32ee89 100644 --- a/gym_locm/agents.py +++ b/gym_locm/agents.py @@ -1198,8 +1198,10 @@ def act(self, state): "pass": PassBattleAgent, "random": RandomBattleAgent, "greedy": GreedyBattleAgent, + "osl": GreedyBattleAgent, "rule-based": RuleBasedBattleAgent, "max-attack": MaxAttackBattleAgent, + "ma": MaxAttackBattleAgent, "coac": CoacBattleAgent, "mcts": MCTSBattleAgent } diff --git a/gym_locm/envs/battle.py b/gym_locm/envs/battle.py index 4a8eaa7..29bdb57 100644 --- a/gym_locm/envs/battle.py +++ b/gym_locm/envs/battle.py @@ -200,13 +200,15 @@ def get_episode_rewards(self): class LOCMBattleSingleEnv(LOCMBattleEnv): def __init__(self, battle_agent=RandomBattleAgent(), - play_first=True, **kwargs): + play_first=True, alternate_roles=False, **kwargs): # init the env super().__init__(**kwargs) - # also init the battle agent and the new parameter + # also init the battle agent and the new parameters self.battle_agent = battle_agent self.play_first = play_first + self.alternate_roles = alternate_roles + self.rewards_single_player = [] # reset the battle agent self.battle_agent.reset() @@ -216,6 +218,9 @@ def reset(self) -> np.array: Resets the environment. The game is put into its initial state and all agents are reset. """ + if self.alternate_roles: + self.play_first = not self.play_first + # reset what is needed encoded_state = super().reset() @@ -227,6 +232,8 @@ def reset(self) -> np.array: while self.state.current_player.id != PlayerOrder.SECOND: super().step(self.battle_agent.act(self.state)) + self.rewards_single_player.append(0.0) + return encoded_state def step(self, action): @@ -253,17 +260,27 @@ def step(self, action): if not self.play_first: reward = -reward + try: + self.rewards_single_player[-1] += reward + except IndexError: + self.rewards_single_player = [reward] + return state, reward, done, info + def get_episode_rewards(self): + return self.rewards_single_player + class LOCMBattleSelfPlayEnv(LOCMBattleEnv): - def __init__(self, play_first=True, adversary_policy=None, **kwargs): + def __init__(self, play_first=True, alternate_roles=True, adversary_policy=None, **kwargs): # init the env super().__init__(**kwargs) # also init the new parameters self.play_first = play_first self.adversary_policy = adversary_policy + self.alternate_roles = alternate_roles + self.rewards_single_player = [] def reset(self) -> np.array: """ @@ -273,8 +290,8 @@ def reset(self) -> np.array: # reset what is needed encoded_state = super().reset() - # also reset the battle agent - self.play_first = not self.play_first + if self.alternate_roles: + self.play_first = not self.play_first # if playing second, have first player play if not self.play_first: @@ -288,6 +305,8 @@ def reset(self) -> np.array: state, reward, done, info = super().step(0) break + self.rewards_single_player.append(0.0) + return encoded_state def step(self, action): @@ -315,4 +334,12 @@ def step(self, action): if not self.play_first: reward = -reward + try: + self.rewards_single_player[-1] += reward + except IndexError: + self.rewards_single_player = [reward] + return state, reward, done, info + + def get_episode_rewards(self): + return self.rewards_single_player diff --git a/gym_locm/envs/draft.py b/gym_locm/envs/draft.py index 2e158cf..1a5a05a 100644 --- a/gym_locm/envs/draft.py +++ b/gym_locm/envs/draft.py @@ -152,8 +152,6 @@ def step(self, action: Union[int, Action]) -> (np.array, int, bool, dict): done = True - del info['turn'] - if reward_before is None: raw_rewards = (0.0,) * len(self.reward_functions) else: diff --git a/gym_locm/envs/full_game.py b/gym_locm/envs/full_game.py index 07367e0..027c085 100644 --- a/gym_locm/envs/full_game.py +++ b/gym_locm/envs/full_game.py @@ -107,8 +107,6 @@ def step(self, action): if winner is not None: reward = 1 if winner == PlayerOrder.FIRST else -1 - del info['turn'] - return self.encode_state(), reward, done, info def _encode_state_battle(self): diff --git a/gym_locm/experiments/README.md b/gym_locm/experiments/README.md index 7d5d373..cd70fcb 100644 --- a/gym_locm/experiments/README.md +++ b/gym_locm/experiments/README.md @@ -17,7 +17,7 @@ python3 gym_locm/experiments/hyp-search.py --approach --battle-agent --path hyp_search_results/ --seed 96765 --processes 4 ``` -The list and range of hyperparameted explored is available in the Appendix of our paper and in Attachment A of +The list and range of hyperparameters explored is available in the Appendix of our paper and in Attachment A of our thesis. we performed hyperparameter tunings for all combinations of `` (`immediate`, `history` and `lstm`) and `` (`max-attack` and `greedy`). Each run of the script took around 2 days with the `max-attack` battle agent and more than a week with the `greedy` battle agent. To learn about other script's @@ -37,7 +37,7 @@ python3 gym_locm/experiments/training.py --approach --battle-agent ` and -``, using the best sets of hyperparameters found for them in the previous experiment. That comprises +``, using the best sets of hyperparameters found for them in the previous experiment. That comprises ten runs of the script, in which we used the seeds 32359627, 91615349, 88803987, 83140551, 50731732, 19279988, 35717793, 48046766, 86798618 and 62644993. diff --git a/gym_locm/experiments/papers/entcom-2022/README.md b/gym_locm/experiments/papers/entcom-2022/README.md new file mode 100644 index 0000000..e5ab9e5 --- /dev/null +++ b/gym_locm/experiments/papers/entcom-2022/README.md @@ -0,0 +1,109 @@ +# Reproducing the experiments from our Entertainment Computing 2022 paper + +This readme file contains the information necessary to reproduce the experiments +from our paper in Entertainment Computing 2022 named "_Exploring Deep Reinforcement Learning for +Drafting in Collectible Card Games_." Please contact +me at [ronaldo.vieira@dcc.ufmg.br](mailto:ronaldo.vieira@dcc.ufmg.br) in case any +of the instructions below do not work. + +The game engine for LOCM 1.2 can be found at [engine.py](../../../engine.py), which is used by the OpenAI +Gym environments (more info on the repository's main page). The implementation of our +approaches can be found in the experiment files mentioned below. The resulting agents can be found in the +[trained_models](../../../trained_models) folder, along with instructions on how to use them. + +## Section 4.1: hyperparameter search + +To perform a hyperparameter tuning, simply execute the [hyp-search.py](../../../experiments/hyp-search.py) script: + +``` +python3 gym_locm/experiments/hyp-search.py --approach --battle-agent \ + --path hyp_search_results/ --seed 96765 --processes 4 +``` + +The list and range of hyperparameters explored is available in the Appendix A of our paper. we performed +hyperparameter tunings for all combinations of `` (`immediate`, `history` and `lstm`) and +`` (`ma` and `osl`). To learn about the other script's parameters, execute it with the +`--help` flag. + +## Section 4.2: comparison between our approaches + +To train **two** draft agents (a 1st player and a 2nd player) with a specific draft approach and battle agent, +in asymmetric self-play, simply execute the [training.py](../../../experiments/training.py) script: + +``` +python3 gym_locm/experiments/training.py --approach --battle-agent \ + --path training_results/ --switch-freq --layers --neurons \ + --act-fun --n-steps --nminibatches \ + --noptepochs --cliprange --vf-coef --ent-coef \ + --learning-rate --seed 32359627 --concurrency 4 +``` + +We trained ten draft agents (five 1st players and five 2nd players) of each combination of `` and +``, using the best sets of hyperparameters found for them in the previous experiment. That comprises +five runs of the script, in which we used the seeds `32359627`, `91615349`, `88803987`, `83140551` and `50731732`. + +To learn about the other script's parameters, execute it with the `--help` flag. Running the script with all default +parameters will train a `immediate` drafter with the `ma` battler, using the best set of hyperparameters +we found for that combination. The best set of hyperparameters for the other combinations is available in the +Appendix A of our paper. + +## Section 4.3: comparison with other draft agents + +To run one of the tournaments, simply execute the [tournament.py](../../../experiments/tournament.py) script: +``` +python3 gym_locm/experiments/tournament.py \ + --drafters random max-attack coac closet-ai icebox chad \ + gym_locm/trained_models//immediate-1M/ \ + gym_locm/trained_models//lstm-1M/ \ + gym_locm/trained_models//history-1M/ \ + --battler --concurrency 4 --games 1000 --path tournament_results/ \ + --seeds 32359627 91615349 88803987 83140551 50731732 +``` +replacing `` for either `ma` or `osl`, respectively, to run either tournament as +depicted in our paper. The script will create files at `tournament_results/` describing +the individual win rates of every set of matches, the aggregate win rates, average mana curves (section 4.3.2) +and every individual draft choice made by every agent, in CSV format, for human inspection, and as serialized +Pandas data frames (PKL format), for easy further data manipulation. To learn about the other script's +parameters, execute it with the `--help` flag. + +To reproduce the table of agent similarities and the plot containing the agent's three-dimensional coordinates +found via Principal Component Analysis and grouped via K-Means (section 4.3.3), simply execute the +[similarities.py](../../../experiments/similarities.py) script: +``` +python3 gym_locm/experiments/similarities.py \ + --files ma_tournament_results/choices.csv osl_tournament_results/choices.csv +``` +which will result in files containing the similarities table (in CSV and PKL formats) and the plot (in PNG format) +created to the current folder. + +## Section 4.4: agent improvement in the SCGAI competition + +We used the source code of the Strategy Card Game AI competition +([2019](https://github.com/acatai/Strategy-Card-Game-AI-Competition/tree/master/contest-2019-08-COG) and +[2020](https://github.com/acatai/Strategy-Card-Game-AI-Competition/tree/master/contest-2020-08-COG) editions) +to re-run the matches, replacing the *max-attack* player (named Baseline2) with a personalized player featuring +our best draft agent and the battle portion on the *max-attack* player. This can be reproduced by altering line +11 (2019) or line 2 (2020) of the runner script (`run.sh`) from `AGENTS[10]="python3 Baseline2/main.py"` to +```bash +AGENTS[10]="python3 gym_locm/toolbox/predictor.py --battle \"python3 Baseline2/main.py\" \ + --draft-1 path/to/gym_locm/trained_models/max-attack/immediate-1M/1st/6.json \ + --draft-2 path/to/gym_locm/trained_models/max-attack/immediate-1M/2nd/8.json" +``` +then, executing it. Parallelism can be achieved by running the script in multiple processes/machines. Save the +output to text files named `out-*.txt` (with a number instead of `*`) in the same folder, then run `analyze.py` +to extract win rates. The runner script can take up to several days, and the analyze script can take up to some hours. +See the [trained_models](../../../trained_models) package for more information on the predictor script. + +## Section 4.5: importance of being history-aware in LOCM + +This experiment is simply a re-execution of the OSL tournament from section 4.2, adding a new draft agent to the +tournament (`historyless`). To reproduce it, execute the following script: +``` +python3 gym_locm/experiments/tournament.py \ + --drafters random max-attack coac closet-ai icebox chad historyless \ + gym_locm/trained_models//immediate-1M/ \ + gym_locm/trained_models//lstm-1M/ \ + gym_locm/trained_models//history-1M/ \ + --battler osl --concurrency 4 --games 1000 --path osl_historyless_tournament_results/ \ + --seeds 32359627 91615349 88803987 83140551 50731732 +``` \ No newline at end of file diff --git a/gym_locm/experiments/papers/sbgames-2022/README.md b/gym_locm/experiments/papers/sbgames-2022/README.md new file mode 100644 index 0000000..aa66e69 --- /dev/null +++ b/gym_locm/experiments/papers/sbgames-2022/README.md @@ -0,0 +1,74 @@ +# Reproducing the experiments from our SBGames 2022 paper + +This readme file contains the information necessary to reproduce the experiments +from our paper in SBGames 2022 named "_Exploring Deep Reinforcement Learning for +Battling in Collectible Card Games_." Although we mention in the paper that we use +gym-locm's version 1.3.0, any future version should also suffice. Please contact +me at [ronaldo.vieira@dcc.ufmg.br](mailto:ronaldo.vieira@dcc.ufmg.br) in case any +of the instructions below do not work. + +## Experiment 1: hyperparameter search + +We use Weights and Biases (W&B) to orchestrate our hyperparameter search. The +`hyp-search.yaml` file contains the search configuration, including hyperparameter +ranges. Having W&B installed, executing the following command on a terminal will +create a "sweep" on W&B: + +```commandline +wandb sweep gym_locm/experiments/sbgames-2022/hyp-search.yaml +``` + +This command will output a _sweep ID_, including the entity and project names. +Save it for the next step. +From this moment on, the hyperparameter search can be observed on W&B's website. +However, no training sessions will happen until you "recruit" one or more +computers to run the training sessions. That can be done by executing the following +command on a terminal: + +```commandline +wandb agent +``` + +Where the `sweep_id` parameter should be the _sweep ID_ saved from the output of +the previous command. From now on, the recruited computers will run training sessions +continuously until you tell them to stop. That can be done on W&B's website or by +issuing a CTRL + C on the terminal where the training sessions are being executed. +In our paper, we executed 35 training sessions. All the statistics can be seen on +W&B's website, including which sets of hyperparameters yielded the best results. +For more info on W&B sweeps, see [the docs](https://docs.wandb.ai/guides/sweeps). + +## Experiment 2: training in self-play + +Using the best set of hyperparameters found in the previous experiment, we executed +five training sessions, each with a different random seed. To reproduce the training +sessions we used for the paper, execute the following command on a terminal: + +```commandline +python gym_locm/experiments/training.py --act-fun=relu --adversary=self-play \ +--cliprange=0.2 --concurrency=4 --draft-agent=random --ent-coef=0.005 \ +--eval-episodes=500 --gamma=0.99 --layers=7 --learning-rate=0.0041142387646692325 \ +--n-steps=512 --neurons=455 --nminibatches-divider=1 --noptepochs=1 --num-evals=100 \ +--path=gym_locm/experiments/papers/sbgames-2022/self-play --role=alternate \ +--seed= --switch-freq=10 --task=battle --train-episodes=100000 --vf-coef=1 +``` + +Repeating five times, each with a different `seed` parameter. The seeds we used were: +`91577453`, `688183`, `63008694`, `4662087`, and `58793266`. + +## Experiment 3: training against a fixed battle agent + +This experiment uses almost the same command as the previous: + +```commandline +python gym_locm/experiments/training.py --act-fun=relu --adversary=fixed \ +--battle-agent= --cliprange=0.2 --concurrency=4 --draft-agent=random \ +--ent-coef=0.005 --eval-episodes=500 --gamma=0.99 --layers=7 \ +--learning-rate=0.0041142387646692325 --n-steps=512 --neurons=455 \ +--nminibatches-divider=1 --noptepochs=1 --num-evals=100 \ +--path=gym_locm/experiments/papers/sbgames-2022/fixed --role=alternate --seed= \ + --switch-freq=10 --task=battle --train-episodes=100000 --vf-coef=1 +``` + +Repeating ten times, each with a different combination of `battle_agent` and `seed` +parameters. The seeds we used were: `91577453`, `688183`, `63008694`, `4662087`, +and `58793266`. The battle agents we used were `max-attack` (MA) and `greedy` (OSL). diff --git a/gym_locm/experiments/papers/sbgames-2022/hyp-search.yaml b/gym_locm/experiments/papers/sbgames-2022/hyp-search.yaml new file mode 100644 index 0000000..091bddd --- /dev/null +++ b/gym_locm/experiments/papers/sbgames-2022/hyp-search.yaml @@ -0,0 +1,74 @@ +method: bayes +metric: + goal: maximize + name: eval_vs_GreedyBattleAgent/win_rate +name: sbgames-2022 +parameters: + act-fun: + value: relu + adversary: + value: self-play + cliprange: + value: 0.2 + concurrency: + value: 4 + draft-agent: + value: random + ent-coef: + value: 0.005 + eval-episodes: + value: 500 + gamma: + value: 0.99 + layers: + distribution: int_uniform + max: 12 + min: 3 + learning-rate: + distribution: uniform + max: 0.01 + min: 1e-06 + n-steps: + values: + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + neurons: + distribution: int_uniform + max: 512 + min: 32 + nminibatches-divider: + values: + - 1 + - 2 + - 4 + - 8 + - "n" + noptepochs: + distribution: int_uniform + max: 24 + min: 1 + num-evals: + value: 100 + path: + value: papers/sbgames-2022/sweep + role: + value: alternate + seed: + value: 91577453 + switch-freq: + values: + - 10 + - 100 + - 1000 + task: + value: battle + train-episodes: + value: 100000 + vf-coef: + value: 1 +program: gym_locm/experiments/training.py +project: sbgames-2022 \ No newline at end of file diff --git a/gym_locm/experiments/training.py b/gym_locm/experiments/training.py index 58b6745..469dfd6 100644 --- a/gym_locm/experiments/training.py +++ b/gym_locm/experiments/training.py @@ -19,14 +19,19 @@ def get_arg_parser(): approach = ['immediate', 'lstm', 'history'] battle_agents = ['max-attack', 'greedy'] adversary = ['fixed', 'self-play', 'asymmetric-self-play'] + roles = ['first', 'second', 'alternate'] p.add_argument("--task", "-t", choices=tasks, default="draft") p.add_argument("--approach", "-ap", choices=approach, default="immediate") p.add_argument("--adversary", "-ad", choices=adversary, default="asymmetric-self-play") p.add_argument("--draft-agent", "-d", choices=list(agents.draft_agents.keys()), default="max-attack") - p.add_argument("--battle-agent", "-b", choices=battle_agents, + p.add_argument("--battle-agent", "-b", choices=list(agents.battle_agents.keys()), default="max-attack") + p.add_argument("--eval-battle-agents", "-eb", choices=list(agents.battle_agents.keys()), + nargs="+", default=["max-attack", "greedy"], help="battle agents to use on evaluation") + p.add_argument("--role", "-r", choices=roles, default="alternate", + help="whether to train as first player, second player or alternate") p.add_argument("--reward-functions", "-rf", nargs="+", choices=list(rewards.available_rewards.keys()), default=("win-loss",), help="reward functions to use") p.add_argument("--reward-weights", "-rw", nargs="+", type=float, @@ -109,10 +114,7 @@ def run(): else: model_builder = model_builder_mlp - if args.battle_agent == 'greedy': - battle_agent = agents.GreedyBattleAgent - else: - battle_agent = agents.MaxAttackBattleAgent + battle_agent = agents.parse_battle_agent(args.battle_agent) env_params = { 'battle_agents': (battle_agent(), battle_agent()), @@ -137,6 +139,11 @@ def run(): draft_agent = agents.parse_draft_agent(args.draft_agent) battle_agent = agents.parse_battle_agent(args.battle_agent) + if args.eval_battle_agents is None: + args.eval_battle_agents = [args.battle_agent] + + eval_battle_agents = list(map(agents.parse_battle_agent, args.eval_battle_agents)) + env_params = { 'draft_agents': (draft_agent(), draft_agent()), 'reward_functions': args.reward_functions, @@ -146,12 +153,15 @@ def run(): if args.adversary == 'fixed': env_params['battle_agent'] = battle_agent() - eval_env_params = { - 'draft_agents': (draft_agent(), draft_agent()), - 'battle_agent': battle_agent(), - 'reward_functions': args.reward_functions, - 'reward_weights': args.reward_weights - } + eval_env_params = [] + + for eval_battle_agent in eval_battle_agents: + eval_env_params.append({ + 'draft_agents': (draft_agent(), draft_agent()), + 'battle_agent': eval_battle_agent(), + 'reward_functions': args.reward_functions, + 'reward_weights': args.reward_weights + }) else: raise Exception("Invalid task") @@ -168,15 +178,18 @@ def run(): 'activation': args.act_fun, 'learning_rate': args.learning_rate, 'tensorboard_log': args.path + '/tf_logs', 'gamma': args.gamma} - run = wandb.init( - project=args.wandb_project, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args) - ) + if args.task == 'battle': + run = wandb.init( + project=args.wandb_project, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args) + ) - # enable the use of wandb sweeps - args = wandb.config + # enable the use of wandb sweeps + args = wandb.config + else: + run = None if args.adversary == 'asymmetric-self-play': trainer = AsymmetricSelfPlay( @@ -189,14 +202,14 @@ def run(): trainer = SelfPlay( args.task, model_builder, model_params, env_params, eval_env_params, args.train_episodes, args.eval_episodes, args.num_evals, - args.switch_freq, args.path, args.seed, args.concurrency, + args.role, args.switch_freq, args.path, args.seed, args.concurrency, wandb_run=run ) elif args.adversary == 'fixed': trainer = FixedAdversary( args.task, model_builder, model_params, env_params, eval_env_params, args.train_episodes, args.eval_episodes, args.num_evals, - True, args.path, args.seed, args.concurrency, wandb_run=run + args.role, args.path, args.seed, args.concurrency, wandb_run=run ) else: raise Exception("Invalid adversary") diff --git a/gym_locm/toolbox/trainer_battle.py b/gym_locm/toolbox/trainer_battle.py index de22688..dfeb06c 100644 --- a/gym_locm/toolbox/trainer_battle.py +++ b/gym_locm/toolbox/trainer_battle.py @@ -3,6 +3,8 @@ import math import os import time +from typing import List + import numpy as np from abc import abstractmethod from datetime import datetime @@ -15,7 +17,7 @@ from sb3_contrib import MaskablePPO from wandb.integration.sb3 import WandbCallback -from gym_locm.agents import Agent, MaxAttackDraftAgent, MaxAttackBattleAgent, RLBattleAgent, RLDraftAgent +from gym_locm.agents import Agent, RLBattleAgent, RLDraftAgent from gym_locm.envs import LOCMBattleSingleEnv from gym_locm.envs.battle import LOCMBattleSelfPlayEnv @@ -51,21 +53,6 @@ def __init__(self, task, params, path, seed, wandb_run=None): def _train(self): pass - def _save_results(self): - results_path = self.path + '/results.json' - - with open(results_path, 'w') as file: - info = dict(task=self.task, **self.params, seed=self.seed, checkpoints=self.checkpoints, - win_rates=self.win_rates, ep_lengths=self.episode_lengths, - battle_lengths=self.battle_lengths, - action_histograms=self.action_histograms, - start_time=str(self.start_time), end_time=str(self.end_time)) - info = json.dumps(info, indent=2) - - file.write(info) - - self.logger.debug(f"Results saved at {results_path}.") - def run(self): # log start time self.start_time = datetime.now() @@ -78,14 +65,11 @@ def run(self): self.end_time = datetime.now() self.logger.info(f"End of training. Time elapsed: {self.end_time - self.start_time}.") - # save model info to results file - self._save_results() - class FixedAdversary(TrainingSession): def __init__(self, task, model_builder, model_params, env_params, eval_env_params, train_episodes, eval_episodes, num_evals, - play_first, path, seed, num_envs=1, wandb_run=None): + role, path, seed, num_envs=1, wandb_run=None): super(FixedAdversary, self).__init__( task, model_params, path, seed, wandb_run=wandb_run) @@ -106,7 +90,8 @@ def __init__(self, task, model_builder, model_params, env_params, current_seed = None # create the env - env.append(lambda: env_class(seed=current_seed, play_first=play_first, **env_params)) + env.append(lambda: env_class( + seed=current_seed, play_first=role == 'first', alternate_roles=role == 'alternate', **env_params)) # wrap envs in a vectorized env self.env: VecEnv3 = DummyVecEnv3(env) @@ -114,8 +99,8 @@ def __init__(self, task, model_builder, model_params, env_params, # initialize evaluator self.logger.debug("Initializing evaluator...") eval_seed = seed + train_episodes if seed is not None else None - self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes, - eval_seed, num_envs) + self.evaluators: List[Evaluator] = \ + [Evaluator(task, e, eval_episodes, eval_seed, num_envs) for e in eval_env_params] # build the model self.logger.debug("Building the model...") @@ -132,11 +117,13 @@ def __init__(self, task, model_builder, model_params, env_params, self.train_episodes = train_episodes self.num_evals = num_evals self.eval_frequency = train_episodes / num_evals + self.eval_adversaries = [type(e['battle_agent']).__name__ for e in eval_env_params] + self.role = role # initialize control attributes self.model.last_eval = None self.model.next_eval = 0 - self.model.role_id = 0 if play_first else 1 + self.model.role_id = 0 if role == 'first' else 1 # log end time end_time = time.perf_counter() @@ -163,44 +150,48 @@ def _training_callback(self, _locals=None, _globals=None): agent = agent_class(self.model, deterministic=True) - win_rate, mean_reward, ep_length, battle_length, act_hist = \ - self.evaluator.run(agent, play_first=self.model.role_id == 0) + for evaluator, eval_adversary in zip(self.evaluators, self.eval_adversaries): + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + evaluator.run(agent, play_first=self.role == 'first', alternate_roles=self.role == 'alternate') - end_time = time.perf_counter() - self.logger.info(f"Finished evaluating " - f"({round(end_time - start_time, 3)}s). " - f"Avg. reward: {mean_reward}") + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating vs {eval_adversary} " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") - # save the results - self.checkpoints.append(episodes_so_far) - self.win_rates.append(win_rate) - self.episode_lengths.append(ep_length) - self.battle_lengths.append(battle_length) - self.action_histograms.append(act_hist) + # save the results + self.checkpoints.append(episodes_so_far) + self.win_rates.append(win_rate) + self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) + self.action_histograms.append(act_hist) - # update control attributes - self.model.last_eval = episodes_so_far - self.model.next_eval += self.eval_frequency + # update control attributes + self.model.last_eval = episodes_so_far + self.model.next_eval += self.eval_frequency - # write partial results to file - self._save_results() + # upload stats to wandb, if enabled + if self.wandb_run: + panel_name = f"eval_vs_{eval_adversary}" - # upload stats to wandb, if enabled - if self.wandb_run: - info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length, - mean_battle_length=battle_length) + info = dict() + + info['checkpoint'] = episodes_so_far + info[panel_name + '/mean_reward'] = mean_reward + info[panel_name + '/win_rate'] = win_rate + info[panel_name + '/mean_ep_length'] = ep_length + info[panel_name + '/mean_battle_length'] = battle_length - info['pass_actions'] = act_hist[0] - info['summon_actions'] = sum(act_hist[1:17]) + info[panel_name + '/pass_actions'] = act_hist[0] + info[panel_name + '/summon_actions'] = sum(act_hist[1:17]) - if self.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) + if self.env.get_attr('items', indices=[0])[0]: + info[panel_name + '/use_actions'] = sum(act_hist[17:121]) + info[panel_name + '/attack_actions'] = sum(act_hist[121:]) + else: + info[panel_name + '/attack_actions'] = sum(act_hist[17:]) - self.wandb_run.log(info) + self.wandb_run.log(info) # if training should end, return False to end training training_is_finished = episodes_so_far >= self.train_episodes @@ -231,15 +222,17 @@ def _train(self): if len(self.win_rates) < self.num_evals: self._training_callback() - # close the envs - for e in (self.env, self.evaluator): + # close all envs + self.env.close() + + for e in self.evaluators: e.close() class SelfPlay(TrainingSession): def __init__(self, task, model_builder, model_params, env_params, eval_env_params, train_episodes, eval_episodes, num_evals, - switch_frequency, path, seed, num_envs=1, wandb_run=None): + role, switch_frequency, path, seed, num_envs=1, wandb_run=None): super(SelfPlay, self).__init__( task, model_params, path, seed, wandb_run=wandb_run) @@ -260,7 +253,8 @@ def __init__(self, task, model_builder, model_params, env_params, current_seed = None # create one env per process - env.append(lambda: env_class(seed=current_seed, play_first=True, **env_params)) + env.append(lambda: env_class( + seed=current_seed, play_first=role == 'first', alternate_roles=role == 'alternate', **env_params)) # wrap envs in a vectorized env self.env: VecEnv3 = DummyVecEnv3(env) @@ -268,8 +262,8 @@ def __init__(self, task, model_builder, model_params, env_params, # initialize parallel evaluating environments self.logger.debug("Initializing evaluation envs...") eval_seed = seed + train_episodes if seed is not None else None - self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes // 2, - eval_seed, num_envs) + self.evaluators: List[Evaluator] = \ + [Evaluator(task, e, eval_episodes, eval_seed, num_envs) for e in eval_env_params] # build the models self.logger.debug("Building the models...") @@ -283,7 +277,7 @@ def __init__(self, task, model_builder, model_params, env_params, def make_adversary_policy(model, env): def adversary_policy(obs): actions, _ = model.adversary.predict( - obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) + obs, action_masks=env.env_method('action_masks')[0]) return actions @@ -305,6 +299,8 @@ def adversary_policy(obs): self.switch_frequency = switch_frequency self.eval_frequency = train_episodes / num_evals self.num_switches = math.ceil(train_episodes / switch_frequency) + self.eval_adversaries = [type(e['battle_agent']).__name__ for e in eval_env_params] + self.role = role # initialize control attributes self.model.last_eval, self.model.next_eval = None, 0 @@ -326,14 +322,6 @@ def _training_callback(self, _locals=None, _globals=None): model = self.model episodes_so_far = sum(self.env.get_attr('episodes')) - # note: wtf was this code about, ronaldo??? - # turns = model.env.get_attr('turn') - # playing_first = model.env.get_attr('play_first') - # - # for i in range(model.env.num_envs): - # if turns[i] in range(0, model.env.num_envs): - # model.env.set_attr('play_first', not playing_first[i], indices=[i]) - # if it is time to evaluate, do so if episodes_so_far >= model.next_eval: # save model @@ -349,59 +337,52 @@ def _training_callback(self, _locals=None, _globals=None): agent_class = RLBattleAgent - if self.evaluator.seed is not None: - self.evaluator.seed = self.seed + self.train_episodes + for evaluator, eval_adversary in zip(self.evaluators, self.eval_adversaries): + if evaluator.seed is not None: + evaluator.seed = self.seed + self.train_episodes - win_rate, mean_reward, ep_length, battle_length, act_hist = \ - self.evaluator.run(agent_class(model, deterministic=True), play_first=True) + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + evaluator.run(agent_class(model, deterministic=True), + play_first=self.role == 'first', alternate_roles=self.role == 'alternate') - if self.evaluator.seed is not None: - self.evaluator.seed += self.eval_episodes + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating vs {eval_adversary} " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") - win_rate2, mean_reward2, ep_length2, battle_length2, act_hist2 = \ - self.evaluator.run(agent_class(model, deterministic=True), play_first=False) + # save the results + self.checkpoints.append(episodes_so_far) + self.win_rates.append(win_rate) + self.episode_lengths.append(ep_length) + self.battle_lengths.append(battle_length) + self.action_histograms.append(act_hist) - mean_reward = (mean_reward + mean_reward2) / 2 - win_rate = (win_rate + win_rate2) / 2 - ep_length = (ep_length + ep_length2) / 2 - battle_length = (battle_length + battle_length2) / 2 - act_hist = [(act_hist[i] + act_hist2[i]) / 2 for i in range(model.env.get_attr('action_space', indices=[0])[0].n)] + # update control attributes + model.last_eval = episodes_so_far + model.next_eval += self.eval_frequency - end_time = time.perf_counter() - self.logger.info(f"Finished evaluating " - f"({round(end_time - start_time, 3)}s). " - f"Avg. reward: {mean_reward}") + # upload stats to wandb, if enabled + if self.wandb_run: + panel_name = f"eval_vs_{eval_adversary}" - # save the results - self.checkpoints.append(episodes_so_far) - self.win_rates.append(win_rate) - self.episode_lengths.append(ep_length) - self.battle_lengths.append(battle_length) - self.action_histograms.append(act_hist) + info = dict() - # update control attributes - model.last_eval = episodes_so_far - model.next_eval += self.eval_frequency - - # write partial results to file - self._save_results() - - # upload stats to wandb, if enabled - if self.wandb_run: - info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length, - mean_battle_length=battle_length) + info['checkpoint'] = episodes_so_far + info[panel_name + '/mean_reward'] = mean_reward + info[panel_name + '/win_rate'] = win_rate + info[panel_name + '/mean_ep_length'] = ep_length + info[panel_name + '/mean_battle_length'] = battle_length - info['pass_actions'] = act_hist[0] - info['summon_actions'] = sum(act_hist[1:17]) + info[panel_name + '/pass_actions'] = act_hist[0] + info[panel_name + '/summon_actions'] = sum(act_hist[1:17]) - if self.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) + if self.env.get_attr('items', indices=[0])[0]: + info[panel_name + '/use_actions'] = sum(act_hist[17:121]) + info[panel_name + '/attack_actions'] = sum(act_hist[121:]) + else: + info[panel_name + '/attack_actions'] = sum(act_hist[17:]) - self.wandb_run.log(info) + self.wandb_run.log(info) # if it is time to update the adversary model, do so if episodes_so_far >= model.next_switch: @@ -418,7 +399,7 @@ def _training_callback(self, _locals=None, _globals=None): # reset training env rewards for i in range(model.env.num_envs): - model.env.set_attr('rewards', [0.0], indices=[i]) + model.env.set_attr('rewards_single_player', [], indices=[i]) # update parameters of adversary models model.adversary.set_parameters(model.get_parameters(), exact_match=True) @@ -440,9 +421,6 @@ def _train(self): callbacks.append(WandbCallback(gradient_save_freq=0, verbose=0)) try: - self.logger.debug(f"Training will switch models every " - f"{self.switch_frequency} episodes") - # train the model self.model.learn(total_timesteps=REALLY_BIG_INT, reset_num_timesteps=False, @@ -461,8 +439,10 @@ def _train(self): if len(self.win_rates) < self.num_evals: self._training_callback({'self': self.model}) - # close the envs - for e in (self.env, self.evaluator): + # close all envs + self.env.close() + + for e in self.evaluators: e.close() @@ -490,8 +470,8 @@ def __init__(self, task, model_builder, model_params, env_params, current_seed = None # create one env per process - env1.append(lambda: env_class(seed=current_seed, play_first=True, **env_params)) - env2.append(lambda: env_class(seed=current_seed, play_first=False, **env_params)) + env1.append(lambda: env_class(seed=current_seed, play_first=True, alternate_role=False, **env_params)) + env2.append(lambda: env_class(seed=current_seed, play_first=False, alternate_role=False, **env_params)) # wrap envs in a vectorized env self.env1: VecEnv3 = DummyVecEnv3(env1) @@ -500,8 +480,8 @@ def __init__(self, task, model_builder, model_params, env_params, # initialize parallel evaluating environments self.logger.debug("Initializing evaluation envs...") eval_seed = seed + train_episodes if seed is not None else None - self.evaluator: Evaluator = Evaluator(task, eval_env_params, eval_episodes, - eval_seed, num_envs) + self.evaluators: List[Evaluator] = \ + [Evaluator(task, e, eval_episodes, eval_seed, num_envs) for e in eval_env_params] # build the models self.logger.debug("Building the models...") @@ -518,7 +498,7 @@ def __init__(self, task, model_builder, model_params, env_params, def make_adversary_policy(model, env): def adversary_policy(obs): actions, _ = model.adversary.predict( - obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) + obs, action_masks=env.env_method('action_masks')[0]) return actions @@ -542,6 +522,7 @@ def adversary_policy(obs): self.switch_frequency = switch_frequency self.eval_frequency = train_episodes / num_evals self.num_switches = math.ceil(train_episodes / switch_frequency) + self.eval_adversaries = [type(e['battle_agent']).__name__ for e in eval_env_params] # initialize control attributes self.model1.role_id, self.model2.role_id = 0, 1 @@ -582,45 +563,45 @@ def _training_callback(self, _locals=None, _globals=None): agent_class = RLBattleAgent - win_rate, mean_reward, ep_length, battle_length, act_hist = \ - self.evaluator.run(agent_class(model, deterministic=True), play_first=model.role_id == 0) - - end_time = time.perf_counter() - self.logger.info(f"Finished evaluating " - f"({round(end_time - start_time, 3)}s). " - f"Avg. reward: {mean_reward}") - - # save the results - self.checkpoints[model.role_id].append(episodes_so_far) - self.win_rates[model.role_id].append(win_rate) - self.episode_lengths[model.role_id].append(ep_length) - self.battle_lengths[model.role_id].append(battle_length) - self.action_histograms[model.role_id].append(act_hist) - - # update control attributes - model.last_eval = episodes_so_far - model.next_eval += self.eval_frequency - - # write partial results to file - self._save_results() - - # upload stats to wandb, if enabled - if self.wandb_run: - info = {'checkpoint_' + model.role_id: episodes_so_far, - 'mean_reward_' + model.role_id: mean_reward, - 'win_rate_' + model.role_id: win_rate, - 'mean_ep_length_' + model.role_id: ep_length, - 'mean_battle_length_' + model.role_id: battle_length, - 'pass_actions_' + model.role_id: act_hist[0], - 'summon_actions_' + model.role_id: sum(act_hist[1:17])} - - if model.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) - - self.wandb_run.log(info) + for evaluator, eval_adversary in zip(self.evaluators, self.eval_adversaries): + win_rate, mean_reward, ep_length, battle_length, act_hist = \ + evaluator.run(agent_class(model, deterministic=True), play_first=model.role_id == 0) + + end_time = time.perf_counter() + self.logger.info(f"Finished evaluating vs {eval_adversary} " + f"({round(end_time - start_time, 3)}s). " + f"Avg. reward: {mean_reward}") + + # save the results + self.checkpoints[model.role_id].append(episodes_so_far) + self.win_rates[model.role_id].append(win_rate) + self.episode_lengths[model.role_id].append(ep_length) + self.battle_lengths[model.role_id].append(battle_length) + self.action_histograms[model.role_id].append(act_hist) + + # update control attributes + model.last_eval = episodes_so_far + model.next_eval += self.eval_frequency + + # upload stats to wandb, if enabled + if self.wandb_run: + panel_name = f"eval_vs_{eval_adversary}" + + info = {'checkpoint_' + model.role_id: episodes_so_far, + panel_name + '/mean_reward_' + model.role_id: mean_reward, + panel_name + '/win_rate_' + model.role_id: win_rate, + panel_name + '/mean_ep_length_' + model.role_id: ep_length, + panel_name + '/mean_battle_length_' + model.role_id: battle_length, + panel_name + '/pass_actions_' + model.role_id: act_hist[0], + panel_name + '/summon_actions_' + model.role_id: sum(act_hist[1:17])} + + if model.env.get_attr('items', indices=[0])[0]: + info[panel_name + '/use_actions' + model.role_id] = sum(act_hist[17:121]) + info[panel_name + '/attack_actions' + model.role_id] = sum(act_hist[121:]) + else: + info[panel_name + '/attack_actions' + model.role_id] = sum(act_hist[17:]) + + self.wandb_run.log(info) # if training should end, return False to end training training_is_finished = episodes_so_far >= model.next_switch or episodes_so_far >= self.train_episodes @@ -702,8 +683,11 @@ def _train(self): if len(self.win_rates[1]) < self.num_evals: self._training_callback({'self': self.model1}) - # close the envs - for e in (self.env1, self.env2, self.evaluator): + # close all envs + self.env1.close() + self.env2.close() + + for e in self.evaluators: e.close() @@ -734,11 +718,13 @@ def __init__(self, task, env_params, episodes, seed, num_envs): self.logger.debug("Finished initializing evaluator " f"({round(end_time - start_time, ndigits=3)}s).") - def run(self, agent: Agent, play_first=True): + def run(self, agent: Agent, play_first=True, alternate_roles=False): """ Evaluates an agent. :param agent: (gym_locm.agents.Agent) Agent to be evaluated. :param play_first: Whether the agent will be playing first. + :param alternate_roles: Whether the agent should be alternating + between playing first and second :return: A tuple containing the `win_rate`, the `mean_reward`, the `mean_length` and the `action_histogram` of the evaluation episodes. """ @@ -753,8 +739,7 @@ def run(self, agent: Agent, play_first=True): # set agent role self.env.set_attr('play_first', play_first) - - player = 0 if play_first else 1 + self.env.set_attr('alternate_roles', alternate_roles) # reset the env observations = self.env.reset() @@ -769,6 +754,9 @@ def run(self, agent: Agent, play_first=True): # run the episodes while True: + # get current role info + roles = [0 if play_first else 1 for play_first in self.env.get_attr('play_first')] + # get the agent's action for all parallel envs # todo: do this in a more elegant way if isinstance(agent, RLDraftAgent): @@ -793,7 +781,7 @@ def run(self, agent: Agent, play_first=True): episode_lengths[i][-1] += 1 if dones[i]: - episode_wins[i].append(1 if infos[i]['winner'] == player else 0) + episode_wins[i].append(1 if infos[i]['winner'] == roles[i] else 0) episode_rewards[i].append(0.0) episode_lengths[i].append(0) episode_turns[i].append(infos[i]['turn']) @@ -849,7 +837,7 @@ def save_model_as_json(model, act_fun, path): def model_builder_mlp_masked(env, seed, neurons, layers, activation, n_steps, nminibatches, noptepochs, cliprange, vf_coef, ent_coef, - learning_rate, gamma, tensorboard_log=None): + learning_rate, gamma=1, tensorboard_log=None): net_arch = [neurons] * layers activation = dict(tanh=th.nn.Tanh, relu=th.nn.ReLU, elu=th.nn.ELU)[activation] diff --git a/gym_locm/toolbox/trainer_draft.py b/gym_locm/toolbox/trainer_draft.py index 3d40efb..98e8360 100644 --- a/gym_locm/toolbox/trainer_draft.py +++ b/gym_locm/toolbox/trainer_draft.py @@ -15,7 +15,6 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} import tensorflow as tf -import torch as th tf.get_logger().setLevel('INFO') tf.get_logger().setLevel(logging.ERROR) @@ -23,15 +22,11 @@ from stable_baselines import PPO2 from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy from stable_baselines.common.vec_env import VecEnv, DummyVecEnv -from stable_baselines3.common.vec_env import VecEnv as VecEnv3, DummyVecEnv as DummyVecEnv3 -from stable_baselines3.common.callbacks import BaseCallback -from sb3_contrib import MaskablePPO -from wandb.integration.sb3 import WandbCallback +from stable_baselines.common.callbacks import CallbackList, BaseCallback from gym_locm.agents import Agent, MaxAttackDraftAgent, MaxAttackBattleAgent, RLDraftAgent, RLBattleAgent -from gym_locm.envs import LOCMDraftSingleEnv, LOCMBattleSingleEnv +from gym_locm.envs import LOCMDraftSingleEnv from gym_locm.envs.draft import LOCMDraftSelfPlayEnv -from gym_locm.envs.battle import LOCMBattleSelfPlayEnv verbose = True REALLY_BIG_INT = 1_000_000_000 @@ -110,10 +105,7 @@ def __init__(self, task, model_builder, model_params, env_params, self.logger.debug("Initializing training env...") env = [] - if task == 'battle': - env_class = LOCMBattleSingleEnv - else: - env_class = LOCMDraftSingleEnv + env_class = LOCMDraftSingleEnv for i in range(num_envs): # no overlap between episodes at each concurrent env @@ -126,10 +118,7 @@ def __init__(self, task, model_builder, model_params, env_params, env.append(lambda: env_class(seed=current_seed, play_first=play_first, **env_params)) # wrap envs in a vectorized env - if task == 'battle': - self.env: VecEnv3 = DummyVecEnv3(env) - else: - self.env: VecEnv = DummyVecEnv(env) + self.env: VecEnv = DummyVecEnv(env) # initialize evaluator self.logger.debug("Initializing evaluator...") @@ -178,12 +167,7 @@ def _training_callback(self, _locals=None, _globals=None): self.logger.info(f"Evaluating model ({episodes_so_far} episodes)...") start_time = time.perf_counter() - if self.task == 'battle': - agent_class = RLBattleAgent - else: - agent_class = RLDraftAgent - - agent = agent_class(self.model) + agent = RLDraftAgent(self.model) mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent, play_first=self.model.role_id == 0) @@ -208,24 +192,6 @@ def _training_callback(self, _locals=None, _globals=None): # write partial results to file self._save_results() - # upload stats to wandb, if enabled - if self.wandb_run: - info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length, - mean_battle_length=battle_length) - - if self.task == 'battle': - info['pass_actions'] = act_hist[0] - info['summon_actions'] = sum(act_hist[1:17]) - - if self.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) - - self.wandb_run.log(info) - # if training should end, return False to end training training_is_finished = episodes_so_far >= self.train_episodes @@ -238,16 +204,8 @@ def _train(self): # save and evaluate starting model self._training_callback() - if self.task == 'battle': - from stable_baselines3.common.callbacks import CallbackList - else: - from stable_baselines.common.callbacks import CallbackList - callbacks = [TrainingCallback(self._training_callback)] - if self.wandb_run: - callbacks.append(WandbCallback(gradient_save_freq=0, verbose=0)) - try: # train the model # note: dynamic learning or clip rates will require accurate # of timesteps @@ -279,10 +237,7 @@ def __init__(self, task, model_builder, model_params, env_params, self.logger.debug("Initializing training envs...") env = [] - if task == 'battle': - env_class = LOCMBattleSelfPlayEnv - else: - env_class = LOCMDraftSelfPlayEnv + env_class = LOCMDraftSelfPlayEnv for i in range(num_envs): # no overlap between episodes at each process @@ -295,10 +250,7 @@ def __init__(self, task, model_builder, model_params, env_params, env.append(lambda: env_class(seed=current_seed, play_first=True, **env_params)) # wrap envs in a vectorized env - if task == 'battle': - self.env: VecEnv3 = DummyVecEnv3(env) - else: - self.env: VecEnv = DummyVecEnv(env) + self.env: VecEnv = DummyVecEnv(env) # initialize parallel evaluating environments self.logger.debug("Initializing evaluation envs...") @@ -312,35 +264,21 @@ def __init__(self, task, model_builder, model_params, env_params, self.model.adversary = model_builder(self.env, seed, **model_params) # initialize parameters of adversary models accordingly - try: - self.model.adversary.load_parameters(self.model.get_parameters(), exact_match=True) - except AttributeError: - self.model.adversary.set_parameters(self.model.get_parameters(), exact_match=True) + self.model.adversary.load_parameters(self.model.get_parameters(), exact_match=True) # set adversary models as adversary policies of the self-play envs - if self.task == 'battle': - def make_adversary_policy(model, env): - def adversary_policy(obs): - actions, _ = model.adversary.predict( - obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) + def make_adversary_policy(model, env): + def adversary_policy(obs): + zero_completed_obs = np.zeros((num_envs,) + env.observation_space.shape) + zero_completed_obs[0, :] = obs - return actions + actions, _ = model.adversary.predict(zero_completed_obs) - return adversary_policy - else: - def make_adversary_policy(model, env): - def adversary_policy(obs): - zero_completed_obs = np.zeros((num_envs,) + env.observation_space.shape) - zero_completed_obs[0, :] = obs + return actions[0] - actions, _ = model.adversary.predict(zero_completed_obs) + return adversary_policy - return actions[0] - - return adversary_policy - - self.env.set_attr('adversary_policy', - make_adversary_policy(self.model, self.env)) + self.env.set_attr('adversary_policy', make_adversary_policy(self.model, self.env)) # create necessary folders os.makedirs(self.path, exist_ok=True) @@ -390,10 +328,7 @@ def _training_callback(self, _locals=None, _globals=None): # save model model_path = self.path + f'/{episodes_so_far}' - if self.task == 'battle': - model.save(model_path, exclude=['adversary']) - else: - model.save(model_path) + model.save(model_path) save_model_as_json(model, self.params['activation'], model_path) self.logger.debug(f"Saved model at {model_path}.zip/json.") @@ -402,10 +337,7 @@ def _training_callback(self, _locals=None, _globals=None): self.logger.info(f"Evaluating model ({episodes_so_far} episodes)...") start_time = time.perf_counter() - if self.task == 'battle': - agent_class = RLBattleAgent - else: - agent_class = RLDraftAgent + agent_class = RLDraftAgent if self.evaluator.seed is not None: self.evaluator.seed = self.seed + self.train_episodes @@ -444,24 +376,6 @@ def _training_callback(self, _locals=None, _globals=None): # write partial results to file self._save_results() - # upload stats to wandb, if enabled - if self.wandb_run: - info = dict(checkpoint=episodes_so_far, mean_reward=mean_reward, - win_rate=win_rate, mean_ep_length=ep_length, - mean_battle_length=battle_length) - - if self.task == 'battle': - info['pass_actions'] = act_hist[0] - info['summon_actions'] = sum(act_hist[1:17]) - - if self.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) - - self.wandb_run.log(info) - # if it is time to update the adversary model, do so if episodes_so_far >= model.next_switch: model.last_switch = episodes_so_far @@ -469,7 +383,6 @@ def _training_callback(self, _locals=None, _globals=None): # log training win rate at the time of the switch train_mean_reward = np.mean([np.mean(rewards) for rewards in model.env.env_method('get_episode_rewards')]) - self.wandb_run.log({'train_mean_reward': train_mean_reward}) self.logger.debug(f"Model trained for " f"{sum(model.env.get_attr('episodes'))} episodes. " @@ -480,10 +393,7 @@ def _training_callback(self, _locals=None, _globals=None): model.env.set_attr('rewards', [0.0], indices=[i]) # update parameters of adversary models - try: - model.adversary.load_parameters(model.get_parameters(), exact_match=True) - except AttributeError: - model.adversary.set_parameters(model.get_parameters(), exact_match=True) + model.adversary.load_parameters(model.get_parameters(), exact_match=True) self.logger.debug("Parameters of adversary network updated.") @@ -496,16 +406,8 @@ def _train(self): # save and evaluate starting models self._training_callback({'self': self.model}) - if self.task == 'battle': - from stable_baselines3.common.callbacks import CallbackList - else: - from stable_baselines.common.callbacks import CallbackList - callbacks = [TrainingCallback(self._training_callback)] - if self.wandb_run: - callbacks.append(WandbCallback(gradient_save_freq=0, verbose=0)) - try: self.logger.debug(f"Training will switch models every " f"{self.switch_frequency} episodes") @@ -547,10 +449,7 @@ def __init__(self, task, model_builder, model_params, env_params, self.logger.debug("Initializing training envs...") env1, env2 = [], [] - if task == 'battle': - env_class = LOCMBattleSelfPlayEnv - else: - env_class = LOCMDraftSelfPlayEnv + env_class = LOCMDraftSelfPlayEnv for i in range(num_envs): # no overlap between episodes at each process @@ -564,12 +463,8 @@ def __init__(self, task, model_builder, model_params, env_params, env2.append(lambda: env_class(seed=current_seed, play_first=False, **env_params)) # wrap envs in a vectorized env - if task == 'battle': - self.env1: VecEnv3 = DummyVecEnv3(env1) - self.env2: VecEnv3 = DummyVecEnv3(env2) - else: - self.env1: VecEnv = DummyVecEnv(env1) - self.env2: VecEnv = DummyVecEnv(env2) + self.env1: VecEnv = DummyVecEnv(env1) + self.env2: VecEnv = DummyVecEnv(env2) # initialize parallel evaluating environments self.logger.debug("Initializing evaluation envs...") @@ -585,34 +480,20 @@ def __init__(self, task, model_builder, model_params, env_params, self.model2.adversary = model_builder(self.env1, seed, **model_params) # initialize parameters of adversary models accordingly - try: - self.model1.adversary.load_parameters(self.model2.get_parameters(), exact_match=True) - self.model2.adversary.load_parameters(self.model1.get_parameters(), exact_match=True) - except AttributeError: - self.model1.adversary.set_parameters(self.model2.get_parameters(), exact_match=True) - self.model2.adversary.set_parameters(self.model1.get_parameters(), exact_match=True) + self.model1.adversary.load_parameters(self.model2.get_parameters(), exact_match=True) + self.model2.adversary.load_parameters(self.model1.get_parameters(), exact_match=True) # set adversary models as adversary policies of the self-play envs - if self.task == 'battle': - def make_adversary_policy(model, env): - def adversary_policy(obs): - actions, _ = model.adversary.predict( - obs, deterministic=True, action_masks=env.env_method('action_masks')[0]) - - return actions + def make_adversary_policy(model, env): + def adversary_policy(obs): + zero_completed_obs = np.zeros((num_envs,) + env.observation_space.shape) + zero_completed_obs[0, :] = obs - return adversary_policy - else: - def make_adversary_policy(model, env): - def adversary_policy(obs): - zero_completed_obs = np.zeros((num_envs,) + env.observation_space.shape) - zero_completed_obs[0, :] = obs + actions, _ = model.adversary.predict(zero_completed_obs) - actions, _ = model.adversary.predict(zero_completed_obs) + return actions[0] - return actions[0] - - return adversary_policy + return adversary_policy self.env1.set_attr('adversary_policy', make_adversary_policy(self.model1, self.env1)) @@ -647,6 +528,7 @@ def adversary_policy(obs): self.win_rates = [], [] self.episode_lengths = [], [] self.action_histograms = [], [] + self.battle_lengths = [], [] # log end time end_time = time.perf_counter() @@ -663,10 +545,7 @@ def _training_callback(self, _locals=None, _globals=None): # save model model_path = f'{self.path}/role{model.role_id}/{episodes_so_far}' - if self.task == 'battle': - model.save(model_path, exclude=['adversary']) - else: - model.save(model_path) + model.save(model_path) save_model_as_json(model, self.params['activation'], model_path) self.logger.debug(f"Saved model at {model_path}.zip/json.") @@ -676,10 +555,7 @@ def _training_callback(self, _locals=None, _globals=None): f"({episodes_so_far} episodes)...") start_time = time.perf_counter() - if self.task == 'battle': - agent_class = RLBattleAgent - else: - agent_class = RLDraftAgent + agent_class = RLDraftAgent mean_reward, ep_length, battle_length, act_hist = \ self.evaluator.run(agent_class(model), play_first=model.role_id == 0) @@ -704,28 +580,6 @@ def _training_callback(self, _locals=None, _globals=None): # write partial results to file self._save_results() - # upload stats to wandb, if enabled - if self.wandb_run: - info = { - 'checkpoint_' + model.role_id: episodes_so_far, - 'mean_reward_' + model.role_id: mean_reward, - 'win_rate_' + model.role_id: win_rate, - 'mean_ep_length_' + model.role_id: ep_length, - 'mean_battle_length_' + model.role_id: battle_length - } - - if self.task == 'battle': - info['pass_actions_' + model.role_id] = act_hist[0] - info['summon_actions_' + model.role_id] = sum(act_hist[1:17]) - - if model.env.get_attr('items', indices=[0])[0]: - info['use_actions'] = sum(act_hist[17:121]) - info['attack_actions'] = sum(act_hist[121:]) - else: - info['attack_actions'] = sum(act_hist[17:]) - - self.wandb_run.log(info) - # if training should end, return False to end training training_is_finished = episodes_so_far >= model.next_switch or episodes_so_far >= self.train_episodes @@ -740,11 +594,6 @@ def _train(self): self._training_callback({'self': self.model1}) self._training_callback({'self': self.model2}) - if self.task == 'battle': - from stable_baselines3.common.callbacks import CallbackList - else: - from stable_baselines.common.callbacks import CallbackList - try: self.logger.debug(f"Training will switch models every " f"{self.switch_frequency} episodes") @@ -752,10 +601,6 @@ def _train(self): callbacks1 = [TrainingCallback(lambda: self._training_callback({'self': self.model1}))] callbacks2 = [TrainingCallback(lambda: self._training_callback({'self': self.model2}))] - if self.wandb_run: - callbacks1.append(WandbCallback(gradient_save_freq=0, verbose=0)) - callbacks2.append(WandbCallback(gradient_save_freq=0, verbose=0)) - for _ in range(self.num_switches): # train the first player model self.model1.learn(total_timesteps=REALLY_BIG_INT, @@ -764,7 +609,6 @@ def _train(self): # log training win rate at the time of the switch train_mean_reward1 = np.mean([np.mean(rewards) for rewards in self.env1.env_method('get_episode_rewards')]) - self.wandb_run.log({'train_mean_reward_0': train_mean_reward1}) # reset training env rewards for i in range(self.env1.num_envs): @@ -782,7 +626,6 @@ def _train(self): # log training win rate at the time of the switch train_mean_reward2 = np.mean([np.mean(rewards) for rewards in self.env2.env_method('get_episode_rewards')]) - self.wandb_run.log({'train_mean_reward_1': train_mean_reward2}) # reset training env rewards for i in range(self.env2.num_envs): @@ -794,12 +637,8 @@ def _train(self): f"Switching to model {self.model1.role_id}.") # update parameters of adversary models - try: - self.model1.adversary.load_parameters(self.model2.get_parameters(), exact_match=True) - self.model2.adversary.load_parameters(self.model1.get_parameters(), exact_match=True) - except AttributeError: - self.model1.adversary.set_parameters(self.model2.get_parameters(), exact_match=True) - self.model2.adversary.set_parameters(self.model1.get_parameters(), exact_match=True) + self.model1.adversary.load_parameters(self.model2.get_parameters(), exact_match=True) + self.model2.adversary.load_parameters(self.model1.get_parameters(), exact_match=True) self.logger.debug("Parameters of adversary networks updated.") except KeyboardInterrupt: @@ -831,17 +670,11 @@ def __init__(self, task, env_params, episodes, seed, num_envs): # initialize parallel environments self.logger.debug("Initializing envs...") - if task == 'battle': - env_class = LOCMBattleSingleEnv - else: - env_class = LOCMDraftSingleEnv + env_class = LOCMDraftSingleEnv self.env = [lambda: env_class(**env_params) for _ in range(num_envs)] - if task == 'battle': - self.env: VecEnv3 = DummyVecEnv3(self.env) - else: - self.env: VecEnv = DummyVecEnv(self.env) + self.env: VecEnv = DummyVecEnv(self.env) # save parameters self.episodes = episodes @@ -961,12 +794,12 @@ def save_model_as_json(model, act_fun, path): def model_builder_mlp(env, seed, neurons, layers, activation, n_steps, nminibatches, - noptepochs, cliprange, vf_coef, ent_coef, learning_rate, + noptepochs, cliprange, vf_coef, ent_coef, learning_rate, gamma=1, tensorboard_log=None): net_arch = [neurons] * layers activation = dict(tanh=tf.nn.tanh, relu=tf.nn.relu, elu=tf.nn.elu)[activation] - return PPO2(MlpPolicy, env, verbose=0, gamma=1, seed=seed, + return PPO2(MlpPolicy, env, verbose=0, gamma=gamma, seed=seed, policy_kwargs=dict(net_arch=net_arch, act_fun=activation), n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, cliprange=cliprange, @@ -975,12 +808,12 @@ def model_builder_mlp(env, seed, neurons, layers, activation, n_steps, nminibatc def model_builder_lstm(env, seed, neurons, layers, activation, n_steps, nminibatches, - noptepochs, cliprange, vf_coef, ent_coef, learning_rate, + noptepochs, cliprange, vf_coef, ent_coef, learning_rate, gamma=1, tensorboard_log=None): net_arch = ['lstm'] + [neurons] * (layers - 1) activation = dict(tanh=tf.nn.tanh, relu=tf.nn.relu, elu=tf.nn.elu)[activation] - return PPO2(MlpLstmPolicy, env, verbose=0, gamma=1, seed=seed, + return PPO2(MlpLstmPolicy, env, verbose=0, gamma=gamma, seed=seed, policy_kwargs=dict(net_arch=net_arch, n_lstm=neurons, act_fun=activation), n_steps=n_steps, nminibatches=nminibatches, noptepochs=noptepochs, cliprange=cliprange, @@ -989,20 +822,6 @@ def model_builder_lstm(env, seed, neurons, layers, activation, n_steps, nminibat tensorboard_log=tensorboard_log) -def model_builder_mlp_masked(env, seed, neurons, layers, activation, n_steps, - nminibatches, noptepochs, cliprange, vf_coef, ent_coef, - learning_rate, tensorboard_log=None): - net_arch = [neurons] * layers - activation = dict(tanh=th.nn.Tanh, relu=th.nn.ReLU, elu=th.nn.ELU)[activation] - - return MaskablePPO("MlpPolicy", env, learning_rate=learning_rate, n_steps=n_steps, - batch_size=nminibatches, n_epochs=noptepochs, gamma=1, - clip_range=cliprange, ent_coef=ent_coef, vf_coef=vf_coef, - verbose=0, seed=seed, - policy_kwargs=dict(net_arch=net_arch, activation_fn=activation), - tensorboard_log=tensorboard_log) - - if __name__ == '__main__': env_params = { 'battle_agents': (MaxAttackBattleAgent(), MaxAttackBattleAgent()), diff --git a/gym_locm/trained_models/README.md b/gym_locm/trained_models/README.md index 404e7ef..18cb713 100644 --- a/gym_locm/trained_models/README.md +++ b/gym_locm/trained_models/README.md @@ -2,13 +2,14 @@ In this folder, there are all draft agents trained and used in experiments for our paper and thesis. They are organized in the following folder structure: ``` -.../trained_models///<1st or 2nd player>/.(zip|json) +.../trained_models//-/<1st or 2nd player>/.(zip|json) ``` Where: -- `battle_agent` means which battle agent played the battles while they were being trained, +- `battle_agent` means which battle agent played the battles while they were being trained. Can be `max-attack` (MA) or `greedy` (OSL). - `draft_approach` is either `immediate` (disregards past picks), `history` (leverages past picks by enumerating them in the input) or `lstm` (leverages past picks via long short-term memory (LSTM) units). The `immediate` obtained best results. +- `training_episodes` is either `30k` or `1M`, meaning the amount of training episodes used. - `1st or 2nd player` is either `1st` or `2nd`, meaning for which role they were trained. - `file` is a number from 1 to 10. For each combination of battle agent, draft approach and role, we trained ten different agents. @@ -22,13 +23,13 @@ The only use case implemented so far is replacing the draft portion of an AI pla To do so, execute `predictor.py` in one of the two scenarios below: 1. To use **different** models when playing first and second, with some AI player `player.py`. ``` - python3 gym_locm/toolbox/predictor.py --draft-1 gym_locm/trained_models/greedy/immediate/1st/4.json \ - --draft-2 gym_locm/trained_models/greedy/immediate/2nd/3.json \ + python3 gym_locm/toolbox/predictor.py --draft-1 gym_locm/trained_models/greedy/immediate-30k/1st/4.json \ + --draft-2 gym_locm/trained_models/greedy/immediate-30k/2nd/3.json \ --battle "python3 /path/to/player.py" ``` 2. To use the **same** model when playing first and second, with some AI player `player` ``` - python3 gym_locm/toolbox/predictor.py --draft gym_locm/trained_models/max-attack/history/1st/5.json \ + python3 gym_locm/toolbox/predictor.py --draft gym_locm/trained_models/max-attack/history-1M/1st/5.json \ --battle "./path/to/player" ``` The use of LSTM draft agents with predictor is not yet implemented. diff --git a/setup.py b/setup.py index 64113ce..e371427 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup setup(name='gym_locm', - version='1.2.0', + version='1.3.0', install_requires=['gym', 'numpy', 'prettytable', 'pexpect', 'sty'], extras_require={ 'experiments': ['numpy', 'scipy', 'stable_baselines', 'hyperopt',