-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move agent-code to agents.py along with some generalization. Updated …
…policy-based examples notebook.
- Loading branch information
1 parent
4a6c611
commit 897147e
Showing
7 changed files
with
610 additions
and
973 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,234 @@ | ||
from abc import ABC, abstractmethod | ||
import numpy as np | ||
import torch | ||
|
||
|
||
class Agent(ABC, torch.nn.Module): | ||
def __init__(self, | ||
inputs=4, | ||
outputs=2, | ||
optimizer=torch.optim.RMSprop, | ||
lr=0.00025, | ||
discount=0.99 | ||
): | ||
""" | ||
Base Agent for reinforcement learning. | ||
Parameters | ||
---------- | ||
inputs : int, optional | ||
Number of input nodes (observations). | ||
outputs : int, optional | ||
Number of output nodes (actions). | ||
optimizer : torch.optim.X, optional | ||
Optimizer for the Agent to learn. | ||
lr : float, optional | ||
Learning rate for the optimizer. | ||
discount : float, optional | ||
Discount factor for future rewards. | ||
--> 0: only consider immediate rewards | ||
--> 1: consider all future rewards equally | ||
""" | ||
super(Agent, self).__init__() | ||
|
||
# ARCHITECTURE | ||
# -------------------------------------------------- | ||
|
||
self.layer_in = torch.nn.Linear(inputs, 20) | ||
self.layer_hidden = torch.nn.Linear(20, 80) | ||
self.layer_out = torch.nn.Linear(80, outputs) | ||
|
||
# LEARNING | ||
# -------------------------------------------------- | ||
|
||
self.discount = discount | ||
self.optimizer = optimizer(self.parameters(), lr=lr) | ||
|
||
self.memory = {} | ||
|
||
def forward(self, state): | ||
""" | ||
Forward pass with nonmodified output. | ||
Parameters | ||
---------- | ||
state : torch.Tensor | ||
Observed state. | ||
Returns | ||
------- | ||
output : torch.Tensor | ||
""" | ||
_output = torch.relu(self.layer_in(state)) | ||
_output = torch.relu(self.layer_hidden(_output)) | ||
output = self.layer_out(_output) | ||
|
||
return output | ||
|
||
@abstractmethod | ||
def action(self, state): | ||
""" | ||
Abstract method for action selection. | ||
Parameters | ||
---------- | ||
state : torch.Tensor | ||
Observed state. | ||
Returns | ||
------- | ||
action : int | ||
Selected action. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def learn(self): | ||
""" | ||
Abstract method for learning. | ||
Returns | ||
------- | ||
float | ||
Either the gradient, loss, Q-value, etc. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def memorize(self, *args): | ||
""" | ||
Abstract method for memorizing. | ||
Parameters | ||
---------- | ||
*args : list | ||
Observation, action, reward, etc. | ||
""" | ||
pass | ||
|
||
|
||
class PolicyGradientAgent(Agent): | ||
def __init__(self, | ||
inputs=4, | ||
outputs=2, | ||
optimizer=torch.optim.RMSprop, | ||
lr=0.00025, | ||
discount=0.99 | ||
): | ||
""" | ||
Policy-based Agent for reinforcement learning. | ||
Parameters | ||
---------- | ||
inputs : int, optional | ||
Number of input nodes (observations). | ||
outputs : int, optional | ||
Number of output nodes (actions). | ||
optimizer : torch.optim.X, optional | ||
Optimizer for the Agent to learn. | ||
lr : float, optional | ||
Learning rate for the optimizer. | ||
discount : float, optional | ||
Discount factor for future rewards. | ||
--> 0: only consider immediate rewards | ||
--> 1: consider all future rewards equally | ||
""" | ||
super().__init__(inputs, outputs, optimizer, lr, discount) | ||
|
||
self.memory["logarithm"] = [] | ||
self.memory["reward"] = [] | ||
|
||
def action(self, state): | ||
""" | ||
Stochastic action selection. | ||
Parameters | ||
---------- | ||
state : torch.Tensor | ||
Observed state. | ||
Returns | ||
------- | ||
action : int | ||
Selected action. | ||
logarithm : torch.Tensor | ||
Logarithm of the selected action probability. | ||
""" | ||
actions = torch.softmax(self(state), dim=-1) | ||
|
||
action = np.random.choice(range(actions.shape[0]), 1, | ||
p=actions.detach().numpy())[0] | ||
logarithm = torch.log(actions[action]) | ||
|
||
return action, logarithm | ||
|
||
def learn(self): | ||
""" | ||
REINFORCE algorithm; a policy-based gradient method, with respect to the last game played. | ||
Returns | ||
------- | ||
gradient : float | ||
Notes | ||
----- | ||
In order for the Agent to best learn the optimal actions, it is common to evaluate the | ||
expected future rewards. Then, the Agent can adjust its predicted action probabilities | ||
(policy) so that this expected reward is maximized. This is done through the REINFORCE | ||
algorithm, which computes the policy gradient. Algorithm modified from: | ||
https://medium.com/@thechrisyoon/deriving-policy-gradients-and-implementing-reinforce-f887949bd63 | ||
""" | ||
rewards = torch.tensor(self.memory["reward"], dtype=torch.float32) | ||
|
||
# EXPECTED FUTURE REWARDS | ||
# -------------------------------------------------- | ||
# The expected reward given an action is the sum of all future (discounted) rewards. This is | ||
# achieved by reversely adding the observed reward and the discounted cumulative future | ||
# rewards. The rewards are then standardized. | ||
|
||
_reward = 0 | ||
for i in reversed(range(len(rewards))): | ||
_reward = _reward * self.discount + rewards[i] | ||
rewards[i] = _reward | ||
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9) | ||
|
||
# POLICY GRADIENT | ||
# -------------------------------------------------- | ||
# The policy gradient is the gradient of the expected reward with respect to the action | ||
# taken (policy). This is computed by multiplying the logarithm of the selected action | ||
# probability (see `action` method) with the standardized expected reward — previously | ||
# calculated. The overall gradient is then the sum of all these products. | ||
|
||
gradient = torch.zeros_like(rewards) | ||
for i, (logarithm, reward) in enumerate(zip(self.memory["logarithm"], rewards)): | ||
gradient[i] = -logarithm * reward | ||
gradient = gradient.sum() | ||
|
||
# BACKPROPAGATION | ||
# -------------------------------------------------- | ||
# The gradient is then used to update the Agent's policy. This is done by backpropagating | ||
# with the optimizer using the gradient. | ||
|
||
self.optimizer.zero_grad() | ||
gradient.backward() | ||
self.optimizer.step() | ||
|
||
self.memory["logarithm"] = [] | ||
self.memory["reward"] = [] | ||
|
||
return gradient.item() | ||
|
||
def memorize(self, logarithm, reward): | ||
""" | ||
Append observation, action and reward to Agent memory. | ||
Parameters | ||
---------- | ||
logarithm : torch.Tensor | ||
Logarithm of the selected action probability. | ||
reward : int | ||
Reward from the chosen action. | ||
""" | ||
self.memory["logarithm"].append(logarithm) | ||
self.memory["reward"].append(reward) |
Binary file not shown.
Oops, something went wrong.