Skip to content

Commit

Permalink
Move agent-code to agents.py along with some generalization. Updated …
Browse files Browse the repository at this point in the history
…policy-based examples notebook.
  • Loading branch information
hallvardnmbu committed Feb 5, 2024
1 parent 4a6c611 commit 897147e
Show file tree
Hide file tree
Showing 7 changed files with 610 additions and 973 deletions.
234 changes: 234 additions & 0 deletions reinforcement-learning/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
from abc import ABC, abstractmethod
import numpy as np
import torch


class Agent(ABC, torch.nn.Module):
def __init__(self,
inputs=4,
outputs=2,
optimizer=torch.optim.RMSprop,
lr=0.00025,
discount=0.99
):
"""
Base Agent for reinforcement learning.
Parameters
----------
inputs : int, optional
Number of input nodes (observations).
outputs : int, optional
Number of output nodes (actions).
optimizer : torch.optim.X, optional
Optimizer for the Agent to learn.
lr : float, optional
Learning rate for the optimizer.
discount : float, optional
Discount factor for future rewards.
--> 0: only consider immediate rewards
--> 1: consider all future rewards equally
"""
super(Agent, self).__init__()

# ARCHITECTURE
# --------------------------------------------------

self.layer_in = torch.nn.Linear(inputs, 20)
self.layer_hidden = torch.nn.Linear(20, 80)
self.layer_out = torch.nn.Linear(80, outputs)

# LEARNING
# --------------------------------------------------

self.discount = discount
self.optimizer = optimizer(self.parameters(), lr=lr)

self.memory = {}

def forward(self, state):
"""
Forward pass with nonmodified output.
Parameters
----------
state : torch.Tensor
Observed state.
Returns
-------
output : torch.Tensor
"""
_output = torch.relu(self.layer_in(state))
_output = torch.relu(self.layer_hidden(_output))
output = self.layer_out(_output)

return output

@abstractmethod
def action(self, state):
"""
Abstract method for action selection.
Parameters
----------
state : torch.Tensor
Observed state.
Returns
-------
action : int
Selected action.
"""
pass

@abstractmethod
def learn(self):
"""
Abstract method for learning.
Returns
-------
float
Either the gradient, loss, Q-value, etc.
"""
pass

@abstractmethod
def memorize(self, *args):
"""
Abstract method for memorizing.
Parameters
----------
*args : list
Observation, action, reward, etc.
"""
pass


class PolicyGradientAgent(Agent):
def __init__(self,
inputs=4,
outputs=2,
optimizer=torch.optim.RMSprop,
lr=0.00025,
discount=0.99
):
"""
Policy-based Agent for reinforcement learning.
Parameters
----------
inputs : int, optional
Number of input nodes (observations).
outputs : int, optional
Number of output nodes (actions).
optimizer : torch.optim.X, optional
Optimizer for the Agent to learn.
lr : float, optional
Learning rate for the optimizer.
discount : float, optional
Discount factor for future rewards.
--> 0: only consider immediate rewards
--> 1: consider all future rewards equally
"""
super().__init__(inputs, outputs, optimizer, lr, discount)

self.memory["logarithm"] = []
self.memory["reward"] = []

def action(self, state):
"""
Stochastic action selection.
Parameters
----------
state : torch.Tensor
Observed state.
Returns
-------
action : int
Selected action.
logarithm : torch.Tensor
Logarithm of the selected action probability.
"""
actions = torch.softmax(self(state), dim=-1)

action = np.random.choice(range(actions.shape[0]), 1,
p=actions.detach().numpy())[0]
logarithm = torch.log(actions[action])

return action, logarithm

def learn(self):
"""
REINFORCE algorithm; a policy-based gradient method, with respect to the last game played.
Returns
-------
gradient : float
Notes
-----
In order for the Agent to best learn the optimal actions, it is common to evaluate the
expected future rewards. Then, the Agent can adjust its predicted action probabilities
(policy) so that this expected reward is maximized. This is done through the REINFORCE
algorithm, which computes the policy gradient. Algorithm modified from:
https://medium.com/@thechrisyoon/deriving-policy-gradients-and-implementing-reinforce-f887949bd63
"""
rewards = torch.tensor(self.memory["reward"], dtype=torch.float32)

# EXPECTED FUTURE REWARDS
# --------------------------------------------------
# The expected reward given an action is the sum of all future (discounted) rewards. This is
# achieved by reversely adding the observed reward and the discounted cumulative future
# rewards. The rewards are then standardized.

_reward = 0
for i in reversed(range(len(rewards))):
_reward = _reward * self.discount + rewards[i]
rewards[i] = _reward
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9)

# POLICY GRADIENT
# --------------------------------------------------
# The policy gradient is the gradient of the expected reward with respect to the action
# taken (policy). This is computed by multiplying the logarithm of the selected action
# probability (see `action` method) with the standardized expected reward — previously
# calculated. The overall gradient is then the sum of all these products.

gradient = torch.zeros_like(rewards)
for i, (logarithm, reward) in enumerate(zip(self.memory["logarithm"], rewards)):
gradient[i] = -logarithm * reward
gradient = gradient.sum()

# BACKPROPAGATION
# --------------------------------------------------
# The gradient is then used to update the Agent's policy. This is done by backpropagating
# with the optimizer using the gradient.

self.optimizer.zero_grad()
gradient.backward()
self.optimizer.step()

self.memory["logarithm"] = []
self.memory["reward"] = []

return gradient.item()

def memorize(self, logarithm, reward):
"""
Append observation, action and reward to Agent memory.
Parameters
----------
logarithm : torch.Tensor
Logarithm of the selected action probability.
reward : int
Reward from the chosen action.
"""
self.memory["logarithm"].append(logarithm)
self.memory["reward"].append(reward)
Binary file removed reinforcement-learning/cart-pole/policy-based.gif
Binary file not shown.
Loading

0 comments on commit 897147e

Please sign in to comment.