Move agent-code to agents.py along with some generalization. Updated …

…policy-based examples notebook.
hallvardnmbu · Feb 5, 2024 · 897147e · 897147e
1 parent 4a6c611
commit 897147e
Show file tree

Hide file tree

Showing 7 changed files with 610 additions and 973 deletions.
diff --git a/reinforcement-learning/agents.py b/reinforcement-learning/agents.py
@@ -0,0 +1,234 @@
+from abc import ABC, abstractmethod
+import numpy as np
+import torch
+
+
+class Agent(ABC, torch.nn.Module):
+    def __init__(self,
+                 inputs=4,
+                 outputs=2,
+                 optimizer=torch.optim.RMSprop,
+                 lr=0.00025,
+                 discount=0.99
+                 ):
+        """
+        Base Agent for reinforcement learning.
+
+        Parameters
+        ----------
+        inputs : int, optional
+            Number of input nodes (observations).
+        outputs : int, optional
+            Number of output nodes (actions).
+        optimizer : torch.optim.X, optional
+            Optimizer for the Agent to learn.
+        lr : float, optional
+            Learning rate for the optimizer.
+        discount : float, optional
+            Discount factor for future rewards.
+            --> 0: only consider immediate rewards
+            --> 1: consider all future rewards equally
+        """
+        super(Agent, self).__init__()
+
+        # ARCHITECTURE
+        # --------------------------------------------------
+
+        self.layer_in = torch.nn.Linear(inputs, 20)
+        self.layer_hidden = torch.nn.Linear(20, 80)
+        self.layer_out = torch.nn.Linear(80, outputs)
+
+        # LEARNING
+        # --------------------------------------------------
+
+        self.discount = discount
+        self.optimizer = optimizer(self.parameters(), lr=lr)
+
+        self.memory = {}
+
+    def forward(self, state):
+        """
+        Forward pass with nonmodified output.
+
+        Parameters
+        ----------
+        state : torch.Tensor
+            Observed state.
+
+        Returns
+        -------
+        output : torch.Tensor
+        """
+        _output = torch.relu(self.layer_in(state))
+        _output = torch.relu(self.layer_hidden(_output))
+        output = self.layer_out(_output)
+
+        return output
+
+    @abstractmethod
+    def action(self, state):
+        """
+        Abstract method for action selection.
+
+        Parameters
+        ----------
+        state : torch.Tensor
+            Observed state.
+
+        Returns
+        -------
+        action : int
+            Selected action.
+        """
+        pass
+
+    @abstractmethod
+    def learn(self):
+        """
+        Abstract method for learning.
+
+        Returns
+        -------
+        float
+            Either the gradient, loss, Q-value, etc.
+        """
+        pass
+
+    @abstractmethod
+    def memorize(self, *args):
+        """
+        Abstract method for memorizing.
+
+        Parameters
+        ----------
+        *args : list
+            Observation, action, reward, etc.
+        """
+        pass
+
+
+class PolicyGradientAgent(Agent):
+    def __init__(self,
+                 inputs=4,
+                 outputs=2,
+                 optimizer=torch.optim.RMSprop,
+                 lr=0.00025,
+                 discount=0.99
+                 ):
+        """
+        Policy-based Agent for reinforcement learning.
+
+        Parameters
+        ----------
+        inputs : int, optional
+            Number of input nodes (observations).
+        outputs : int, optional
+            Number of output nodes (actions).
+        optimizer : torch.optim.X, optional
+            Optimizer for the Agent to learn.
+        lr : float, optional
+            Learning rate for the optimizer.
+        discount : float, optional
+            Discount factor for future rewards.
+            --> 0: only consider immediate rewards
+            --> 1: consider all future rewards equally
+        """
+        super().__init__(inputs, outputs, optimizer, lr, discount)
+
+        self.memory["logarithm"] = []
+        self.memory["reward"] = []
+
+    def action(self, state):
+        """
+        Stochastic action selection.
+
+        Parameters
+        ----------
+        state : torch.Tensor
+            Observed state.
+
+        Returns
+        -------
+        action : int
+            Selected action.
+        logarithm : torch.Tensor
+            Logarithm of the selected action probability.
+        """
+        actions = torch.softmax(self(state), dim=-1)
+
+        action = np.random.choice(range(actions.shape[0]), 1,
+                                  p=actions.detach().numpy())[0]
+        logarithm = torch.log(actions[action])
+
+        return action, logarithm
+
+    def learn(self):
+        """
+        REINFORCE algorithm; a policy-based gradient method, with respect to the last game played.
+
+        Returns
+        -------
+        gradient : float
+
+        Notes
+        -----
+        In order for the Agent to best learn the optimal actions, it is common to evaluate the
+        expected future rewards. Then, the Agent can adjust its predicted action probabilities
+        (policy) so that this expected reward is maximized. This is done through the REINFORCE
+        algorithm, which computes the policy gradient. Algorithm modified from:
+
+         https://medium.com/@thechrisyoon/deriving-policy-gradients-and-implementing-reinforce-f887949bd63
+        """
+        rewards = torch.tensor(self.memory["reward"], dtype=torch.float32)
+
+        # EXPECTED FUTURE REWARDS
+        # --------------------------------------------------
+        # The expected reward given an action is the sum of all future (discounted) rewards. This is
+        # achieved by reversely adding the observed reward and the discounted cumulative future
+        # rewards. The rewards are then standardized.
+
+        _reward = 0
+        for i in reversed(range(len(rewards))):
+            _reward = _reward * self.discount + rewards[i]
+            rewards[i] = _reward
+        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-9)
+
+        # POLICY GRADIENT
+        # --------------------------------------------------
+        # The policy gradient is the gradient of the expected reward with respect to the action
+        # taken (policy). This is computed by multiplying the logarithm of the selected action
+        # probability (see `action` method) with the standardized expected reward — previously
+        # calculated. The overall gradient is then the sum of all these products.
+
+        gradient = torch.zeros_like(rewards)
+        for i, (logarithm, reward) in enumerate(zip(self.memory["logarithm"], rewards)):
+            gradient[i] = -logarithm * reward
+        gradient = gradient.sum()
+
+        # BACKPROPAGATION
+        # --------------------------------------------------
+        # The gradient is then used to update the Agent's policy. This is done by backpropagating
+        # with the optimizer using the gradient.
+
+        self.optimizer.zero_grad()
+        gradient.backward()
+        self.optimizer.step()
+
+        self.memory["logarithm"] = []
+        self.memory["reward"] = []
+
+        return gradient.item()
+
+    def memorize(self, logarithm, reward):
+        """
+        Append observation, action and reward to Agent memory.
+
+        Parameters
+        ----------
+        logarithm : torch.Tensor
+            Logarithm of the selected action probability.
+        reward : int
+            Reward from the chosen action.
+        """
+        self.memory["logarithm"].append(logarithm)
+        self.memory["reward"].append(reward)
diff --git a/reinforcement-learning/cart-pole/policy-based.gif b/reinforcement-learning/cart-pole/policy-based.gif