-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbandit.py
105 lines (77 loc) · 2.71 KB
/
bandit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from typing import Iterable
import numpy as np
class Bandit:
"""
Simulates a n-armed bandit
Parameters
----------
n : int
Number of arms of the bandit
alpha : float
Step-size parameter of exponential (recency-weighted) average. Takes value from 0 to 1.
observation_noise : float
noise to add to the observed reward
Methods
-------
get_reward(action)
Returns the reward of the selected action
update_estimate(action)
Given a selected action, it returns the reward and updates the estimate of the action value
"""
def __init__(self, n: int, alpha: float = 0.1, action_values=None, observation_noise: float = 1):
self.alpha = alpha
self.observation_noise = observation_noise
if action_values is None:
self.action_values = np.random.multivariate_normal(np.ones(n), np.eye(n))
else:
if len(action_values) != n:
raise ValueError("Provide as many action values as bandit arms (n)")
self.action_values = action_values
self.action_values_est = np.zeros(n)
def get_reward(self, action: int) -> float:
"""
Returns the reward of the selected action
Parameters
----------
action : int
Action index
"""
return self.action_values[action] + np.random.normal(0, 1)
def update_estimate(self, action: int) -> float:
"""
Given a selected action, it returns the reward and updates the estimate of the action value
Parameters
----------
action : int
Action index
"""
reward = self.get_reward(action)
self.action_values_est[action] += self.alpha * (reward - self.action_values_est[action])
return reward
class EpsilonGreedy:
"""
Implements the ε-greedy policy
Parameters
----------
epsilon : float
Exploration probability. With epsilon probability a random action is selected
instead of the currently optimal action
Methods
-------
select_action(action_values)
Returns an action based on current action value estimates and epsilon
"""
def __init__(self, epsilon):
self.epsilon = epsilon
def select_action(self, action_values: Iterable):
"""
select_action(action_values)
Returns an action based on current action value estimates and epsilon
Parameters
----------
action_values : Iterable
Action values (true or estimates)
"""
if np.random.random() < self.epsilon:
return np.random.randint(action_values.shape[0])
return np.argmax(action_values)