-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paththompson_sampling.py
228 lines (156 loc) · 5.39 KB
/
thompson_sampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
from coordination_graph import variable_elimination
import numpy as np
import pandas as pd
import scipy as sp
import random
import copy
class Random_policy():
def __init__(self, groups, n_agents, seed):
self._groups = groups
self._n_agents = n_agents
self.agents = [f'A{i}' for i in range(n_agents)]
np.random.seed(seed=seed)
random.seed(seed)
def pull(self, iter):
"""
Returns
-------
pd.Series
A joint arm with the agents' names as columns
"""
# Sample
data = np.random.randint(0, 2, self._n_agents)
df = pd.Series(data, index=self.agents)
return df
class ThompsonSampling():
"""
Traditional Thompson sampling mechanism.
Methods
-------
sample()
Sample a single value for each the mean posteriors.
pull()
Pull an arm according to the probability matching mechanism of Thompson sampling.
update(arm, reward)
Update an arm's mean posterior with a given reward.
"""
def __init__(self, arms, prior):
"""
Parameters
----------
arms : pd.DataFrame
arms with entries labeled with the associated agent
priors : list of objects with superclass 'posteriors.Posterior'
prior for each arm (should be in the same order as arms)
"""
self._arms = arms
self._posteriors = prior # Mean posteriors
# print('self arms: ', self._arms)
def sample(self, mean = False):
"""
Returns
-------
list of float
A sample from every mean's posterior.
"""
theta = self._arms.copy()
if mean == False:
theta['mu'] = [post.sample() for post in self._posteriors]
else:
theta['mu'] = [post.mean for post in self._posteriors]
return theta
def pull(self, iter):
"""
Returns
-------
pd.Series
A joint arm with the agents' names as columns
"""
# Sample
means = self.sample()
# Maximize
max_operator = lambda x: x.loc[x[self.name] == x[self.name].max()]
a_max = means.loc[means['mu'] == means['mu'].max()]
a_max.drop(columns='mu', inplace=True)
return a_max
def update(self, arm, reward):
"""
Parameters
----------
arm : pd.Series
arm with entries labeled with the associated agent
reward : float
reward received for executing the arm
"""
index = np.where((self._arms == arm).all(axis=1))[0][0]
self._posteriors[index].update(reward)
class MultiAgentThompsonSampling():
"""
Multi-agent Thompson sampling (MATS) mechanism (epsilon exploration)
Methods
-------
sample()
Sample from the mean posteriors.
pull()
Pull a joint arm according to the probability matching mechanism of MATS.
update(arm, reward)
Update an arm's mean posterior with a given reward.
"""
def __init__(self, groups, priors, epsilon, seed, algo, n_agents):
"""
Parameters
----------
groups : list of pd.DataFrame
A data frame for each local group. The data frame consists of every possible local joint arm (rows) jointly over the agents (columns) within the group.
priors : list of list of objects with superclass 'posteriors.Posterior'
Each group has a list of priors, i.e., one for the mean of every local joint action.
"""
# Create local Thompson sampler per group
self._groups = groups
self.n_agents = n_agents
self._groups_samplers = [ThompsonSampling(local_arms, local_priors) for local_arms, local_priors in zip(groups, priors)]
self.epsilon = epsilon
self.algo = algo
np.random.seed(seed=seed)
random.seed(seed)
self._n_agents = n_agents
self.agents = [f'A{i}' for i in range(n_agents)]
def sample(self, iter):
"""
Returns
-------
list of list of float
For every group, a sample from every mean's posterior.
"""
theta = []
# # Sample per group
for e, sampler in enumerate(self._groups_samplers):
if random.random() <= self.epsilon or iter == 0:
theta_e = sampler.sample(False)
else:
theta_e = sampler.sample(True)
theta_e.rename(columns={'mu': f'mu{e}'}, inplace=True)
theta.append(theta_e)
return theta
def pull(self, iter):
"""
Returns
-------
pd.Series
A joint arm with the agents' names as columns
"""
group_means = self.sample(iter)
a_max = variable_elimination(group_means)
return a_max
def update(self, joint_arm, local_rewards):
"""
Parameters
----------
joint_arm : pd.Series
arm with entries labeled with the associated agent
local_rewards : list of float
For each group, the reward received for executing the local arm
----------
"""
for local_arms, local_sampler, local_reward in zip(self._groups, self._groups_samplers, local_rewards):
local_sampler.update(joint_arm[local_arms.columns], local_reward)